linux/fs/btrfs/tree-log.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include "ctree.h"
  22#include "transaction.h"
  23#include "disk-io.h"
  24#include "locking.h"
  25#include "print-tree.h"
  26#include "compat.h"
  27#include "tree-log.h"
  28
  29/* magic values for the inode_only field in btrfs_log_inode:
  30 *
  31 * LOG_INODE_ALL means to log everything
  32 * LOG_INODE_EXISTS means to log just enough to recreate the inode
  33 * during log replay
  34 */
  35#define LOG_INODE_ALL 0
  36#define LOG_INODE_EXISTS 1
  37
  38/*
  39 * directory trouble cases
  40 *
  41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  42 * log, we must force a full commit before doing an fsync of the directory
  43 * where the unlink was done.
  44 * ---> record transid of last unlink/rename per directory
  45 *
  46 * mkdir foo/some_dir
  47 * normal commit
  48 * rename foo/some_dir foo2/some_dir
  49 * mkdir foo/some_dir
  50 * fsync foo/some_dir/some_file
  51 *
  52 * The fsync above will unlink the original some_dir without recording
  53 * it in its new location (foo2).  After a crash, some_dir will be gone
  54 * unless the fsync of some_file forces a full commit
  55 *
  56 * 2) we must log any new names for any file or dir that is in the fsync
  57 * log. ---> check inode while renaming/linking.
  58 *
  59 * 2a) we must log any new names for any file or dir during rename
  60 * when the directory they are being removed from was logged.
  61 * ---> check inode and old parent dir during rename
  62 *
  63 *  2a is actually the more important variant.  With the extra logging
  64 *  a crash might unlink the old name without recreating the new one
  65 *
  66 * 3) after a crash, we must go through any directories with a link count
  67 * of zero and redo the rm -rf
  68 *
  69 * mkdir f1/foo
  70 * normal commit
  71 * rm -rf f1/foo
  72 * fsync(f1)
  73 *
  74 * The directory f1 was fully removed from the FS, but fsync was never
  75 * called on f1, only its parent dir.  After a crash the rm -rf must
  76 * be replayed.  This must be able to recurse down the entire
  77 * directory tree.  The inode link count fixup code takes care of the
  78 * ugly details.
  79 */
  80
  81/*
  82 * stages for the tree walking.  The first
  83 * stage (0) is to only pin down the blocks we find
  84 * the second stage (1) is to make sure that all the inodes
  85 * we find in the log are created in the subvolume.
  86 *
  87 * The last stage is to deal with directories and links and extents
  88 * and all the other fun semantics
  89 */
  90#define LOG_WALK_PIN_ONLY 0
  91#define LOG_WALK_REPLAY_INODES 1
  92#define LOG_WALK_REPLAY_ALL 2
  93
  94static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  95                             struct btrfs_root *root, struct inode *inode,
  96                             int inode_only);
  97static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  98                             struct btrfs_root *root,
  99                             struct btrfs_path *path, u64 objectid);
 100static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 101                                       struct btrfs_root *root,
 102                                       struct btrfs_root *log,
 103                                       struct btrfs_path *path,
 104                                       u64 dirid, int del_all);
 105
 106/*
 107 * tree logging is a special write ahead log used to make sure that
 108 * fsyncs and O_SYNCs can happen without doing full tree commits.
 109 *
 110 * Full tree commits are expensive because they require commonly
 111 * modified blocks to be recowed, creating many dirty pages in the
 112 * extent tree an 4x-6x higher write load than ext3.
 113 *
 114 * Instead of doing a tree commit on every fsync, we use the
 115 * key ranges and transaction ids to find items for a given file or directory
 116 * that have changed in this transaction.  Those items are copied into
 117 * a special tree (one per subvolume root), that tree is written to disk
 118 * and then the fsync is considered complete.
 119 *
 120 * After a crash, items are copied out of the log-tree back into the
 121 * subvolume tree.  Any file data extents found are recorded in the extent
 122 * allocation tree, and the log-tree freed.
 123 *
 124 * The log tree is read three times, once to pin down all the extents it is
 125 * using in ram and once, once to create all the inodes logged in the tree
 126 * and once to do all the other items.
 127 */
 128
 129/*
 130 * start a sub transaction and setup the log tree
 131 * this increments the log tree writer count to make the people
 132 * syncing the tree wait for us to finish
 133 */
 134static int start_log_trans(struct btrfs_trans_handle *trans,
 135                           struct btrfs_root *root)
 136{
 137        int ret;
 138        int err = 0;
 139
 140        mutex_lock(&root->log_mutex);
 141        if (root->log_root) {
 142                if (!root->log_start_pid) {
 143                        root->log_start_pid = current->pid;
 144                        root->log_multiple_pids = false;
 145                } else if (root->log_start_pid != current->pid) {
 146                        root->log_multiple_pids = true;
 147                }
 148
 149                root->log_batch++;
 150                atomic_inc(&root->log_writers);
 151                mutex_unlock(&root->log_mutex);
 152                return 0;
 153        }
 154        root->log_multiple_pids = false;
 155        root->log_start_pid = current->pid;
 156        mutex_lock(&root->fs_info->tree_log_mutex);
 157        if (!root->fs_info->log_root_tree) {
 158                ret = btrfs_init_log_root_tree(trans, root->fs_info);
 159                if (ret)
 160                        err = ret;
 161        }
 162        if (err == 0 && !root->log_root) {
 163                ret = btrfs_add_log_tree(trans, root);
 164                if (ret)
 165                        err = ret;
 166        }
 167        mutex_unlock(&root->fs_info->tree_log_mutex);
 168        root->log_batch++;
 169        atomic_inc(&root->log_writers);
 170        mutex_unlock(&root->log_mutex);
 171        return err;
 172}
 173
 174/*
 175 * returns 0 if there was a log transaction running and we were able
 176 * to join, or returns -ENOENT if there were not transactions
 177 * in progress
 178 */
 179static int join_running_log_trans(struct btrfs_root *root)
 180{
 181        int ret = -ENOENT;
 182
 183        smp_mb();
 184        if (!root->log_root)
 185                return -ENOENT;
 186
 187        mutex_lock(&root->log_mutex);
 188        if (root->log_root) {
 189                ret = 0;
 190                atomic_inc(&root->log_writers);
 191        }
 192        mutex_unlock(&root->log_mutex);
 193        return ret;
 194}
 195
 196/*
 197 * This either makes the current running log transaction wait
 198 * until you call btrfs_end_log_trans() or it makes any future
 199 * log transactions wait until you call btrfs_end_log_trans()
 200 */
 201int btrfs_pin_log_trans(struct btrfs_root *root)
 202{
 203        int ret = -ENOENT;
 204
 205        mutex_lock(&root->log_mutex);
 206        atomic_inc(&root->log_writers);
 207        mutex_unlock(&root->log_mutex);
 208        return ret;
 209}
 210
 211/*
 212 * indicate we're done making changes to the log tree
 213 * and wake up anyone waiting to do a sync
 214 */
 215void btrfs_end_log_trans(struct btrfs_root *root)
 216{
 217        if (atomic_dec_and_test(&root->log_writers)) {
 218                smp_mb();
 219                if (waitqueue_active(&root->log_writer_wait))
 220                        wake_up(&root->log_writer_wait);
 221        }
 222}
 223
 224
 225/*
 226 * the walk control struct is used to pass state down the chain when
 227 * processing the log tree.  The stage field tells us which part
 228 * of the log tree processing we are currently doing.  The others
 229 * are state fields used for that specific part
 230 */
 231struct walk_control {
 232        /* should we free the extent on disk when done?  This is used
 233         * at transaction commit time while freeing a log tree
 234         */
 235        int free;
 236
 237        /* should we write out the extent buffer?  This is used
 238         * while flushing the log tree to disk during a sync
 239         */
 240        int write;
 241
 242        /* should we wait for the extent buffer io to finish?  Also used
 243         * while flushing the log tree to disk for a sync
 244         */
 245        int wait;
 246
 247        /* pin only walk, we record which extents on disk belong to the
 248         * log trees
 249         */
 250        int pin;
 251
 252        /* what stage of the replay code we're currently in */
 253        int stage;
 254
 255        /* the root we are currently replaying */
 256        struct btrfs_root *replay_dest;
 257
 258        /* the trans handle for the current replay */
 259        struct btrfs_trans_handle *trans;
 260
 261        /* the function that gets used to process blocks we find in the
 262         * tree.  Note the extent_buffer might not be up to date when it is
 263         * passed in, and it must be checked or read if you need the data
 264         * inside it
 265         */
 266        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 267                            struct walk_control *wc, u64 gen);
 268};
 269
 270/*
 271 * process_func used to pin down extents, write them or wait on them
 272 */
 273static int process_one_buffer(struct btrfs_root *log,
 274                              struct extent_buffer *eb,
 275                              struct walk_control *wc, u64 gen)
 276{
 277        if (wc->pin)
 278                btrfs_pin_extent_for_log_replay(wc->trans,
 279                                                log->fs_info->extent_root,
 280                                                eb->start, eb->len);
 281
 282        if (btrfs_buffer_uptodate(eb, gen, 0)) {
 283                if (wc->write)
 284                        btrfs_write_tree_block(eb);
 285                if (wc->wait)
 286                        btrfs_wait_tree_block_writeback(eb);
 287        }
 288        return 0;
 289}
 290
 291/*
 292 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 293 * to the src data we are copying out.
 294 *
 295 * root is the tree we are copying into, and path is a scratch
 296 * path for use in this function (it should be released on entry and
 297 * will be released on exit).
 298 *
 299 * If the key is already in the destination tree the existing item is
 300 * overwritten.  If the existing item isn't big enough, it is extended.
 301 * If it is too large, it is truncated.
 302 *
 303 * If the key isn't in the destination yet, a new item is inserted.
 304 */
 305static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 306                                   struct btrfs_root *root,
 307                                   struct btrfs_path *path,
 308                                   struct extent_buffer *eb, int slot,
 309                                   struct btrfs_key *key)
 310{
 311        int ret;
 312        u32 item_size;
 313        u64 saved_i_size = 0;
 314        int save_old_i_size = 0;
 315        unsigned long src_ptr;
 316        unsigned long dst_ptr;
 317        int overwrite_root = 0;
 318
 319        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 320                overwrite_root = 1;
 321
 322        item_size = btrfs_item_size_nr(eb, slot);
 323        src_ptr = btrfs_item_ptr_offset(eb, slot);
 324
 325        /* look for the key in the destination tree */
 326        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 327        if (ret == 0) {
 328                char *src_copy;
 329                char *dst_copy;
 330                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 331                                                  path->slots[0]);
 332                if (dst_size != item_size)
 333                        goto insert;
 334
 335                if (item_size == 0) {
 336                        btrfs_release_path(path);
 337                        return 0;
 338                }
 339                dst_copy = kmalloc(item_size, GFP_NOFS);
 340                src_copy = kmalloc(item_size, GFP_NOFS);
 341                if (!dst_copy || !src_copy) {
 342                        btrfs_release_path(path);
 343                        kfree(dst_copy);
 344                        kfree(src_copy);
 345                        return -ENOMEM;
 346                }
 347
 348                read_extent_buffer(eb, src_copy, src_ptr, item_size);
 349
 350                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 351                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 352                                   item_size);
 353                ret = memcmp(dst_copy, src_copy, item_size);
 354
 355                kfree(dst_copy);
 356                kfree(src_copy);
 357                /*
 358                 * they have the same contents, just return, this saves
 359                 * us from cowing blocks in the destination tree and doing
 360                 * extra writes that may not have been done by a previous
 361                 * sync
 362                 */
 363                if (ret == 0) {
 364                        btrfs_release_path(path);
 365                        return 0;
 366                }
 367
 368        }
 369insert:
 370        btrfs_release_path(path);
 371        /* try to insert the key into the destination tree */
 372        ret = btrfs_insert_empty_item(trans, root, path,
 373                                      key, item_size);
 374
 375        /* make sure any existing item is the correct size */
 376        if (ret == -EEXIST) {
 377                u32 found_size;
 378                found_size = btrfs_item_size_nr(path->nodes[0],
 379                                                path->slots[0]);
 380                if (found_size > item_size)
 381                        btrfs_truncate_item(trans, root, path, item_size, 1);
 382                else if (found_size < item_size)
 383                        btrfs_extend_item(trans, root, path,
 384                                          item_size - found_size);
 385        } else if (ret) {
 386                return ret;
 387        }
 388        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 389                                        path->slots[0]);
 390
 391        /* don't overwrite an existing inode if the generation number
 392         * was logged as zero.  This is done when the tree logging code
 393         * is just logging an inode to make sure it exists after recovery.
 394         *
 395         * Also, don't overwrite i_size on directories during replay.
 396         * log replay inserts and removes directory items based on the
 397         * state of the tree found in the subvolume, and i_size is modified
 398         * as it goes
 399         */
 400        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 401                struct btrfs_inode_item *src_item;
 402                struct btrfs_inode_item *dst_item;
 403
 404                src_item = (struct btrfs_inode_item *)src_ptr;
 405                dst_item = (struct btrfs_inode_item *)dst_ptr;
 406
 407                if (btrfs_inode_generation(eb, src_item) == 0)
 408                        goto no_copy;
 409
 410                if (overwrite_root &&
 411                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 412                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 413                        save_old_i_size = 1;
 414                        saved_i_size = btrfs_inode_size(path->nodes[0],
 415                                                        dst_item);
 416                }
 417        }
 418
 419        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 420                           src_ptr, item_size);
 421
 422        if (save_old_i_size) {
 423                struct btrfs_inode_item *dst_item;
 424                dst_item = (struct btrfs_inode_item *)dst_ptr;
 425                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 426        }
 427
 428        /* make sure the generation is filled in */
 429        if (key->type == BTRFS_INODE_ITEM_KEY) {
 430                struct btrfs_inode_item *dst_item;
 431                dst_item = (struct btrfs_inode_item *)dst_ptr;
 432                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 433                        btrfs_set_inode_generation(path->nodes[0], dst_item,
 434                                                   trans->transid);
 435                }
 436        }
 437no_copy:
 438        btrfs_mark_buffer_dirty(path->nodes[0]);
 439        btrfs_release_path(path);
 440        return 0;
 441}
 442
 443/*
 444 * simple helper to read an inode off the disk from a given root
 445 * This can only be called for subvolume roots and not for the log
 446 */
 447static noinline struct inode *read_one_inode(struct btrfs_root *root,
 448                                             u64 objectid)
 449{
 450        struct btrfs_key key;
 451        struct inode *inode;
 452
 453        key.objectid = objectid;
 454        key.type = BTRFS_INODE_ITEM_KEY;
 455        key.offset = 0;
 456        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 457        if (IS_ERR(inode)) {
 458                inode = NULL;
 459        } else if (is_bad_inode(inode)) {
 460                iput(inode);
 461                inode = NULL;
 462        }
 463        return inode;
 464}
 465
 466/* replays a single extent in 'eb' at 'slot' with 'key' into the
 467 * subvolume 'root'.  path is released on entry and should be released
 468 * on exit.
 469 *
 470 * extents in the log tree have not been allocated out of the extent
 471 * tree yet.  So, this completes the allocation, taking a reference
 472 * as required if the extent already exists or creating a new extent
 473 * if it isn't in the extent allocation tree yet.
 474 *
 475 * The extent is inserted into the file, dropping any existing extents
 476 * from the file that overlap the new one.
 477 */
 478static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 479                                      struct btrfs_root *root,
 480                                      struct btrfs_path *path,
 481                                      struct extent_buffer *eb, int slot,
 482                                      struct btrfs_key *key)
 483{
 484        int found_type;
 485        u64 mask = root->sectorsize - 1;
 486        u64 extent_end;
 487        u64 alloc_hint;
 488        u64 start = key->offset;
 489        u64 saved_nbytes;
 490        struct btrfs_file_extent_item *item;
 491        struct inode *inode = NULL;
 492        unsigned long size;
 493        int ret = 0;
 494
 495        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 496        found_type = btrfs_file_extent_type(eb, item);
 497
 498        if (found_type == BTRFS_FILE_EXTENT_REG ||
 499            found_type == BTRFS_FILE_EXTENT_PREALLOC)
 500                extent_end = start + btrfs_file_extent_num_bytes(eb, item);
 501        else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 502                size = btrfs_file_extent_inline_len(eb, item);
 503                extent_end = (start + size + mask) & ~mask;
 504        } else {
 505                ret = 0;
 506                goto out;
 507        }
 508
 509        inode = read_one_inode(root, key->objectid);
 510        if (!inode) {
 511                ret = -EIO;
 512                goto out;
 513        }
 514
 515        /*
 516         * first check to see if we already have this extent in the
 517         * file.  This must be done before the btrfs_drop_extents run
 518         * so we don't try to drop this extent.
 519         */
 520        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 521                                       start, 0);
 522
 523        if (ret == 0 &&
 524            (found_type == BTRFS_FILE_EXTENT_REG ||
 525             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 526                struct btrfs_file_extent_item cmp1;
 527                struct btrfs_file_extent_item cmp2;
 528                struct btrfs_file_extent_item *existing;
 529                struct extent_buffer *leaf;
 530
 531                leaf = path->nodes[0];
 532                existing = btrfs_item_ptr(leaf, path->slots[0],
 533                                          struct btrfs_file_extent_item);
 534
 535                read_extent_buffer(eb, &cmp1, (unsigned long)item,
 536                                   sizeof(cmp1));
 537                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 538                                   sizeof(cmp2));
 539
 540                /*
 541                 * we already have a pointer to this exact extent,
 542                 * we don't have to do anything
 543                 */
 544                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 545                        btrfs_release_path(path);
 546                        goto out;
 547                }
 548        }
 549        btrfs_release_path(path);
 550
 551        saved_nbytes = inode_get_bytes(inode);
 552        /* drop any overlapping extents */
 553        ret = btrfs_drop_extents(trans, inode, start, extent_end,
 554                                 &alloc_hint, 1);
 555        BUG_ON(ret);
 556
 557        if (found_type == BTRFS_FILE_EXTENT_REG ||
 558            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 559                u64 offset;
 560                unsigned long dest_offset;
 561                struct btrfs_key ins;
 562
 563                ret = btrfs_insert_empty_item(trans, root, path, key,
 564                                              sizeof(*item));
 565                BUG_ON(ret);
 566                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 567                                                    path->slots[0]);
 568                copy_extent_buffer(path->nodes[0], eb, dest_offset,
 569                                (unsigned long)item,  sizeof(*item));
 570
 571                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 572                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 573                ins.type = BTRFS_EXTENT_ITEM_KEY;
 574                offset = key->offset - btrfs_file_extent_offset(eb, item);
 575
 576                if (ins.objectid > 0) {
 577                        u64 csum_start;
 578                        u64 csum_end;
 579                        LIST_HEAD(ordered_sums);
 580                        /*
 581                         * is this extent already allocated in the extent
 582                         * allocation tree?  If so, just add a reference
 583                         */
 584                        ret = btrfs_lookup_extent(root, ins.objectid,
 585                                                ins.offset);
 586                        if (ret == 0) {
 587                                ret = btrfs_inc_extent_ref(trans, root,
 588                                                ins.objectid, ins.offset,
 589                                                0, root->root_key.objectid,
 590                                                key->objectid, offset, 0);
 591                                BUG_ON(ret);
 592                        } else {
 593                                /*
 594                                 * insert the extent pointer in the extent
 595                                 * allocation tree
 596                                 */
 597                                ret = btrfs_alloc_logged_file_extent(trans,
 598                                                root, root->root_key.objectid,
 599                                                key->objectid, offset, &ins);
 600                                BUG_ON(ret);
 601                        }
 602                        btrfs_release_path(path);
 603
 604                        if (btrfs_file_extent_compression(eb, item)) {
 605                                csum_start = ins.objectid;
 606                                csum_end = csum_start + ins.offset;
 607                        } else {
 608                                csum_start = ins.objectid +
 609                                        btrfs_file_extent_offset(eb, item);
 610                                csum_end = csum_start +
 611                                        btrfs_file_extent_num_bytes(eb, item);
 612                        }
 613
 614                        ret = btrfs_lookup_csums_range(root->log_root,
 615                                                csum_start, csum_end - 1,
 616                                                &ordered_sums, 0);
 617                        BUG_ON(ret);
 618                        while (!list_empty(&ordered_sums)) {
 619                                struct btrfs_ordered_sum *sums;
 620                                sums = list_entry(ordered_sums.next,
 621                                                struct btrfs_ordered_sum,
 622                                                list);
 623                                ret = btrfs_csum_file_blocks(trans,
 624                                                root->fs_info->csum_root,
 625                                                sums);
 626                                BUG_ON(ret);
 627                                list_del(&sums->list);
 628                                kfree(sums);
 629                        }
 630                } else {
 631                        btrfs_release_path(path);
 632                }
 633        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 634                /* inline extents are easy, we just overwrite them */
 635                ret = overwrite_item(trans, root, path, eb, slot, key);
 636                BUG_ON(ret);
 637        }
 638
 639        inode_set_bytes(inode, saved_nbytes);
 640        ret = btrfs_update_inode(trans, root, inode);
 641out:
 642        if (inode)
 643                iput(inode);
 644        return ret;
 645}
 646
 647/*
 648 * when cleaning up conflicts between the directory names in the
 649 * subvolume, directory names in the log and directory names in the
 650 * inode back references, we may have to unlink inodes from directories.
 651 *
 652 * This is a helper function to do the unlink of a specific directory
 653 * item
 654 */
 655static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 656                                      struct btrfs_root *root,
 657                                      struct btrfs_path *path,
 658                                      struct inode *dir,
 659                                      struct btrfs_dir_item *di)
 660{
 661        struct inode *inode;
 662        char *name;
 663        int name_len;
 664        struct extent_buffer *leaf;
 665        struct btrfs_key location;
 666        int ret;
 667
 668        leaf = path->nodes[0];
 669
 670        btrfs_dir_item_key_to_cpu(leaf, di, &location);
 671        name_len = btrfs_dir_name_len(leaf, di);
 672        name = kmalloc(name_len, GFP_NOFS);
 673        if (!name)
 674                return -ENOMEM;
 675
 676        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 677        btrfs_release_path(path);
 678
 679        inode = read_one_inode(root, location.objectid);
 680        if (!inode) {
 681                kfree(name);
 682                return -EIO;
 683        }
 684
 685        ret = link_to_fixup_dir(trans, root, path, location.objectid);
 686        BUG_ON(ret);
 687
 688        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 689        BUG_ON(ret);
 690        kfree(name);
 691
 692        iput(inode);
 693
 694        btrfs_run_delayed_items(trans, root);
 695        return ret;
 696}
 697
 698/*
 699 * helper function to see if a given name and sequence number found
 700 * in an inode back reference are already in a directory and correctly
 701 * point to this inode
 702 */
 703static noinline int inode_in_dir(struct btrfs_root *root,
 704                                 struct btrfs_path *path,
 705                                 u64 dirid, u64 objectid, u64 index,
 706                                 const char *name, int name_len)
 707{
 708        struct btrfs_dir_item *di;
 709        struct btrfs_key location;
 710        int match = 0;
 711
 712        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 713                                         index, name, name_len, 0);
 714        if (di && !IS_ERR(di)) {
 715                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 716                if (location.objectid != objectid)
 717                        goto out;
 718        } else
 719                goto out;
 720        btrfs_release_path(path);
 721
 722        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 723        if (di && !IS_ERR(di)) {
 724                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 725                if (location.objectid != objectid)
 726                        goto out;
 727        } else
 728                goto out;
 729        match = 1;
 730out:
 731        btrfs_release_path(path);
 732        return match;
 733}
 734
 735/*
 736 * helper function to check a log tree for a named back reference in
 737 * an inode.  This is used to decide if a back reference that is
 738 * found in the subvolume conflicts with what we find in the log.
 739 *
 740 * inode backreferences may have multiple refs in a single item,
 741 * during replay we process one reference at a time, and we don't
 742 * want to delete valid links to a file from the subvolume if that
 743 * link is also in the log.
 744 */
 745static noinline int backref_in_log(struct btrfs_root *log,
 746                                   struct btrfs_key *key,
 747                                   char *name, int namelen)
 748{
 749        struct btrfs_path *path;
 750        struct btrfs_inode_ref *ref;
 751        unsigned long ptr;
 752        unsigned long ptr_end;
 753        unsigned long name_ptr;
 754        int found_name_len;
 755        int item_size;
 756        int ret;
 757        int match = 0;
 758
 759        path = btrfs_alloc_path();
 760        if (!path)
 761                return -ENOMEM;
 762
 763        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 764        if (ret != 0)
 765                goto out;
 766
 767        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 768        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 769        ptr_end = ptr + item_size;
 770        while (ptr < ptr_end) {
 771                ref = (struct btrfs_inode_ref *)ptr;
 772                found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 773                if (found_name_len == namelen) {
 774                        name_ptr = (unsigned long)(ref + 1);
 775                        ret = memcmp_extent_buffer(path->nodes[0], name,
 776                                                   name_ptr, namelen);
 777                        if (ret == 0) {
 778                                match = 1;
 779                                goto out;
 780                        }
 781                }
 782                ptr = (unsigned long)(ref + 1) + found_name_len;
 783        }
 784out:
 785        btrfs_free_path(path);
 786        return match;
 787}
 788
 789
 790/*
 791 * replay one inode back reference item found in the log tree.
 792 * eb, slot and key refer to the buffer and key found in the log tree.
 793 * root is the destination we are replaying into, and path is for temp
 794 * use by this function.  (it should be released on return).
 795 */
 796static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 797                                  struct btrfs_root *root,
 798                                  struct btrfs_root *log,
 799                                  struct btrfs_path *path,
 800                                  struct extent_buffer *eb, int slot,
 801                                  struct btrfs_key *key)
 802{
 803        struct btrfs_inode_ref *ref;
 804        struct btrfs_dir_item *di;
 805        struct inode *dir;
 806        struct inode *inode;
 807        unsigned long ref_ptr;
 808        unsigned long ref_end;
 809        char *name;
 810        int namelen;
 811        int ret;
 812        int search_done = 0;
 813
 814        /*
 815         * it is possible that we didn't log all the parent directories
 816         * for a given inode.  If we don't find the dir, just don't
 817         * copy the back ref in.  The link count fixup code will take
 818         * care of the rest
 819         */
 820        dir = read_one_inode(root, key->offset);
 821        if (!dir)
 822                return -ENOENT;
 823
 824        inode = read_one_inode(root, key->objectid);
 825        if (!inode) {
 826                iput(dir);
 827                return -EIO;
 828        }
 829
 830        ref_ptr = btrfs_item_ptr_offset(eb, slot);
 831        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
 832
 833again:
 834        ref = (struct btrfs_inode_ref *)ref_ptr;
 835
 836        namelen = btrfs_inode_ref_name_len(eb, ref);
 837        name = kmalloc(namelen, GFP_NOFS);
 838        BUG_ON(!name);
 839
 840        read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
 841
 842        /* if we already have a perfect match, we're done */
 843        if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
 844                         btrfs_inode_ref_index(eb, ref),
 845                         name, namelen)) {
 846                goto out;
 847        }
 848
 849        /*
 850         * look for a conflicting back reference in the metadata.
 851         * if we find one we have to unlink that name of the file
 852         * before we add our new link.  Later on, we overwrite any
 853         * existing back reference, and we don't want to create
 854         * dangling pointers in the directory.
 855         */
 856
 857        if (search_done)
 858                goto insert;
 859
 860        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 861        if (ret == 0) {
 862                char *victim_name;
 863                int victim_name_len;
 864                struct btrfs_inode_ref *victim_ref;
 865                unsigned long ptr;
 866                unsigned long ptr_end;
 867                struct extent_buffer *leaf = path->nodes[0];
 868
 869                /* are we trying to overwrite a back ref for the root directory
 870                 * if so, just jump out, we're done
 871                 */
 872                if (key->objectid == key->offset)
 873                        goto out_nowrite;
 874
 875                /* check all the names in this back reference to see
 876                 * if they are in the log.  if so, we allow them to stay
 877                 * otherwise they must be unlinked as a conflict
 878                 */
 879                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 880                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
 881                while (ptr < ptr_end) {
 882                        victim_ref = (struct btrfs_inode_ref *)ptr;
 883                        victim_name_len = btrfs_inode_ref_name_len(leaf,
 884                                                                   victim_ref);
 885                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
 886                        BUG_ON(!victim_name);
 887
 888                        read_extent_buffer(leaf, victim_name,
 889                                           (unsigned long)(victim_ref + 1),
 890                                           victim_name_len);
 891
 892                        if (!backref_in_log(log, key, victim_name,
 893                                            victim_name_len)) {
 894                                btrfs_inc_nlink(inode);
 895                                btrfs_release_path(path);
 896
 897                                ret = btrfs_unlink_inode(trans, root, dir,
 898                                                         inode, victim_name,
 899                                                         victim_name_len);
 900                                btrfs_run_delayed_items(trans, root);
 901                        }
 902                        kfree(victim_name);
 903                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 904                }
 905                BUG_ON(ret);
 906
 907                /*
 908                 * NOTE: we have searched root tree and checked the
 909                 * coresponding ref, it does not need to check again.
 910                 */
 911                search_done = 1;
 912        }
 913        btrfs_release_path(path);
 914
 915        /* look for a conflicting sequence number */
 916        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
 917                                         btrfs_inode_ref_index(eb, ref),
 918                                         name, namelen, 0);
 919        if (di && !IS_ERR(di)) {
 920                ret = drop_one_dir_item(trans, root, path, dir, di);
 921                BUG_ON(ret);
 922        }
 923        btrfs_release_path(path);
 924
 925        /* look for a conflicing name */
 926        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
 927                                   name, namelen, 0);
 928        if (di && !IS_ERR(di)) {
 929                ret = drop_one_dir_item(trans, root, path, dir, di);
 930                BUG_ON(ret);
 931        }
 932        btrfs_release_path(path);
 933
 934insert:
 935        /* insert our name */
 936        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
 937                             btrfs_inode_ref_index(eb, ref));
 938        BUG_ON(ret);
 939
 940        btrfs_update_inode(trans, root, inode);
 941
 942out:
 943        ref_ptr = (unsigned long)(ref + 1) + namelen;
 944        kfree(name);
 945        if (ref_ptr < ref_end)
 946                goto again;
 947
 948        /* finally write the back reference in the inode */
 949        ret = overwrite_item(trans, root, path, eb, slot, key);
 950        BUG_ON(ret);
 951
 952out_nowrite:
 953        btrfs_release_path(path);
 954        iput(dir);
 955        iput(inode);
 956        return 0;
 957}
 958
 959static int insert_orphan_item(struct btrfs_trans_handle *trans,
 960                              struct btrfs_root *root, u64 offset)
 961{
 962        int ret;
 963        ret = btrfs_find_orphan_item(root, offset);
 964        if (ret > 0)
 965                ret = btrfs_insert_orphan_item(trans, root, offset);
 966        return ret;
 967}
 968
 969
 970/*
 971 * There are a few corners where the link count of the file can't
 972 * be properly maintained during replay.  So, instead of adding
 973 * lots of complexity to the log code, we just scan the backrefs
 974 * for any file that has been through replay.
 975 *
 976 * The scan will update the link count on the inode to reflect the
 977 * number of back refs found.  If it goes down to zero, the iput
 978 * will free the inode.
 979 */
 980static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 981                                           struct btrfs_root *root,
 982                                           struct inode *inode)
 983{
 984        struct btrfs_path *path;
 985        int ret;
 986        struct btrfs_key key;
 987        u64 nlink = 0;
 988        unsigned long ptr;
 989        unsigned long ptr_end;
 990        int name_len;
 991        u64 ino = btrfs_ino(inode);
 992
 993        key.objectid = ino;
 994        key.type = BTRFS_INODE_REF_KEY;
 995        key.offset = (u64)-1;
 996
 997        path = btrfs_alloc_path();
 998        if (!path)
 999                return -ENOMEM;
1000
1001        while (1) {
1002                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003                if (ret < 0)
1004                        break;
1005                if (ret > 0) {
1006                        if (path->slots[0] == 0)
1007                                break;
1008                        path->slots[0]--;
1009                }
1010                btrfs_item_key_to_cpu(path->nodes[0], &key,
1011                                      path->slots[0]);
1012                if (key.objectid != ino ||
1013                    key.type != BTRFS_INODE_REF_KEY)
1014                        break;
1015                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1016                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1017                                                   path->slots[0]);
1018                while (ptr < ptr_end) {
1019                        struct btrfs_inode_ref *ref;
1020
1021                        ref = (struct btrfs_inode_ref *)ptr;
1022                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
1023                                                            ref);
1024                        ptr = (unsigned long)(ref + 1) + name_len;
1025                        nlink++;
1026                }
1027
1028                if (key.offset == 0)
1029                        break;
1030                key.offset--;
1031                btrfs_release_path(path);
1032        }
1033        btrfs_release_path(path);
1034        if (nlink != inode->i_nlink) {
1035                set_nlink(inode, nlink);
1036                btrfs_update_inode(trans, root, inode);
1037        }
1038        BTRFS_I(inode)->index_cnt = (u64)-1;
1039
1040        if (inode->i_nlink == 0) {
1041                if (S_ISDIR(inode->i_mode)) {
1042                        ret = replay_dir_deletes(trans, root, NULL, path,
1043                                                 ino, 1);
1044                        BUG_ON(ret);
1045                }
1046                ret = insert_orphan_item(trans, root, ino);
1047                BUG_ON(ret);
1048        }
1049        btrfs_free_path(path);
1050
1051        return 0;
1052}
1053
1054static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1055                                            struct btrfs_root *root,
1056                                            struct btrfs_path *path)
1057{
1058        int ret;
1059        struct btrfs_key key;
1060        struct inode *inode;
1061
1062        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1063        key.type = BTRFS_ORPHAN_ITEM_KEY;
1064        key.offset = (u64)-1;
1065        while (1) {
1066                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1067                if (ret < 0)
1068                        break;
1069
1070                if (ret == 1) {
1071                        if (path->slots[0] == 0)
1072                                break;
1073                        path->slots[0]--;
1074                }
1075
1076                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1077                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1078                    key.type != BTRFS_ORPHAN_ITEM_KEY)
1079                        break;
1080
1081                ret = btrfs_del_item(trans, root, path);
1082                if (ret)
1083                        goto out;
1084
1085                btrfs_release_path(path);
1086                inode = read_one_inode(root, key.offset);
1087                if (!inode)
1088                        return -EIO;
1089
1090                ret = fixup_inode_link_count(trans, root, inode);
1091                BUG_ON(ret);
1092
1093                iput(inode);
1094
1095                /*
1096                 * fixup on a directory may create new entries,
1097                 * make sure we always look for the highset possible
1098                 * offset
1099                 */
1100                key.offset = (u64)-1;
1101        }
1102        ret = 0;
1103out:
1104        btrfs_release_path(path);
1105        return ret;
1106}
1107
1108
1109/*
1110 * record a given inode in the fixup dir so we can check its link
1111 * count when replay is done.  The link count is incremented here
1112 * so the inode won't go away until we check it
1113 */
1114static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1115                                      struct btrfs_root *root,
1116                                      struct btrfs_path *path,
1117                                      u64 objectid)
1118{
1119        struct btrfs_key key;
1120        int ret = 0;
1121        struct inode *inode;
1122
1123        inode = read_one_inode(root, objectid);
1124        if (!inode)
1125                return -EIO;
1126
1127        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1128        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1129        key.offset = objectid;
1130
1131        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1132
1133        btrfs_release_path(path);
1134        if (ret == 0) {
1135                btrfs_inc_nlink(inode);
1136                ret = btrfs_update_inode(trans, root, inode);
1137        } else if (ret == -EEXIST) {
1138                ret = 0;
1139        } else {
1140                BUG();
1141        }
1142        iput(inode);
1143
1144        return ret;
1145}
1146
1147/*
1148 * when replaying the log for a directory, we only insert names
1149 * for inodes that actually exist.  This means an fsync on a directory
1150 * does not implicitly fsync all the new files in it
1151 */
1152static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1153                                    struct btrfs_root *root,
1154                                    struct btrfs_path *path,
1155                                    u64 dirid, u64 index,
1156                                    char *name, int name_len, u8 type,
1157                                    struct btrfs_key *location)
1158{
1159        struct inode *inode;
1160        struct inode *dir;
1161        int ret;
1162
1163        inode = read_one_inode(root, location->objectid);
1164        if (!inode)
1165                return -ENOENT;
1166
1167        dir = read_one_inode(root, dirid);
1168        if (!dir) {
1169                iput(inode);
1170                return -EIO;
1171        }
1172        ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1173
1174        /* FIXME, put inode into FIXUP list */
1175
1176        iput(inode);
1177        iput(dir);
1178        return ret;
1179}
1180
1181/*
1182 * take a single entry in a log directory item and replay it into
1183 * the subvolume.
1184 *
1185 * if a conflicting item exists in the subdirectory already,
1186 * the inode it points to is unlinked and put into the link count
1187 * fix up tree.
1188 *
1189 * If a name from the log points to a file or directory that does
1190 * not exist in the FS, it is skipped.  fsyncs on directories
1191 * do not force down inodes inside that directory, just changes to the
1192 * names or unlinks in a directory.
1193 */
1194static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1195                                    struct btrfs_root *root,
1196                                    struct btrfs_path *path,
1197                                    struct extent_buffer *eb,
1198                                    struct btrfs_dir_item *di,
1199                                    struct btrfs_key *key)
1200{
1201        char *name;
1202        int name_len;
1203        struct btrfs_dir_item *dst_di;
1204        struct btrfs_key found_key;
1205        struct btrfs_key log_key;
1206        struct inode *dir;
1207        u8 log_type;
1208        int exists;
1209        int ret;
1210
1211        dir = read_one_inode(root, key->objectid);
1212        if (!dir)
1213                return -EIO;
1214
1215        name_len = btrfs_dir_name_len(eb, di);
1216        name = kmalloc(name_len, GFP_NOFS);
1217        if (!name)
1218                return -ENOMEM;
1219
1220        log_type = btrfs_dir_type(eb, di);
1221        read_extent_buffer(eb, name, (unsigned long)(di + 1),
1222                   name_len);
1223
1224        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1225        exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1226        if (exists == 0)
1227                exists = 1;
1228        else
1229                exists = 0;
1230        btrfs_release_path(path);
1231
1232        if (key->type == BTRFS_DIR_ITEM_KEY) {
1233                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1234                                       name, name_len, 1);
1235        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1236                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1237                                                     key->objectid,
1238                                                     key->offset, name,
1239                                                     name_len, 1);
1240        } else {
1241                BUG();
1242        }
1243        if (IS_ERR_OR_NULL(dst_di)) {
1244                /* we need a sequence number to insert, so we only
1245                 * do inserts for the BTRFS_DIR_INDEX_KEY types
1246                 */
1247                if (key->type != BTRFS_DIR_INDEX_KEY)
1248                        goto out;
1249                goto insert;
1250        }
1251
1252        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1253        /* the existing item matches the logged item */
1254        if (found_key.objectid == log_key.objectid &&
1255            found_key.type == log_key.type &&
1256            found_key.offset == log_key.offset &&
1257            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1258                goto out;
1259        }
1260
1261        /*
1262         * don't drop the conflicting directory entry if the inode
1263         * for the new entry doesn't exist
1264         */
1265        if (!exists)
1266                goto out;
1267
1268        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1269        BUG_ON(ret);
1270
1271        if (key->type == BTRFS_DIR_INDEX_KEY)
1272                goto insert;
1273out:
1274        btrfs_release_path(path);
1275        kfree(name);
1276        iput(dir);
1277        return 0;
1278
1279insert:
1280        btrfs_release_path(path);
1281        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1282                              name, name_len, log_type, &log_key);
1283
1284        BUG_ON(ret && ret != -ENOENT);
1285        goto out;
1286}
1287
1288/*
1289 * find all the names in a directory item and reconcile them into
1290 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1291 * one name in a directory item, but the same code gets used for
1292 * both directory index types
1293 */
1294static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1295                                        struct btrfs_root *root,
1296                                        struct btrfs_path *path,
1297                                        struct extent_buffer *eb, int slot,
1298                                        struct btrfs_key *key)
1299{
1300        int ret;
1301        u32 item_size = btrfs_item_size_nr(eb, slot);
1302        struct btrfs_dir_item *di;
1303        int name_len;
1304        unsigned long ptr;
1305        unsigned long ptr_end;
1306
1307        ptr = btrfs_item_ptr_offset(eb, slot);
1308        ptr_end = ptr + item_size;
1309        while (ptr < ptr_end) {
1310                di = (struct btrfs_dir_item *)ptr;
1311                if (verify_dir_item(root, eb, di))
1312                        return -EIO;
1313                name_len = btrfs_dir_name_len(eb, di);
1314                ret = replay_one_name(trans, root, path, eb, di, key);
1315                BUG_ON(ret);
1316                ptr = (unsigned long)(di + 1);
1317                ptr += name_len;
1318        }
1319        return 0;
1320}
1321
1322/*
1323 * directory replay has two parts.  There are the standard directory
1324 * items in the log copied from the subvolume, and range items
1325 * created in the log while the subvolume was logged.
1326 *
1327 * The range items tell us which parts of the key space the log
1328 * is authoritative for.  During replay, if a key in the subvolume
1329 * directory is in a logged range item, but not actually in the log
1330 * that means it was deleted from the directory before the fsync
1331 * and should be removed.
1332 */
1333static noinline int find_dir_range(struct btrfs_root *root,
1334                                   struct btrfs_path *path,
1335                                   u64 dirid, int key_type,
1336                                   u64 *start_ret, u64 *end_ret)
1337{
1338        struct btrfs_key key;
1339        u64 found_end;
1340        struct btrfs_dir_log_item *item;
1341        int ret;
1342        int nritems;
1343
1344        if (*start_ret == (u64)-1)
1345                return 1;
1346
1347        key.objectid = dirid;
1348        key.type = key_type;
1349        key.offset = *start_ret;
1350
1351        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1352        if (ret < 0)
1353                goto out;
1354        if (ret > 0) {
1355                if (path->slots[0] == 0)
1356                        goto out;
1357                path->slots[0]--;
1358        }
1359        if (ret != 0)
1360                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1361
1362        if (key.type != key_type || key.objectid != dirid) {
1363                ret = 1;
1364                goto next;
1365        }
1366        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1367                              struct btrfs_dir_log_item);
1368        found_end = btrfs_dir_log_end(path->nodes[0], item);
1369
1370        if (*start_ret >= key.offset && *start_ret <= found_end) {
1371                ret = 0;
1372                *start_ret = key.offset;
1373                *end_ret = found_end;
1374                goto out;
1375        }
1376        ret = 1;
1377next:
1378        /* check the next slot in the tree to see if it is a valid item */
1379        nritems = btrfs_header_nritems(path->nodes[0]);
1380        if (path->slots[0] >= nritems) {
1381                ret = btrfs_next_leaf(root, path);
1382                if (ret)
1383                        goto out;
1384        } else {
1385                path->slots[0]++;
1386        }
1387
1388        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1389
1390        if (key.type != key_type || key.objectid != dirid) {
1391                ret = 1;
1392                goto out;
1393        }
1394        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1395                              struct btrfs_dir_log_item);
1396        found_end = btrfs_dir_log_end(path->nodes[0], item);
1397        *start_ret = key.offset;
1398        *end_ret = found_end;
1399        ret = 0;
1400out:
1401        btrfs_release_path(path);
1402        return ret;
1403}
1404
1405/*
1406 * this looks for a given directory item in the log.  If the directory
1407 * item is not in the log, the item is removed and the inode it points
1408 * to is unlinked
1409 */
1410static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1411                                      struct btrfs_root *root,
1412                                      struct btrfs_root *log,
1413                                      struct btrfs_path *path,
1414                                      struct btrfs_path *log_path,
1415                                      struct inode *dir,
1416                                      struct btrfs_key *dir_key)
1417{
1418        int ret;
1419        struct extent_buffer *eb;
1420        int slot;
1421        u32 item_size;
1422        struct btrfs_dir_item *di;
1423        struct btrfs_dir_item *log_di;
1424        int name_len;
1425        unsigned long ptr;
1426        unsigned long ptr_end;
1427        char *name;
1428        struct inode *inode;
1429        struct btrfs_key location;
1430
1431again:
1432        eb = path->nodes[0];
1433        slot = path->slots[0];
1434        item_size = btrfs_item_size_nr(eb, slot);
1435        ptr = btrfs_item_ptr_offset(eb, slot);
1436        ptr_end = ptr + item_size;
1437        while (ptr < ptr_end) {
1438                di = (struct btrfs_dir_item *)ptr;
1439                if (verify_dir_item(root, eb, di)) {
1440                        ret = -EIO;
1441                        goto out;
1442                }
1443
1444                name_len = btrfs_dir_name_len(eb, di);
1445                name = kmalloc(name_len, GFP_NOFS);
1446                if (!name) {
1447                        ret = -ENOMEM;
1448                        goto out;
1449                }
1450                read_extent_buffer(eb, name, (unsigned long)(di + 1),
1451                                  name_len);
1452                log_di = NULL;
1453                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1454                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
1455                                                       dir_key->objectid,
1456                                                       name, name_len, 0);
1457                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1458                        log_di = btrfs_lookup_dir_index_item(trans, log,
1459                                                     log_path,
1460                                                     dir_key->objectid,
1461                                                     dir_key->offset,
1462                                                     name, name_len, 0);
1463                }
1464                if (IS_ERR_OR_NULL(log_di)) {
1465                        btrfs_dir_item_key_to_cpu(eb, di, &location);
1466                        btrfs_release_path(path);
1467                        btrfs_release_path(log_path);
1468                        inode = read_one_inode(root, location.objectid);
1469                        if (!inode) {
1470                                kfree(name);
1471                                return -EIO;
1472                        }
1473
1474                        ret = link_to_fixup_dir(trans, root,
1475                                                path, location.objectid);
1476                        BUG_ON(ret);
1477                        btrfs_inc_nlink(inode);
1478                        ret = btrfs_unlink_inode(trans, root, dir, inode,
1479                                                 name, name_len);
1480                        BUG_ON(ret);
1481
1482                        btrfs_run_delayed_items(trans, root);
1483
1484                        kfree(name);
1485                        iput(inode);
1486
1487                        /* there might still be more names under this key
1488                         * check and repeat if required
1489                         */
1490                        ret = btrfs_search_slot(NULL, root, dir_key, path,
1491                                                0, 0);
1492                        if (ret == 0)
1493                                goto again;
1494                        ret = 0;
1495                        goto out;
1496                }
1497                btrfs_release_path(log_path);
1498                kfree(name);
1499
1500                ptr = (unsigned long)(di + 1);
1501                ptr += name_len;
1502        }
1503        ret = 0;
1504out:
1505        btrfs_release_path(path);
1506        btrfs_release_path(log_path);
1507        return ret;
1508}
1509
1510/*
1511 * deletion replay happens before we copy any new directory items
1512 * out of the log or out of backreferences from inodes.  It
1513 * scans the log to find ranges of keys that log is authoritative for,
1514 * and then scans the directory to find items in those ranges that are
1515 * not present in the log.
1516 *
1517 * Anything we don't find in the log is unlinked and removed from the
1518 * directory.
1519 */
1520static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1521                                       struct btrfs_root *root,
1522                                       struct btrfs_root *log,
1523                                       struct btrfs_path *path,
1524                                       u64 dirid, int del_all)
1525{
1526        u64 range_start;
1527        u64 range_end;
1528        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1529        int ret = 0;
1530        struct btrfs_key dir_key;
1531        struct btrfs_key found_key;
1532        struct btrfs_path *log_path;
1533        struct inode *dir;
1534
1535        dir_key.objectid = dirid;
1536        dir_key.type = BTRFS_DIR_ITEM_KEY;
1537        log_path = btrfs_alloc_path();
1538        if (!log_path)
1539                return -ENOMEM;
1540
1541        dir = read_one_inode(root, dirid);
1542        /* it isn't an error if the inode isn't there, that can happen
1543         * because we replay the deletes before we copy in the inode item
1544         * from the log
1545         */
1546        if (!dir) {
1547                btrfs_free_path(log_path);
1548                return 0;
1549        }
1550again:
1551        range_start = 0;
1552        range_end = 0;
1553        while (1) {
1554                if (del_all)
1555                        range_end = (u64)-1;
1556                else {
1557                        ret = find_dir_range(log, path, dirid, key_type,
1558                                             &range_start, &range_end);
1559                        if (ret != 0)
1560                                break;
1561                }
1562
1563                dir_key.offset = range_start;
1564                while (1) {
1565                        int nritems;
1566                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
1567                                                0, 0);
1568                        if (ret < 0)
1569                                goto out;
1570
1571                        nritems = btrfs_header_nritems(path->nodes[0]);
1572                        if (path->slots[0] >= nritems) {
1573                                ret = btrfs_next_leaf(root, path);
1574                                if (ret)
1575                                        break;
1576                        }
1577                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1578                                              path->slots[0]);
1579                        if (found_key.objectid != dirid ||
1580                            found_key.type != dir_key.type)
1581                                goto next_type;
1582
1583                        if (found_key.offset > range_end)
1584                                break;
1585
1586                        ret = check_item_in_log(trans, root, log, path,
1587                                                log_path, dir,
1588                                                &found_key);
1589                        BUG_ON(ret);
1590                        if (found_key.offset == (u64)-1)
1591                                break;
1592                        dir_key.offset = found_key.offset + 1;
1593                }
1594                btrfs_release_path(path);
1595                if (range_end == (u64)-1)
1596                        break;
1597                range_start = range_end + 1;
1598        }
1599
1600next_type:
1601        ret = 0;
1602        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1603                key_type = BTRFS_DIR_LOG_INDEX_KEY;
1604                dir_key.type = BTRFS_DIR_INDEX_KEY;
1605                btrfs_release_path(path);
1606                goto again;
1607        }
1608out:
1609        btrfs_release_path(path);
1610        btrfs_free_path(log_path);
1611        iput(dir);
1612        return ret;
1613}
1614
1615/*
1616 * the process_func used to replay items from the log tree.  This
1617 * gets called in two different stages.  The first stage just looks
1618 * for inodes and makes sure they are all copied into the subvolume.
1619 *
1620 * The second stage copies all the other item types from the log into
1621 * the subvolume.  The two stage approach is slower, but gets rid of
1622 * lots of complexity around inodes referencing other inodes that exist
1623 * only in the log (references come from either directory items or inode
1624 * back refs).
1625 */
1626static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1627                             struct walk_control *wc, u64 gen)
1628{
1629        int nritems;
1630        struct btrfs_path *path;
1631        struct btrfs_root *root = wc->replay_dest;
1632        struct btrfs_key key;
1633        int level;
1634        int i;
1635        int ret;
1636
1637        ret = btrfs_read_buffer(eb, gen);
1638        if (ret)
1639                return ret;
1640
1641        level = btrfs_header_level(eb);
1642
1643        if (level != 0)
1644                return 0;
1645
1646        path = btrfs_alloc_path();
1647        if (!path)
1648                return -ENOMEM;
1649
1650        nritems = btrfs_header_nritems(eb);
1651        for (i = 0; i < nritems; i++) {
1652                btrfs_item_key_to_cpu(eb, &key, i);
1653
1654                /* inode keys are done during the first stage */
1655                if (key.type == BTRFS_INODE_ITEM_KEY &&
1656                    wc->stage == LOG_WALK_REPLAY_INODES) {
1657                        struct btrfs_inode_item *inode_item;
1658                        u32 mode;
1659
1660                        inode_item = btrfs_item_ptr(eb, i,
1661                                            struct btrfs_inode_item);
1662                        mode = btrfs_inode_mode(eb, inode_item);
1663                        if (S_ISDIR(mode)) {
1664                                ret = replay_dir_deletes(wc->trans,
1665                                         root, log, path, key.objectid, 0);
1666                                BUG_ON(ret);
1667                        }
1668                        ret = overwrite_item(wc->trans, root, path,
1669                                             eb, i, &key);
1670                        BUG_ON(ret);
1671
1672                        /* for regular files, make sure corresponding
1673                         * orhpan item exist. extents past the new EOF
1674                         * will be truncated later by orphan cleanup.
1675                         */
1676                        if (S_ISREG(mode)) {
1677                                ret = insert_orphan_item(wc->trans, root,
1678                                                         key.objectid);
1679                                BUG_ON(ret);
1680                        }
1681
1682                        ret = link_to_fixup_dir(wc->trans, root,
1683                                                path, key.objectid);
1684                        BUG_ON(ret);
1685                }
1686                if (wc->stage < LOG_WALK_REPLAY_ALL)
1687                        continue;
1688
1689                /* these keys are simply copied */
1690                if (key.type == BTRFS_XATTR_ITEM_KEY) {
1691                        ret = overwrite_item(wc->trans, root, path,
1692                                             eb, i, &key);
1693                        BUG_ON(ret);
1694                } else if (key.type == BTRFS_INODE_REF_KEY) {
1695                        ret = add_inode_ref(wc->trans, root, log, path,
1696                                            eb, i, &key);
1697                        BUG_ON(ret && ret != -ENOENT);
1698                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1699                        ret = replay_one_extent(wc->trans, root, path,
1700                                                eb, i, &key);
1701                        BUG_ON(ret);
1702                } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1703                           key.type == BTRFS_DIR_INDEX_KEY) {
1704                        ret = replay_one_dir_item(wc->trans, root, path,
1705                                                  eb, i, &key);
1706                        BUG_ON(ret);
1707                }
1708        }
1709        btrfs_free_path(path);
1710        return 0;
1711}
1712
1713static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1714                                   struct btrfs_root *root,
1715                                   struct btrfs_path *path, int *level,
1716                                   struct walk_control *wc)
1717{
1718        u64 root_owner;
1719        u64 bytenr;
1720        u64 ptr_gen;
1721        struct extent_buffer *next;
1722        struct extent_buffer *cur;
1723        struct extent_buffer *parent;
1724        u32 blocksize;
1725        int ret = 0;
1726
1727        WARN_ON(*level < 0);
1728        WARN_ON(*level >= BTRFS_MAX_LEVEL);
1729
1730        while (*level > 0) {
1731                WARN_ON(*level < 0);
1732                WARN_ON(*level >= BTRFS_MAX_LEVEL);
1733                cur = path->nodes[*level];
1734
1735                if (btrfs_header_level(cur) != *level)
1736                        WARN_ON(1);
1737
1738                if (path->slots[*level] >=
1739                    btrfs_header_nritems(cur))
1740                        break;
1741
1742                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1743                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1744                blocksize = btrfs_level_size(root, *level - 1);
1745
1746                parent = path->nodes[*level];
1747                root_owner = btrfs_header_owner(parent);
1748
1749                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1750                if (!next)
1751                        return -ENOMEM;
1752
1753                if (*level == 1) {
1754                        ret = wc->process_func(root, next, wc, ptr_gen);
1755                        if (ret)
1756                                return ret;
1757
1758                        path->slots[*level]++;
1759                        if (wc->free) {
1760                                ret = btrfs_read_buffer(next, ptr_gen);
1761                                if (ret) {
1762                                        free_extent_buffer(next);
1763                                        return ret;
1764                                }
1765
1766                                btrfs_tree_lock(next);
1767                                btrfs_set_lock_blocking(next);
1768                                clean_tree_block(trans, root, next);
1769                                btrfs_wait_tree_block_writeback(next);
1770                                btrfs_tree_unlock(next);
1771
1772                                WARN_ON(root_owner !=
1773                                        BTRFS_TREE_LOG_OBJECTID);
1774                                ret = btrfs_free_and_pin_reserved_extent(root,
1775                                                         bytenr, blocksize);
1776                                BUG_ON(ret); /* -ENOMEM or logic errors */
1777                        }
1778                        free_extent_buffer(next);
1779                        continue;
1780                }
1781                ret = btrfs_read_buffer(next, ptr_gen);
1782                if (ret) {
1783                        free_extent_buffer(next);
1784                        return ret;
1785                }
1786
1787                WARN_ON(*level <= 0);
1788                if (path->nodes[*level-1])
1789                        free_extent_buffer(path->nodes[*level-1]);
1790                path->nodes[*level-1] = next;
1791                *level = btrfs_header_level(next);
1792                path->slots[*level] = 0;
1793                cond_resched();
1794        }
1795        WARN_ON(*level < 0);
1796        WARN_ON(*level >= BTRFS_MAX_LEVEL);
1797
1798        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1799
1800        cond_resched();
1801        return 0;
1802}
1803
1804static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1805                                 struct btrfs_root *root,
1806                                 struct btrfs_path *path, int *level,
1807                                 struct walk_control *wc)
1808{
1809        u64 root_owner;
1810        int i;
1811        int slot;
1812        int ret;
1813
1814        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1815                slot = path->slots[i];
1816                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1817                        path->slots[i]++;
1818                        *level = i;
1819                        WARN_ON(*level == 0);
1820                        return 0;
1821                } else {
1822                        struct extent_buffer *parent;
1823                        if (path->nodes[*level] == root->node)
1824                                parent = path->nodes[*level];
1825                        else
1826                                parent = path->nodes[*level + 1];
1827
1828                        root_owner = btrfs_header_owner(parent);
1829                        ret = wc->process_func(root, path->nodes[*level], wc,
1830                                 btrfs_header_generation(path->nodes[*level]));
1831                        if (ret)
1832                                return ret;
1833
1834                        if (wc->free) {
1835                                struct extent_buffer *next;
1836
1837                                next = path->nodes[*level];
1838
1839                                btrfs_tree_lock(next);
1840                                btrfs_set_lock_blocking(next);
1841                                clean_tree_block(trans, root, next);
1842                                btrfs_wait_tree_block_writeback(next);
1843                                btrfs_tree_unlock(next);
1844
1845                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1846                                ret = btrfs_free_and_pin_reserved_extent(root,
1847                                                path->nodes[*level]->start,
1848                                                path->nodes[*level]->len);
1849                                BUG_ON(ret);
1850                        }
1851                        free_extent_buffer(path->nodes[*level]);
1852                        path->nodes[*level] = NULL;
1853                        *level = i + 1;
1854                }
1855        }
1856        return 1;
1857}
1858
1859/*
1860 * drop the reference count on the tree rooted at 'snap'.  This traverses
1861 * the tree freeing any blocks that have a ref count of zero after being
1862 * decremented.
1863 */
1864static int walk_log_tree(struct btrfs_trans_handle *trans,
1865                         struct btrfs_root *log, struct walk_control *wc)
1866{
1867        int ret = 0;
1868        int wret;
1869        int level;
1870        struct btrfs_path *path;
1871        int i;
1872        int orig_level;
1873
1874        path = btrfs_alloc_path();
1875        if (!path)
1876                return -ENOMEM;
1877
1878        level = btrfs_header_level(log->node);
1879        orig_level = level;
1880        path->nodes[level] = log->node;
1881        extent_buffer_get(log->node);
1882        path->slots[level] = 0;
1883
1884        while (1) {
1885                wret = walk_down_log_tree(trans, log, path, &level, wc);
1886                if (wret > 0)
1887                        break;
1888                if (wret < 0) {
1889                        ret = wret;
1890                        goto out;
1891                }
1892
1893                wret = walk_up_log_tree(trans, log, path, &level, wc);
1894                if (wret > 0)
1895                        break;
1896                if (wret < 0) {
1897                        ret = wret;
1898                        goto out;
1899                }
1900        }
1901
1902        /* was the root node processed? if not, catch it here */
1903        if (path->nodes[orig_level]) {
1904                ret = wc->process_func(log, path->nodes[orig_level], wc,
1905                         btrfs_header_generation(path->nodes[orig_level]));
1906                if (ret)
1907                        goto out;
1908                if (wc->free) {
1909                        struct extent_buffer *next;
1910
1911                        next = path->nodes[orig_level];
1912
1913                        btrfs_tree_lock(next);
1914                        btrfs_set_lock_blocking(next);
1915                        clean_tree_block(trans, log, next);
1916                        btrfs_wait_tree_block_writeback(next);
1917                        btrfs_tree_unlock(next);
1918
1919                        WARN_ON(log->root_key.objectid !=
1920                                BTRFS_TREE_LOG_OBJECTID);
1921                        ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1922                                                         next->len);
1923                        BUG_ON(ret); /* -ENOMEM or logic errors */
1924                }
1925        }
1926
1927out:
1928        for (i = 0; i <= orig_level; i++) {
1929                if (path->nodes[i]) {
1930                        free_extent_buffer(path->nodes[i]);
1931                        path->nodes[i] = NULL;
1932                }
1933        }
1934        btrfs_free_path(path);
1935        return ret;
1936}
1937
1938/*
1939 * helper function to update the item for a given subvolumes log root
1940 * in the tree of log roots
1941 */
1942static int update_log_root(struct btrfs_trans_handle *trans,
1943                           struct btrfs_root *log)
1944{
1945        int ret;
1946
1947        if (log->log_transid == 1) {
1948                /* insert root item on the first sync */
1949                ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1950                                &log->root_key, &log->root_item);
1951        } else {
1952                ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1953                                &log->root_key, &log->root_item);
1954        }
1955        return ret;
1956}
1957
1958static int wait_log_commit(struct btrfs_trans_handle *trans,
1959                           struct btrfs_root *root, unsigned long transid)
1960{
1961        DEFINE_WAIT(wait);
1962        int index = transid % 2;
1963
1964        /*
1965         * we only allow two pending log transactions at a time,
1966         * so we know that if ours is more than 2 older than the
1967         * current transaction, we're done
1968         */
1969        do {
1970                prepare_to_wait(&root->log_commit_wait[index],
1971                                &wait, TASK_UNINTERRUPTIBLE);
1972                mutex_unlock(&root->log_mutex);
1973
1974                if (root->fs_info->last_trans_log_full_commit !=
1975                    trans->transid && root->log_transid < transid + 2 &&
1976                    atomic_read(&root->log_commit[index]))
1977                        schedule();
1978
1979                finish_wait(&root->log_commit_wait[index], &wait);
1980                mutex_lock(&root->log_mutex);
1981        } while (root->fs_info->last_trans_log_full_commit !=
1982                 trans->transid && root->log_transid < transid + 2 &&
1983                 atomic_read(&root->log_commit[index]));
1984        return 0;
1985}
1986
1987static void wait_for_writer(struct btrfs_trans_handle *trans,
1988                            struct btrfs_root *root)
1989{
1990        DEFINE_WAIT(wait);
1991        while (root->fs_info->last_trans_log_full_commit !=
1992               trans->transid && atomic_read(&root->log_writers)) {
1993                prepare_to_wait(&root->log_writer_wait,
1994                                &wait, TASK_UNINTERRUPTIBLE);
1995                mutex_unlock(&root->log_mutex);
1996                if (root->fs_info->last_trans_log_full_commit !=
1997                    trans->transid && atomic_read(&root->log_writers))
1998                        schedule();
1999                mutex_lock(&root->log_mutex);
2000                finish_wait(&root->log_writer_wait, &wait);
2001        }
2002}
2003
2004/*
2005 * btrfs_sync_log does sends a given tree log down to the disk and
2006 * updates the super blocks to record it.  When this call is done,
2007 * you know that any inodes previously logged are safely on disk only
2008 * if it returns 0.
2009 *
2010 * Any other return value means you need to call btrfs_commit_transaction.
2011 * Some of the edge cases for fsyncing directories that have had unlinks
2012 * or renames done in the past mean that sometimes the only safe
2013 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2014 * that has happened.
2015 */
2016int btrfs_sync_log(struct btrfs_trans_handle *trans,
2017                   struct btrfs_root *root)
2018{
2019        int index1;
2020        int index2;
2021        int mark;
2022        int ret;
2023        struct btrfs_root *log = root->log_root;
2024        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2025        unsigned long log_transid = 0;
2026
2027        mutex_lock(&root->log_mutex);
2028        index1 = root->log_transid % 2;
2029        if (atomic_read(&root->log_commit[index1])) {
2030                wait_log_commit(trans, root, root->log_transid);
2031                mutex_unlock(&root->log_mutex);
2032                return 0;
2033        }
2034        atomic_set(&root->log_commit[index1], 1);
2035
2036        /* wait for previous tree log sync to complete */
2037        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2038                wait_log_commit(trans, root, root->log_transid - 1);
2039        while (1) {
2040                unsigned long batch = root->log_batch;
2041                /* when we're on an ssd, just kick the log commit out */
2042                if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2043                        mutex_unlock(&root->log_mutex);
2044                        schedule_timeout_uninterruptible(1);
2045                        mutex_lock(&root->log_mutex);
2046                }
2047                wait_for_writer(trans, root);
2048                if (batch == root->log_batch)
2049                        break;
2050        }
2051
2052        /* bail out if we need to do a full commit */
2053        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2054                ret = -EAGAIN;
2055                mutex_unlock(&root->log_mutex);
2056                goto out;
2057        }
2058
2059        log_transid = root->log_transid;
2060        if (log_transid % 2 == 0)
2061                mark = EXTENT_DIRTY;
2062        else
2063                mark = EXTENT_NEW;
2064
2065        /* we start IO on  all the marked extents here, but we don't actually
2066         * wait for them until later.
2067         */
2068        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2069        if (ret) {
2070                btrfs_abort_transaction(trans, root, ret);
2071                mutex_unlock(&root->log_mutex);
2072                goto out;
2073        }
2074
2075        btrfs_set_root_node(&log->root_item, log->node);
2076
2077        root->log_batch = 0;
2078        root->log_transid++;
2079        log->log_transid = root->log_transid;
2080        root->log_start_pid = 0;
2081        smp_mb();
2082        /*
2083         * IO has been started, blocks of the log tree have WRITTEN flag set
2084         * in their headers. new modifications of the log will be written to
2085         * new positions. so it's safe to allow log writers to go in.
2086         */
2087        mutex_unlock(&root->log_mutex);
2088
2089        mutex_lock(&log_root_tree->log_mutex);
2090        log_root_tree->log_batch++;
2091        atomic_inc(&log_root_tree->log_writers);
2092        mutex_unlock(&log_root_tree->log_mutex);
2093
2094        ret = update_log_root(trans, log);
2095
2096        mutex_lock(&log_root_tree->log_mutex);
2097        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2098                smp_mb();
2099                if (waitqueue_active(&log_root_tree->log_writer_wait))
2100                        wake_up(&log_root_tree->log_writer_wait);
2101        }
2102
2103        if (ret) {
2104                if (ret != -ENOSPC) {
2105                        btrfs_abort_transaction(trans, root, ret);
2106                        mutex_unlock(&log_root_tree->log_mutex);
2107                        goto out;
2108                }
2109                root->fs_info->last_trans_log_full_commit = trans->transid;
2110                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2111                mutex_unlock(&log_root_tree->log_mutex);
2112                ret = -EAGAIN;
2113                goto out;
2114        }
2115
2116        index2 = log_root_tree->log_transid % 2;
2117        if (atomic_read(&log_root_tree->log_commit[index2])) {
2118                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119                wait_log_commit(trans, log_root_tree,
2120                                log_root_tree->log_transid);
2121                mutex_unlock(&log_root_tree->log_mutex);
2122                ret = 0;
2123                goto out;
2124        }
2125        atomic_set(&log_root_tree->log_commit[index2], 1);
2126
2127        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2128                wait_log_commit(trans, log_root_tree,
2129                                log_root_tree->log_transid - 1);
2130        }
2131
2132        wait_for_writer(trans, log_root_tree);
2133
2134        /*
2135         * now that we've moved on to the tree of log tree roots,
2136         * check the full commit flag again
2137         */
2138        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2139                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2140                mutex_unlock(&log_root_tree->log_mutex);
2141                ret = -EAGAIN;
2142                goto out_wake_log_root;
2143        }
2144
2145        ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2146                                &log_root_tree->dirty_log_pages,
2147                                EXTENT_DIRTY | EXTENT_NEW);
2148        if (ret) {
2149                btrfs_abort_transaction(trans, root, ret);
2150                mutex_unlock(&log_root_tree->log_mutex);
2151                goto out_wake_log_root;
2152        }
2153        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2154
2155        btrfs_set_super_log_root(root->fs_info->super_for_commit,
2156                                log_root_tree->node->start);
2157        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2158                                btrfs_header_level(log_root_tree->node));
2159
2160        log_root_tree->log_batch = 0;
2161        log_root_tree->log_transid++;
2162        smp_mb();
2163
2164        mutex_unlock(&log_root_tree->log_mutex);
2165
2166        /*
2167         * nobody else is going to jump in and write the the ctree
2168         * super here because the log_commit atomic below is protecting
2169         * us.  We must be called with a transaction handle pinning
2170         * the running transaction open, so a full commit can't hop
2171         * in and cause problems either.
2172         */
2173        btrfs_scrub_pause_super(root);
2174        write_ctree_super(trans, root->fs_info->tree_root, 1);
2175        btrfs_scrub_continue_super(root);
2176        ret = 0;
2177
2178        mutex_lock(&root->log_mutex);
2179        if (root->last_log_commit < log_transid)
2180                root->last_log_commit = log_transid;
2181        mutex_unlock(&root->log_mutex);
2182
2183out_wake_log_root:
2184        atomic_set(&log_root_tree->log_commit[index2], 0);
2185        smp_mb();
2186        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2187                wake_up(&log_root_tree->log_commit_wait[index2]);
2188out:
2189        atomic_set(&root->log_commit[index1], 0);
2190        smp_mb();
2191        if (waitqueue_active(&root->log_commit_wait[index1]))
2192                wake_up(&root->log_commit_wait[index1]);
2193        return ret;
2194}
2195
2196static void free_log_tree(struct btrfs_trans_handle *trans,
2197                          struct btrfs_root *log)
2198{
2199        int ret;
2200        u64 start;
2201        u64 end;
2202        struct walk_control wc = {
2203                .free = 1,
2204                .process_func = process_one_buffer
2205        };
2206
2207        ret = walk_log_tree(trans, log, &wc);
2208        BUG_ON(ret);
2209
2210        while (1) {
2211                ret = find_first_extent_bit(&log->dirty_log_pages,
2212                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
2213                if (ret)
2214                        break;
2215
2216                clear_extent_bits(&log->dirty_log_pages, start, end,
2217                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2218        }
2219
2220        free_extent_buffer(log->node);
2221        kfree(log);
2222}
2223
2224/*
2225 * free all the extents used by the tree log.  This should be called
2226 * at commit time of the full transaction
2227 */
2228int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2229{
2230        if (root->log_root) {
2231                free_log_tree(trans, root->log_root);
2232                root->log_root = NULL;
2233        }
2234        return 0;
2235}
2236
2237int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2238                             struct btrfs_fs_info *fs_info)
2239{
2240        if (fs_info->log_root_tree) {
2241                free_log_tree(trans, fs_info->log_root_tree);
2242                fs_info->log_root_tree = NULL;
2243        }
2244        return 0;
2245}
2246
2247/*
2248 * If both a file and directory are logged, and unlinks or renames are
2249 * mixed in, we have a few interesting corners:
2250 *
2251 * create file X in dir Y
2252 * link file X to X.link in dir Y
2253 * fsync file X
2254 * unlink file X but leave X.link
2255 * fsync dir Y
2256 *
2257 * After a crash we would expect only X.link to exist.  But file X
2258 * didn't get fsync'd again so the log has back refs for X and X.link.
2259 *
2260 * We solve this by removing directory entries and inode backrefs from the
2261 * log when a file that was logged in the current transaction is
2262 * unlinked.  Any later fsync will include the updated log entries, and
2263 * we'll be able to reconstruct the proper directory items from backrefs.
2264 *
2265 * This optimizations allows us to avoid relogging the entire inode
2266 * or the entire directory.
2267 */
2268int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2269                                 struct btrfs_root *root,
2270                                 const char *name, int name_len,
2271                                 struct inode *dir, u64 index)
2272{
2273        struct btrfs_root *log;
2274        struct btrfs_dir_item *di;
2275        struct btrfs_path *path;
2276        int ret;
2277        int err = 0;
2278        int bytes_del = 0;
2279        u64 dir_ino = btrfs_ino(dir);
2280
2281        if (BTRFS_I(dir)->logged_trans < trans->transid)
2282                return 0;
2283
2284        ret = join_running_log_trans(root);
2285        if (ret)
2286                return 0;
2287
2288        mutex_lock(&BTRFS_I(dir)->log_mutex);
2289
2290        log = root->log_root;
2291        path = btrfs_alloc_path();
2292        if (!path) {
2293                err = -ENOMEM;
2294                goto out_unlock;
2295        }
2296
2297        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2298                                   name, name_len, -1);
2299        if (IS_ERR(di)) {
2300                err = PTR_ERR(di);
2301                goto fail;
2302        }
2303        if (di) {
2304                ret = btrfs_delete_one_dir_name(trans, log, path, di);
2305                bytes_del += name_len;
2306                BUG_ON(ret);
2307        }
2308        btrfs_release_path(path);
2309        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2310                                         index, name, name_len, -1);
2311        if (IS_ERR(di)) {
2312                err = PTR_ERR(di);
2313                goto fail;
2314        }
2315        if (di) {
2316                ret = btrfs_delete_one_dir_name(trans, log, path, di);
2317                bytes_del += name_len;
2318                BUG_ON(ret);
2319        }
2320
2321        /* update the directory size in the log to reflect the names
2322         * we have removed
2323         */
2324        if (bytes_del) {
2325                struct btrfs_key key;
2326
2327                key.objectid = dir_ino;
2328                key.offset = 0;
2329                key.type = BTRFS_INODE_ITEM_KEY;
2330                btrfs_release_path(path);
2331
2332                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2333                if (ret < 0) {
2334                        err = ret;
2335                        goto fail;
2336                }
2337                if (ret == 0) {
2338                        struct btrfs_inode_item *item;
2339                        u64 i_size;
2340
2341                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2342                                              struct btrfs_inode_item);
2343                        i_size = btrfs_inode_size(path->nodes[0], item);
2344                        if (i_size > bytes_del)
2345                                i_size -= bytes_del;
2346                        else
2347                                i_size = 0;
2348                        btrfs_set_inode_size(path->nodes[0], item, i_size);
2349                        btrfs_mark_buffer_dirty(path->nodes[0]);
2350                } else
2351                        ret = 0;
2352                btrfs_release_path(path);
2353        }
2354fail:
2355        btrfs_free_path(path);
2356out_unlock:
2357        mutex_unlock(&BTRFS_I(dir)->log_mutex);
2358        if (ret == -ENOSPC) {
2359                root->fs_info->last_trans_log_full_commit = trans->transid;
2360                ret = 0;
2361        } else if (ret < 0)
2362                btrfs_abort_transaction(trans, root, ret);
2363
2364        btrfs_end_log_trans(root);
2365
2366        return err;
2367}
2368
2369/* see comments for btrfs_del_dir_entries_in_log */
2370int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2371                               struct btrfs_root *root,
2372                               const char *name, int name_len,
2373                               struct inode *inode, u64 dirid)
2374{
2375        struct btrfs_root *log;
2376        u64 index;
2377        int ret;
2378
2379        if (BTRFS_I(inode)->logged_trans < trans->transid)
2380                return 0;
2381
2382        ret = join_running_log_trans(root);
2383        if (ret)
2384                return 0;
2385        log = root->log_root;
2386        mutex_lock(&BTRFS_I(inode)->log_mutex);
2387
2388        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2389                                  dirid, &index);
2390        mutex_unlock(&BTRFS_I(inode)->log_mutex);
2391        if (ret == -ENOSPC) {
2392                root->fs_info->last_trans_log_full_commit = trans->transid;
2393                ret = 0;
2394        } else if (ret < 0 && ret != -ENOENT)
2395                btrfs_abort_transaction(trans, root, ret);
2396        btrfs_end_log_trans(root);
2397
2398        return ret;
2399}
2400
2401/*
2402 * creates a range item in the log for 'dirid'.  first_offset and
2403 * last_offset tell us which parts of the key space the log should
2404 * be considered authoritative for.
2405 */
2406static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2407                                       struct btrfs_root *log,
2408                                       struct btrfs_path *path,
2409                                       int key_type, u64 dirid,
2410                                       u64 first_offset, u64 last_offset)
2411{
2412        int ret;
2413        struct btrfs_key key;
2414        struct btrfs_dir_log_item *item;
2415
2416        key.objectid = dirid;
2417        key.offset = first_offset;
2418        if (key_type == BTRFS_DIR_ITEM_KEY)
2419                key.type = BTRFS_DIR_LOG_ITEM_KEY;
2420        else
2421                key.type = BTRFS_DIR_LOG_INDEX_KEY;
2422        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2423        if (ret)
2424                return ret;
2425
2426        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2427                              struct btrfs_dir_log_item);
2428        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2429        btrfs_mark_buffer_dirty(path->nodes[0]);
2430        btrfs_release_path(path);
2431        return 0;
2432}
2433
2434/*
2435 * log all the items included in the current transaction for a given
2436 * directory.  This also creates the range items in the log tree required
2437 * to replay anything deleted before the fsync
2438 */
2439static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2440                          struct btrfs_root *root, struct inode *inode,
2441                          struct btrfs_path *path,
2442                          struct btrfs_path *dst_path, int key_type,
2443                          u64 min_offset, u64 *last_offset_ret)
2444{
2445        struct btrfs_key min_key;
2446        struct btrfs_key max_key;
2447        struct btrfs_root *log = root->log_root;
2448        struct extent_buffer *src;
2449        int err = 0;
2450        int ret;
2451        int i;
2452        int nritems;
2453        u64 first_offset = min_offset;
2454        u64 last_offset = (u64)-1;
2455        u64 ino = btrfs_ino(inode);
2456
2457        log = root->log_root;
2458        max_key.objectid = ino;
2459        max_key.offset = (u64)-1;
2460        max_key.type = key_type;
2461
2462        min_key.objectid = ino;
2463        min_key.type = key_type;
2464        min_key.offset = min_offset;
2465
2466        path->keep_locks = 1;
2467
2468        ret = btrfs_search_forward(root, &min_key, &max_key,
2469                                   path, 0, trans->transid);
2470
2471        /*
2472         * we didn't find anything from this transaction, see if there
2473         * is anything at all
2474         */
2475        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2476                min_key.objectid = ino;
2477                min_key.type = key_type;
2478                min_key.offset = (u64)-1;
2479                btrfs_release_path(path);
2480                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2481                if (ret < 0) {
2482                        btrfs_release_path(path);
2483                        return ret;
2484                }
2485                ret = btrfs_previous_item(root, path, ino, key_type);
2486
2487                /* if ret == 0 there are items for this type,
2488                 * create a range to tell us the last key of this type.
2489                 * otherwise, there are no items in this directory after
2490                 * *min_offset, and we create a range to indicate that.
2491                 */
2492                if (ret == 0) {
2493                        struct btrfs_key tmp;
2494                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2495                                              path->slots[0]);
2496                        if (key_type == tmp.type)
2497                                first_offset = max(min_offset, tmp.offset) + 1;
2498                }
2499                goto done;
2500        }
2501
2502        /* go backward to find any previous key */
2503        ret = btrfs_previous_item(root, path, ino, key_type);
2504        if (ret == 0) {
2505                struct btrfs_key tmp;
2506                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2507                if (key_type == tmp.type) {
2508                        first_offset = tmp.offset;
2509                        ret = overwrite_item(trans, log, dst_path,
2510                                             path->nodes[0], path->slots[0],
2511                                             &tmp);
2512                        if (ret) {
2513                                err = ret;
2514                                goto done;
2515                        }
2516                }
2517        }
2518        btrfs_release_path(path);
2519
2520        /* find the first key from this transaction again */
2521        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2522        if (ret != 0) {
2523                WARN_ON(1);
2524                goto done;
2525        }
2526
2527        /*
2528         * we have a block from this transaction, log every item in it
2529         * from our directory
2530         */
2531        while (1) {
2532                struct btrfs_key tmp;
2533                src = path->nodes[0];
2534                nritems = btrfs_header_nritems(src);
2535                for (i = path->slots[0]; i < nritems; i++) {
2536                        btrfs_item_key_to_cpu(src, &min_key, i);
2537
2538                        if (min_key.objectid != ino || min_key.type != key_type)
2539                                goto done;
2540                        ret = overwrite_item(trans, log, dst_path, src, i,
2541                                             &min_key);
2542                        if (ret) {
2543                                err = ret;
2544                                goto done;
2545                        }
2546                }
2547                path->slots[0] = nritems;
2548
2549                /*
2550                 * look ahead to the next item and see if it is also
2551                 * from this directory and from this transaction
2552                 */
2553                ret = btrfs_next_leaf(root, path);
2554                if (ret == 1) {
2555                        last_offset = (u64)-1;
2556                        goto done;
2557                }
2558                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2559                if (tmp.objectid != ino || tmp.type != key_type) {
2560                        last_offset = (u64)-1;
2561                        goto done;
2562                }
2563                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2564                        ret = overwrite_item(trans, log, dst_path,
2565                                             path->nodes[0], path->slots[0],
2566                                             &tmp);
2567                        if (ret)
2568                                err = ret;
2569                        else
2570                                last_offset = tmp.offset;
2571                        goto done;
2572                }
2573        }
2574done:
2575        btrfs_release_path(path);
2576        btrfs_release_path(dst_path);
2577
2578        if (err == 0) {
2579                *last_offset_ret = last_offset;
2580                /*
2581                 * insert the log range keys to indicate where the log
2582                 * is valid
2583                 */
2584                ret = insert_dir_log_key(trans, log, path, key_type,
2585                                         ino, first_offset, last_offset);
2586                if (ret)
2587                        err = ret;
2588        }
2589        return err;
2590}
2591
2592/*
2593 * logging directories is very similar to logging inodes, We find all the items
2594 * from the current transaction and write them to the log.
2595 *
2596 * The recovery code scans the directory in the subvolume, and if it finds a
2597 * key in the range logged that is not present in the log tree, then it means
2598 * that dir entry was unlinked during the transaction.
2599 *
2600 * In order for that scan to work, we must include one key smaller than
2601 * the smallest logged by this transaction and one key larger than the largest
2602 * key logged by this transaction.
2603 */
2604static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2605                          struct btrfs_root *root, struct inode *inode,
2606                          struct btrfs_path *path,
2607                          struct btrfs_path *dst_path)
2608{
2609        u64 min_key;
2610        u64 max_key;
2611        int ret;
2612        int key_type = BTRFS_DIR_ITEM_KEY;
2613
2614again:
2615        min_key = 0;
2616        max_key = 0;
2617        while (1) {
2618                ret = log_dir_items(trans, root, inode, path,
2619                                    dst_path, key_type, min_key,
2620                                    &max_key);
2621                if (ret)
2622                        return ret;
2623                if (max_key == (u64)-1)
2624                        break;
2625                min_key = max_key + 1;
2626        }
2627
2628        if (key_type == BTRFS_DIR_ITEM_KEY) {
2629                key_type = BTRFS_DIR_INDEX_KEY;
2630                goto again;
2631        }
2632        return 0;
2633}
2634
2635/*
2636 * a helper function to drop items from the log before we relog an
2637 * inode.  max_key_type indicates the highest item type to remove.
2638 * This cannot be run for file data extents because it does not
2639 * free the extents they point to.
2640 */
2641static int drop_objectid_items(struct btrfs_trans_handle *trans,
2642                                  struct btrfs_root *log,
2643                                  struct btrfs_path *path,
2644                                  u64 objectid, int max_key_type)
2645{
2646        int ret;
2647        struct btrfs_key key;
2648        struct btrfs_key found_key;
2649
2650        key.objectid = objectid;
2651        key.type = max_key_type;
2652        key.offset = (u64)-1;
2653
2654        while (1) {
2655                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2656                BUG_ON(ret == 0);
2657                if (ret < 0)
2658                        break;
2659
2660                if (path->slots[0] == 0)
2661                        break;
2662
2663                path->slots[0]--;
2664                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2665                                      path->slots[0]);
2666
2667                if (found_key.objectid != objectid)
2668                        break;
2669
2670                ret = btrfs_del_item(trans, log, path);
2671                if (ret)
2672                        break;
2673                btrfs_release_path(path);
2674        }
2675        btrfs_release_path(path);
2676        if (ret > 0)
2677                ret = 0;
2678        return ret;
2679}
2680
2681static noinline int copy_items(struct btrfs_trans_handle *trans,
2682                               struct btrfs_root *log,
2683                               struct btrfs_path *dst_path,
2684                               struct extent_buffer *src,
2685                               int start_slot, int nr, int inode_only)
2686{
2687        unsigned long src_offset;
2688        unsigned long dst_offset;
2689        struct btrfs_file_extent_item *extent;
2690        struct btrfs_inode_item *inode_item;
2691        int ret;
2692        struct btrfs_key *ins_keys;
2693        u32 *ins_sizes;
2694        char *ins_data;
2695        int i;
2696        struct list_head ordered_sums;
2697
2698        INIT_LIST_HEAD(&ordered_sums);
2699
2700        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2701                           nr * sizeof(u32), GFP_NOFS);
2702        if (!ins_data)
2703                return -ENOMEM;
2704
2705        ins_sizes = (u32 *)ins_data;
2706        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2707
2708        for (i = 0; i < nr; i++) {
2709                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2710                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2711        }
2712        ret = btrfs_insert_empty_items(trans, log, dst_path,
2713                                       ins_keys, ins_sizes, nr);
2714        if (ret) {
2715                kfree(ins_data);
2716                return ret;
2717        }
2718
2719        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2720                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2721                                                   dst_path->slots[0]);
2722
2723                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2724
2725                copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2726                                   src_offset, ins_sizes[i]);
2727
2728                if (inode_only == LOG_INODE_EXISTS &&
2729                    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2730                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
2731                                                    dst_path->slots[0],
2732                                                    struct btrfs_inode_item);
2733                        btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2734
2735                        /* set the generation to zero so the recover code
2736                         * can tell the difference between an logging
2737                         * just to say 'this inode exists' and a logging
2738                         * to say 'update this inode with these values'
2739                         */
2740                        btrfs_set_inode_generation(dst_path->nodes[0],
2741                                                   inode_item, 0);
2742                }
2743                /* take a reference on file data extents so that truncates
2744                 * or deletes of this inode don't have to relog the inode
2745                 * again
2746                 */
2747                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2748                        int found_type;
2749                        extent = btrfs_item_ptr(src, start_slot + i,
2750                                                struct btrfs_file_extent_item);
2751
2752                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
2753                                continue;
2754
2755                        found_type = btrfs_file_extent_type(src, extent);
2756                        if (found_type == BTRFS_FILE_EXTENT_REG ||
2757                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2758                                u64 ds, dl, cs, cl;
2759                                ds = btrfs_file_extent_disk_bytenr(src,
2760                                                                extent);
2761                                /* ds == 0 is a hole */
2762                                if (ds == 0)
2763                                        continue;
2764
2765                                dl = btrfs_file_extent_disk_num_bytes(src,
2766                                                                extent);
2767                                cs = btrfs_file_extent_offset(src, extent);
2768                                cl = btrfs_file_extent_num_bytes(src,
2769                                                                extent);
2770                                if (btrfs_file_extent_compression(src,
2771                                                                  extent)) {
2772                                        cs = 0;
2773                                        cl = dl;
2774                                }
2775
2776                                ret = btrfs_lookup_csums_range(
2777                                                log->fs_info->csum_root,
2778                                                ds + cs, ds + cs + cl - 1,
2779                                                &ordered_sums, 0);
2780                                BUG_ON(ret);
2781                        }
2782                }
2783        }
2784
2785        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2786        btrfs_release_path(dst_path);
2787        kfree(ins_data);
2788
2789        /*
2790         * we have to do this after the loop above to avoid changing the
2791         * log tree while trying to change the log tree.
2792         */
2793        ret = 0;
2794        while (!list_empty(&ordered_sums)) {
2795                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2796                                                   struct btrfs_ordered_sum,
2797                                                   list);
2798                if (!ret)
2799                        ret = btrfs_csum_file_blocks(trans, log, sums);
2800                list_del(&sums->list);
2801                kfree(sums);
2802        }
2803        return ret;
2804}
2805
2806/* log a single inode in the tree log.
2807 * At least one parent directory for this inode must exist in the tree
2808 * or be logged already.
2809 *
2810 * Any items from this inode changed by the current transaction are copied
2811 * to the log tree.  An extra reference is taken on any extents in this
2812 * file, allowing us to avoid a whole pile of corner cases around logging
2813 * blocks that have been removed from the tree.
2814 *
2815 * See LOG_INODE_ALL and related defines for a description of what inode_only
2816 * does.
2817 *
2818 * This handles both files and directories.
2819 */
2820static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2821                             struct btrfs_root *root, struct inode *inode,
2822                             int inode_only)
2823{
2824        struct btrfs_path *path;
2825        struct btrfs_path *dst_path;
2826        struct btrfs_key min_key;
2827        struct btrfs_key max_key;
2828        struct btrfs_root *log = root->log_root;
2829        struct extent_buffer *src = NULL;
2830        int err = 0;
2831        int ret;
2832        int nritems;
2833        int ins_start_slot = 0;
2834        int ins_nr;
2835        u64 ino = btrfs_ino(inode);
2836
2837        log = root->log_root;
2838
2839        path = btrfs_alloc_path();
2840        if (!path)
2841                return -ENOMEM;
2842        dst_path = btrfs_alloc_path();
2843        if (!dst_path) {
2844                btrfs_free_path(path);
2845                return -ENOMEM;
2846        }
2847
2848        min_key.objectid = ino;
2849        min_key.type = BTRFS_INODE_ITEM_KEY;
2850        min_key.offset = 0;
2851
2852        max_key.objectid = ino;
2853
2854        /* today the code can only do partial logging of directories */
2855        if (!S_ISDIR(inode->i_mode))
2856            inode_only = LOG_INODE_ALL;
2857
2858        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2859                max_key.type = BTRFS_XATTR_ITEM_KEY;
2860        else
2861                max_key.type = (u8)-1;
2862        max_key.offset = (u64)-1;
2863
2864        ret = btrfs_commit_inode_delayed_items(trans, inode);
2865        if (ret) {
2866                btrfs_free_path(path);
2867                btrfs_free_path(dst_path);
2868                return ret;
2869        }
2870
2871        mutex_lock(&BTRFS_I(inode)->log_mutex);
2872
2873        /*
2874         * a brute force approach to making sure we get the most uptodate
2875         * copies of everything.
2876         */
2877        if (S_ISDIR(inode->i_mode)) {
2878                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2879
2880                if (inode_only == LOG_INODE_EXISTS)
2881                        max_key_type = BTRFS_XATTR_ITEM_KEY;
2882                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2883        } else {
2884                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2885        }
2886        if (ret) {
2887                err = ret;
2888                goto out_unlock;
2889        }
2890        path->keep_locks = 1;
2891
2892        while (1) {
2893                ins_nr = 0;
2894                ret = btrfs_search_forward(root, &min_key, &max_key,
2895                                           path, 0, trans->transid);
2896                if (ret != 0)
2897                        break;
2898again:
2899                /* note, ins_nr might be > 0 here, cleanup outside the loop */
2900                if (min_key.objectid != ino)
2901                        break;
2902                if (min_key.type > max_key.type)
2903                        break;
2904
2905                src = path->nodes[0];
2906                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2907                        ins_nr++;
2908                        goto next_slot;
2909                } else if (!ins_nr) {
2910                        ins_start_slot = path->slots[0];
2911                        ins_nr = 1;
2912                        goto next_slot;
2913                }
2914
2915                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2916                                 ins_nr, inode_only);
2917                if (ret) {
2918                        err = ret;
2919                        goto out_unlock;
2920                }
2921                ins_nr = 1;
2922                ins_start_slot = path->slots[0];
2923next_slot:
2924
2925                nritems = btrfs_header_nritems(path->nodes[0]);
2926                path->slots[0]++;
2927                if (path->slots[0] < nritems) {
2928                        btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2929                                              path->slots[0]);
2930                        goto again;
2931                }
2932                if (ins_nr) {
2933                        ret = copy_items(trans, log, dst_path, src,
2934                                         ins_start_slot,
2935                                         ins_nr, inode_only);
2936                        if (ret) {
2937                                err = ret;
2938                                goto out_unlock;
2939                        }
2940                        ins_nr = 0;
2941                }
2942                btrfs_release_path(path);
2943
2944                if (min_key.offset < (u64)-1)
2945                        min_key.offset++;
2946                else if (min_key.type < (u8)-1)
2947                        min_key.type++;
2948                else if (min_key.objectid < (u64)-1)
2949                        min_key.objectid++;
2950                else
2951                        break;
2952        }
2953        if (ins_nr) {
2954                ret = copy_items(trans, log, dst_path, src,
2955                                 ins_start_slot,
2956                                 ins_nr, inode_only);
2957                if (ret) {
2958                        err = ret;
2959                        goto out_unlock;
2960                }
2961                ins_nr = 0;
2962        }
2963        WARN_ON(ins_nr);
2964        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2965                btrfs_release_path(path);
2966                btrfs_release_path(dst_path);
2967                ret = log_directory_changes(trans, root, inode, path, dst_path);
2968                if (ret) {
2969                        err = ret;
2970                        goto out_unlock;
2971                }
2972        }
2973        BTRFS_I(inode)->logged_trans = trans->transid;
2974out_unlock:
2975        mutex_unlock(&BTRFS_I(inode)->log_mutex);
2976
2977        btrfs_free_path(path);
2978        btrfs_free_path(dst_path);
2979        return err;
2980}
2981
2982/*
2983 * follow the dentry parent pointers up the chain and see if any
2984 * of the directories in it require a full commit before they can
2985 * be logged.  Returns zero if nothing special needs to be done or 1 if
2986 * a full commit is required.
2987 */
2988static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2989                                               struct inode *inode,
2990                                               struct dentry *parent,
2991                                               struct super_block *sb,
2992                                               u64 last_committed)
2993{
2994        int ret = 0;
2995        struct btrfs_root *root;
2996        struct dentry *old_parent = NULL;
2997
2998        /*
2999         * for regular files, if its inode is already on disk, we don't
3000         * have to worry about the parents at all.  This is because
3001         * we can use the last_unlink_trans field to record renames
3002         * and other fun in this file.
3003         */
3004        if (S_ISREG(inode->i_mode) &&
3005            BTRFS_I(inode)->generation <= last_committed &&
3006            BTRFS_I(inode)->last_unlink_trans <= last_committed)
3007                        goto out;
3008
3009        if (!S_ISDIR(inode->i_mode)) {
3010                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3011                        goto out;
3012                inode = parent->d_inode;
3013        }
3014
3015        while (1) {
3016                BTRFS_I(inode)->logged_trans = trans->transid;
3017                smp_mb();
3018
3019                if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
3020                        root = BTRFS_I(inode)->root;
3021
3022                        /*
3023                         * make sure any commits to the log are forced
3024                         * to be full commits
3025                         */
3026                        root->fs_info->last_trans_log_full_commit =
3027                                trans->transid;
3028                        ret = 1;
3029                        break;
3030                }
3031
3032                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3033                        break;
3034
3035                if (IS_ROOT(parent))
3036                        break;
3037
3038                parent = dget_parent(parent);
3039                dput(old_parent);
3040                old_parent = parent;
3041                inode = parent->d_inode;
3042
3043        }
3044        dput(old_parent);
3045out:
3046        return ret;
3047}
3048
3049/*
3050 * helper function around btrfs_log_inode to make sure newly created
3051 * parent directories also end up in the log.  A minimal inode and backref
3052 * only logging is done of any parent directories that are older than
3053 * the last committed transaction
3054 */
3055int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3056                    struct btrfs_root *root, struct inode *inode,
3057                    struct dentry *parent, int exists_only)
3058{
3059        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3060        struct super_block *sb;
3061        struct dentry *old_parent = NULL;
3062        int ret = 0;
3063        u64 last_committed = root->fs_info->last_trans_committed;
3064
3065        sb = inode->i_sb;
3066
3067        if (btrfs_test_opt(root, NOTREELOG)) {
3068                ret = 1;
3069                goto end_no_trans;
3070        }
3071
3072        if (root->fs_info->last_trans_log_full_commit >
3073            root->fs_info->last_trans_committed) {
3074                ret = 1;
3075                goto end_no_trans;
3076        }
3077
3078        if (root != BTRFS_I(inode)->root ||
3079            btrfs_root_refs(&root->root_item) == 0) {
3080                ret = 1;
3081                goto end_no_trans;
3082        }
3083
3084        ret = check_parent_dirs_for_sync(trans, inode, parent,
3085                                         sb, last_committed);
3086        if (ret)
3087                goto end_no_trans;
3088
3089        if (btrfs_inode_in_log(inode, trans->transid)) {
3090                ret = BTRFS_NO_LOG_SYNC;
3091                goto end_no_trans;
3092        }
3093
3094        ret = start_log_trans(trans, root);
3095        if (ret)
3096                goto end_trans;
3097
3098        ret = btrfs_log_inode(trans, root, inode, inode_only);
3099        if (ret)
3100                goto end_trans;
3101
3102        /*
3103         * for regular files, if its inode is already on disk, we don't
3104         * have to worry about the parents at all.  This is because
3105         * we can use the last_unlink_trans field to record renames
3106         * and other fun in this file.
3107         */
3108        if (S_ISREG(inode->i_mode) &&
3109            BTRFS_I(inode)->generation <= last_committed &&
3110            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
3111                ret = 0;
3112                goto end_trans;
3113        }
3114
3115        inode_only = LOG_INODE_EXISTS;
3116        while (1) {
3117                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3118                        break;
3119
3120                inode = parent->d_inode;
3121                if (root != BTRFS_I(inode)->root)
3122                        break;
3123
3124                if (BTRFS_I(inode)->generation >
3125                    root->fs_info->last_trans_committed) {
3126                        ret = btrfs_log_inode(trans, root, inode, inode_only);
3127                        if (ret)
3128                                goto end_trans;
3129                }
3130                if (IS_ROOT(parent))
3131                        break;
3132
3133                parent = dget_parent(parent);
3134                dput(old_parent);
3135                old_parent = parent;
3136        }
3137        ret = 0;
3138end_trans:
3139        dput(old_parent);
3140        if (ret < 0) {
3141                BUG_ON(ret != -ENOSPC);
3142                root->fs_info->last_trans_log_full_commit = trans->transid;
3143                ret = 1;
3144        }
3145        btrfs_end_log_trans(root);
3146end_no_trans:
3147        return ret;
3148}
3149
3150/*
3151 * it is not safe to log dentry if the chunk root has added new
3152 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
3153 * If this returns 1, you must commit the transaction to safely get your
3154 * data on disk.
3155 */
3156int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3157                          struct btrfs_root *root, struct dentry *dentry)
3158{
3159        struct dentry *parent = dget_parent(dentry);
3160        int ret;
3161
3162        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3163        dput(parent);
3164
3165        return ret;
3166}
3167
3168/*
3169 * should be called during mount to recover any replay any log trees
3170 * from the FS
3171 */
3172int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3173{
3174        int ret;
3175        struct btrfs_path *path;
3176        struct btrfs_trans_handle *trans;
3177        struct btrfs_key key;
3178        struct btrfs_key found_key;
3179        struct btrfs_key tmp_key;
3180        struct btrfs_root *log;
3181        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
3182        struct walk_control wc = {
3183                .process_func = process_one_buffer,
3184                .stage = 0,
3185        };
3186
3187        path = btrfs_alloc_path();
3188        if (!path)
3189                return -ENOMEM;
3190
3191        fs_info->log_root_recovering = 1;
3192
3193        trans = btrfs_start_transaction(fs_info->tree_root, 0);
3194        if (IS_ERR(trans)) {
3195                ret = PTR_ERR(trans);
3196                goto error;
3197        }
3198
3199        wc.trans = trans;
3200        wc.pin = 1;
3201
3202        ret = walk_log_tree(trans, log_root_tree, &wc);
3203        if (ret) {
3204                btrfs_error(fs_info, ret, "Failed to pin buffers while "
3205                            "recovering log root tree.");
3206                goto error;
3207        }
3208
3209again:
3210        key.objectid = BTRFS_TREE_LOG_OBJECTID;
3211        key.offset = (u64)-1;
3212        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
3213
3214        while (1) {
3215                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
3216
3217                if (ret < 0) {
3218                        btrfs_error(fs_info, ret,
3219                                    "Couldn't find tree log root.");
3220                        goto error;
3221                }
3222                if (ret > 0) {
3223                        if (path->slots[0] == 0)
3224                                break;
3225                        path->slots[0]--;
3226                }
3227                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3228                                      path->slots[0]);
3229                btrfs_release_path(path);
3230                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
3231                        break;
3232
3233                log = btrfs_read_fs_root_no_radix(log_root_tree,
3234                                                  &found_key);
3235                if (IS_ERR(log)) {
3236                        ret = PTR_ERR(log);
3237                        btrfs_error(fs_info, ret,
3238                                    "Couldn't read tree log root.");
3239                        goto error;
3240                }
3241
3242                tmp_key.objectid = found_key.offset;
3243                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
3244                tmp_key.offset = (u64)-1;
3245
3246                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
3247                if (IS_ERR(wc.replay_dest)) {
3248                        ret = PTR_ERR(wc.replay_dest);
3249                        btrfs_error(fs_info, ret, "Couldn't read target root "
3250                                    "for tree log recovery.");
3251                        goto error;
3252                }
3253
3254                wc.replay_dest->log_root = log;
3255                btrfs_record_root_in_trans(trans, wc.replay_dest);
3256                ret = walk_log_tree(trans, log, &wc);
3257                BUG_ON(ret);
3258
3259                if (wc.stage == LOG_WALK_REPLAY_ALL) {
3260                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
3261                                                      path);
3262                        BUG_ON(ret);
3263                }
3264
3265                key.offset = found_key.offset - 1;
3266                wc.replay_dest->log_root = NULL;
3267                free_extent_buffer(log->node);
3268                free_extent_buffer(log->commit_root);
3269                kfree(log);
3270
3271                if (found_key.offset == 0)
3272                        break;
3273        }
3274        btrfs_release_path(path);
3275
3276        /* step one is to pin it all, step two is to replay just inodes */
3277        if (wc.pin) {
3278                wc.pin = 0;
3279                wc.process_func = replay_one_buffer;
3280                wc.stage = LOG_WALK_REPLAY_INODES;
3281                goto again;
3282        }
3283        /* step three is to replay everything */
3284        if (wc.stage < LOG_WALK_REPLAY_ALL) {
3285                wc.stage++;
3286                goto again;
3287        }
3288
3289        btrfs_free_path(path);
3290
3291        free_extent_buffer(log_root_tree->node);
3292        log_root_tree->log_root = NULL;
3293        fs_info->log_root_recovering = 0;
3294
3295        /* step 4: commit the transaction, which also unpins the blocks */
3296        btrfs_commit_transaction(trans, fs_info->tree_root);
3297
3298        kfree(log_root_tree);
3299        return 0;
3300
3301error:
3302        btrfs_free_path(path);
3303        return ret;
3304}
3305
3306/*
3307 * there are some corner cases where we want to force a full
3308 * commit instead of allowing a directory to be logged.
3309 *
3310 * They revolve around files there were unlinked from the directory, and
3311 * this function updates the parent directory so that a full commit is
3312 * properly done if it is fsync'd later after the unlinks are done.
3313 */
3314void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
3315                             struct inode *dir, struct inode *inode,
3316                             int for_rename)
3317{
3318        /*
3319         * when we're logging a file, if it hasn't been renamed
3320         * or unlinked, and its inode is fully committed on disk,
3321         * we don't have to worry about walking up the directory chain
3322         * to log its parents.
3323         *
3324         * So, we use the last_unlink_trans field to put this transid
3325         * into the file.  When the file is logged we check it and
3326         * don't log the parents if the file is fully on disk.
3327         */
3328        if (S_ISREG(inode->i_mode))
3329                BTRFS_I(inode)->last_unlink_trans = trans->transid;
3330
3331        /*
3332         * if this directory was already logged any new
3333         * names for this file/dir will get recorded
3334         */
3335        smp_mb();
3336        if (BTRFS_I(dir)->logged_trans == trans->transid)
3337                return;
3338
3339        /*
3340         * if the inode we're about to unlink was logged,
3341         * the log will be properly updated for any new names
3342         */
3343        if (BTRFS_I(inode)->logged_trans == trans->transid)
3344                return;
3345
3346        /*
3347         * when renaming files across directories, if the directory
3348         * there we're unlinking from gets fsync'd later on, there's
3349         * no way to find the destination directory later and fsync it
3350         * properly.  So, we have to be conservative and force commits
3351         * so the new name gets discovered.
3352         */
3353        if (for_rename)
3354                goto record;
3355
3356        /* we can safely do the unlink without any special recording */
3357        return;
3358
3359record:
3360        BTRFS_I(dir)->last_unlink_trans = trans->transid;
3361}
3362
3363/*
3364 * Call this after adding a new name for a file and it will properly
3365 * update the log to reflect the new name.
3366 *
3367 * It will return zero if all goes well, and it will return 1 if a
3368 * full transaction commit is required.
3369 */
3370int btrfs_log_new_name(struct btrfs_trans_handle *trans,
3371                        struct inode *inode, struct inode *old_dir,
3372                        struct dentry *parent)
3373{
3374        struct btrfs_root * root = BTRFS_I(inode)->root;
3375
3376        /*
3377         * this will force the logging code to walk the dentry chain
3378         * up for the file
3379         */
3380        if (S_ISREG(inode->i_mode))
3381                BTRFS_I(inode)->last_unlink_trans = trans->transid;
3382
3383        /*
3384         * if this inode hasn't been logged and directory we're renaming it
3385         * from hasn't been logged, we don't need to log it
3386         */
3387        if (BTRFS_I(inode)->logged_trans <=
3388            root->fs_info->last_trans_committed &&
3389            (!old_dir || BTRFS_I(old_dir)->logged_trans <=
3390                    root->fs_info->last_trans_committed))
3391                return 0;
3392
3393        return btrfs_log_inode_parent(trans, root, inode, parent, 1);
3394}
3395
3396
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.