linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/rbtree.h>
  22#include "ctree.h"
  23#include "disk-io.h"
  24#include "transaction.h"
  25#include "print-tree.h"
  26#include "locking.h"
  27
  28static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  29                      *root, struct btrfs_path *path, int level);
  30static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
  31                      *root, struct btrfs_key *ins_key,
  32                      struct btrfs_path *path, int data_size, int extend);
  33static int push_node_left(struct btrfs_trans_handle *trans,
  34                          struct btrfs_root *root, struct extent_buffer *dst,
  35                          struct extent_buffer *src, int empty);
  36static int balance_node_right(struct btrfs_trans_handle *trans,
  37                              struct btrfs_root *root,
  38                              struct extent_buffer *dst_buf,
  39                              struct extent_buffer *src_buf);
  40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  41                    struct btrfs_path *path, int level, int slot,
  42                    int tree_mod_log);
  43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
  44                                 struct extent_buffer *eb);
  45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
  46                                          u32 blocksize, u64 parent_transid,
  47                                          u64 time_seq);
  48struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
  49                                                u64 bytenr, u32 blocksize,
  50                                                u64 time_seq);
  51
  52struct btrfs_path *btrfs_alloc_path(void)
  53{
  54        struct btrfs_path *path;
  55        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  56        return path;
  57}
  58
  59/*
  60 * set all locked nodes in the path to blocking locks.  This should
  61 * be done before scheduling
  62 */
  63noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  64{
  65        int i;
  66        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  67                if (!p->nodes[i] || !p->locks[i])
  68                        continue;
  69                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  70                if (p->locks[i] == BTRFS_READ_LOCK)
  71                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  72                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  73                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  74        }
  75}
  76
  77/*
  78 * reset all the locked nodes in the patch to spinning locks.
  79 *
  80 * held is used to keep lockdep happy, when lockdep is enabled
  81 * we set held to a blocking lock before we go around and
  82 * retake all the spinlocks in the path.  You can safely use NULL
  83 * for held
  84 */
  85noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
  86                                        struct extent_buffer *held, int held_rw)
  87{
  88        int i;
  89
  90#ifdef CONFIG_DEBUG_LOCK_ALLOC
  91        /* lockdep really cares that we take all of these spinlocks
  92         * in the right order.  If any of the locks in the path are not
  93         * currently blocking, it is going to complain.  So, make really
  94         * really sure by forcing the path to blocking before we clear
  95         * the path blocking.
  96         */
  97        if (held) {
  98                btrfs_set_lock_blocking_rw(held, held_rw);
  99                if (held_rw == BTRFS_WRITE_LOCK)
 100                        held_rw = BTRFS_WRITE_LOCK_BLOCKING;
 101                else if (held_rw == BTRFS_READ_LOCK)
 102                        held_rw = BTRFS_READ_LOCK_BLOCKING;
 103        }
 104        btrfs_set_path_blocking(p);
 105#endif
 106
 107        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 108                if (p->nodes[i] && p->locks[i]) {
 109                        btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
 110                        if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
 111                                p->locks[i] = BTRFS_WRITE_LOCK;
 112                        else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
 113                                p->locks[i] = BTRFS_READ_LOCK;
 114                }
 115        }
 116
 117#ifdef CONFIG_DEBUG_LOCK_ALLOC
 118        if (held)
 119                btrfs_clear_lock_blocking_rw(held, held_rw);
 120#endif
 121}
 122
 123/* this also releases the path */
 124void btrfs_free_path(struct btrfs_path *p)
 125{
 126        if (!p)
 127                return;
 128        btrfs_release_path(p);
 129        kmem_cache_free(btrfs_path_cachep, p);
 130}
 131
 132/*
 133 * path release drops references on the extent buffers in the path
 134 * and it drops any locks held by this path
 135 *
 136 * It is safe to call this on paths that no locks or extent buffers held.
 137 */
 138noinline void btrfs_release_path(struct btrfs_path *p)
 139{
 140        int i;
 141
 142        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 143                p->slots[i] = 0;
 144                if (!p->nodes[i])
 145                        continue;
 146                if (p->locks[i]) {
 147                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 148                        p->locks[i] = 0;
 149                }
 150                free_extent_buffer(p->nodes[i]);
 151                p->nodes[i] = NULL;
 152        }
 153}
 154
 155/*
 156 * safely gets a reference on the root node of a tree.  A lock
 157 * is not taken, so a concurrent writer may put a different node
 158 * at the root of the tree.  See btrfs_lock_root_node for the
 159 * looping required.
 160 *
 161 * The extent buffer returned by this has a reference taken, so
 162 * it won't disappear.  It may stop being the root of the tree
 163 * at any time because there are no locks held.
 164 */
 165struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 166{
 167        struct extent_buffer *eb;
 168
 169        while (1) {
 170                rcu_read_lock();
 171                eb = rcu_dereference(root->node);
 172
 173                /*
 174                 * RCU really hurts here, we could free up the root node because
 175                 * it was cow'ed but we may not get the new root node yet so do
 176                 * the inc_not_zero dance and if it doesn't work then
 177                 * synchronize_rcu and try again.
 178                 */
 179                if (atomic_inc_not_zero(&eb->refs)) {
 180                        rcu_read_unlock();
 181                        break;
 182                }
 183                rcu_read_unlock();
 184                synchronize_rcu();
 185        }
 186        return eb;
 187}
 188
 189/* loop around taking references on and locking the root node of the
 190 * tree until you end up with a lock on the root.  A locked buffer
 191 * is returned, with a reference held.
 192 */
 193struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 194{
 195        struct extent_buffer *eb;
 196
 197        while (1) {
 198                eb = btrfs_root_node(root);
 199                btrfs_tree_lock(eb);
 200                if (eb == root->node)
 201                        break;
 202                btrfs_tree_unlock(eb);
 203                free_extent_buffer(eb);
 204        }
 205        return eb;
 206}
 207
 208/* loop around taking references on and locking the root node of the
 209 * tree until you end up with a lock on the root.  A locked buffer
 210 * is returned, with a reference held.
 211 */
 212struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 213{
 214        struct extent_buffer *eb;
 215
 216        while (1) {
 217                eb = btrfs_root_node(root);
 218                btrfs_tree_read_lock(eb);
 219                if (eb == root->node)
 220                        break;
 221                btrfs_tree_read_unlock(eb);
 222                free_extent_buffer(eb);
 223        }
 224        return eb;
 225}
 226
 227/* cowonly root (everything not a reference counted cow subvolume), just get
 228 * put onto a simple dirty list.  transaction.c walks this to make sure they
 229 * get properly updated on disk.
 230 */
 231static void add_root_to_dirty_list(struct btrfs_root *root)
 232{
 233        spin_lock(&root->fs_info->trans_lock);
 234        if (root->track_dirty && list_empty(&root->dirty_list)) {
 235                list_add(&root->dirty_list,
 236                         &root->fs_info->dirty_cowonly_roots);
 237        }
 238        spin_unlock(&root->fs_info->trans_lock);
 239}
 240
 241/*
 242 * used by snapshot creation to make a copy of a root for a tree with
 243 * a given objectid.  The buffer with the new root node is returned in
 244 * cow_ret, and this func returns zero on success or a negative error code.
 245 */
 246int btrfs_copy_root(struct btrfs_trans_handle *trans,
 247                      struct btrfs_root *root,
 248                      struct extent_buffer *buf,
 249                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 250{
 251        struct extent_buffer *cow;
 252        int ret = 0;
 253        int level;
 254        struct btrfs_disk_key disk_key;
 255
 256        WARN_ON(root->ref_cows && trans->transid !=
 257                root->fs_info->running_transaction->transid);
 258        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 259
 260        level = btrfs_header_level(buf);
 261        if (level == 0)
 262                btrfs_item_key(buf, &disk_key, 0);
 263        else
 264                btrfs_node_key(buf, &disk_key, 0);
 265
 266        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
 267                                     new_root_objectid, &disk_key, level,
 268                                     buf->start, 0);
 269        if (IS_ERR(cow))
 270                return PTR_ERR(cow);
 271
 272        copy_extent_buffer(cow, buf, 0, 0, cow->len);
 273        btrfs_set_header_bytenr(cow, cow->start);
 274        btrfs_set_header_generation(cow, trans->transid);
 275        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 276        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 277                                     BTRFS_HEADER_FLAG_RELOC);
 278        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 279                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 280        else
 281                btrfs_set_header_owner(cow, new_root_objectid);
 282
 283        write_extent_buffer(cow, root->fs_info->fsid,
 284                            (unsigned long)btrfs_header_fsid(cow),
 285                            BTRFS_FSID_SIZE);
 286
 287        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 288        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 289                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 290        else
 291                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 292
 293        if (ret)
 294                return ret;
 295
 296        btrfs_mark_buffer_dirty(cow);
 297        *cow_ret = cow;
 298        return 0;
 299}
 300
 301enum mod_log_op {
 302        MOD_LOG_KEY_REPLACE,
 303        MOD_LOG_KEY_ADD,
 304        MOD_LOG_KEY_REMOVE,
 305        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 306        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 307        MOD_LOG_MOVE_KEYS,
 308        MOD_LOG_ROOT_REPLACE,
 309};
 310
 311struct tree_mod_move {
 312        int dst_slot;
 313        int nr_items;
 314};
 315
 316struct tree_mod_root {
 317        u64 logical;
 318        u8 level;
 319};
 320
 321struct tree_mod_elem {
 322        struct rb_node node;
 323        u64 index;              /* shifted logical */
 324        u64 seq;
 325        enum mod_log_op op;
 326
 327        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 328        int slot;
 329
 330        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 331        u64 generation;
 332
 333        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 334        struct btrfs_disk_key key;
 335        u64 blockptr;
 336
 337        /* this is used for op == MOD_LOG_MOVE_KEYS */
 338        struct tree_mod_move move;
 339
 340        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 341        struct tree_mod_root old_root;
 342};
 343
 344static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 345{
 346        read_lock(&fs_info->tree_mod_log_lock);
 347}
 348
 349static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
 350{
 351        read_unlock(&fs_info->tree_mod_log_lock);
 352}
 353
 354static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
 355{
 356        write_lock(&fs_info->tree_mod_log_lock);
 357}
 358
 359static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 360{
 361        write_unlock(&fs_info->tree_mod_log_lock);
 362}
 363
 364/*
 365 * This adds a new blocker to the tree mod log's blocker list if the @elem
 366 * passed does not already have a sequence number set. So when a caller expects
 367 * to record tree modifications, it should ensure to set elem->seq to zero
 368 * before calling btrfs_get_tree_mod_seq.
 369 * Returns a fresh, unused tree log modification sequence number, even if no new
 370 * blocker was added.
 371 */
 372u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 373                           struct seq_list *elem)
 374{
 375        u64 seq;
 376
 377        tree_mod_log_write_lock(fs_info);
 378        spin_lock(&fs_info->tree_mod_seq_lock);
 379        if (!elem->seq) {
 380                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 381                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 382        }
 383        seq = btrfs_inc_tree_mod_seq(fs_info);
 384        spin_unlock(&fs_info->tree_mod_seq_lock);
 385        tree_mod_log_write_unlock(fs_info);
 386
 387        return seq;
 388}
 389
 390void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 391                            struct seq_list *elem)
 392{
 393        struct rb_root *tm_root;
 394        struct rb_node *node;
 395        struct rb_node *next;
 396        struct seq_list *cur_elem;
 397        struct tree_mod_elem *tm;
 398        u64 min_seq = (u64)-1;
 399        u64 seq_putting = elem->seq;
 400
 401        if (!seq_putting)
 402                return;
 403
 404        spin_lock(&fs_info->tree_mod_seq_lock);
 405        list_del(&elem->list);
 406        elem->seq = 0;
 407
 408        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 409                if (cur_elem->seq < min_seq) {
 410                        if (seq_putting > cur_elem->seq) {
 411                                /*
 412                                 * blocker with lower sequence number exists, we
 413                                 * cannot remove anything from the log
 414                                 */
 415                                spin_unlock(&fs_info->tree_mod_seq_lock);
 416                                return;
 417                        }
 418                        min_seq = cur_elem->seq;
 419                }
 420        }
 421        spin_unlock(&fs_info->tree_mod_seq_lock);
 422
 423        /*
 424         * anything that's lower than the lowest existing (read: blocked)
 425         * sequence number can be removed from the tree.
 426         */
 427        tree_mod_log_write_lock(fs_info);
 428        tm_root = &fs_info->tree_mod_log;
 429        for (node = rb_first(tm_root); node; node = next) {
 430                next = rb_next(node);
 431                tm = container_of(node, struct tree_mod_elem, node);
 432                if (tm->seq > min_seq)
 433                        continue;
 434                rb_erase(node, tm_root);
 435                kfree(tm);
 436        }
 437        tree_mod_log_write_unlock(fs_info);
 438}
 439
 440/*
 441 * key order of the log:
 442 *       index -> sequence
 443 *
 444 * the index is the shifted logical of the *new* root node for root replace
 445 * operations, or the shifted logical of the affected block for all other
 446 * operations.
 447 */
 448static noinline int
 449__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 450{
 451        struct rb_root *tm_root;
 452        struct rb_node **new;
 453        struct rb_node *parent = NULL;
 454        struct tree_mod_elem *cur;
 455
 456        BUG_ON(!tm || !tm->seq);
 457
 458        tm_root = &fs_info->tree_mod_log;
 459        new = &tm_root->rb_node;
 460        while (*new) {
 461                cur = container_of(*new, struct tree_mod_elem, node);
 462                parent = *new;
 463                if (cur->index < tm->index)
 464                        new = &((*new)->rb_left);
 465                else if (cur->index > tm->index)
 466                        new = &((*new)->rb_right);
 467                else if (cur->seq < tm->seq)
 468                        new = &((*new)->rb_left);
 469                else if (cur->seq > tm->seq)
 470                        new = &((*new)->rb_right);
 471                else {
 472                        kfree(tm);
 473                        return -EEXIST;
 474                }
 475        }
 476
 477        rb_link_node(&tm->node, parent, new);
 478        rb_insert_color(&tm->node, tm_root);
 479        return 0;
 480}
 481
 482/*
 483 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 484 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 485 * this until all tree mod log insertions are recorded in the rb tree and then
 486 * call tree_mod_log_write_unlock() to release.
 487 */
 488static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 489                                    struct extent_buffer *eb) {
 490        smp_mb();
 491        if (list_empty(&(fs_info)->tree_mod_seq_list))
 492                return 1;
 493        if (eb && btrfs_header_level(eb) == 0)
 494                return 1;
 495
 496        tree_mod_log_write_lock(fs_info);
 497        if (list_empty(&fs_info->tree_mod_seq_list)) {
 498                /*
 499                 * someone emptied the list while we were waiting for the lock.
 500                 * we must not add to the list when no blocker exists.
 501                 */
 502                tree_mod_log_write_unlock(fs_info);
 503                return 1;
 504        }
 505
 506        return 0;
 507}
 508
 509/*
 510 * This allocates memory and gets a tree modification sequence number.
 511 *
 512 * Returns <0 on error.
 513 * Returns >0 (the added sequence number) on success.
 514 */
 515static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
 516                                 struct tree_mod_elem **tm_ret)
 517{
 518        struct tree_mod_elem *tm;
 519
 520        /*
 521         * once we switch from spin locks to something different, we should
 522         * honor the flags parameter here.
 523         */
 524        tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
 525        if (!tm)
 526                return -ENOMEM;
 527
 528        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 529        return tm->seq;
 530}
 531
 532static inline int
 533__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 534                          struct extent_buffer *eb, int slot,
 535                          enum mod_log_op op, gfp_t flags)
 536{
 537        int ret;
 538        struct tree_mod_elem *tm;
 539
 540        ret = tree_mod_alloc(fs_info, flags, &tm);
 541        if (ret < 0)
 542                return ret;
 543
 544        tm->index = eb->start >> PAGE_CACHE_SHIFT;
 545        if (op != MOD_LOG_KEY_ADD) {
 546                btrfs_node_key(eb, &tm->key, slot);
 547                tm->blockptr = btrfs_node_blockptr(eb, slot);
 548        }
 549        tm->op = op;
 550        tm->slot = slot;
 551        tm->generation = btrfs_node_ptr_generation(eb, slot);
 552
 553        return __tree_mod_log_insert(fs_info, tm);
 554}
 555
 556static noinline int
 557tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
 558                             struct extent_buffer *eb, int slot,
 559                             enum mod_log_op op, gfp_t flags)
 560{
 561        int ret;
 562
 563        if (tree_mod_dont_log(fs_info, eb))
 564                return 0;
 565
 566        ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
 567
 568        tree_mod_log_write_unlock(fs_info);
 569        return ret;
 570}
 571
 572static noinline int
 573tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 574                        int slot, enum mod_log_op op)
 575{
 576        return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
 577}
 578
 579static noinline int
 580tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
 581                             struct extent_buffer *eb, int slot,
 582                             enum mod_log_op op)
 583{
 584        return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
 585}
 586
 587static noinline int
 588tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 589                         struct extent_buffer *eb, int dst_slot, int src_slot,
 590                         int nr_items, gfp_t flags)
 591{
 592        struct tree_mod_elem *tm;
 593        int ret;
 594        int i;
 595
 596        if (tree_mod_dont_log(fs_info, eb))
 597                return 0;
 598
 599        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 600                ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
 601                                              MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 602                BUG_ON(ret < 0);
 603        }
 604
 605        ret = tree_mod_alloc(fs_info, flags, &tm);
 606        if (ret < 0)
 607                goto out;
 608
 609        tm->index = eb->start >> PAGE_CACHE_SHIFT;
 610        tm->slot = src_slot;
 611        tm->move.dst_slot = dst_slot;
 612        tm->move.nr_items = nr_items;
 613        tm->op = MOD_LOG_MOVE_KEYS;
 614
 615        ret = __tree_mod_log_insert(fs_info, tm);
 616out:
 617        tree_mod_log_write_unlock(fs_info);
 618        return ret;
 619}
 620
 621static inline void
 622__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 623{
 624        int i;
 625        u32 nritems;
 626        int ret;
 627
 628        if (btrfs_header_level(eb) == 0)
 629                return;
 630
 631        nritems = btrfs_header_nritems(eb);
 632        for (i = nritems - 1; i >= 0; i--) {
 633                ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
 634                                              MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 635                BUG_ON(ret < 0);
 636        }
 637}
 638
 639static noinline int
 640tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 641                         struct extent_buffer *old_root,
 642                         struct extent_buffer *new_root, gfp_t flags)
 643{
 644        struct tree_mod_elem *tm;
 645        int ret;
 646
 647        if (tree_mod_dont_log(fs_info, NULL))
 648                return 0;
 649
 650        __tree_mod_log_free_eb(fs_info, old_root);
 651
 652        ret = tree_mod_alloc(fs_info, flags, &tm);
 653        if (ret < 0)
 654                goto out;
 655
 656        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
 657        tm->old_root.logical = old_root->start;
 658        tm->old_root.level = btrfs_header_level(old_root);
 659        tm->generation = btrfs_header_generation(old_root);
 660        tm->op = MOD_LOG_ROOT_REPLACE;
 661
 662        ret = __tree_mod_log_insert(fs_info, tm);
 663out:
 664        tree_mod_log_write_unlock(fs_info);
 665        return ret;
 666}
 667
 668static struct tree_mod_elem *
 669__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 670                      int smallest)
 671{
 672        struct rb_root *tm_root;
 673        struct rb_node *node;
 674        struct tree_mod_elem *cur = NULL;
 675        struct tree_mod_elem *found = NULL;
 676        u64 index = start >> PAGE_CACHE_SHIFT;
 677
 678        tree_mod_log_read_lock(fs_info);
 679        tm_root = &fs_info->tree_mod_log;
 680        node = tm_root->rb_node;
 681        while (node) {
 682                cur = container_of(node, struct tree_mod_elem, node);
 683                if (cur->index < index) {
 684                        node = node->rb_left;
 685                } else if (cur->index > index) {
 686                        node = node->rb_right;
 687                } else if (cur->seq < min_seq) {
 688                        node = node->rb_left;
 689                } else if (!smallest) {
 690                        /* we want the node with the highest seq */
 691                        if (found)
 692                                BUG_ON(found->seq > cur->seq);
 693                        found = cur;
 694                        node = node->rb_left;
 695                } else if (cur->seq > min_seq) {
 696                        /* we want the node with the smallest seq */
 697                        if (found)
 698                                BUG_ON(found->seq < cur->seq);
 699                        found = cur;
 700                        node = node->rb_right;
 701                } else {
 702                        found = cur;
 703                        break;
 704                }
 705        }
 706        tree_mod_log_read_unlock(fs_info);
 707
 708        return found;
 709}
 710
 711/*
 712 * this returns the element from the log with the smallest time sequence
 713 * value that's in the log (the oldest log item). any element with a time
 714 * sequence lower than min_seq will be ignored.
 715 */
 716static struct tree_mod_elem *
 717tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 718                           u64 min_seq)
 719{
 720        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 721}
 722
 723/*
 724 * this returns the element from the log with the largest time sequence
 725 * value that's in the log (the most recent log item). any element with
 726 * a time sequence lower than min_seq will be ignored.
 727 */
 728static struct tree_mod_elem *
 729tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 730{
 731        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 732}
 733
 734static noinline void
 735tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 736                     struct extent_buffer *src, unsigned long dst_offset,
 737                     unsigned long src_offset, int nr_items)
 738{
 739        int ret;
 740        int i;
 741
 742        if (tree_mod_dont_log(fs_info, NULL))
 743                return;
 744
 745        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
 746                tree_mod_log_write_unlock(fs_info);
 747                return;
 748        }
 749
 750        for (i = 0; i < nr_items; i++) {
 751                ret = tree_mod_log_insert_key_locked(fs_info, src,
 752                                                     i + src_offset,
 753                                                     MOD_LOG_KEY_REMOVE);
 754                BUG_ON(ret < 0);
 755                ret = tree_mod_log_insert_key_locked(fs_info, dst,
 756                                                     i + dst_offset,
 757                                                     MOD_LOG_KEY_ADD);
 758                BUG_ON(ret < 0);
 759        }
 760
 761        tree_mod_log_write_unlock(fs_info);
 762}
 763
 764static inline void
 765tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 766                     int dst_offset, int src_offset, int nr_items)
 767{
 768        int ret;
 769        ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
 770                                       nr_items, GFP_NOFS);
 771        BUG_ON(ret < 0);
 772}
 773
 774static noinline void
 775tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 776                          struct extent_buffer *eb,
 777                          struct btrfs_disk_key *disk_key, int slot, int atomic)
 778{
 779        int ret;
 780
 781        ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
 782                                           MOD_LOG_KEY_REPLACE,
 783                                           atomic ? GFP_ATOMIC : GFP_NOFS);
 784        BUG_ON(ret < 0);
 785}
 786
 787static noinline void
 788tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 789{
 790        if (tree_mod_dont_log(fs_info, eb))
 791                return;
 792
 793        __tree_mod_log_free_eb(fs_info, eb);
 794
 795        tree_mod_log_write_unlock(fs_info);
 796}
 797
 798static noinline void
 799tree_mod_log_set_root_pointer(struct btrfs_root *root,
 800                              struct extent_buffer *new_root_node)
 801{
 802        int ret;
 803        ret = tree_mod_log_insert_root(root->fs_info, root->node,
 804                                       new_root_node, GFP_NOFS);
 805        BUG_ON(ret < 0);
 806}
 807
 808/*
 809 * check if the tree block can be shared by multiple trees
 810 */
 811int btrfs_block_can_be_shared(struct btrfs_root *root,
 812                              struct extent_buffer *buf)
 813{
 814        /*
 815         * Tree blocks not in refernece counted trees and tree roots
 816         * are never shared. If a block was allocated after the last
 817         * snapshot and the block was not allocated by tree relocation,
 818         * we know the block is not shared.
 819         */
 820        if (root->ref_cows &&
 821            buf != root->node && buf != root->commit_root &&
 822            (btrfs_header_generation(buf) <=
 823             btrfs_root_last_snapshot(&root->root_item) ||
 824             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 825                return 1;
 826#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 827        if (root->ref_cows &&
 828            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 829                return 1;
 830#endif
 831        return 0;
 832}
 833
 834static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 835                                       struct btrfs_root *root,
 836                                       struct extent_buffer *buf,
 837                                       struct extent_buffer *cow,
 838                                       int *last_ref)
 839{
 840        u64 refs;
 841        u64 owner;
 842        u64 flags;
 843        u64 new_flags = 0;
 844        int ret;
 845
 846        /*
 847         * Backrefs update rules:
 848         *
 849         * Always use full backrefs for extent pointers in tree block
 850         * allocated by tree relocation.
 851         *
 852         * If a shared tree block is no longer referenced by its owner
 853         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 854         * use full backrefs for extent pointers in tree block.
 855         *
 856         * If a tree block is been relocating
 857         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 858         * use full backrefs for extent pointers in tree block.
 859         * The reason for this is some operations (such as drop tree)
 860         * are only allowed for blocks use full backrefs.
 861         */
 862
 863        if (btrfs_block_can_be_shared(root, buf)) {
 864                ret = btrfs_lookup_extent_info(trans, root, buf->start,
 865                                               buf->len, &refs, &flags);
 866                if (ret)
 867                        return ret;
 868                if (refs == 0) {
 869                        ret = -EROFS;
 870                        btrfs_std_error(root->fs_info, ret);
 871                        return ret;
 872                }
 873        } else {
 874                refs = 1;
 875                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 876                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 877                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 878                else
 879                        flags = 0;
 880        }
 881
 882        owner = btrfs_header_owner(buf);
 883        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 884               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 885
 886        if (refs > 1) {
 887                if ((owner == root->root_key.objectid ||
 888                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 889                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 890                        ret = btrfs_inc_ref(trans, root, buf, 1, 1);
 891                        BUG_ON(ret); /* -ENOMEM */
 892
 893                        if (root->root_key.objectid ==
 894                            BTRFS_TREE_RELOC_OBJECTID) {
 895                                ret = btrfs_dec_ref(trans, root, buf, 0, 1);
 896                                BUG_ON(ret); /* -ENOMEM */
 897                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 898                                BUG_ON(ret); /* -ENOMEM */
 899                        }
 900                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 901                } else {
 902
 903                        if (root->root_key.objectid ==
 904                            BTRFS_TREE_RELOC_OBJECTID)
 905                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 906                        else
 907                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 908                        BUG_ON(ret); /* -ENOMEM */
 909                }
 910                if (new_flags != 0) {
 911                        ret = btrfs_set_disk_extent_flags(trans, root,
 912                                                          buf->start,
 913                                                          buf->len,
 914                                                          new_flags, 0);
 915                        if (ret)
 916                                return ret;
 917                }
 918        } else {
 919                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 920                        if (root->root_key.objectid ==
 921                            BTRFS_TREE_RELOC_OBJECTID)
 922                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 923                        else
 924                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 925                        BUG_ON(ret); /* -ENOMEM */
 926                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
 927                        BUG_ON(ret); /* -ENOMEM */
 928                }
 929                /*
 930                 * don't log freeing in case we're freeing the root node, this
 931                 * is done by tree_mod_log_set_root_pointer later
 932                 */
 933                if (buf != root->node && btrfs_header_level(buf) != 0)
 934                        tree_mod_log_free_eb(root->fs_info, buf);
 935                clean_tree_block(trans, root, buf);
 936                *last_ref = 1;
 937        }
 938        return 0;
 939}
 940
 941/*
 942 * does the dirty work in cow of a single block.  The parent block (if
 943 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 944 * dirty and returned locked.  If you modify the block it needs to be marked
 945 * dirty again.
 946 *
 947 * search_start -- an allocation hint for the new block
 948 *
 949 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 950 * bytes the allocator should try to find free next to the block it returns.
 951 * This is just a hint and may be ignored by the allocator.
 952 */
 953static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 954                             struct btrfs_root *root,
 955                             struct extent_buffer *buf,
 956                             struct extent_buffer *parent, int parent_slot,
 957                             struct extent_buffer **cow_ret,
 958                             u64 search_start, u64 empty_size)
 959{
 960        struct btrfs_disk_key disk_key;
 961        struct extent_buffer *cow;
 962        int level, ret;
 963        int last_ref = 0;
 964        int unlock_orig = 0;
 965        u64 parent_start;
 966
 967        if (*cow_ret == buf)
 968                unlock_orig = 1;
 969
 970        btrfs_assert_tree_locked(buf);
 971
 972        WARN_ON(root->ref_cows && trans->transid !=
 973                root->fs_info->running_transaction->transid);
 974        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 975
 976        level = btrfs_header_level(buf);
 977
 978        if (level == 0)
 979                btrfs_item_key(buf, &disk_key, 0);
 980        else
 981                btrfs_node_key(buf, &disk_key, 0);
 982
 983        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
 984                if (parent)
 985                        parent_start = parent->start;
 986                else
 987                        parent_start = 0;
 988        } else
 989                parent_start = 0;
 990
 991        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
 992                                     root->root_key.objectid, &disk_key,
 993                                     level, search_start, empty_size);
 994        if (IS_ERR(cow))
 995                return PTR_ERR(cow);
 996
 997        /* cow is set to blocking by btrfs_init_new_buffer */
 998
 999        copy_extent_buffer(cow, buf, 0, 0, cow->len);
1000        btrfs_set_header_bytenr(cow, cow->start);
1001        btrfs_set_header_generation(cow, trans->transid);
1002        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1003        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1004                                     BTRFS_HEADER_FLAG_RELOC);
1005        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1006                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1007        else
1008                btrfs_set_header_owner(cow, root->root_key.objectid);
1009
1010        write_extent_buffer(cow, root->fs_info->fsid,
1011                            (unsigned long)btrfs_header_fsid(cow),
1012                            BTRFS_FSID_SIZE);
1013
1014        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1015        if (ret) {
1016                btrfs_abort_transaction(trans, root, ret);
1017                return ret;
1018        }
1019
1020        if (root->ref_cows)
1021                btrfs_reloc_cow_block(trans, root, buf, cow);
1022
1023        if (buf == root->node) {
1024                WARN_ON(parent && parent != buf);
1025                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1026                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1027                        parent_start = buf->start;
1028                else
1029                        parent_start = 0;
1030
1031                extent_buffer_get(cow);
1032                tree_mod_log_set_root_pointer(root, cow);
1033                rcu_assign_pointer(root->node, cow);
1034
1035                btrfs_free_tree_block(trans, root, buf, parent_start,
1036                                      last_ref);
1037                free_extent_buffer(buf);
1038                add_root_to_dirty_list(root);
1039        } else {
1040                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1041                        parent_start = parent->start;
1042                else
1043                        parent_start = 0;
1044
1045                WARN_ON(trans->transid != btrfs_header_generation(parent));
1046                tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1047                                        MOD_LOG_KEY_REPLACE);
1048                btrfs_set_node_blockptr(parent, parent_slot,
1049                                        cow->start);
1050                btrfs_set_node_ptr_generation(parent, parent_slot,
1051                                              trans->transid);
1052                btrfs_mark_buffer_dirty(parent);
1053                btrfs_free_tree_block(trans, root, buf, parent_start,
1054                                      last_ref);
1055        }
1056        if (unlock_orig)
1057                btrfs_tree_unlock(buf);
1058        free_extent_buffer_stale(buf);
1059        btrfs_mark_buffer_dirty(cow);
1060        *cow_ret = cow;
1061        return 0;
1062}
1063
1064/*
1065 * returns the logical address of the oldest predecessor of the given root.
1066 * entries older than time_seq are ignored.
1067 */
1068static struct tree_mod_elem *
1069__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1070                           struct btrfs_root *root, u64 time_seq)
1071{
1072        struct tree_mod_elem *tm;
1073        struct tree_mod_elem *found = NULL;
1074        u64 root_logical = root->node->start;
1075        int looped = 0;
1076
1077        if (!time_seq)
1078                return 0;
1079
1080        /*
1081         * the very last operation that's logged for a root is the replacement
1082         * operation (if it is replaced at all). this has the index of the *new*
1083         * root, making it the very first operation that's logged for this root.
1084         */
1085        while (1) {
1086                tm = tree_mod_log_search_oldest(fs_info, root_logical,
1087                                                time_seq);
1088                if (!looped && !tm)
1089                        return 0;
1090                /*
1091                 * if there are no tree operation for the oldest root, we simply
1092                 * return it. this should only happen if that (old) root is at
1093                 * level 0.
1094                 */
1095                if (!tm)
1096                        break;
1097
1098                /*
1099                 * if there's an operation that's not a root replacement, we
1100                 * found the oldest version of our root. normally, we'll find a
1101                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1102                 */
1103                if (tm->op != MOD_LOG_ROOT_REPLACE)
1104                        break;
1105
1106                found = tm;
1107                root_logical = tm->old_root.logical;
1108                BUG_ON(root_logical == root->node->start);
1109                looped = 1;
1110        }
1111
1112        /* if there's no old root to return, return what we found instead */
1113        if (!found)
1114                found = tm;
1115
1116        return found;
1117}
1118
1119/*
1120 * tm is a pointer to the first operation to rewind within eb. then, all
1121 * previous operations will be rewinded (until we reach something older than
1122 * time_seq).
1123 */
1124static void
1125__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1126                      struct tree_mod_elem *first_tm)
1127{
1128        u32 n;
1129        struct rb_node *next;
1130        struct tree_mod_elem *tm = first_tm;
1131        unsigned long o_dst;
1132        unsigned long o_src;
1133        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1134
1135        n = btrfs_header_nritems(eb);
1136        while (tm && tm->seq >= time_seq) {
1137                /*
1138                 * all the operations are recorded with the operator used for
1139                 * the modification. as we're going backwards, we do the
1140                 * opposite of each operation here.
1141                 */
1142                switch (tm->op) {
1143                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1144                        BUG_ON(tm->slot < n);
1145                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1146                case MOD_LOG_KEY_REMOVE:
1147                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1148                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1149                        btrfs_set_node_ptr_generation(eb, tm->slot,
1150                                                      tm->generation);
1151                        n++;
1152                        break;
1153                case MOD_LOG_KEY_REPLACE:
1154                        BUG_ON(tm->slot >= n);
1155                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1156                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1157                        btrfs_set_node_ptr_generation(eb, tm->slot,
1158                                                      tm->generation);
1159                        break;
1160                case MOD_LOG_KEY_ADD:
1161                        /* if a move operation is needed it's in the log */
1162                        n--;
1163                        break;
1164                case MOD_LOG_MOVE_KEYS:
1165                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1166                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1167                        memmove_extent_buffer(eb, o_dst, o_src,
1168                                              tm->move.nr_items * p_size);
1169                        break;
1170                case MOD_LOG_ROOT_REPLACE:
1171                        /*
1172                         * this operation is special. for roots, this must be
1173                         * handled explicitly before rewinding.
1174                         * for non-roots, this operation may exist if the node
1175                         * was a root: root A -> child B; then A gets empty and
1176                         * B is promoted to the new root. in the mod log, we'll
1177                         * have a root-replace operation for B, a tree block
1178                         * that is no root. we simply ignore that operation.
1179                         */
1180                        break;
1181                }
1182                next = rb_next(&tm->node);
1183                if (!next)
1184                        break;
1185                tm = container_of(next, struct tree_mod_elem, node);
1186                if (tm->index != first_tm->index)
1187                        break;
1188        }
1189        btrfs_set_header_nritems(eb, n);
1190}
1191
1192static struct extent_buffer *
1193tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1194                    u64 time_seq)
1195{
1196        struct extent_buffer *eb_rewin;
1197        struct tree_mod_elem *tm;
1198
1199        if (!time_seq)
1200                return eb;
1201
1202        if (btrfs_header_level(eb) == 0)
1203                return eb;
1204
1205        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1206        if (!tm)
1207                return eb;
1208
1209        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1210                BUG_ON(tm->slot != 0);
1211                eb_rewin = alloc_dummy_extent_buffer(eb->start,
1212                                                fs_info->tree_root->nodesize);
1213                BUG_ON(!eb_rewin);
1214                btrfs_set_header_bytenr(eb_rewin, eb->start);
1215                btrfs_set_header_backref_rev(eb_rewin,
1216                                             btrfs_header_backref_rev(eb));
1217                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1218                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1219        } else {
1220                eb_rewin = btrfs_clone_extent_buffer(eb);
1221                BUG_ON(!eb_rewin);
1222        }
1223
1224        extent_buffer_get(eb_rewin);
1225        free_extent_buffer(eb);
1226
1227        __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1228
1229        return eb_rewin;
1230}
1231
1232/*
1233 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1234 * value. If there are no changes, the current root->root_node is returned. If
1235 * anything changed in between, there's a fresh buffer allocated on which the
1236 * rewind operations are done. In any case, the returned buffer is read locked.
1237 * Returns NULL on error (with no locks held).
1238 */
1239static inline struct extent_buffer *
1240get_old_root(struct btrfs_root *root, u64 time_seq)
1241{
1242        struct tree_mod_elem *tm;
1243        struct extent_buffer *eb;
1244        struct tree_mod_root *old_root = NULL;
1245        u64 old_generation = 0;
1246        u64 logical;
1247
1248        eb = btrfs_read_lock_root_node(root);
1249        tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1250        if (!tm)
1251                return root->node;
1252
1253        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1254                old_root = &tm->old_root;
1255                old_generation = tm->generation;
1256                logical = old_root->logical;
1257        } else {
1258                logical = root->node->start;
1259        }
1260
1261        tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1262        if (old_root)
1263                eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1264        else
1265                eb = btrfs_clone_extent_buffer(root->node);
1266        btrfs_tree_read_unlock(root->node);
1267        free_extent_buffer(root->node);
1268        if (!eb)
1269                return NULL;
1270        btrfs_tree_read_lock(eb);
1271        if (old_root) {
1272                btrfs_set_header_bytenr(eb, eb->start);
1273                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1274                btrfs_set_header_owner(eb, root->root_key.objectid);
1275                btrfs_set_header_level(eb, old_root->level);
1276                btrfs_set_header_generation(eb, old_generation);
1277        }
1278        if (tm)
1279                __tree_mod_log_rewind(eb, time_seq, tm);
1280        else
1281                WARN_ON(btrfs_header_level(eb) != 0);
1282        extent_buffer_get(eb);
1283
1284        return eb;
1285}
1286
1287static inline int should_cow_block(struct btrfs_trans_handle *trans,
1288                                   struct btrfs_root *root,
1289                                   struct extent_buffer *buf)
1290{
1291        /* ensure we can see the force_cow */
1292        smp_rmb();
1293
1294        /*
1295         * We do not need to cow a block if
1296         * 1) this block is not created or changed in this transaction;
1297         * 2) this block does not belong to TREE_RELOC tree;
1298         * 3) the root is not forced COW.
1299         *
1300         * What is forced COW:
1301         *    when we create snapshot during commiting the transaction,
1302         *    after we've finished coping src root, we must COW the shared
1303         *    block to ensure the metadata consistency.
1304         */
1305        if (btrfs_header_generation(buf) == trans->transid &&
1306            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1307            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1308              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1309            !root->force_cow)
1310                return 0;
1311        return 1;
1312}
1313
1314/*
1315 * cows a single block, see __btrfs_cow_block for the real work.
1316 * This version of it has extra checks so that a block isn't cow'd more than
1317 * once per transaction, as long as it hasn't been written yet
1318 */
1319noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1320                    struct btrfs_root *root, struct extent_buffer *buf,
1321                    struct extent_buffer *parent, int parent_slot,
1322                    struct extent_buffer **cow_ret)
1323{
1324        u64 search_start;
1325        int ret;
1326
1327        if (trans->transaction != root->fs_info->running_transaction) {
1328                printk(KERN_CRIT "trans %llu running %llu\n",
1329                       (unsigned long long)trans->transid,
1330                       (unsigned long long)
1331                       root->fs_info->running_transaction->transid);
1332                WARN_ON(1);
1333        }
1334        if (trans->transid != root->fs_info->generation) {
1335                printk(KERN_CRIT "trans %llu running %llu\n",
1336                       (unsigned long long)trans->transid,
1337                       (unsigned long long)root->fs_info->generation);
1338                WARN_ON(1);
1339        }
1340
1341        if (!should_cow_block(trans, root, buf)) {
1342                *cow_ret = buf;
1343                return 0;
1344        }
1345
1346        search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1347
1348        if (parent)
1349                btrfs_set_lock_blocking(parent);
1350        btrfs_set_lock_blocking(buf);
1351
1352        ret = __btrfs_cow_block(trans, root, buf, parent,
1353                                 parent_slot, cow_ret, search_start, 0);
1354
1355        trace_btrfs_cow_block(root, buf, *cow_ret);
1356
1357        return ret;
1358}
1359
1360/*
1361 * helper function for defrag to decide if two blocks pointed to by a
1362 * node are actually close by
1363 */
1364static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1365{
1366        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1367                return 1;
1368        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1369                return 1;
1370        return 0;
1371}
1372
1373/*
1374 * compare two keys in a memcmp fashion
1375 */
1376static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1377{
1378        struct btrfs_key k1;
1379
1380        btrfs_disk_key_to_cpu(&k1, disk);
1381
1382        return btrfs_comp_cpu_keys(&k1, k2);
1383}
1384
1385/*
1386 * same as comp_keys only with two btrfs_key's
1387 */
1388int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1389{
1390        if (k1->objectid > k2->objectid)
1391                return 1;
1392        if (k1->objectid < k2->objectid)
1393                return -1;
1394        if (k1->type > k2->type)
1395                return 1;
1396        if (k1->type < k2->type)
1397                return -1;
1398        if (k1->offset > k2->offset)
1399                return 1;
1400        if (k1->offset < k2->offset)
1401                return -1;
1402        return 0;
1403}
1404
1405/*
1406 * this is used by the defrag code to go through all the
1407 * leaves pointed to by a node and reallocate them so that
1408 * disk order is close to key order
1409 */
1410int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1411                       struct btrfs_root *root, struct extent_buffer *parent,
1412                       int start_slot, int cache_only, u64 *last_ret,
1413                       struct btrfs_key *progress)
1414{
1415        struct extent_buffer *cur;
1416        u64 blocknr;
1417        u64 gen;
1418        u64 search_start = *last_ret;
1419        u64 last_block = 0;
1420        u64 other;
1421        u32 parent_nritems;
1422        int end_slot;
1423        int i;
1424        int err = 0;
1425        int parent_level;
1426        int uptodate;
1427        u32 blocksize;
1428        int progress_passed = 0;
1429        struct btrfs_disk_key disk_key;
1430
1431        parent_level = btrfs_header_level(parent);
1432        if (cache_only && parent_level != 1)
1433                return 0;
1434
1435        if (trans->transaction != root->fs_info->running_transaction)
1436                WARN_ON(1);
1437        if (trans->transid != root->fs_info->generation)
1438                WARN_ON(1);
1439
1440        parent_nritems = btrfs_header_nritems(parent);
1441        blocksize = btrfs_level_size(root, parent_level - 1);
1442        end_slot = parent_nritems;
1443
1444        if (parent_nritems == 1)
1445                return 0;
1446
1447        btrfs_set_lock_blocking(parent);
1448
1449        for (i = start_slot; i < end_slot; i++) {
1450                int close = 1;
1451
1452                btrfs_node_key(parent, &disk_key, i);
1453                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1454                        continue;
1455
1456                progress_passed = 1;
1457                blocknr = btrfs_node_blockptr(parent, i);
1458                gen = btrfs_node_ptr_generation(parent, i);
1459                if (last_block == 0)
1460                        last_block = blocknr;
1461
1462                if (i > 0) {
1463                        other = btrfs_node_blockptr(parent, i - 1);
1464                        close = close_blocks(blocknr, other, blocksize);
1465                }
1466                if (!close && i < end_slot - 2) {
1467                        other = btrfs_node_blockptr(parent, i + 1);
1468                        close = close_blocks(blocknr, other, blocksize);
1469                }
1470                if (close) {
1471                        last_block = blocknr;
1472                        continue;
1473                }
1474
1475                cur = btrfs_find_tree_block(root, blocknr, blocksize);
1476                if (cur)
1477                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1478                else
1479                        uptodate = 0;
1480                if (!cur || !uptodate) {
1481                        if (cache_only) {
1482                                free_extent_buffer(cur);
1483                                continue;
1484                        }
1485                        if (!cur) {
1486                                cur = read_tree_block(root, blocknr,
1487                                                         blocksize, gen);
1488                                if (!cur)
1489                                        return -EIO;
1490                        } else if (!uptodate) {
1491                                err = btrfs_read_buffer(cur, gen);
1492                                if (err) {
1493                                        free_extent_buffer(cur);
1494                                        return err;
1495                                }
1496                        }
1497                }
1498                if (search_start == 0)
1499                        search_start = last_block;
1500
1501                btrfs_tree_lock(cur);
1502                btrfs_set_lock_blocking(cur);
1503                err = __btrfs_cow_block(trans, root, cur, parent, i,
1504                                        &cur, search_start,
1505                                        min(16 * blocksize,
1506                                            (end_slot - i) * blocksize));
1507                if (err) {
1508                        btrfs_tree_unlock(cur);
1509                        free_extent_buffer(cur);
1510                        break;
1511                }
1512                search_start = cur->start;
1513                last_block = cur->start;
1514                *last_ret = search_start;
1515                btrfs_tree_unlock(cur);
1516                free_extent_buffer(cur);
1517        }
1518        return err;
1519}
1520
1521/*
1522 * The leaf data grows from end-to-front in the node.
1523 * this returns the address of the start of the last item,
1524 * which is the stop of the leaf data stack
1525 */
1526static inline unsigned int leaf_data_end(struct btrfs_root *root,
1527                                         struct extent_buffer *leaf)
1528{
1529        u32 nr = btrfs_header_nritems(leaf);
1530        if (nr == 0)
1531                return BTRFS_LEAF_DATA_SIZE(root);
1532        return btrfs_item_offset_nr(leaf, nr - 1);
1533}
1534
1535
1536/*
1537 * search for key in the extent_buffer.  The items start at offset p,
1538 * and they are item_size apart.  There are 'max' items in p.
1539 *
1540 * the slot in the array is returned via slot, and it points to
1541 * the place where you would insert key if it is not found in
1542 * the array.
1543 *
1544 * slot may point to max if the key is bigger than all of the keys
1545 */
1546static noinline int generic_bin_search(struct extent_buffer *eb,
1547                                       unsigned long p,
1548                                       int item_size, struct btrfs_key *key,
1549                                       int max, int *slot)
1550{
1551        int low = 0;
1552        int high = max;
1553        int mid;
1554        int ret;
1555        struct btrfs_disk_key *tmp = NULL;
1556        struct btrfs_disk_key unaligned;
1557        unsigned long offset;
1558        char *kaddr = NULL;
1559        unsigned long map_start = 0;
1560        unsigned long map_len = 0;
1561        int err;
1562
1563        while (low < high) {
1564                mid = (low + high) / 2;
1565                offset = p + mid * item_size;
1566
1567                if (!kaddr || offset < map_start ||
1568                    (offset + sizeof(struct btrfs_disk_key)) >
1569                    map_start + map_len) {
1570
1571                        err = map_private_extent_buffer(eb, offset,
1572                                                sizeof(struct btrfs_disk_key),
1573                                                &kaddr, &map_start, &map_len);
1574
1575                        if (!err) {
1576                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1577                                                        map_start);
1578                        } else {
1579                                read_extent_buffer(eb, &unaligned,
1580                                                   offset, sizeof(unaligned));
1581                                tmp = &unaligned;
1582                        }
1583
1584                } else {
1585                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1586                                                        map_start);
1587                }
1588                ret = comp_keys(tmp, key);
1589
1590                if (ret < 0)
1591                        low = mid + 1;
1592                else if (ret > 0)
1593                        high = mid;
1594                else {
1595                        *slot = mid;
1596                        return 0;
1597                }
1598        }
1599        *slot = low;
1600        return 1;
1601}
1602
1603/*
1604 * simple bin_search frontend that does the right thing for
1605 * leaves vs nodes
1606 */
1607static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1608                      int level, int *slot)
1609{
1610        if (level == 0)
1611                return generic_bin_search(eb,
1612                                          offsetof(struct btrfs_leaf, items),
1613                                          sizeof(struct btrfs_item),
1614                                          key, btrfs_header_nritems(eb),
1615                                          slot);
1616        else
1617                return generic_bin_search(eb,
1618                                          offsetof(struct btrfs_node, ptrs),
1619                                          sizeof(struct btrfs_key_ptr),
1620                                          key, btrfs_header_nritems(eb),
1621                                          slot);
1622}
1623
1624int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1625                     int level, int *slot)
1626{
1627        return bin_search(eb, key, level, slot);
1628}
1629
1630static void root_add_used(struct btrfs_root *root, u32 size)
1631{
1632        spin_lock(&root->accounting_lock);
1633        btrfs_set_root_used(&root->root_item,
1634                            btrfs_root_used(&root->root_item) + size);
1635        spin_unlock(&root->accounting_lock);
1636}
1637
1638static void root_sub_used(struct btrfs_root *root, u32 size)
1639{
1640        spin_lock(&root->accounting_lock);
1641        btrfs_set_root_used(&root->root_item,
1642                            btrfs_root_used(&root->root_item) - size);
1643        spin_unlock(&root->accounting_lock);
1644}
1645
1646/* given a node and slot number, this reads the blocks it points to.  The
1647 * extent buffer is returned with a reference taken (but unlocked).
1648 * NULL is returned on error.
1649 */
1650static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1651                                   struct extent_buffer *parent, int slot)
1652{
1653        int level = btrfs_header_level(parent);
1654        if (slot < 0)
1655                return NULL;
1656        if (slot >= btrfs_header_nritems(parent))
1657                return NULL;
1658
1659        BUG_ON(level == 0);
1660
1661        return read_tree_block(root, btrfs_node_blockptr(parent, slot),
1662                       btrfs_level_size(root, level - 1),
1663                       btrfs_node_ptr_generation(parent, slot));
1664}
1665
1666/*
1667 * node level balancing, used to make sure nodes are in proper order for
1668 * item deletion.  We balance from the top down, so we have to make sure
1669 * that a deletion won't leave an node completely empty later on.
1670 */
1671static noinline int balance_level(struct btrfs_trans_handle *trans,
1672                         struct btrfs_root *root,
1673                         struct btrfs_path *path, int level)
1674{
1675        struct extent_buffer *right = NULL;
1676        struct extent_buffer *mid;
1677        struct extent_buffer *left = NULL;
1678        struct extent_buffer *parent = NULL;
1679        int ret = 0;
1680        int wret;
1681        int pslot;
1682        int orig_slot = path->slots[level];
1683        u64 orig_ptr;
1684
1685        if (level == 0)
1686                return 0;
1687
1688        mid = path->nodes[level];
1689
1690        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1691                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1692        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1693
1694        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1695
1696        if (level < BTRFS_MAX_LEVEL - 1) {
1697                parent = path->nodes[level + 1];
1698                pslot = path->slots[level + 1];
1699        }
1700
1701        /*
1702         * deal with the case where there is only one pointer in the root
1703         * by promoting the node below to a root
1704         */
1705        if (!parent) {
1706                struct extent_buffer *child;
1707
1708                if (btrfs_header_nritems(mid) != 1)
1709                        return 0;
1710
1711                /* promote the child to a root */
1712                child = read_node_slot(root, mid, 0);
1713                if (!child) {
1714                        ret = -EROFS;
1715                        btrfs_std_error(root->fs_info, ret);
1716                        goto enospc;
1717                }
1718
1719                btrfs_tree_lock(child);
1720                btrfs_set_lock_blocking(child);
1721                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1722                if (ret) {
1723                        btrfs_tree_unlock(child);
1724                        free_extent_buffer(child);
1725                        goto enospc;
1726                }
1727
1728                tree_mod_log_set_root_pointer(root, child);
1729                rcu_assign_pointer(root->node, child);
1730
1731                add_root_to_dirty_list(root);
1732                btrfs_tree_unlock(child);
1733
1734                path->locks[level] = 0;
1735                path->nodes[level] = NULL;
1736                clean_tree_block(trans, root, mid);
1737                btrfs_tree_unlock(mid);
1738                /* once for the path */
1739                free_extent_buffer(mid);
1740
1741                root_sub_used(root, mid->len);
1742                btrfs_free_tree_block(trans, root, mid, 0, 1);
1743                /* once for the root ptr */
1744                free_extent_buffer_stale(mid);
1745                return 0;
1746        }
1747        if (btrfs_header_nritems(mid) >
1748            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1749                return 0;
1750
1751        left = read_node_slot(root, parent, pslot - 1);
1752        if (left) {
1753                btrfs_tree_lock(left);
1754                btrfs_set_lock_blocking(left);
1755                wret = btrfs_cow_block(trans, root, left,
1756                                       parent, pslot - 1, &left);
1757                if (wret) {
1758                        ret = wret;
1759                        goto enospc;
1760                }
1761        }
1762        right = read_node_slot(root, parent, pslot + 1);
1763        if (right) {
1764                btrfs_tree_lock(right);
1765                btrfs_set_lock_blocking(right);
1766                wret = btrfs_cow_block(trans, root, right,
1767                                       parent, pslot + 1, &right);
1768                if (wret) {
1769                        ret = wret;
1770                        goto enospc;
1771                }
1772        }
1773
1774        /* first, try to make some room in the middle buffer */
1775        if (left) {
1776                orig_slot += btrfs_header_nritems(left);
1777                wret = push_node_left(trans, root, left, mid, 1);
1778                if (wret < 0)
1779                        ret = wret;
1780        }
1781
1782        /*
1783         * then try to empty the right most buffer into the middle
1784         */
1785        if (right) {
1786                wret = push_node_left(trans, root, mid, right, 1);
1787                if (wret < 0 && wret != -ENOSPC)
1788                        ret = wret;
1789                if (btrfs_header_nritems(right) == 0) {
1790                        clean_tree_block(trans, root, right);
1791                        btrfs_tree_unlock(right);
1792                        del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1793                        root_sub_used(root, right->len);
1794                        btrfs_free_tree_block(trans, root, right, 0, 1);
1795                        free_extent_buffer_stale(right);
1796                        right = NULL;
1797                } else {
1798                        struct btrfs_disk_key right_key;
1799                        btrfs_node_key(right, &right_key, 0);
1800                        tree_mod_log_set_node_key(root->fs_info, parent,
1801                                                  &right_key, pslot + 1, 0);
1802                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1803                        btrfs_mark_buffer_dirty(parent);
1804                }
1805        }
1806        if (btrfs_header_nritems(mid) == 1) {
1807                /*
1808                 * we're not allowed to leave a node with one item in the
1809                 * tree during a delete.  A deletion from lower in the tree
1810                 * could try to delete the only pointer in this node.
1811                 * So, pull some keys from the left.
1812                 * There has to be a left pointer at this point because
1813                 * otherwise we would have pulled some pointers from the
1814                 * right
1815                 */
1816                if (!left) {
1817                        ret = -EROFS;
1818                        btrfs_std_error(root->fs_info, ret);
1819                        goto enospc;
1820                }
1821                wret = balance_node_right(trans, root, mid, left);
1822                if (wret < 0) {
1823                        ret = wret;
1824                        goto enospc;
1825                }
1826                if (wret == 1) {
1827                        wret = push_node_left(trans, root, left, mid, 1);
1828                        if (wret < 0)
1829                                ret = wret;
1830                }
1831                BUG_ON(wret == 1);
1832        }
1833        if (btrfs_header_nritems(mid) == 0) {
1834                clean_tree_block(trans, root, mid);
1835                btrfs_tree_unlock(mid);
1836                del_ptr(trans, root, path, level + 1, pslot, 1);
1837                root_sub_used(root, mid->len);
1838                btrfs_free_tree_block(trans, root, mid, 0, 1);
1839                free_extent_buffer_stale(mid);
1840                mid = NULL;
1841        } else {
1842                /* update the parent key to reflect our changes */
1843                struct btrfs_disk_key mid_key;
1844                btrfs_node_key(mid, &mid_key, 0);
1845                tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1846                                          pslot, 0);
1847                btrfs_set_node_key(parent, &mid_key, pslot);
1848                btrfs_mark_buffer_dirty(parent);
1849        }
1850
1851        /* update the path */
1852        if (left) {
1853                if (btrfs_header_nritems(left) > orig_slot) {
1854                        extent_buffer_get(left);
1855                        /* left was locked after cow */
1856                        path->nodes[level] = left;
1857                        path->slots[level + 1] -= 1;
1858                        path->slots[level] = orig_slot;
1859                        if (mid) {
1860                                btrfs_tree_unlock(mid);
1861                                free_extent_buffer(mid);
1862                        }
1863                } else {
1864                        orig_slot -= btrfs_header_nritems(left);
1865                        path->slots[level] = orig_slot;
1866                }
1867        }
1868        /* double check we haven't messed things up */
1869        if (orig_ptr !=
1870            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1871                BUG();
1872enospc:
1873        if (right) {
1874                btrfs_tree_unlock(right);
1875                free_extent_buffer(right);
1876        }
1877        if (left) {
1878                if (path->nodes[level] != left)
1879                        btrfs_tree_unlock(left);
1880                free_extent_buffer(left);
1881        }
1882        return ret;
1883}
1884
1885/* Node balancing for insertion.  Here we only split or push nodes around
1886 * when they are completely full.  This is also done top down, so we
1887 * have to be pessimistic.
1888 */
1889static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1890                                          struct btrfs_root *root,
1891                                          struct btrfs_path *path, int level)
1892{
1893        struct extent_buffer *right = NULL;
1894        struct extent_buffer *mid;
1895        struct extent_buffer *left = NULL;
1896        struct extent_buffer *parent = NULL;
1897        int ret = 0;
1898        int wret;
1899        int pslot;
1900        int orig_slot = path->slots[level];
1901
1902        if (level == 0)
1903                return 1;
1904
1905        mid = path->nodes[level];
1906        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1907
1908        if (level < BTRFS_MAX_LEVEL - 1) {
1909                parent = path->nodes[level + 1];
1910                pslot = path->slots[level + 1];
1911        }
1912
1913        if (!parent)
1914                return 1;
1915
1916        left = read_node_slot(root, parent, pslot - 1);
1917
1918        /* first, try to make some room in the middle buffer */
1919        if (left) {
1920                u32 left_nr;
1921
1922                btrfs_tree_lock(left);
1923                btrfs_set_lock_blocking(left);
1924
1925                left_nr = btrfs_header_nritems(left);
1926                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1927                        wret = 1;
1928                } else {
1929                        ret = btrfs_cow_block(trans, root, left, parent,
1930                                              pslot - 1, &left);
1931                        if (ret)
1932                                wret = 1;
1933                        else {
1934                                wret = push_node_left(trans, root,
1935                                                      left, mid, 0);
1936                        }
1937                }
1938                if (wret < 0)
1939                        ret = wret;
1940                if (wret == 0) {
1941                        struct btrfs_disk_key disk_key;
1942                        orig_slot += left_nr;
1943                        btrfs_node_key(mid, &disk_key, 0);
1944                        tree_mod_log_set_node_key(root->fs_info, parent,
1945                                                  &disk_key, pslot, 0);
1946                        btrfs_set_node_key(parent, &disk_key, pslot);
1947                        btrfs_mark_buffer_dirty(parent);
1948                        if (btrfs_header_nritems(left) > orig_slot) {
1949                                path->nodes[level] = left;
1950                                path->slots[level + 1] -= 1;
1951                                path->slots[level] = orig_slot;
1952                                btrfs_tree_unlock(mid);
1953                                free_extent_buffer(mid);
1954                        } else {
1955                                orig_slot -=
1956                                        btrfs_header_nritems(left);
1957                                path->slots[level] = orig_slot;
1958                                btrfs_tree_unlock(left);
1959                                free_extent_buffer(left);
1960                        }
1961                        return 0;
1962                }
1963                btrfs_tree_unlock(left);
1964                free_extent_buffer(left);
1965        }
1966        right = read_node_slot(root, parent, pslot + 1);
1967
1968        /*
1969         * then try to empty the right most buffer into the middle
1970         */
1971        if (right) {
1972                u32 right_nr;
1973
1974                btrfs_tree_lock(right);
1975                btrfs_set_lock_blocking(right);
1976
1977                right_nr = btrfs_header_nritems(right);
1978                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1979                        wret = 1;
1980                } else {
1981                        ret = btrfs_cow_block(trans, root, right,
1982                                              parent, pslot + 1,
1983                                              &right);
1984                        if (ret)
1985                                wret = 1;
1986                        else {
1987                                wret = balance_node_right(trans, root,
1988                                                          right, mid);
1989                        }
1990                }
1991                if (wret < 0)
1992                        ret = wret;
1993                if (wret == 0) {
1994                        struct btrfs_disk_key disk_key;
1995
1996                        btrfs_node_key(right, &disk_key, 0);
1997                        tree_mod_log_set_node_key(root->fs_info, parent,
1998                                                  &disk_key, pslot + 1, 0);
1999                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2000                        btrfs_mark_buffer_dirty(parent);
2001
2002                        if (btrfs_header_nritems(mid) <= orig_slot) {
2003                                path->nodes[level] = right;
2004                                path->slots[level + 1] += 1;
2005                                path->slots[level] = orig_slot -
2006                                        btrfs_header_nritems(mid);
2007                                btrfs_tree_unlock(mid);
2008                                free_extent_buffer(mid);
2009                        } else {
2010                                btrfs_tree_unlock(right);
2011                                free_extent_buffer(right);
2012                        }
2013                        return 0;
2014                }
2015                btrfs_tree_unlock(right);
2016                free_extent_buffer(right);
2017        }
2018        return 1;
2019}
2020
2021/*
2022 * readahead one full node of leaves, finding things that are close
2023 * to the block in 'slot', and triggering ra on them.
2024 */
2025static void reada_for_search(struct btrfs_root *root,
2026                             struct btrfs_path *path,
2027                             int level, int slot, u64 objectid)
2028{
2029        struct extent_buffer *node;
2030        struct btrfs_disk_key disk_key;
2031        u32 nritems;
2032        u64 search;
2033        u64 target;
2034        u64 nread = 0;
2035        u64 gen;
2036        int direction = path->reada;
2037        struct extent_buffer *eb;
2038        u32 nr;
2039        u32 blocksize;
2040        u32 nscan = 0;
2041
2042        if (level != 1)
2043                return;
2044
2045        if (!path->nodes[level])
2046                return;
2047
2048        node = path->nodes[level];
2049
2050        search = btrfs_node_blockptr(node, slot);
2051        blocksize = btrfs_level_size(root, level - 1);
2052        eb = btrfs_find_tree_block(root, search, blocksize);
2053        if (eb) {
2054                free_extent_buffer(eb);
2055                return;
2056        }
2057
2058        target = search;
2059
2060        nritems = btrfs_header_nritems(node);
2061        nr = slot;
2062
2063        while (1) {
2064                if (direction < 0) {
2065                        if (nr == 0)
2066                                break;
2067                        nr--;
2068                } else if (direction > 0) {
2069                        nr++;
2070                        if (nr >= nritems)
2071                                break;
2072                }
2073                if (path->reada < 0 && objectid) {
2074                        btrfs_node_key(node, &disk_key, nr);
2075                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2076                                break;
2077                }
2078                search = btrfs_node_blockptr(node, nr);
2079                if ((search <= target && target - search <= 65536) ||
2080                    (search > target && search - target <= 65536)) {
2081                        gen = btrfs_node_ptr_generation(node, nr);
2082                        readahead_tree_block(root, search, blocksize, gen);
2083                        nread += blocksize;
2084                }
2085                nscan++;
2086                if ((nread > 65536 || nscan > 32))
2087                        break;
2088        }
2089}
2090
2091/*
2092 * returns -EAGAIN if it had to drop the path, or zero if everything was in
2093 * cache
2094 */
2095static noinline int reada_for_balance(struct btrfs_root *root,
2096                                      struct btrfs_path *path, int level)
2097{
2098        int slot;
2099        int nritems;
2100        struct extent_buffer *parent;
2101        struct extent_buffer *eb;
2102        u64 gen;
2103        u64 block1 = 0;
2104        u64 block2 = 0;
2105        int ret = 0;
2106        int blocksize;
2107
2108        parent = path->nodes[level + 1];
2109        if (!parent)
2110                return 0;
2111
2112        nritems = btrfs_header_nritems(parent);
2113        slot = path->slots[level + 1];
2114        blocksize = btrfs_level_size(root, level);
2115
2116        if (slot > 0) {
2117                block1 = btrfs_node_blockptr(parent, slot - 1);
2118                gen = btrfs_node_ptr_generation(parent, slot - 1);
2119                eb = btrfs_find_tree_block(root, block1, blocksize);
2120                /*
2121                 * if we get -eagain from btrfs_buffer_uptodate, we
2122                 * don't want to return eagain here.  That will loop
2123                 * forever
2124                 */
2125                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2126                        block1 = 0;
2127                free_extent_buffer(eb);
2128        }
2129        if (slot + 1 < nritems) {
2130                block2 = btrfs_node_blockptr(parent, slot + 1);
2131                gen = btrfs_node_ptr_generation(parent, slot + 1);
2132                eb = btrfs_find_tree_block(root, block2, blocksize);
2133                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2134                        block2 = 0;
2135                free_extent_buffer(eb);
2136        }
2137        if (block1 || block2) {
2138                ret = -EAGAIN;
2139
2140                /* release the whole path */
2141                btrfs_release_path(path);
2142
2143                /* read the blocks */
2144                if (block1)
2145                        readahead_tree_block(root, block1, blocksize, 0);
2146                if (block2)
2147                        readahead_tree_block(root, block2, blocksize, 0);
2148
2149                if (block1) {
2150                        eb = read_tree_block(root, block1, blocksize, 0);
2151                        free_extent_buffer(eb);
2152                }
2153                if (block2) {
2154                        eb = read_tree_block(root, block2, blocksize, 0);
2155                        free_extent_buffer(eb);
2156                }
2157        }
2158        return ret;
2159}
2160
2161
2162/*
2163 * when we walk down the tree, it is usually safe to unlock the higher layers
2164 * in the tree.  The exceptions are when our path goes through slot 0, because
2165 * operations on the tree might require changing key pointers higher up in the
2166 * tree.
2167 *
2168 * callers might also have set path->keep_locks, which tells this code to keep
2169 * the lock if the path points to the last slot in the block.  This is part of
2170 * walking through the tree, and selecting the next slot in the higher block.
2171 *
2172 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2173 * if lowest_unlock is 1, level 0 won't be unlocked
2174 */
2175static noinline void unlock_up(struct btrfs_path *path, int level,
2176                               int lowest_unlock, int min_write_lock_level,
2177                               int *write_lock_level)
2178{
2179        int i;
2180        int skip_level = level;
2181        int no_skips = 0;
2182        struct extent_buffer *t;
2183
2184        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2185                if (!path->nodes[i])
2186                        break;
2187                if (!path->locks[i])
2188                        break;
2189                if (!no_skips && path->slots[i] == 0) {
2190                        skip_level = i + 1;
2191                        continue;
2192                }
2193                if (!no_skips && path->keep_locks) {
2194                        u32 nritems;
2195                        t = path->nodes[i];
2196                        nritems = btrfs_header_nritems(t);
2197                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2198                                skip_level = i + 1;
2199                                continue;
2200                        }
2201                }
2202                if (skip_level < i && i >= lowest_unlock)
2203                        no_skips = 1;
2204
2205                t = path->nodes[i];
2206                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2207                        btrfs_tree_unlock_rw(t, path->locks[i]);
2208                        path->locks[i] = 0;
2209                        if (write_lock_level &&
2210                            i > min_write_lock_level &&
2211                            i <= *write_lock_level) {
2212                                *write_lock_level = i - 1;
2213                        }
2214                }
2215        }
2216}
2217
2218/*
2219 * This releases any locks held in the path starting at level and
2220 * going all the way up to the root.
2221 *
2222 * btrfs_search_slot will keep the lock held on higher nodes in a few
2223 * corner cases, such as COW of the block at slot zero in the node.  This
2224 * ignores those rules, and it should only be called when there are no
2225 * more updates to be done higher up in the tree.
2226 */
2227noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2228{
2229        int i;
2230
2231        if (path->keep_locks)
2232                return;
2233
2234        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2235                if (!path->nodes[i])
2236                        continue;
2237                if (!path->locks[i])
2238                        continue;
2239                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2240                path->locks[i] = 0;
2241        }
2242}
2243
2244/*
2245 * helper function for btrfs_search_slot.  The goal is to find a block
2246 * in cache without setting the path to blocking.  If we find the block
2247 * we return zero and the path is unchanged.
2248 *
2249 * If we can't find the block, we set the path blocking and do some
2250 * reada.  -EAGAIN is returned and the search must be repeated.
2251 */
2252static int
2253read_block_for_search(struct btrfs_trans_handle *trans,
2254                       struct btrfs_root *root, struct btrfs_path *p,
2255                       struct extent_buffer **eb_ret, int level, int slot,
2256                       struct btrfs_key *key, u64 time_seq)
2257{
2258        u64 blocknr;
2259        u64 gen;
2260        u32 blocksize;
2261        struct extent_buffer *b = *eb_ret;
2262        struct extent_buffer *tmp;
2263        int ret;
2264
2265        blocknr = btrfs_node_blockptr(b, slot);
2266        gen = btrfs_node_ptr_generation(b, slot);
2267        blocksize = btrfs_level_size(root, level - 1);
2268
2269        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2270        if (tmp) {
2271                /* first we do an atomic uptodate check */
2272                if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2273                        if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2274                                /*
2275                                 * we found an up to date block without
2276                                 * sleeping, return
2277                                 * right away
2278                                 */
2279                                *eb_ret = tmp;
2280                                return 0;
2281                        }
2282                        /* the pages were up to date, but we failed
2283                         * the generation number check.  Do a full
2284                         * read for the generation number that is correct.
2285                         * We must do this without dropping locks so
2286                         * we can trust our generation number
2287                         */
2288                        free_extent_buffer(tmp);
2289                        btrfs_set_path_blocking(p);
2290
2291                        /* now we're allowed to do a blocking uptodate check */
2292                        tmp = read_tree_block(root, blocknr, blocksize, gen);
2293                        if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2294                                *eb_ret = tmp;
2295                                return 0;
2296                        }
2297                        free_extent_buffer(tmp);
2298                        btrfs_release_path(p);
2299                        return -EIO;
2300                }
2301        }
2302
2303        /*
2304         * reduce lock contention at high levels
2305         * of the btree by dropping locks before
2306         * we read.  Don't release the lock on the current
2307         * level because we need to walk this node to figure
2308         * out which blocks to read.
2309         */
2310        btrfs_unlock_up_safe(p, level + 1);
2311        btrfs_set_path_blocking(p);
2312
2313        free_extent_buffer(tmp);
2314        if (p->reada)
2315                reada_for_search(root, p, level, slot, key->objectid);
2316
2317        btrfs_release_path(p);
2318
2319        ret = -EAGAIN;
2320        tmp = read_tree_block(root, blocknr, blocksize, 0);
2321        if (tmp) {
2322                /*
2323                 * If the read above didn't mark this buffer up to date,
2324                 * it will never end up being up to date.  Set ret to EIO now
2325                 * and give up so that our caller doesn't loop forever
2326                 * on our EAGAINs.
2327                 */
2328                if (!btrfs_buffer_uptodate(tmp, 0, 0))
2329                        ret = -EIO;
2330                free_extent_buffer(tmp);
2331        }
2332        return ret;
2333}
2334
2335/*
2336 * helper function for btrfs_search_slot.  This does all of the checks
2337 * for node-level blocks and does any balancing required based on
2338 * the ins_len.
2339 *
2340 * If no extra work was required, zero is returned.  If we had to
2341 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2342 * start over
2343 */
2344static int
2345setup_nodes_for_search(struct btrfs_trans_handle *trans,
2346                       struct btrfs_root *root, struct btrfs_path *p,
2347                       struct extent_buffer *b, int level, int ins_len,
2348                       int *write_lock_level)
2349{
2350        int ret;
2351        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2352            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2353                int sret;
2354
2355                if (*write_lock_level < level + 1) {
2356                        *write_lock_level = level + 1;
2357                        btrfs_release_path(p);
2358                        goto again;
2359                }
2360
2361                sret = reada_for_balance(root, p, level);
2362                if (sret)
2363                        goto again;
2364
2365                btrfs_set_path_blocking(p);
2366                sret = split_node(trans, root, p, level);
2367                btrfs_clear_path_blocking(p, NULL, 0);
2368
2369                BUG_ON(sret > 0);
2370                if (sret) {
2371                        ret = sret;
2372                        goto done;
2373                }
2374                b = p->nodes[level];
2375        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2376                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2377                int sret;
2378
2379                if (*write_lock_level < level + 1) {
2380                        *write_lock_level = level + 1;
2381                        btrfs_release_path(p);
2382                        goto again;
2383                }
2384
2385                sret = reada_for_balance(root, p, level);
2386                if (sret)
2387                        goto again;
2388
2389                btrfs_set_path_blocking(p);
2390                sret = balance_level(trans, root, p, level);
2391                btrfs_clear_path_blocking(p, NULL, 0);
2392
2393                if (sret) {
2394                        ret = sret;
2395                        goto done;
2396                }
2397                b = p->nodes[level];
2398                if (!b) {
2399                        btrfs_release_path(p);
2400                        goto again;
2401                }
2402                BUG_ON(btrfs_header_nritems(b) == 1);
2403        }
2404        return 0;
2405
2406again:
2407        ret = -EAGAIN;
2408done:
2409        return ret;
2410}
2411
2412/*
2413 * look for key in the tree.  path is filled in with nodes along the way
2414 * if key is found, we return zero and you can find the item in the leaf
2415 * level of the path (level 0)
2416 *
2417 * If the key isn't found, the path points to the slot where it should
2418 * be inserted, and 1 is returned.  If there are other errors during the
2419 * search a negative error number is returned.
2420 *
2421 * if ins_len > 0, nodes and leaves will be split as we walk down the
2422 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2423 * possible)
2424 */
2425int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2426                      *root, struct btrfs_key *key, struct btrfs_path *p, int
2427                      ins_len, int cow)
2428{
2429        struct extent_buffer *b;
2430        int slot;
2431        int ret;
2432        int err;
2433        int level;
2434        int lowest_unlock = 1;
2435        int root_lock;
2436        /* everything at write_lock_level or lower must be write locked */
2437        int write_lock_level = 0;
2438        u8 lowest_level = 0;
2439        int min_write_lock_level;
2440
2441        lowest_level = p->lowest_level;
2442        WARN_ON(lowest_level && ins_len > 0);
2443        WARN_ON(p->nodes[0] != NULL);
2444
2445        if (ins_len < 0) {
2446                lowest_unlock = 2;
2447
2448                /* when we are removing items, we might have to go up to level
2449                 * two as we update tree pointers  Make sure we keep write
2450                 * for those levels as well
2451                 */
2452                write_lock_level = 2;
2453        } else if (ins_len > 0) {
2454                /*
2455                 * for inserting items, make sure we have a write lock on
2456                 * level 1 so we can update keys
2457                 */
2458                write_lock_level = 1;
2459        }
2460
2461        if (!cow)
2462                write_lock_level = -1;
2463
2464        if (cow && (p->keep_locks || p->lowest_level))
2465                write_lock_level = BTRFS_MAX_LEVEL;
2466
2467        min_write_lock_level = write_lock_level;
2468
2469again:
2470        /*
2471         * we try very hard to do read locks on the root
2472         */
2473        root_lock = BTRFS_READ_LOCK;
2474        level = 0;
2475        if (p->search_commit_root) {
2476                /*
2477                 * the commit roots are read only
2478                 * so we always do read locks
2479                 */
2480                b = root->commit_root;
2481                extent_buffer_get(b);
2482                level = btrfs_header_level(b);
2483                if (!p->skip_locking)
2484                        btrfs_tree_read_lock(b);
2485        } else {
2486                if (p->skip_locking) {
2487                        b = btrfs_root_node(root);
2488                        level = btrfs_header_level(b);
2489                } else {
2490                        /* we don't know the level of the root node
2491                         * until we actually have it read locked
2492                         */
2493                        b = btrfs_read_lock_root_node(root);
2494                        level = btrfs_header_level(b);
2495                        if (level <= write_lock_level) {
2496                                /* whoops, must trade for write lock */
2497                                btrfs_tree_read_unlock(b);
2498                                free_extent_buffer(b);
2499                                b = btrfs_lock_root_node(root);
2500                                root_lock = BTRFS_WRITE_LOCK;
2501
2502                                /* the level might have changed, check again */
2503                                level = btrfs_header_level(b);
2504                        }
2505                }
2506        }
2507        p->nodes[level] = b;
2508        if (!p->skip_locking)
2509                p->locks[level] = root_lock;
2510
2511        while (b) {
2512                level = btrfs_header_level(b);
2513
2514                /*
2515                 * setup the path here so we can release it under lock
2516                 * contention with the cow code
2517                 */
2518                if (cow) {
2519                        /*
2520                         * if we don't really need to cow this block
2521                         * then we don't want to set the path blocking,
2522                         * so we test it here
2523                         */
2524                        if (!should_cow_block(trans, root, b))
2525                                goto cow_done;
2526
2527                        btrfs_set_path_blocking(p);
2528
2529                        /*
2530                         * must have write locks on this node and the
2531                         * parent
2532                         */
2533                        if (level + 1 > write_lock_level) {
2534                                write_lock_level = level + 1;
2535                                btrfs_release_path(p);
2536                                goto again;
2537                        }
2538
2539                        err = btrfs_cow_block(trans, root, b,
2540                                              p->nodes[level + 1],
2541                                              p->slots[level + 1], &b);
2542                        if (err) {
2543                                ret = err;
2544                                goto done;
2545                        }
2546                }
2547cow_done:
2548                BUG_ON(!cow && ins_len);
2549
2550                p->nodes[level] = b;
2551                btrfs_clear_path_blocking(p, NULL, 0);
2552
2553                /*
2554                 * we have a lock on b and as long as we aren't changing
2555                 * the tree, there is no way to for the items in b to change.
2556                 * It is safe to drop the lock on our parent before we
2557                 * go through the expensive btree search on b.
2558                 *
2559                 * If cow is true, then we might be changing slot zero,
2560                 * which may require changing the parent.  So, we can't
2561                 * drop the lock until after we know which slot we're
2562                 * operating on.
2563                 */
2564                if (!cow)
2565                        btrfs_unlock_up_safe(p, level + 1);
2566
2567                ret = bin_search(b, key, level, &slot);
2568
2569                if (level != 0) {
2570                        int dec = 0;
2571                        if (ret && slot > 0) {
2572                                dec = 1;
2573                                slot -= 1;
2574                        }
2575                        p->slots[level] = slot;
2576                        err = setup_nodes_for_search(trans, root, p, b, level,
2577                                             ins_len, &write_lock_level);
2578                        if (err == -EAGAIN)
2579                                goto again;
2580                        if (err) {
2581                                ret = err;
2582                                goto done;
2583                        }
2584                        b = p->nodes[level];
2585                        slot = p->slots[level];
2586
2587                        /*
2588                         * slot 0 is special, if we change the key
2589                         * we have to update the parent pointer
2590                         * which means we must have a write lock
2591                         * on the parent
2592                         */
2593                        if (slot == 0 && cow &&
2594                            write_lock_level < level + 1) {
2595                                write_lock_level = level + 1;
2596                                btrfs_release_path(p);
2597                                goto again;
2598                        }
2599
2600                        unlock_up(p, level, lowest_unlock,
2601                                  min_write_lock_level, &write_lock_level);
2602
2603                        if (level == lowest_level) {
2604                                if (dec)
2605                                        p->slots[level]++;
2606                                goto done;
2607                        }
2608
2609                        err = read_block_for_search(trans, root, p,
2610                                                    &b, level, slot, key, 0);
2611                        if (err == -EAGAIN)
2612                                goto again;
2613                        if (err) {
2614                                ret = err;
2615                                goto done;
2616                        }
2617
2618                        if (!p->skip_locking) {
2619                                level = btrfs_header_level(b);
2620                                if (level <= write_lock_level) {
2621                                        err = btrfs_try_tree_write_lock(b);
2622                                        if (!err) {
2623                                                btrfs_set_path_blocking(p);
2624                                                btrfs_tree_lock(b);
2625                                                btrfs_clear_path_blocking(p, b,
2626                                                                  BTRFS_WRITE_LOCK);
2627                                        }
2628                                        p->locks[level] = BTRFS_WRITE_LOCK;
2629                                } else {
2630                                        err = btrfs_try_tree_read_lock(b);
2631                                        if (!err) {
2632                                                btrfs_set_path_blocking(p);
2633                                                btrfs_tree_read_lock(b);
2634                                                btrfs_clear_path_blocking(p, b,
2635                                                                  BTRFS_READ_LOCK);
2636                                        }
2637                                        p->locks[level] = BTRFS_READ_LOCK;
2638                                }
2639                                p->nodes[level] = b;
2640                        }
2641                } else {
2642                        p->slots[level] = slot;
2643                        if (ins_len > 0 &&
2644                            btrfs_leaf_free_space(root, b) < ins_len) {
2645                                if (write_lock_level < 1) {
2646                                        write_lock_level = 1;
2647                                        btrfs_release_path(p);
2648                                        goto again;
2649                                }
2650
2651                                btrfs_set_path_blocking(p);
2652                                err = split_leaf(trans, root, key,
2653                                                 p, ins_len, ret == 0);
2654                                btrfs_clear_path_blocking(p, NULL, 0);
2655
2656                                BUG_ON(err > 0);
2657                                if (err) {
2658                                        ret = err;
2659                                        goto done;
2660                                }
2661                        }
2662                        if (!p->search_for_split)
2663                                unlock_up(p, level, lowest_unlock,
2664                                          min_write_lock_level, &write_lock_level);
2665                        goto done;
2666                }
2667        }
2668        ret = 1;
2669done:
2670        /*
2671         * we don't really know what they plan on doing with the path
2672         * from here on, so for now just mark it as blocking
2673         */
2674        if (!p->leave_spinning)
2675                btrfs_set_path_blocking(p);
2676        if (ret < 0)
2677                btrfs_release_path(p);
2678        return ret;
2679}
2680
2681/*
2682 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2683 * current state of the tree together with the operations recorded in the tree
2684 * modification log to search for the key in a previous version of this tree, as
2685 * denoted by the time_seq parameter.
2686 *
2687 * Naturally, there is no support for insert, delete or cow operations.
2688 *
2689 * The resulting path and return value will be set up as if we called
2690 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2691 */
2692int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2693                          struct btrfs_path *p, u64 time_seq)
2694{
2695        struct extent_buffer *b;
2696        int slot;
2697        int ret;
2698        int err;
2699        int level;
2700        int lowest_unlock = 1;
2701        u8 lowest_level = 0;
2702
2703        lowest_level = p->lowest_level;
2704        WARN_ON(p->nodes[0] != NULL);
2705
2706        if (p->search_commit_root) {
2707                BUG_ON(time_seq);
2708                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2709        }
2710
2711again:
2712        b = get_old_root(root, time_seq);
2713        level = btrfs_header_level(b);
2714        p->locks[level] = BTRFS_READ_LOCK;
2715
2716        while (b) {
2717                level = btrfs_header_level(b);
2718                p->nodes[level] = b;
2719                btrfs_clear_path_blocking(p, NULL, 0);
2720
2721                /*
2722                 * we have a lock on b and as long as we aren't changing
2723                 * the tree, there is no way to for the items in b to change.
2724                 * It is safe to drop the lock on our parent before we
2725                 * go through the expensive btree search on b.
2726                 */
2727                btrfs_unlock_up_safe(p, level + 1);
2728
2729                ret = bin_search(b, key, level, &slot);
2730
2731                if (level != 0) {
2732                        int dec = 0;
2733                        if (ret && slot > 0) {
2734                                dec = 1;
2735                                slot -= 1;
2736                        }
2737                        p->slots[level] = slot;
2738                        unlock_up(p, level, lowest_unlock, 0, NULL);
2739
2740                        if (level == lowest_level) {
2741                                if (dec)
2742                                        p->slots[level]++;
2743                                goto done;
2744                        }
2745
2746                        err = read_block_for_search(NULL, root, p, &b, level,
2747                                                    slot, key, time_seq);
2748                        if (err == -EAGAIN)
2749                                goto again;
2750                        if (err) {
2751                                ret = err;
2752                                goto done;
2753                        }
2754
2755                        level = btrfs_header_level(b);
2756                        err = btrfs_try_tree_read_lock(b);
2757                        if (!err) {
2758                                btrfs_set_path_blocking(p);
2759                                btrfs_tree_read_lock(b);
2760                                btrfs_clear_path_blocking(p, b,
2761                                                          BTRFS_READ_LOCK);
2762                        }
2763                        p->locks[level] = BTRFS_READ_LOCK;
2764                        p->nodes[level] = b;
2765                        b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2766                        if (b != p->nodes[level]) {
2767                                btrfs_tree_unlock_rw(p->nodes[level],
2768                                                     p->locks[level]);
2769                                p->locks[level] = 0;
2770                                p->nodes[level] = b;
2771                        }
2772                } else {
2773                        p->slots[level] = slot;
2774                        unlock_up(p, level, lowest_unlock, 0, NULL);
2775                        goto done;
2776                }
2777        }
2778        ret = 1;
2779done:
2780        if (!p->leave_spinning)
2781                btrfs_set_path_blocking(p);
2782        if (ret < 0)
2783                btrfs_release_path(p);
2784
2785        return ret;
2786}
2787
2788/*
2789 * helper to use instead of search slot if no exact match is needed but
2790 * instead the next or previous item should be returned.
2791 * When find_higher is true, the next higher item is returned, the next lower
2792 * otherwise.
2793 * When return_any and find_higher are both true, and no higher item is found,
2794 * return the next lower instead.
2795 * When return_any is true and find_higher is false, and no lower item is found,
2796 * return the next higher instead.
2797 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2798 * < 0 on error
2799 */
2800int btrfs_search_slot_for_read(struct btrfs_root *root,
2801                               struct btrfs_key *key, struct btrfs_path *p,
2802                               int find_higher, int return_any)
2803{
2804        int ret;
2805        struct extent_buffer *leaf;
2806
2807again:
2808        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2809        if (ret <= 0)
2810                return ret;
2811        /*
2812         * a return value of 1 means the path is at the position where the
2813         * item should be inserted. Normally this is the next bigger item,
2814         * but in case the previous item is the last in a leaf, path points
2815         * to the first free slot in the previous leaf, i.e. at an invalid
2816         * item.
2817         */
2818        leaf = p->nodes[0];
2819
2820        if (find_higher) {
2821                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2822                        ret = btrfs_next_leaf(root, p);
2823                        if (ret <= 0)
2824                                return ret;
2825                        if (!return_any)
2826                                return 1;
2827                        /*
2828                         * no higher item found, return the next
2829                         * lower instead
2830                         */
2831                        return_any = 0;
2832                        find_higher = 0;
2833                        btrfs_release_path(p);
2834                        goto again;
2835                }
2836        } else {
2837                if (p->slots[0] == 0) {
2838                        ret = btrfs_prev_leaf(root, p);
2839                        if (ret < 0)
2840                                return ret;
2841                        if (!ret) {
2842                                p->slots[0] = btrfs_header_nritems(leaf) - 1;
2843                                return 0;
2844                        }
2845                        if (!return_any)
2846                                return 1;
2847                        /*
2848                         * no lower item found, return the next
2849                         * higher instead
2850                         */
2851                        return_any = 0;
2852                        find_higher = 1;
2853                        btrfs_release_path(p);
2854                        goto again;
2855                } else {
2856                        --p->slots[0];
2857                }
2858        }
2859        return 0;
2860}
2861
2862/*
2863 * adjust the pointers going up the tree, starting at level
2864 * making sure the right key of each node is points to 'key'.
2865 * This is used after shifting pointers to the left, so it stops
2866 * fixing up pointers when a given leaf/node is not in slot 0 of the
2867 * higher levels
2868 *
2869 */
2870static void fixup_low_keys(struct btrfs_trans_handle *trans,
2871                           struct btrfs_root *root, struct btrfs_path *path,
2872                           struct btrfs_disk_key *key, int level)
2873{
2874        int i;
2875        struct extent_buffer *t;
2876
2877        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2878                int tslot = path->slots[i];
2879                if (!path->nodes[i])
2880                        break;
2881                t = path->nodes[i];
2882                tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
2883                btrfs_set_node_key(t, key, tslot);
2884                btrfs_mark_buffer_dirty(path->nodes[i]);
2885                if (tslot != 0)
2886                        break;
2887        }
2888}
2889
2890/*
2891 * update item key.
2892 *
2893 * This function isn't completely safe. It's the caller's responsibility
2894 * that the new key won't break the order
2895 */
2896void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2897                             struct btrfs_root *root, struct btrfs_path *path,
2898                             struct btrfs_key *new_key)
2899{
2900        struct btrfs_disk_key disk_key;
2901        struct extent_buffer *eb;
2902        int slot;
2903
2904        eb = path->nodes[0];
2905        slot = path->slots[0];
2906        if (slot > 0) {
2907                btrfs_item_key(eb, &disk_key, slot - 1);
2908                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2909        }
2910        if (slot < btrfs_header_nritems(eb) - 1) {
2911                btrfs_item_key(eb, &disk_key, slot + 1);
2912                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2913        }
2914
2915        btrfs_cpu_key_to_disk(&disk_key, new_key);
2916        btrfs_set_item_key(eb, &disk_key, slot);
2917        btrfs_mark_buffer_dirty(eb);
2918        if (slot == 0)
2919                fixup_low_keys(trans, root, path, &disk_key, 1);
2920}
2921
2922/*
2923 * try to push data from one node into the next node left in the
2924 * tree.
2925 *
2926 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2927 * error, and > 0 if there was no room in the left hand block.
2928 */
2929static int push_node_left(struct btrfs_trans_handle *trans,
2930                          struct btrfs_root *root, struct extent_buffer *dst,
2931                          struct extent_buffer *src, int empty)
2932{
2933        int push_items = 0;
2934        int src_nritems;
2935        int dst_nritems;
2936        int ret = 0;
2937
2938        src_nritems = btrfs_header_nritems(src);
2939        dst_nritems = btrfs_header_nritems(dst);
2940        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2941        WARN_ON(btrfs_header_generation(src) != trans->transid);
2942        WARN_ON(btrfs_header_generation(dst) != trans->transid);
2943
2944        if (!empty && src_nritems <= 8)
2945                return 1;
2946
2947        if (push_items <= 0)
2948                return 1;
2949
2950        if (empty) {
2951                push_items = min(src_nritems, push_items);
2952                if (push_items < src_nritems) {
2953                        /* leave at least 8 pointers in the node if
2954                         * we aren't going to empty it
2955                         */
2956                        if (src_nritems - push_items < 8) {
2957                                if (push_items <= 8)
2958                                        return 1;
2959                                push_items -= 8;
2960                        }
2961                }
2962        } else
2963                push_items = min(src_nritems - 8, push_items);
2964
2965        tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2966                             push_items);
2967        copy_extent_buffer(dst, src,
2968                           btrfs_node_key_ptr_offset(dst_nritems),
2969                           btrfs_node_key_ptr_offset(0),
2970                           push_items * sizeof(struct btrfs_key_ptr));
2971
2972        if (push_items < src_nritems) {
2973                tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2974                                     src_nritems - push_items);
2975                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2976                                      btrfs_node_key_ptr_offset(push_items),
2977                                      (src_nritems - push_items) *
2978                                      sizeof(struct btrfs_key_ptr));
2979        }
2980        btrfs_set_header_nritems(src, src_nritems - push_items);
2981        btrfs_set_header_nritems(dst, dst_nritems + push_items);
2982        btrfs_mark_buffer_dirty(src);
2983        btrfs_mark_buffer_dirty(dst);
2984
2985        return ret;
2986}
2987
2988/*
2989 * try to push data from one node into the next node right in the
2990 * tree.
2991 *
2992 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2993 * error, and > 0 if there was no room in the right hand block.
2994 *
2995 * this will  only push up to 1/2 the contents of the left node over
2996 */
2997static int balance_node_right(struct btrfs_trans_handle *trans,
2998                              struct btrfs_root *root,
2999                              struct extent_buffer *dst,
3000                              struct extent_buffer *src)
3001{
3002        int push_items = 0;
3003        int max_push;
3004        int src_nritems;
3005        int dst_nritems;
3006        int ret = 0;
3007
3008        WARN_ON(btrfs_header_generation(src) != trans->transid);
3009        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3010
3011        src_nritems = btrfs_header_nritems(src);
3012        dst_nritems = btrfs_header_nritems(dst);
3013        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3014        if (push_items <= 0)
3015                return 1;
3016
3017        if (src_nritems < 4)
3018                return 1;
3019
3020        max_push = src_nritems / 2 + 1;
3021        /* don't try to empty the node */
3022        if (max_push >= src_nritems)
3023                return 1;
3024
3025        if (max_push < push_items)
3026                push_items = max_push;
3027
3028        tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3029        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3030                                      btrfs_node_key_ptr_offset(0),
3031                                      (dst_nritems) *
3032                                      sizeof(struct btrfs_key_ptr));
3033
3034        tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3035                             src_nritems - push_items, push_items);
3036        copy_extent_buffer(dst, src,
3037                           btrfs_node_key_ptr_offset(0),
3038                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3039                           push_items * sizeof(struct btrfs_key_ptr));
3040
3041        btrfs_set_header_nritems(src, src_nritems - push_items);
3042        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3043
3044        btrfs_mark_buffer_dirty(src);
3045        btrfs_mark_buffer_dirty(dst);
3046
3047        return ret;
3048}
3049
3050/*
3051 * helper function to insert a new root level in the tree.
3052 * A new node is allocated, and a single item is inserted to
3053 * point to the existing root
3054 *
3055 * returns zero on success or < 0 on failure.
3056 */
3057static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3058                           struct btrfs_root *root,
3059                           struct btrfs_path *path, int level)
3060{
3061        u64 lower_gen;
3062        struct extent_buffer *lower;
3063        struct extent_buffer *c;
3064        struct extent_buffer *old;
3065        struct btrfs_disk_key lower_key;
3066
3067        BUG_ON(path->nodes[level]);
3068        BUG_ON(path->nodes[level-1] != root->node);
3069
3070        lower = path->nodes[level-1];
3071        if (level == 1)
3072                btrfs_item_key(lower, &lower_key, 0);
3073        else
3074                btrfs_node_key(lower, &lower_key, 0);
3075
3076        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3077                                   root->root_key.objectid, &lower_key,
3078                                   level, root->node->start, 0);
3079        if (IS_ERR(c))
3080                return PTR_ERR(c);
3081
3082        root_add_used(root, root->nodesize);
3083
3084        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3085        btrfs_set_header_nritems(c, 1);
3086        btrfs_set_header_level(c, level);
3087        btrfs_set_header_bytenr(c, c->start);
3088        btrfs_set_header_generation(c, trans->transid);
3089        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3090        btrfs_set_header_owner(c, root->root_key.objectid);
3091
3092        write_extent_buffer(c, root->fs_info->fsid,
3093                            (unsigned long)btrfs_header_fsid(c),
3094                            BTRFS_FSID_SIZE);
3095
3096        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3097                            (unsigned long)btrfs_header_chunk_tree_uuid(c),
3098                            BTRFS_UUID_SIZE);
3099
3100        btrfs_set_node_key(c, &lower_key, 0);
3101        btrfs_set_node_blockptr(c, 0, lower->start);
3102        lower_gen = btrfs_header_generation(lower);
3103        WARN_ON(lower_gen != trans->transid);
3104
3105        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3106
3107        btrfs_mark_buffer_dirty(c);
3108
3109        old = root->node;
3110        tree_mod_log_set_root_pointer(root, c);
3111        rcu_assign_pointer(root->node, c);
3112
3113        /* the super has an extra ref to root->node */
3114        free_extent_buffer(old);
3115
3116        add_root_to_dirty_list(root);
3117        extent_buffer_get(c);
3118        path->nodes[level] = c;
3119        path->locks[level] = BTRFS_WRITE_LOCK;
3120        path->slots[level] = 0;
3121        return 0;
3122}
3123
3124/*
3125 * worker function to insert a single pointer in a node.
3126 * the node should have enough room for the pointer already
3127 *
3128 * slot and level indicate where you want the key to go, and
3129 * blocknr is the block the key points to.
3130 */
3131static void insert_ptr(struct btrfs_trans_handle *trans,
3132                       struct btrfs_root *root, struct btrfs_path *path,
3133                       struct btrfs_disk_key *key, u64 bytenr,
3134                       int slot, int level)
3135{
3136        struct extent_buffer *lower;
3137        int nritems;
3138        int ret;
3139
3140        BUG_ON(!path->nodes[level]);
3141        btrfs_assert_tree_locked(path->nodes[level]);
3142        lower = path->nodes[level];
3143        nritems = btrfs_header_nritems(lower);
3144        BUG_ON(slot > nritems);
3145        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3146        if (slot != nritems) {
3147                if (level)
3148                        tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3149                                             slot, nritems - slot);
3150                memmove_extent_buffer(lower,
3151                              btrfs_node_key_ptr_offset(slot + 1),
3152                              btrfs_node_key_ptr_offset(slot),
3153                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3154        }
3155        if (level) {
3156                ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3157                                              MOD_LOG_KEY_ADD);
3158                BUG_ON(ret < 0);
3159        }
3160        btrfs_set_node_key(lower, key, slot);
3161        btrfs_set_node_blockptr(lower, slot, bytenr);
3162        WARN_ON(trans->transid == 0);
3163        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3164        btrfs_set_header_nritems(lower, nritems + 1);
3165        btrfs_mark_buffer_dirty(lower);
3166}
3167
3168/*
3169 * split the node at the specified level in path in two.
3170 * The path is corrected to point to the appropriate node after the split
3171 *
3172 * Before splitting this tries to make some room in the node by pushing
3173 * left and right, if either one works, it returns right away.
3174 *
3175 * returns 0 on success and < 0 on failure
3176 */
3177static noinline int split_node(struct btrfs_trans_handle *trans,
3178                               struct btrfs_root *root,
3179                               struct btrfs_path *path, int level)
3180{
3181        struct extent_buffer *c;
3182        struct extent_buffer *split;
3183        struct btrfs_disk_key disk_key;
3184        int mid;
3185        int ret;
3186        u32 c_nritems;
3187
3188        c = path->nodes[level];
3189        WARN_ON(btrfs_header_generation(c) != trans->transid);
3190        if (c == root->node) {
3191                /* trying to split the root, lets make a new one */
3192                ret = insert_new_root(trans, root, path, level + 1);
3193                if (ret)
3194                        return ret;
3195        } else {
3196                ret = push_nodes_for_insert(trans, root, path, level);
3197                c = path->nodes[level];
3198                if (!ret && btrfs_header_nritems(c) <
3199                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3200                        return 0;
3201                if (ret < 0)
3202                        return ret;
3203        }
3204
3205        c_nritems = btrfs_header_nritems(c);
3206        mid = (c_nritems + 1) / 2;
3207        btrfs_node_key(c, &disk_key, mid);
3208
3209        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3210                                        root->root_key.objectid,
3211                                        &disk_key, level, c->start, 0);
3212        if (IS_ERR(split))
3213                return PTR_ERR(split);
3214
3215        root_add_used(root, root->nodesize);
3216
3217        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3218        btrfs_set_header_level(split, btrfs_header_level(c));
3219        btrfs_set_header_bytenr(split, split->start);
3220        btrfs_set_header_generation(split, trans->transid);
3221        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3222        btrfs_set_header_owner(split, root->root_key.objectid);
3223        write_extent_buffer(split, root->fs_info->fsid,
3224                            (unsigned long)btrfs_header_fsid(split),
3225                            BTRFS_FSID_SIZE);
3226        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3227                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
3228                            BTRFS_UUID_SIZE);
3229
3230        tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3231        copy_extent_buffer(split, c,
3232                           btrfs_node_key_ptr_offset(0),
3233                           btrfs_node_key_ptr_offset(mid),
3234                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3235        btrfs_set_header_nritems(split, c_nritems - mid);
3236        btrfs_set_header_nritems(c, mid);
3237        ret = 0;
3238
3239        btrfs_mark_buffer_dirty(c);
3240        btrfs_mark_buffer_dirty(split);
3241
3242        insert_ptr(trans, root, path, &disk_key, split->start,
3243                   path->slots[level + 1] + 1, level + 1);
3244
3245        if (path->slots[level] >= mid) {
3246                path->slots[level] -= mid;
3247                btrfs_tree_unlock(c);
3248                free_extent_buffer(c);
3249                path->nodes[level] = split;
3250                path->slots[level + 1] += 1;
3251        } else {
3252                btrfs_tree_unlock(split);
3253                free_extent_buffer(split);
3254        }
3255        return ret;
3256}
3257
3258/*
3259 * how many bytes are required to store the items in a leaf.  start
3260 * and nr indicate which items in the leaf to check.  This totals up the
3261 * space used both by the item structs and the item data
3262 */
3263static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3264{
3265        int data_len;
3266        int nritems = btrfs_header_nritems(l);
3267        int end = min(nritems, start + nr) - 1;
3268
3269        if (!nr)
3270                return 0;
3271        data_len = btrfs_item_end_nr(l, start);
3272        data_len = data_len - btrfs_item_offset_nr(l, end);
3273        data_len += sizeof(struct btrfs_item) * nr;
3274        WARN_ON(data_len < 0);
3275        return data_len;
3276}
3277
3278/*
3279 * The space between the end of the leaf items and
3280 * the start of the leaf data.  IOW, how much room
3281 * the leaf has left for both items and data
3282 */
3283noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3284                                   struct extent_buffer *leaf)
3285{
3286        int nritems = btrfs_header_nritems(leaf);
3287        int ret;
3288        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3289        if (ret < 0) {
3290                printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3291                       "used %d nritems %d\n",
3292                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3293                       leaf_space_used(leaf, 0, nritems), nritems);
3294        }
3295        return ret;
3296}
3297
3298/*
3299 * min slot controls the lowest index we're willing to push to the
3300 * right.  We'll push up to and including min_slot, but no lower
3301 */
3302static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3303                                      struct btrfs_root *root,
3304                                      struct btrfs_path *path,
3305                                      int data_size, int empty,
3306                                      struct extent_buffer *right,
3307                                      int free_space, u32 left_nritems,
3308                                      u32 min_slot)
3309{
3310        struct extent_buffer *left = path->nodes[0];
3311        struct extent_buffer *upper = path->nodes[1];
3312        struct btrfs_map_token token;
3313        struct btrfs_disk_key disk_key;
3314        int slot;
3315        u32 i;
3316        int push_space = 0;
3317        int push_items = 0;
3318        struct btrfs_item *item;
3319        u32 nr;
3320        u32 right_nritems;
3321        u32 data_end;
3322        u32 this_item_size;
3323
3324        btrfs_init_map_token(&token);
3325
3326        if (empty)
3327                nr = 0;
3328        else
3329                nr = max_t(u32, 1, min_slot);
3330
3331        if (path->slots[0] >= left_nritems)
3332                push_space += data_size;
3333
3334        slot = path->slots[1];
3335        i = left_nritems - 1;
3336        while (i >= nr) {
3337                item = btrfs_item_nr(left, i);
3338
3339                if (!empty && push_items > 0) {
3340                        if (path->slots[0] > i)
3341                                break;
3342                        if (path->slots[0] == i) {
3343                                int space = btrfs_leaf_free_space(root, left);
3344                                if (space + push_space * 2 > free_space)
3345                                        break;
3346                        }
3347                }
3348
3349                if (path->slots[0] == i)
3350                        push_space += data_size;
3351
3352                this_item_size = btrfs_item_size(left, item);
3353                if (this_item_size + sizeof(*item) + push_space > free_space)
3354                        break;
3355
3356                push_items++;
3357                push_space += this_item_size + sizeof(*item);
3358                if (i == 0)
3359                        break;
3360                i--;
3361        }
3362
3363        if (push_items == 0)
3364                goto out_unlock;
3365
3366        if (!empty && push_items == left_nritems)
3367                WARN_ON(1);
3368
3369        /* push left to right */
3370        right_nritems = btrfs_header_nritems(right);
3371
3372        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3373        push_space -= leaf_data_end(root, left);
3374
3375        /* make room in the right data area */
3376        data_end = leaf_data_end(root, right);
3377        memmove_extent_buffer(right,
3378                              btrfs_leaf_data(right) + data_end - push_space,
3379                              btrfs_leaf_data(right) + data_end,
3380                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
3381
3382        /* copy from the left data area */
3383        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3384                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3385                     btrfs_leaf_data(left) + leaf_data_end(root, left),
3386                     push_space);
3387
3388        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3389                              btrfs_item_nr_offset(0),
3390                              right_nritems * sizeof(struct btrfs_item));
3391
3392        /* copy the items from left to right */
3393        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3394                   btrfs_item_nr_offset(left_nritems - push_items),
3395                   push_items * sizeof(struct btrfs_item));
3396
3397        /* update the item pointers */
3398        right_nritems += push_items;
3399        btrfs_set_header_nritems(right, right_nritems);
3400        push_space = BTRFS_LEAF_DATA_SIZE(root);
3401        for (i = 0; i < right_nritems; i++) {
3402                item = btrfs_item_nr(right, i);
3403                push_space -= btrfs_token_item_size(right, item, &token);
3404                btrfs_set_token_item_offset(right, item, push_space, &token);
3405        }
3406
3407        left_nritems -= push_items;
3408        btrfs_set_header_nritems(left, left_nritems);
3409
3410        if (left_nritems)
3411                btrfs_mark_buffer_dirty(left);
3412        else
3413                clean_tree_block(trans, root, left);
3414
3415        btrfs_mark_buffer_dirty(right);
3416
3417        btrfs_item_key(right, &disk_key, 0);
3418        btrfs_set_node_key(upper, &disk_key, slot + 1);
3419        btrfs_mark_buffer_dirty(upper);
3420
3421        /* then fixup the leaf pointer in the path */
3422        if (path->slots[0] >= left_nritems) {
3423                path->slots[0] -= left_nritems;
3424                if (btrfs_header_nritems(path->nodes[0]) == 0)
3425                        clean_tree_block(trans, root, path->nodes[0]);
3426                btrfs_tree_unlock(path->nodes[0]);
3427                free_extent_buffer(path->nodes[0]);
3428                path->nodes[0] = right;
3429                path->slots[1] += 1;
3430        } else {
3431                btrfs_tree_unlock(right);
3432                free_extent_buffer(right);
3433        }
3434        return 0;
3435
3436out_unlock:
3437        btrfs_tree_unlock(right);
3438        free_extent_buffer(right);
3439        return 1;
3440}
3441
3442/*
3443 * push some data in the path leaf to the right, trying to free up at
3444 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3445 *
3446 * returns 1 if the push failed because the other node didn't have enough
3447 * room, 0 if everything worked out and < 0 if there were major errors.
3448 *
3449 * this will push starting from min_slot to the end of the leaf.  It won't
3450 * push any slot lower than min_slot
3451 */
3452static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3453                           *root, struct btrfs_path *path,
3454                           int min_data_size, int data_size,
3455                           int empty, u32 min_slot)
3456{
3457        struct extent_buffer *left = path->nodes[0];
3458        struct extent_buffer *right;
3459        struct extent_buffer *upper;
3460        int slot;
3461        int free_space;
3462        u32 left_nritems;
3463        int ret;
3464
3465        if (!path->nodes[1])
3466                return 1;
3467
3468        slot = path->slots[1];
3469        upper = path->nodes[1];
3470        if (slot >= btrfs_header_nritems(upper) - 1)
3471                return 1;
3472
3473        btrfs_assert_tree_locked(path->nodes[1]);
3474
3475        right = read_node_slot(root, upper, slot + 1);
3476        if (right == NULL)
3477                return 1;
3478
3479        btrfs_tree_lock(right);
3480        btrfs_set_lock_blocking(right);
3481
3482        free_space = btrfs_leaf_free_space(root, right);
3483        if (free_space < data_size)
3484                goto out_unlock;
3485
3486        /* cow and double check */
3487        ret = btrfs_cow_block(trans, root, right, upper,
3488                              slot + 1, &right);
3489        if (ret)
3490                goto out_unlock;
3491
3492        free_space = btrfs_leaf_free_space(root, right);
3493        if (free_space < data_size)
3494                goto out_unlock;
3495
3496        left_nritems = btrfs_header_nritems(left);
3497        if (left_nritems == 0)
3498                goto out_unlock;
3499
3500        return __push_leaf_right(trans, root, path, min_data_size, empty,
3501                                right, free_space, left_nritems, min_slot);
3502out_unlock:
3503        btrfs_tree_unlock(right);
3504        free_extent_buffer(right);
3505        return 1;
3506}
3507
3508/*
3509 * push some data in the path leaf to the left, trying to free up at
3510 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3511 *
3512 * max_slot can put a limit on how far into the leaf we'll push items.  The
3513 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3514 * items
3515 */
3516static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3517                                     struct btrfs_root *root,
3518                                     struct btrfs_path *path, int data_size,
3519                                     int empty, struct extent_buffer *left,
3520                                     int free_space, u32 right_nritems,
3521                                     u32 max_slot)
3522{
3523        struct btrfs_disk_key disk_key;
3524        struct extent_buffer *right = path->nodes[0];
3525        int i;
3526        int push_space = 0;
3527        int push_items = 0;
3528        struct btrfs_item *item;
3529        u32 old_left_nritems;
3530        u32 nr;
3531        int ret = 0;
3532        u32 this_item_size;
3533        u32 old_left_item_size;
3534        struct btrfs_map_token token;
3535
3536        btrfs_init_map_token(&token);
3537
3538        if (empty)
3539                nr = min(right_nritems, max_slot);
3540        else
3541                nr = min(right_nritems - 1, max_slot);
3542
3543        for (i = 0; i < nr; i++) {
3544                item = btrfs_item_nr(right, i);
3545
3546                if (!empty && push_items > 0) {
3547                        if (path->slots[0] < i)
3548                                break;
3549                        if (path->slots[0] == i) {
3550                                int space = btrfs_leaf_free_space(root, right);
3551                                if (space + push_space * 2 > free_space)
3552                                        break;
3553                        }
3554                }
3555
3556                if (path->slots[0] == i)
3557                        push_space += data_size;
3558
3559                this_item_size = btrfs_item_size(right, item);
3560                if (this_item_size + sizeof(*item) + push_space > free_space)
3561                        break;
3562
3563                push_items++;
3564                push_space += this_item_size + sizeof(*item);
3565        }
3566
3567        if (push_items == 0) {
3568                ret = 1;
3569                goto out;
3570        }
3571        if (!empty && push_items == btrfs_header_nritems(right))
3572                WARN_ON(1);
3573
3574        /* push data from right to left */
3575        copy_extent_buffer(left, right,
3576                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3577                           btrfs_item_nr_offset(0),
3578                           push_items * sizeof(struct btrfs_item));
3579
3580        push_space = BTRFS_LEAF_DATA_SIZE(root) -
3581                     btrfs_item_offset_nr(right, push_items - 1);
3582
3583        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3584                     leaf_data_end(root, left) - push_space,
3585                     btrfs_leaf_data(right) +
3586                     btrfs_item_offset_nr(right, push_items - 1),
3587                     push_space);
3588        old_left_nritems = btrfs_header_nritems(left);
3589        BUG_ON(old_left_nritems <= 0);
3590
3591        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3592        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3593                u32 ioff;
3594
3595                item = btrfs_item_nr(left, i);
3596
3597                ioff = btrfs_token_item_offset(left, item, &token);
3598                btrfs_set_token_item_offset(left, item,
3599                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3600                      &token);
3601        }
3602        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3603
3604        /* fixup right node */
3605        if (push_items > right_nritems) {
3606                printk(KERN_CRIT "push items %d nr %u\n", push_items,
3607                       right_nritems);
3608                WARN_ON(1);
3609        }
3610
3611        if (push_items < right_nritems) {
3612                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3613                                                  leaf_data_end(root, right);
3614                memmove_extent_buffer(right, btrfs_leaf_data(right) +
3615                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3616                                      btrfs_leaf_data(right) +
3617                                      leaf_data_end(root, right), push_space);
3618
3619                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3620                              btrfs_item_nr_offset(push_items),
3621                             (btrfs_header_nritems(right) - push_items) *
3622                             sizeof(struct btrfs_item));
3623        }
3624        right_nritems -= push_items;
3625        btrfs_set_header_nritems(right, right_nritems);
3626        push_space = BTRFS_LEAF_DATA_SIZE(root);
3627        for (i = 0; i < right_nritems; i++) {
3628                item = btrfs_item_nr(right, i);
3629
3630                push_space = push_space - btrfs_token_item_size(right,
3631                                                                item, &token);
3632                btrfs_set_token_item_offset(right, item, push_space, &token);
3633        }
3634
3635        btrfs_mark_buffer_dirty(left);
3636        if (right_nritems)
3637                btrfs_mark_buffer_dirty(right);
3638        else
3639                clean_tree_block(trans, root, right);
3640
3641        btrfs_item_key(right, &disk_key, 0);
3642        fixup_low_keys(trans, root, path, &disk_key, 1);
3643
3644        /* then fixup the leaf pointer in the path */
3645        if (path->slots[0] < push_items) {
3646                path->slots[0] += old_left_nritems;
3647                btrfs_tree_unlock(path->nodes[0]);
3648                free_extent_buffer(path->nodes[0]);
3649                path->nodes[0] = left;
3650                path->slots[1] -= 1;
3651        } else {
3652                btrfs_tree_unlock(left);
3653                free_extent_buffer(left);
3654                path->slots[0] -= push_items;
3655        }
3656        BUG_ON(path->slots[0] < 0);
3657        return ret;
3658out:
3659        btrfs_tree_unlock(left);
3660        free_extent_buffer(left);
3661        return ret;
3662}
3663
3664/*
3665 * push some data in the path leaf to the left, trying to free up at
3666 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3667 *
3668 * max_slot can put a limit on how far into the leaf we'll push items.  The
3669 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3670 * items
3671 */
3672static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3673                          *root, struct btrfs_path *path, int min_data_size,
3674                          int data_size, int empty, u32 max_slot)
3675{
3676        struct extent_buffer *right = path->nodes[0];
3677        struct extent_buffer *left;
3678        int slot;
3679        int free_space;
3680        u32 right_nritems;
3681        int ret = 0;
3682
3683        slot = path->slots[1];
3684        if (slot == 0)
3685                return 1;
3686        if (!path->nodes[1])
3687                return 1;
3688
3689        right_nritems = btrfs_header_nritems(right);
3690        if (right_nritems == 0)
3691                return 1;
3692
3693        btrfs_assert_tree_locked(path->nodes[1]);
3694
3695        left = read_node_slot(root, path->nodes[1], slot - 1);
3696        if (left == NULL)
3697                return 1;
3698
3699        btrfs_tree_lock(left);
3700        btrfs_set_lock_blocking(left);
3701
3702        free_space = btrfs_leaf_free_space(root, left);
3703        if (free_space < data_size) {
3704                ret = 1;
3705                goto out;
3706        }
3707
3708        /* cow and double check */
3709        ret = btrfs_cow_block(trans, root, left,
3710                              path->nodes[1], slot - 1, &left);
3711        if (ret) {
3712                /* we hit -ENOSPC, but it isn't fatal here */
3713                if (ret == -ENOSPC)
3714                        ret = 1;
3715                goto out;
3716        }
3717
3718        free_space = btrfs_leaf_free_space(root, left);
3719        if (free_space < data_size) {
3720                ret = 1;
3721                goto out;
3722        }
3723
3724        return __push_leaf_left(trans, root, path, min_data_size,
3725                               empty, left, free_space, right_nritems,
3726                               max_slot);
3727out:
3728        btrfs_tree_unlock(left);
3729        free_extent_buffer(left);
3730        return ret;
3731}
3732
3733/*
3734 * split the path's leaf in two, making sure there is at least data_size
3735 * available for the resulting leaf level of the path.
3736 */
3737static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3738                                    struct btrfs_root *root,
3739                                    struct btrfs_path *path,
3740                                    struct extent_buffer *l,
3741                                    struct extent_buffer *right,
3742                                    int slot, int mid, int nritems)
3743{
3744        int data_copy_size;
3745        int rt_data_off;
3746        int i;
3747        struct btrfs_disk_key disk_key;
3748        struct btrfs_map_token token;
3749
3750        btrfs_init_map_token(&token);
3751
3752        nritems = nritems - mid;
3753        btrfs_set_header_nritems(right, nritems);
3754        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3755
3756        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3757                           btrfs_item_nr_offset(mid),
3758                           nritems * sizeof(struct btrfs_item));
3759
3760        copy_extent_buffer(right, l,
3761                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3762                     data_copy_size, btrfs_leaf_data(l) +
3763                     leaf_data_end(root, l), data_copy_size);
3764
3765        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3766                      btrfs_item_end_nr(l, mid);
3767
3768        for (i = 0; i < nritems; i++) {
3769                struct btrfs_item *item = btrfs_item_nr(right, i);
3770                u32 ioff;
3771
3772                ioff = btrfs_token_item_offset(right, item, &token);
3773                btrfs_set_token_item_offset(right, item,
3774                                            ioff + rt_data_off, &token);
3775        }
3776
3777        btrfs_set_header_nritems(l, mid);
3778        btrfs_item_key(right, &disk_key, 0);
3779        insert_ptr(trans, root, path, &disk_key, right->start,
3780                   path->slots[1] + 1, 1);
3781
3782        btrfs_mark_buffer_dirty(right);
3783        btrfs_mark_buffer_dirty(l);
3784        BUG_ON(path->slots[0] != slot);
3785
3786        if (mid <= slot) {
3787                btrfs_tree_unlock(path->nodes[0]);
3788                free_extent_buffer(path->nodes[0]);
3789                path->nodes[0] = right;
3790                path->slots[0] -= mid;
3791                path->slots[1] += 1;
3792        } else {
3793                btrfs_tree_unlock(right);
3794                free_extent_buffer(right);
3795        }
3796
3797        BUG_ON(path->slots[0] < 0);
3798}
3799
3800/*
3801 * double splits happen when we need to insert a big item in the middle
3802 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3803 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3804 *          A                 B                 C
3805 *
3806 * We avoid this by trying to push the items on either side of our target
3807 * into the adjacent leaves.  If all goes well we can avoid the double split
3808 * completely.
3809 */
3810static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3811                                          struct btrfs_root *root,
3812                                          struct btrfs_path *path,
3813                                          int data_size)
3814{
3815        int ret;
3816        int progress = 0;
3817        int slot;
3818        u32 nritems;
3819
3820        slot = path->slots[0];
3821
3822        /*
3823         * try to push all the items after our slot into the
3824         * right leaf
3825         */
3826        ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3827        if (ret < 0)
3828                return ret;
3829
3830        if (ret == 0)
3831                progress++;
3832
3833        nritems = btrfs_header_nritems(path->nodes[0]);
3834        /*
3835         * our goal is to get our slot at the start or end of a leaf.  If
3836         * we've done so we're done
3837         */
3838        if (path->slots[0] == 0 || path->slots[0] == nritems)
3839                return 0;
3840
3841        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3842                return 0;
3843
3844        /* try to push all the items before our slot into the next leaf */
3845        slot = path->slots[0];
3846        ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3847        if (ret < 0)
3848                return ret;
3849
3850        if (ret == 0)
3851                progress++;
3852
3853        if (progress)
3854                return 0;
3855        return 1;
3856}
3857
3858/*
3859 * split the path's leaf in two, making sure there is at least data_size
3860 * available for the resulting leaf level of the path.
3861 *
3862 * returns 0 if all went well and < 0 on failure.
3863 */
3864static noinline int split_leaf(struct btrfs_trans_handle *trans,
3865                               struct btrfs_root *root,
3866                               struct btrfs_key *ins_key,
3867                               struct btrfs_path *path, int data_size,
3868                               int extend)
3869{
3870        struct btrfs_disk_key disk_key;
3871        struct extent_buffer *l;
3872        u32 nritems;
3873        int mid;
3874        int slot;
3875        struct extent_buffer *right;
3876        int ret = 0;
3877        int wret;
3878        int split;
3879        int num_doubles = 0;
3880        int tried_avoid_double = 0;
3881
3882        l = path->nodes[0];
3883        slot = path->slots[0];
3884        if (extend && data_size + btrfs_item_size_nr(l, slot) +
3885            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3886                return -EOVERFLOW;
3887
3888        /* first try to make some room by pushing left and right */
3889        if (data_size) {
3890                wret = push_leaf_right(trans, root, path, data_size,
3891                                       data_size, 0, 0);
3892                if (wret < 0)
3893                        return wret;
3894                if (wret) {
3895                        wret = push_leaf_left(trans, root, path, data_size,
3896                                              data_size, 0, (u32)-1);
3897                        if (wret < 0)
3898                                return wret;
3899                }
3900                l = path->nodes[0];
3901
3902                /* did the pushes work? */
3903                if (btrfs_leaf_free_space(root, l) >= data_size)
3904                        return 0;
3905        }
3906
3907        if (!path->nodes[1]) {
3908                ret = insert_new_root(trans, root, path, 1);
3909                if (ret)
3910                        return ret;
3911        }
3912again:
3913        split = 1;
3914        l = path->nodes[0];
3915        slot = path->slots[0];
3916        nritems = btrfs_header_nritems(l);
3917        mid = (nritems + 1) / 2;
3918
3919        if (mid <= slot) {
3920                if (nritems == 1 ||
3921                    leaf_space_used(l, mid, nritems - mid) + data_size >
3922                        BTRFS_LEAF_DATA_SIZE(root)) {
3923                        if (slot >= nritems) {
3924                                split = 0;
3925                        } else {
3926                                mid = slot;
3927                                if (mid != nritems &&
3928                                    leaf_space_used(l, mid, nritems - mid) +
3929                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3930                                        if (data_size && !tried_avoid_double)
3931                                                goto push_for_double;
3932                                        split = 2;
3933                                }
3934                        }
3935                }
3936        } else {
3937                if (leaf_space_used(l, 0, mid) + data_size >
3938                        BTRFS_LEAF_DATA_SIZE(root)) {
3939                        if (!extend && data_size && slot == 0) {
3940                                split = 0;
3941                        } else if ((extend || !data_size) && slot == 0) {
3942                                mid = 1;
3943                        } else {
3944                                mid = slot;
3945                                if (mid != nritems &&
3946                                    leaf_space_used(l, mid, nritems - mid) +
3947                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3948                                        if (data_size && !tried_avoid_double)
3949                                                goto push_for_double;
3950                                        split = 2 ;
3951                                }
3952                        }
3953                }
3954        }
3955
3956        if (split == 0)
3957                btrfs_cpu_key_to_disk(&disk_key, ins_key);
3958        else
3959                btrfs_item_key(l, &disk_key, mid);
3960
3961        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3962                                        root->root_key.objectid,
3963                                        &disk_key, 0, l->start, 0);
3964        if (IS_ERR(right))
3965                return PTR_ERR(right);
3966
3967        root_add_used(root, root->leafsize);
3968
3969        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
3970        btrfs_set_header_bytenr(right, right->start);
3971        btrfs_set_header_generation(right, trans->transid);
3972        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
3973        btrfs_set_header_owner(right, root->root_key.objectid);
3974        btrfs_set_header_level(right, 0);
3975        write_extent_buffer(right, root->fs_info->fsid,
3976                            (unsigned long)btrfs_header_fsid(right),
3977                            BTRFS_FSID_SIZE);
3978
3979        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
3980                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
3981                            BTRFS_UUID_SIZE);
3982
3983        if (split == 0) {
3984                if (mid <= slot) {
3985                        btrfs_set_header_nritems(right, 0);
3986                        insert_ptr(trans, root, path, &disk_key, right->start,
3987                                   path->slots[1] + 1, 1);
3988                        btrfs_tree_unlock(path->nodes[0]);
3989                        free_extent_buffer(path->nodes[0]);
3990                        path->nodes[0] = right;
3991                        path->slots[0] = 0;
3992                        path->slots[1] += 1;
3993                } else {
3994                        btrfs_set_header_nritems(right, 0);
3995                        insert_ptr(trans, root, path, &disk_key, right->start,
3996                                          path->slots[1], 1);
3997                        btrfs_tree_unlock(path->nodes[0]);
3998                        free_extent_buffer(path->nodes[0]);
3999                        path->nodes[0] = right;
4000                        path->slots[0] = 0;
4001                        if (path->slots[1] == 0)
4002                                fixup_low_keys(trans, root, path,
4003                                               &disk_key, 1);
4004                }
4005                btrfs_mark_buffer_dirty(right);
4006                return ret;
4007        }
4008
4009        copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4010
4011        if (split == 2) {
4012                BUG_ON(num_doubles != 0);
4013                num_doubles++;
4014                goto again;
4015        }
4016
4017        return 0;
4018
4019push_for_double:
4020        push_for_double_split(trans, root, path, data_size);
4021        tried_avoid_double = 1;
4022        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4023                return 0;
4024        goto again;
4025}
4026
4027static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4028                                         struct btrfs_root *root,
4029                                         struct btrfs_path *path, int ins_len)
4030{
4031        struct btrfs_key key;
4032        struct extent_buffer *leaf;
4033        struct btrfs_file_extent_item *fi;
4034        u64 extent_len = 0;
4035        u32 item_size;
4036        int ret;
4037
4038        leaf = path->nodes[0];
4039        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4040
4041        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4042               key.type != BTRFS_EXTENT_CSUM_KEY);
4043
4044        if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4045                return 0;
4046
4047        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4048        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4049                fi = btrfs_item_ptr(leaf, path->slots[0],
4050                                    struct btrfs_file_extent_item);
4051                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4052        }
4053        btrfs_release_path(path);
4054
4055        path->keep_locks = 1;
4056        path->search_for_split = 1;
4057        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4058        path->search_for_split = 0;
4059        if (ret < 0)
4060                goto err;
4061
4062        ret = -EAGAIN;
4063        leaf = path->nodes[0];
4064        /* if our item isn't there or got smaller, return now */
4065        if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4066                goto err;
4067
4068        /* the leaf has  changed, it now has room.  return now */
4069        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4070                goto err;
4071
4072        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4073                fi = btrfs_item_ptr(leaf, path->slots[0],
4074                                    struct btrfs_file_extent_item);
4075                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4076                        goto err;
4077        }
4078
4079        btrfs_set_path_blocking(path);
4080        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4081        if (ret)
4082                goto err;
4083
4084        path->keep_locks = 0;
4085        btrfs_unlock_up_safe(path, 1);
4086        return 0;
4087err:
4088        path->keep_locks = 0;
4089        return ret;
4090}
4091
4092static noinline int split_item(struct btrfs_trans_handle *trans,
4093                               struct btrfs_root *root,
4094                               struct btrfs_path *path,
4095                               struct btrfs_key *new_key,
4096                               unsigned long split_offset)
4097{
4098        struct extent_buffer *leaf;
4099        struct btrfs_item *item;
4100        struct btrfs_item *new_item;
4101        int slot;
4102        char *buf;
4103        u32 nritems;
4104        u32 item_size;
4105        u32 orig_offset;
4106        struct btrfs_disk_key disk_key;
4107
4108        leaf = path->nodes[0];
4109        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4110
4111        btrfs_set_path_blocking(path);
4112
4113        item = btrfs_item_nr(leaf, path->slots[0]);
4114        orig_offset = btrfs_item_offset(leaf, item);
4115        item_size = btrfs_item_size(leaf, item);
4116
4117        buf = kmalloc(item_size, GFP_NOFS);
4118        if (!buf)
4119                return -ENOMEM;
4120
4121        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4122                            path->slots[0]), item_size);
4123
4124        slot = path->slots[0] + 1;
4125        nritems = btrfs_header_nritems(leaf);
4126        if (slot != nritems) {
4127                /* shift the items */
4128                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4129                                btrfs_item_nr_offset(slot),
4130                                (nritems - slot) * sizeof(struct btrfs_item));
4131        }
4132
4133        btrfs_cpu_key_to_disk(&disk_key, new_key);
4134        btrfs_set_item_key(leaf, &disk_key, slot);
4135
4136        new_item = btrfs_item_nr(leaf, slot);
4137
4138        btrfs_set_item_offset(leaf, new_item, orig_offset);
4139        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4140
4141        btrfs_set_item_offset(leaf, item,
4142                              orig_offset + item_size - split_offset);
4143        btrfs_set_item_size(leaf, item, split_offset);
4144
4145        btrfs_set_header_nritems(leaf, nritems + 1);
4146
4147        /* write the data for the start of the original item */
4148        write_extent_buffer(leaf, buf,
4149                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4150                            split_offset);
4151
4152        /* write the data for the new item */
4153        write_extent_buffer(leaf, buf + split_offset,
4154                            btrfs_item_ptr_offset(leaf, slot),
4155                            item_size - split_offset);
4156        btrfs_mark_buffer_dirty(leaf);
4157
4158        BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4159        kfree(buf);
4160        return 0;
4161}
4162
4163/*
4164 * This function splits a single item into two items,
4165 * giving 'new_key' to the new item and splitting the
4166 * old one at split_offset (from the start of the item).
4167 *
4168 * The path may be released by this operation.  After
4169 * the split, the path is pointing to the old item.  The
4170 * new item is going to be in the same node as the old one.
4171 *
4172 * Note, the item being split must be smaller enough to live alone on
4173 * a tree block with room for one extra struct btrfs_item
4174 *
4175 * This allows us to split the item in place, keeping a lock on the
4176 * leaf the entire time.
4177 */
4178int btrfs_split_item(struct btrfs_trans_handle *trans,
4179                     struct btrfs_root *root,
4180                     struct btrfs_path *path,
4181                     struct btrfs_key *new_key,
4182                     unsigned long split_offset)
4183{
4184        int ret;
4185        ret = setup_leaf_for_split(trans, root, path,
4186                                   sizeof(struct btrfs_item));
4187        if (ret)
4188                return ret;
4189
4190        ret = split_item(trans, root, path, new_key, split_offset);
4191        return ret;
4192}
4193
4194/*
4195 * This function duplicate a item, giving 'new_key' to the new item.
4196 * It guarantees both items live in the same tree leaf and the new item
4197 * is contiguous with the original item.
4198 *
4199 * This allows us to split file extent in place, keeping a lock on the
4200 * leaf the entire time.
4201 */
4202int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4203                         struct btrfs_root *root,
4204                         struct btrfs_path *path,
4205                         struct btrfs_key *new_key)
4206{
4207        struct extent_buffer *leaf;
4208        int ret;
4209        u32 item_size;
4210
4211        leaf = path->nodes[0];
4212        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4213        ret = setup_leaf_for_split(trans, root, path,
4214                                   item_size + sizeof(struct btrfs_item));
4215        if (ret)
4216                return ret;
4217
4218        path->slots[0]++;
4219        setup_items_for_insert(trans, root, path, new_key, &item_size,
4220                               item_size, item_size +
4221                               sizeof(struct btrfs_item), 1);
4222        leaf = path->nodes[0];
4223        memcpy_extent_buffer(leaf,
4224                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4225                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4226                             item_size);
4227        return 0;
4228}
4229
4230/*
4231 * make the item pointed to by the path smaller.  new_size indicates
4232 * how small to make it, and from_end tells us if we just chop bytes
4233 * off the end of the item or if we shift the item to chop bytes off
4234 * the front.
4235 */
4236void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4237                         struct btrfs_root *root,
4238                         struct btrfs_path *path,
4239                         u32 new_size, int from_end)
4240{
4241        int slot;
4242        struct extent_buffer *leaf;
4243        struct btrfs_item *item;
4244        u32 nritems;
4245        unsigned int data_end;
4246        unsigned int old_data_start;
4247        unsigned int old_size;
4248        unsigned int size_diff;
4249        int i;
4250        struct btrfs_map_token token;
4251
4252        btrfs_init_map_token(&token);
4253
4254        leaf = path->nodes[0];
4255        slot = path->slots[0];
4256
4257        old_size = btrfs_item_size_nr(leaf, slot);
4258        if (old_size == new_size)
4259                return;
4260
4261        nritems = btrfs_header_nritems(leaf);
4262        data_end = leaf_data_end(root, leaf);
4263
4264        old_data_start = btrfs_item_offset_nr(leaf, slot);
4265
4266        size_diff = old_size - new_size;
4267
4268        BUG_ON(slot < 0);
4269        BUG_ON(slot >= nritems);
4270
4271        /*
4272         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4273         */
4274        /* first correct the data pointers */
4275        for (i = slot; i < nritems; i++) {
4276                u32 ioff;
4277                item = btrfs_item_nr(leaf, i);
4278
4279                ioff = btrfs_token_item_offset(leaf, item, &token);
4280                btrfs_set_token_item_offset(leaf, item,
4281                                            ioff + size_diff, &token);
4282        }
4283
4284        /* shift the data */
4285        if (from_end) {
4286                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4287                              data_end + size_diff, btrfs_leaf_data(leaf) +
4288                              data_end, old_data_start + new_size - data_end);
4289        } else {
4290                struct btrfs_disk_key disk_key;
4291                u64 offset;
4292
4293                btrfs_item_key(leaf, &disk_key, slot);
4294
4295                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4296                        unsigned long ptr;
4297                        struct btrfs_file_extent_item *fi;
4298
4299                        fi = btrfs_item_ptr(leaf, slot,
4300                                            struct btrfs_file_extent_item);
4301                        fi = (struct btrfs_file_extent_item *)(
4302                             (unsigned long)fi - size_diff);
4303
4304                        if (btrfs_file_extent_type(leaf, fi) ==
4305                            BTRFS_FILE_EXTENT_INLINE) {
4306                                ptr = btrfs_item_ptr_offset(leaf, slot);
4307                                memmove_extent_buffer(leaf, ptr,
4308                                      (unsigned long)fi,
4309                                      offsetof(struct btrfs_file_extent_item,
4310                                                 disk_bytenr));
4311                        }
4312                }
4313
4314                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4315                              data_end + size_diff, btrfs_leaf_data(leaf) +
4316                              data_end, old_data_start - data_end);
4317
4318                offset = btrfs_disk_key_offset(&disk_key);
4319                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4320                btrfs_set_item_key(leaf, &disk_key, slot);
4321                if (slot == 0)
4322                        fixup_low_keys(trans, root, path, &disk_key, 1);
4323        }
4324
4325        item = btrfs_item_nr(leaf, slot);
4326        btrfs_set_item_size(leaf, item, new_size);
4327        btrfs_mark_buffer_dirty(leaf);
4328
4329        if (btrfs_leaf_free_space(root, leaf) < 0) {
4330                btrfs_print_leaf(root, leaf);
4331                BUG();
4332        }
4333}
4334
4335/*
4336 * make the item pointed to by the path bigger, data_size is the new size.
4337 */
4338void btrfs_extend_item(struct btrfs_trans_handle *trans,
4339                       struct btrfs_root *root, struct btrfs_path *path,
4340                       u32 data_size)
4341{
4342        int slot;
4343        struct extent_buffer *leaf;
4344        struct btrfs_item *item;
4345        u32 nritems;
4346        unsigned int data_end;
4347        unsigned int old_data;
4348        unsigned int old_size;
4349        int i;
4350        struct btrfs_map_token token;
4351
4352        btrfs_init_map_token(&token);
4353
4354        leaf = path->nodes[0];
4355
4356        nritems = btrfs_header_nritems(leaf);
4357        data_end = leaf_data_end(root, leaf);
4358
4359        if (btrfs_leaf_free_space(root, leaf) < data_size) {
4360                btrfs_print_leaf(root, leaf);
4361                BUG();
4362        }
4363        slot = path->slots[0];
4364        old_data = btrfs_item_end_nr(leaf, slot);
4365
4366        BUG_ON(slot < 0);
4367        if (slot >= nritems) {
4368                btrfs_print_leaf(root, leaf);
4369                printk(KERN_CRIT "slot %d too large, nritems %d\n",
4370                       slot, nritems);
4371                BUG_ON(1);
4372        }
4373
4374        /*
4375         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4376         */
4377        /* first correct the data pointers */
4378        for (i = slot; i < nritems; i++) {
4379                u32 ioff;
4380                item = btrfs_item_nr(leaf, i);
4381
4382                ioff = btrfs_token_item_offset(leaf, item, &token);
4383                btrfs_set_token_item_offset(leaf, item,
4384                                            ioff - data_size, &token);
4385        }
4386
4387        /* shift the data */
4388        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4389                      data_end - data_size, btrfs_leaf_data(leaf) +
4390                      data_end, old_data - data_end);
4391
4392        data_end = old_data;
4393        old_size = btrfs_item_size_nr(leaf, slot);
4394        item = btrfs_item_nr(leaf, slot);
4395        btrfs_set_item_size(leaf, item, old_size + data_size);
4396        btrfs_mark_buffer_dirty(leaf);
4397
4398        if (btrfs_leaf_free_space(root, leaf) < 0) {
4399                btrfs_print_leaf(root, leaf);
4400                BUG();
4401        }
4402}
4403
4404/*
4405 * Given a key and some data, insert items into the tree.
4406 * This does all the path init required, making room in the tree if needed.
4407 * Returns the number of keys that were inserted.
4408 */
4409int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
4410                            struct btrfs_root *root,
4411                            struct btrfs_path *path,
4412                            struct btrfs_key *cpu_key, u32 *data_size,
4413                            int nr)
4414{
4415        struct extent_buffer *leaf;
4416        struct btrfs_item *item;
4417        int ret = 0;
4418        int slot;
4419        int i;
4420        u32 nritems;
4421        u32 total_data = 0;
4422        u32 total_size = 0;
4423        unsigned int data_end;
4424        struct btrfs_disk_key disk_key;
4425        struct btrfs_key found_key;
4426        struct btrfs_map_token token;
4427
4428        btrfs_init_map_token(&token);
4429
4430        for (i = 0; i < nr; i++) {
4431                if (total_size + data_size[i] + sizeof(struct btrfs_item) >
4432                    BTRFS_LEAF_DATA_SIZE(root)) {
4433                        break;
4434                        nr = i;
4435                }
4436                total_data += data_size[i];
4437                total_size += data_size[i] + sizeof(struct btrfs_item);
4438        }
4439        BUG_ON(nr == 0);
4440
4441        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4442        if (ret == 0)
4443                return -EEXIST;
4444        if (ret < 0)
4445                goto out;
4446
4447        leaf = path->nodes[0];
4448
4449        nritems = btrfs_header_nritems(leaf);
4450        data_end = leaf_data_end(root, leaf);
4451
4452        if (btrfs_leaf_free_space(root, leaf) < total_size) {
4453                for (i = nr; i >= 0; i--) {
4454                        total_data -= data_size[i];
4455                        total_size -= data_size[i] + sizeof(struct btrfs_item);
4456                        if (total_size < btrfs_leaf_free_space(root, leaf))
4457                                break;
4458                }
4459                nr = i;
4460        }
4461
4462        slot = path->slots[0];
4463        BUG_ON(slot < 0);
4464
4465        if (slot != nritems) {
4466                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4467
4468                item = btrfs_item_nr(leaf, slot);
4469                btrfs_item_key_to_cpu(leaf, &found_key, slot);
4470
4471                /* figure out how many keys we can insert in here */
4472                total_data = data_size[0];
4473                for (i = 1; i < nr; i++) {
4474                        if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
4475                                break;
4476                        total_data += data_size[i];
4477                }
4478                nr = i;
4479
4480                if (old_data < data_end) {
4481                        btrfs_print_leaf(root, leaf);
4482                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4483                               slot, old_data, data_end);
4484                        BUG_ON(1);
4485                }
4486                /*
4487                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4488                 */
4489                /* first correct the data pointers */
4490                for (i = slot; i < nritems; i++) {
4491                        u32 ioff;
4492
4493                        item = btrfs_item_nr(leaf, i);
4494                        ioff = btrfs_token_item_offset(leaf, item, &token);
4495                        btrfs_set_token_item_offset(leaf, item,
4496                                                    ioff - total_data, &token);
4497                }
4498                /* shift the items */
4499                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4500                              btrfs_item_nr_offset(slot),
4501                              (nritems - slot) * sizeof(struct btrfs_item));
4502
4503                /* shift the data */
4504                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4505                              data_end - total_data, btrfs_leaf_data(leaf) +
4506                              data_end, old_data - data_end);
4507                data_end = old_data;
4508        } else {
4509                /*
4510                 * this sucks but it has to be done, if we are inserting at
4511                 * the end of the leaf only insert 1 of the items, since we
4512                 * have no way of knowing whats on the next leaf and we'd have
4513                 * to drop our current locks to figure it out
4514                 */
4515                nr = 1;
4516        }
4517
4518        /* setup the item for the new data */
4519        for (i = 0; i < nr; i++) {
4520                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4521                btrfs_set_item_key(leaf, &disk_key, slot + i);
4522                item = btrfs_item_nr(leaf, slot + i);
4523                btrfs_set_token_item_offset(leaf, item,
4524                                            data_end - data_size[i], &token);
4525                data_end -= data_size[i];
4526                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4527        }
4528        btrfs_set_header_nritems(leaf, nritems + nr);
4529        btrfs_mark_buffer_dirty(leaf);
4530
4531        ret = 0;
4532        if (slot == 0) {
4533                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4534                fixup_low_keys(trans, root, path, &disk_key, 1);
4535        }
4536
4537        if (btrfs_leaf_free_space(root, leaf) < 0) {
4538                btrfs_print_leaf(root, leaf);
4539                BUG();
4540        }
4541out:
4542        if (!ret)
4543                ret = nr;
4544        return ret;
4545}
4546
4547/*
4548 * this is a helper for btrfs_insert_empty_items, the main goal here is
4549 * to save stack depth by doing the bulk of the work in a function
4550 * that doesn't call btrfs_search_slot
4551 */
4552void setup_items_for_insert(struct btrfs_trans_handle *trans,
4553                            struct btrfs_root *root, struct btrfs_path *path,
4554                            struct btrfs_key *cpu_key, u32 *data_size,
4555                            u32 total_data, u32 total_size, int nr)
4556{
4557        struct btrfs_item *item;
4558        int i;
4559        u32 nritems;
4560        unsigned int data_end;
4561        struct btrfs_disk_key disk_key;
4562        struct extent_buffer *leaf;
4563        int slot;
4564        struct btrfs_map_token token;
4565
4566        btrfs_init_map_token(&token);
4567
4568        leaf = path->nodes[0];
4569        slot = path->slots[0];
4570
4571        nritems = btrfs_header_nritems(leaf);
4572        data_end = leaf_data_end(root, leaf);
4573
4574        if (btrfs_leaf_free_space(root, leaf) < total_size) {
4575                btrfs_print_leaf(root, leaf);
4576                printk(KERN_CRIT "not enough freespace need %u have %d\n",
4577                       total_size, btrfs_leaf_free_space(root, leaf));
4578                BUG();
4579        }
4580
4581        if (slot != nritems) {
4582                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4583
4584                if (old_data < data_end) {
4585                        btrfs_print_leaf(root, leaf);
4586                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4587                               slot, old_data, data_end);
4588                        BUG_ON(1);
4589                }
4590                /*
4591                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4592                 */
4593                /* first correct the data pointers */
4594                for (i = slot; i < nritems; i++) {
4595                        u32 ioff;
4596
4597                        item = btrfs_item_nr(leaf, i);
4598                        ioff = btrfs_token_item_offset(leaf, item, &token);
4599                        btrfs_set_token_item_offset(leaf, item,
4600                                                    ioff - total_data, &token);
4601                }
4602                /* shift the items */
4603                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4604                              btrfs_item_nr_offset(slot),
4605                              (nritems - slot) * sizeof(struct btrfs_item));
4606
4607                /* shift the data */
4608                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4609                              data_end - total_data, btrfs_leaf_data(leaf) +
4610                              data_end, old_data - data_end);
4611                data_end = old_data;
4612        }
4613
4614        /* setup the item for the new data */
4615        for (i = 0; i < nr; i++) {
4616                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4617                btrfs_set_item_key(leaf, &disk_key, slot + i);
4618                item = btrfs_item_nr(leaf, slot + i);
4619                btrfs_set_token_item_offset(leaf, item,
4620                                            data_end - data_size[i], &token);
4621                data_end -= data_size[i];
4622                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4623        }
4624
4625        btrfs_set_header_nritems(leaf, nritems + nr);
4626
4627        if (slot == 0) {
4628                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4629                fixup_low_keys(trans, root, path, &disk_key, 1);
4630        }
4631        btrfs_unlock_up_safe(path, 1);
4632        btrfs_mark_buffer_dirty(leaf);
4633
4634        if (btrfs_leaf_free_space(root, leaf) < 0) {
4635                btrfs_print_leaf(root, leaf);
4636                BUG();
4637        }
4638}
4639
4640/*
4641 * Given a key and some data, insert items into the tree.
4642 * This does all the path init required, making room in the tree if needed.
4643 */
4644int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4645                            struct btrfs_root *root,
4646                            struct btrfs_path *path,
4647                            struct btrfs_key *cpu_key, u32 *data_size,
4648                            int nr)
4649{
4650        int ret = 0;
4651        int slot;
4652        int i;
4653        u32 total_size = 0;
4654        u32 total_data = 0;
4655
4656        for (i = 0; i < nr; i++)
4657                total_data += data_size[i];
4658
4659        total_size = total_data + (nr * sizeof(struct btrfs_item));
4660        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4661        if (ret == 0)
4662                return -EEXIST;
4663        if (ret < 0)
4664                return ret;
4665
4666        slot = path->slots[0];
4667        BUG_ON(slot < 0);
4668
4669        setup_items_for_insert(trans, root, path, cpu_key, data_size,
4670                               total_data, total_size, nr);
4671        return 0;
4672}
4673
4674/*
4675 * Given a key and some data, insert an item into the tree.
4676 * This does all the path init required, making room in the tree if needed.
4677 */
4678int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4679                      *root, struct btrfs_key *cpu_key, void *data, u32
4680                      data_size)
4681{
4682        int ret = 0;
4683        struct btrfs_path *path;
4684        struct extent_buffer *leaf;
4685        unsigned long ptr;
4686
4687        path = btrfs_alloc_path();
4688        if (!path)
4689                return -ENOMEM;
4690        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4691        if (!ret) {
4692                leaf = path->nodes[0];
4693                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4694                write_extent_buffer(leaf, data, ptr, data_size);
4695                btrfs_mark_buffer_dirty(leaf);
4696        }
4697        btrfs_free_path(path);
4698        return ret;
4699}
4700
4701/*
4702 * delete the pointer from a given node.
4703 *
4704 * the tree should have been previously balanced so the deletion does not
4705 * empty a node.
4706 */
4707static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4708                    struct btrfs_path *path, int level, int slot,
4709                    int tree_mod_log)
4710{
4711        struct extent_buffer *parent = path->nodes[level];
4712        u32 nritems;
4713        int ret;
4714
4715        nritems = btrfs_header_nritems(parent);
4716        if (slot != nritems - 1) {
4717                if (tree_mod_log && level)
4718                        tree_mod_log_eb_move(root->fs_info, parent, slot,
4719                                             slot + 1, nritems - slot - 1);
4720                memmove_extent_buffer(parent,
4721                              btrfs_node_key_ptr_offset(slot),
4722                              btrfs_node_key_ptr_offset(slot + 1),
4723                              sizeof(struct btrfs_key_ptr) *
4724                              (nritems - slot - 1));
4725        } else if (tree_mod_log && level) {
4726                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4727                                              MOD_LOG_KEY_REMOVE);
4728                BUG_ON(ret < 0);
4729        }
4730
4731        nritems--;
4732        btrfs_set_header_nritems(parent, nritems);
4733        if (nritems == 0 && parent == root->node) {
4734                BUG_ON(btrfs_header_level(root->node) != 1);
4735                /* just turn the root into a leaf and break */
4736                btrfs_set_header_level(root->node, 0);
4737        } else if (slot == 0) {
4738                struct btrfs_disk_key disk_key;
4739
4740                btrfs_node_key(parent, &disk_key, 0);
4741                fixup_low_keys(trans, root, path, &disk_key, level + 1);
4742        }
4743        btrfs_mark_buffer_dirty(parent);
4744}
4745
4746/*
4747 * a helper function to delete the leaf pointed to by path->slots[1] and
4748 * path->nodes[1].
4749 *
4750 * This deletes the pointer in path->nodes[1] and frees the leaf
4751 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4752 *
4753 * The path must have already been setup for deleting the leaf, including
4754 * all the proper balancing.  path->nodes[1] must be locked.
4755 */
4756static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4757                                    struct btrfs_root *root,
4758                                    struct btrfs_path *path,
4759                                    struct extent_buffer *leaf)
4760{
4761        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4762        del_ptr(trans, root, path, 1, path->slots[1], 1);
4763
4764        /*
4765         * btrfs_free_extent is expensive, we want to make sure we
4766         * aren't holding any locks when we call it
4767         */
4768        btrfs_unlock_up_safe(path, 0);
4769
4770        root_sub_used(root, leaf->len);
4771
4772        extent_buffer_get(leaf);
4773        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4774        free_extent_buffer_stale(leaf);
4775}
4776/*
4777 * delete the item at the leaf level in path.  If that empties
4778 * the leaf, remove it from the tree
4779 */
4780int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4781                    struct btrfs_path *path, int slot, int nr)
4782{
4783        struct extent_buffer *leaf;
4784        struct btrfs_item *item;
4785        int last_off;
4786        int dsize = 0;
4787        int ret = 0;
4788        int wret;
4789        int i;
4790        u32 nritems;
4791        struct btrfs_map_token token;
4792
4793        btrfs_init_map_token(&token);
4794
4795        leaf = path->nodes[0];
4796        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4797
4798        for (i = 0; i < nr; i++)
4799                dsize += btrfs_item_size_nr(leaf, slot + i);
4800
4801        nritems = btrfs_header_nritems(leaf);
4802
4803        if (slot + nr != nritems) {
4804                int data_end = leaf_data_end(root, leaf);
4805
4806                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4807                              data_end + dsize,
4808                              btrfs_leaf_data(leaf) + data_end,
4809                              last_off - data_end);
4810
4811                for (i = slot + nr; i < nritems; i++) {
4812                        u32 ioff;
4813
4814                        item = btrfs_item_nr(leaf, i);
4815                        ioff = btrfs_token_item_offset(leaf, item, &token);
4816                        btrfs_set_token_item_offset(leaf, item,
4817                                                    ioff + dsize, &token);
4818                }
4819
4820                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4821                              btrfs_item_nr_offset(slot + nr),
4822                              sizeof(struct btrfs_item) *
4823                              (nritems - slot - nr));
4824        }
4825        btrfs_set_header_nritems(leaf, nritems - nr);
4826        nritems -= nr;
4827
4828        /* delete the leaf if we've emptied it */
4829        if (nritems == 0) {
4830                if (leaf == root->node) {
4831                        btrfs_set_header_level(leaf, 0);
4832                } else {
4833                        btrfs_set_path_blocking(path);
4834                        clean_tree_block(trans, root, leaf);
4835                        btrfs_del_leaf(trans, root, path, leaf);
4836                }
4837        } else {
4838                int used = leaf_space_used(leaf, 0, nritems);
4839                if (slot == 0) {
4840                        struct btrfs_disk_key disk_key;
4841
4842                        btrfs_item_key(leaf, &disk_key, 0);
4843                        fixup_low_keys(trans, root, path, &disk_key, 1);
4844                }
4845
4846                /* delete the leaf if it is mostly empty */
4847                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4848                        /* push_leaf_left fixes the path.
4849                         * make sure the path still points to our leaf
4850                         * for possible call to del_ptr below
4851                         */
4852                        slot = path->slots[1];
4853                        extent_buffer_get(leaf);
4854
4855                        btrfs_set_path_blocking(path);
4856                        wret = push_leaf_left(trans, root, path, 1, 1,
4857                                              1, (u32)-1);
4858                        if (wret < 0 && wret != -ENOSPC)
4859                                ret = wret;
4860
4861                        if (path->nodes[0] == leaf &&
4862                            btrfs_header_nritems(leaf)) {
4863                                wret = push_leaf_right(trans, root, path, 1,
4864                                                       1, 1, 0);
4865                                if (wret < 0 && wret != -ENOSPC)
4866                                        ret = wret;
4867                        }
4868
4869                        if (btrfs_header_nritems(leaf) == 0) {
4870                                path->slots[1] = slot;
4871                                btrfs_del_leaf(trans, root, path, leaf);
4872                                free_extent_buffer(leaf);
4873                                ret = 0;
4874                        } else {
4875                                /* if we're still in the path, make sure
4876                                 * we're dirty.  Otherwise, one of the
4877                                 * push_leaf functions must have already
4878                                 * dirtied this buffer
4879                                 */
4880                                if (path->nodes[0] == leaf)
4881                                        btrfs_mark_buffer_dirty(leaf);
4882                                free_extent_buffer(leaf);
4883                        }
4884                } else {
4885                        btrfs_mark_buffer_dirty(leaf);
4886                }
4887        }
4888        return ret;
4889}
4890
4891/*
4892 * search the tree again to find a leaf with lesser keys
4893 * returns 0 if it found something or 1 if there are no lesser leaves.
4894 * returns < 0 on io errors.
4895 *
4896 * This may release the path, and so you may lose any locks held at the
4897 * time you call it.
4898 */
4899int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4900{
4901        struct btrfs_key key;
4902        struct btrfs_disk_key found_key;
4903        int ret;
4904
4905        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4906
4907        if (key.offset > 0)
4908                key.offset--;
4909        else if (key.type > 0)
4910                key.type--;
4911        else if (key.objectid > 0)
4912                key.objectid--;
4913        else
4914                return 1;
4915
4916        btrfs_release_path(path);
4917        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4918        if (ret < 0)
4919                return ret;
4920        btrfs_item_key(path->nodes[0], &found_key, 0);
4921        ret = comp_keys(&found_key, &key);
4922        if (ret < 0)
4923                return 0;
4924        return 1;
4925}
4926
4927/*
4928 * A helper function to walk down the tree starting at min_key, and looking
4929 * for nodes or leaves that are either in cache or have a minimum
4930 * transaction id.  This is used by the btree defrag code, and tree logging
4931 *
4932 * This does not cow, but it does stuff the starting key it finds back
4933 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4934 * key and get a writable path.
4935 *
4936 * This does lock as it descends, and path->keep_locks should be set
4937 * to 1 by the caller.
4938 *
4939 * This honors path->lowest_level to prevent descent past a given level
4940 * of the tree.
4941 *
4942 * min_trans indicates the oldest transaction that you are interested
4943 * in walking through.  Any nodes or leaves older than min_trans are
4944 * skipped over (without reading them).
4945 *
4946 * returns zero if something useful was found, < 0 on error and 1 if there
4947 * was nothing in the tree that matched the search criteria.
4948 */
4949int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4950                         struct btrfs_key *max_key,
4951                         struct btrfs_path *path, int cache_only,
4952                         u64 min_trans)
4953{
4954        struct extent_buffer *cur;
4955        struct btrfs_key found_key;
4956        int slot;
4957        int sret;
4958        u32 nritems;
4959        int level;
4960        int ret = 1;
4961
4962        WARN_ON(!path->keep_locks);
4963again:
4964        cur = btrfs_read_lock_root_node(root);
4965        level = btrfs_header_level(cur);
4966        WARN_ON(path->nodes[level]);
4967        path->nodes[level] = cur;
4968        path->locks[level] = BTRFS_READ_LOCK;
4969
4970        if (btrfs_header_generation(cur) < min_trans) {
4971                ret = 1;
4972                goto out;
4973        }
4974        while (1) {
4975                nritems = btrfs_header_nritems(cur);
4976                level = btrfs_header_level(cur);
4977                sret = bin_search(cur, min_key, level, &slot);
4978
4979                /* at the lowest level, we're done, setup the path and exit */
4980                if (level == path->lowest_level) {
4981                        if (slot >= nritems)
4982                                goto find_next_key;
4983                        ret = 0;
4984                        path->slots[level] = slot;
4985                        btrfs_item_key_to_cpu(cur, &found_key, slot);
4986                        goto out;
4987                }
4988                if (sret && slot > 0)
4989                        slot--;
4990                /*
4991                 * check this node pointer against the cache_only and
4992                 * min_trans parameters.  If it isn't in cache or is too
4993                 * old, skip to the next one.
4994                 */
4995                while (slot < nritems) {
4996                        u64 blockptr;
4997                        u64 gen;
4998                        struct extent_buffer *tmp;
4999                        struct btrfs_disk_key disk_key;
5000
5001                        blockptr = btrfs_node_blockptr(cur, slot);
5002                        gen = btrfs_node_ptr_generation(cur, slot);
5003                        if (gen < min_trans) {
5004                                slot++;
5005                                continue;
5006                        }
5007                        if (!cache_only)
5008                                break;
5009
5010                        if (max_key) {
5011                                btrfs_node_key(cur, &disk_key, slot);
5012                                if (comp_keys(&disk_key, max_key) >= 0) {
5013                                        ret = 1;
5014                                        goto out;
5015                                }
5016                        }
5017
5018                        tmp = btrfs_find_tree_block(root, blockptr,
5019                                            btrfs_level_size(root, level - 1));
5020
5021                        if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
5022                                free_extent_buffer(tmp);
5023                                break;
5024                        }
5025                        if (tmp)
5026                                free_extent_buffer(tmp);
5027                        slot++;
5028                }
5029find_next_key:
5030                /*
5031                 * we didn't find a candidate key in this node, walk forward
5032                 * and find another one
5033                 */
5034                if (slot >= nritems) {
5035                        path->slots[level] = slot;
5036                        btrfs_set_path_blocking(path);
5037                        sret = btrfs_find_next_key(root, path, min_key, level,
5038                                                  cache_only, min_trans);
5039                        if (sret == 0) {
5040                                btrfs_release_path(path);
5041                                goto again;
5042                        } else {
5043                                goto out;
5044                        }
5045                }
5046                /* save our key for returning back */
5047                btrfs_node_key_to_cpu(cur, &found_key, slot);
5048                path->slots[level] = slot;
5049                if (level == path->lowest_level) {
5050                        ret = 0;
5051                        unlock_up(path, level, 1, 0, NULL);
5052                        goto out;
5053                }
5054                btrfs_set_path_blocking(path);
5055                cur = read_node_slot(root, cur, slot);
5056                BUG_ON(!cur); /* -ENOMEM */
5057
5058                btrfs_tree_read_lock(cur);
5059
5060                path->locks[level - 1] = BTRFS_READ_LOCK;
5061                path->nodes[level - 1] = cur;
5062                unlock_up(path, level, 1, 0, NULL);
5063                btrfs_clear_path_blocking(path, NULL, 0);
5064        }
5065out:
5066        if (ret == 0)
5067                memcpy(min_key, &found_key, sizeof(found_key));
5068        btrfs_set_path_blocking(path);
5069        return ret;
5070}
5071
5072static void tree_move_down(struct btrfs_root *root,
5073                           struct btrfs_path *path,
5074                           int *level, int root_level)
5075{
5076        path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5077                                        path->slots[*level]);
5078        path->slots[*level - 1] = 0;
5079        (*level)--;
5080}
5081
5082static int tree_move_next_or_upnext(struct btrfs_root *root,
5083                                    struct btrfs_path *path,
5084                                    int *level, int root_level)
5085{
5086        int ret = 0;
5087        int nritems;
5088        nritems = btrfs_header_nritems(path->nodes[*level]);
5089
5090        path->slots[*level]++;
5091
5092        while (path->slots[*level] == nritems) {
5093                if (*level == root_level)
5094                        return -1;
5095
5096                /* move upnext */
5097                path->slots[*level] = 0;
5098                free_extent_buffer(path->nodes[*level]);
5099                path->nodes[*level] = NULL;
5100                (*level)++;
5101                path->slots[*level]++;
5102
5103                nritems = btrfs_header_nritems(path->nodes[*level]);
5104                ret = 1;
5105        }
5106        return ret;
5107}
5108
5109/*
5110 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5111 * or down.
5112 */
5113static int tree_advance(struct btrfs_root *root,
5114                        struct btrfs_path *path,
5115                        int *level, int root_level,
5116                        int allow_down,
5117                        struct btrfs_key *key)
5118{
5119        int ret;
5120
5121        if (*level == 0 || !allow_down) {
5122                ret = tree_move_next_or_upnext(root, path, level, root_level);
5123        } else {
5124                tree_move_down(root, path, level, root_level);
5125                ret = 0;
5126        }
5127        if (ret >= 0) {
5128                if (*level == 0)
5129                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5130                                        path->slots[*level]);
5131                else
5132                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5133                                        path->slots[*level]);
5134        }
5135        return ret;
5136}
5137
5138static int tree_compare_item(struct btrfs_root *left_root,
5139                             struct btrfs_path *left_path,
5140                             struct btrfs_path *right_path,
5141                             char *tmp_buf)
5142{
5143        int cmp;
5144        int len1, len2;
5145        unsigned long off1, off2;
5146
5147        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5148        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5149        if (len1 != len2)
5150                return 1;
5151
5152        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5153        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5154                                right_path->slots[0]);
5155
5156        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5157
5158        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5159        if (cmp)
5160                return 1;
5161        return 0;
5162}
5163
5164#define ADVANCE 1
5165#define ADVANCE_ONLY_NEXT -1
5166
5167/*
5168 * This function compares two trees and calls the provided callback for
5169 * every changed/new/deleted item it finds.
5170 * If shared tree blocks are encountered, whole subtrees are skipped, making
5171 * the compare pretty fast on snapshotted subvolumes.
5172 *
5173 * This currently works on commit roots only. As commit roots are read only,
5174 * we don't do any locking. The commit roots are protected with transactions.
5175 * Transactions are ended and rejoined when a commit is tried in between.
5176 *
5177 * This function checks for modifications done to the trees while comparing.
5178 * If it detects a change, it aborts immediately.
5179 */
5180int btrfs_compare_trees(struct btrfs_root *left_root,
5181                        struct btrfs_root *right_root,
5182                        btrfs_changed_cb_t changed_cb, void *ctx)
5183{
5184        int ret;
5185        int cmp;
5186        struct btrfs_trans_handle *trans = NULL;
5187        struct btrfs_path *left_path = NULL;
5188        struct btrfs_path *right_path = NULL;
5189        struct btrfs_key left_key;
5190        struct btrfs_key right_key;
5191        char *tmp_buf = NULL;
5192        int left_root_level;
5193        int right_root_level;
5194        int left_level;
5195        int right_level;
5196        int left_end_reached;
5197        int right_end_reached;
5198        int advance_left;
5199        int advance_right;
5200        u64 left_blockptr;
5201        u64 right_blockptr;
5202        u64 left_start_ctransid;
5203        u64 right_start_ctransid;
5204        u64 ctransid;
5205
5206        left_path = btrfs_alloc_path();
5207        if (!left_path) {
5208                ret = -ENOMEM;
5209                goto out;
5210        }
5211        right_path = btrfs_alloc_path();
5212        if (!right_path) {
5213                ret = -ENOMEM;
5214                goto out;
5215        }
5216
5217        tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5218        if (!tmp_buf) {
5219                ret = -ENOMEM;
5220                goto out;
5221        }
5222
5223        left_path->search_commit_root = 1;
5224        left_path->skip_locking = 1;
5225        right_path->search_commit_root = 1;
5226        right_path->skip_locking = 1;
5227
5228        spin_lock(&left_root->root_times_lock);
5229        left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5230        spin_unlock(&left_root->root_times_lock);
5231
5232        spin_lock(&right_root->root_times_lock);
5233        right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5234        spin_unlock(&right_root->root_times_lock);
5235
5236        trans = btrfs_join_transaction(left_root);
5237        if (IS_ERR(trans)) {
5238                ret = PTR_ERR(trans);
5239                trans = NULL;
5240                goto out;
5241        }
5242
5243        /*
5244         * Strategy: Go to the first items of both trees. Then do
5245         *
5246         * If both trees are at level 0
5247         *   Compare keys of current items
5248         *     If left < right treat left item as new, advance left tree
5249         *       and repeat
5250         *     If left > right treat right item as deleted, advance right tree
5251         *       and repeat
5252         *     If left == right do deep compare of items, treat as changed if
5253         *       needed, advance both trees and repeat
5254         * If both trees are at the same level but not at level 0
5255         *   Compare keys of current nodes/leafs
5256         *     If left < right advance left tree and repeat
5257         *     If left > right advance right tree and repeat
5258         *     If left == right compare blockptrs of the next nodes/leafs
5259         *       If they match advance both trees but stay at the same level
5260         *         and repeat
5261         *       If they don't match advance both trees while allowing to go
5262         *         deeper and repeat
5263         * If tree levels are different
5264         *   Advance the tree that needs it and repeat
5265         *
5266         * Advancing a tree means:
5267         *   If we are at level 0, try to go to the next slot. If that's not
5268         *   possible, go one level up and repeat. Stop when we found a level
5269         *   where we could go to the next slot. We may at this point be on a
5270         *   node or a leaf.
5271         *
5272         *   If we are not at level 0 and not on shared tree blocks, go one
5273         *   level deeper.
5274         *
5275         *   If we are not at level 0 and on shared tree blocks, go one slot to
5276         *   the right if possible or go up and right.
5277         */
5278
5279        left_level = btrfs_header_level(left_root->commit_root);
5280        left_root_level = left_level;
5281        left_path->nodes[left_level] = left_root->commit_root;
5282        extent_buffer_get(left_path->nodes[left_level]);
5283
5284        right_level = btrfs_header_level(right_root->commit_root);
5285        right_root_level = right_level;
5286        right_path->nodes[right_level] = right_root->commit_root;
5287        extent_buffer_get(right_path->nodes[right_level]);
5288
5289        if (left_level == 0)
5290                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5291                                &left_key, left_path->slots[left_level]);
5292        else
5293                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5294                                &left_key, left_path->slots[left_level]);
5295        if (right_level == 0)
5296                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5297                                &right_key, right_path->slots[right_level]);
5298        else
5299                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5300                                &right_key, right_path->slots[right_level]);
5301
5302        left_end_reached = right_end_reached = 0;
5303        advance_left = advance_right = 0;
5304
5305        while (1) {
5306                /*
5307                 * We need to make sure the transaction does not get committed
5308                 * while we do anything on commit roots. This means, we need to
5309                 * join and leave transactions for every item that we process.
5310                 */
5311                if (trans && btrfs_should_end_transaction(trans, left_root)) {
5312                        btrfs_release_path(left_path);
5313                        btrfs_release_path(right_path);
5314
5315                        ret = btrfs_end_transaction(trans, left_root);
5316                        trans = NULL;
5317                        if (ret < 0)
5318                                goto out;
5319                }
5320                /* now rejoin the transaction */
5321                if (!trans) {
5322                        trans = btrfs_join_transaction(left_root);
5323                        if (IS_ERR(trans)) {
5324                                ret = PTR_ERR(trans);
5325                                trans = NULL;
5326                                goto out;
5327                        }
5328
5329                        spin_lock(&left_root->root_times_lock);
5330                        ctransid = btrfs_root_ctransid(&left_root->root_item);
5331                        spin_unlock(&left_root->root_times_lock);
5332                        if (ctransid != left_start_ctransid)
5333                                left_start_ctransid = 0;
5334
5335                        spin_lock(&right_root->root_times_lock);
5336                        ctransid = btrfs_root_ctransid(&right_root->root_item);
5337                        spin_unlock(&right_root->root_times_lock);
5338                        if (ctransid != right_start_ctransid)
5339                                right_start_ctransid = 0;
5340
5341                        if (!left_start_ctransid || !right_start_ctransid) {
5342                                WARN(1, KERN_WARNING
5343                                        "btrfs: btrfs_compare_tree detected "
5344