linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include <linux/ratelimit.h>
  27#include "compat.h"
  28#include "hash.h"
  29#include "ctree.h"
  30#include "disk-io.h"
  31#include "print-tree.h"
  32#include "transaction.h"
  33#include "volumes.h"
  34#include "locking.h"
  35#include "free-space-cache.h"
  36
  37#undef SCRAMBLE_DELAYED_REFS
  38
  39/*
  40 * control flags for do_chunk_alloc's force field
  41 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  42 * if we really need one.
  43 *
  44 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  45 * if we have very few chunks already allocated.  This is
  46 * used as part of the clustering code to help make sure
  47 * we have a good pool of storage to cluster in, without
  48 * filling the FS with empty chunks
  49 *
  50 * CHUNK_ALLOC_FORCE means it must try to allocate one
  51 *
  52 */
  53enum {
  54        CHUNK_ALLOC_NO_FORCE = 0,
  55        CHUNK_ALLOC_LIMITED = 1,
  56        CHUNK_ALLOC_FORCE = 2,
  57};
  58
  59/*
  60 * Control how reservations are dealt with.
  61 *
  62 * RESERVE_FREE - freeing a reservation.
  63 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  64 *   ENOSPC accounting
  65 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  66 *   bytes_may_use as the ENOSPC accounting is done elsewhere
  67 */
  68enum {
  69        RESERVE_FREE = 0,
  70        RESERVE_ALLOC = 1,
  71        RESERVE_ALLOC_NO_ACCOUNT = 2,
  72};
  73
  74static int update_block_group(struct btrfs_trans_handle *trans,
  75                              struct btrfs_root *root,
  76                              u64 bytenr, u64 num_bytes, int alloc);
  77static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  78                                struct btrfs_root *root,
  79                                u64 bytenr, u64 num_bytes, u64 parent,
  80                                u64 root_objectid, u64 owner_objectid,
  81                                u64 owner_offset, int refs_to_drop,
  82                                struct btrfs_delayed_extent_op *extra_op);
  83static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  84                                    struct extent_buffer *leaf,
  85                                    struct btrfs_extent_item *ei);
  86static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  87                                      struct btrfs_root *root,
  88                                      u64 parent, u64 root_objectid,
  89                                      u64 flags, u64 owner, u64 offset,
  90                                      struct btrfs_key *ins, int ref_mod);
  91static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  92                                     struct btrfs_root *root,
  93                                     u64 parent, u64 root_objectid,
  94                                     u64 flags, struct btrfs_disk_key *key,
  95                                     int level, struct btrfs_key *ins);
  96static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  97                          struct btrfs_root *extent_root, u64 flags,
  98                          int force);
  99static int find_next_key(struct btrfs_path *path, int level,
 100                         struct btrfs_key *key);
 101static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 102                            int dump_block_groups);
 103static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 104                                       u64 num_bytes, int reserve);
 105
 106static noinline int
 107block_group_cache_done(struct btrfs_block_group_cache *cache)
 108{
 109        smp_mb();
 110        return cache->cached == BTRFS_CACHE_FINISHED;
 111}
 112
 113static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 114{
 115        return (cache->flags & bits) == bits;
 116}
 117
 118static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 119{
 120        atomic_inc(&cache->count);
 121}
 122
 123void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 124{
 125        if (atomic_dec_and_test(&cache->count)) {
 126                WARN_ON(cache->pinned > 0);
 127                WARN_ON(cache->reserved > 0);
 128                kfree(cache->free_space_ctl);
 129                kfree(cache);
 130        }
 131}
 132
 133/*
 134 * this adds the block group to the fs_info rb tree for the block group
 135 * cache
 136 */
 137static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 138                                struct btrfs_block_group_cache *block_group)
 139{
 140        struct rb_node **p;
 141        struct rb_node *parent = NULL;
 142        struct btrfs_block_group_cache *cache;
 143
 144        spin_lock(&info->block_group_cache_lock);
 145        p = &info->block_group_cache_tree.rb_node;
 146
 147        while (*p) {
 148                parent = *p;
 149                cache = rb_entry(parent, struct btrfs_block_group_cache,
 150                                 cache_node);
 151                if (block_group->key.objectid < cache->key.objectid) {
 152                        p = &(*p)->rb_left;
 153                } else if (block_group->key.objectid > cache->key.objectid) {
 154                        p = &(*p)->rb_right;
 155                } else {
 156                        spin_unlock(&info->block_group_cache_lock);
 157                        return -EEXIST;
 158                }
 159        }
 160
 161        rb_link_node(&block_group->cache_node, parent, p);
 162        rb_insert_color(&block_group->cache_node,
 163                        &info->block_group_cache_tree);
 164        spin_unlock(&info->block_group_cache_lock);
 165
 166        return 0;
 167}
 168
 169/*
 170 * This will return the block group at or after bytenr if contains is 0, else
 171 * it will return the block group that contains the bytenr
 172 */
 173static struct btrfs_block_group_cache *
 174block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 175                              int contains)
 176{
 177        struct btrfs_block_group_cache *cache, *ret = NULL;
 178        struct rb_node *n;
 179        u64 end, start;
 180
 181        spin_lock(&info->block_group_cache_lock);
 182        n = info->block_group_cache_tree.rb_node;
 183
 184        while (n) {
 185                cache = rb_entry(n, struct btrfs_block_group_cache,
 186                                 cache_node);
 187                end = cache->key.objectid + cache->key.offset - 1;
 188                start = cache->key.objectid;
 189
 190                if (bytenr < start) {
 191                        if (!contains && (!ret || start < ret->key.objectid))
 192                                ret = cache;
 193                        n = n->rb_left;
 194                } else if (bytenr > start) {
 195                        if (contains && bytenr <= end) {
 196                                ret = cache;
 197                                break;
 198                        }
 199                        n = n->rb_right;
 200                } else {
 201                        ret = cache;
 202                        break;
 203                }
 204        }
 205        if (ret)
 206                btrfs_get_block_group(ret);
 207        spin_unlock(&info->block_group_cache_lock);
 208
 209        return ret;
 210}
 211
 212static int add_excluded_extent(struct btrfs_root *root,
 213                               u64 start, u64 num_bytes)
 214{
 215        u64 end = start + num_bytes - 1;
 216        set_extent_bits(&root->fs_info->freed_extents[0],
 217                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 218        set_extent_bits(&root->fs_info->freed_extents[1],
 219                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 220        return 0;
 221}
 222
 223static void free_excluded_extents(struct btrfs_root *root,
 224                                  struct btrfs_block_group_cache *cache)
 225{
 226        u64 start, end;
 227
 228        start = cache->key.objectid;
 229        end = start + cache->key.offset - 1;
 230
 231        clear_extent_bits(&root->fs_info->freed_extents[0],
 232                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 233        clear_extent_bits(&root->fs_info->freed_extents[1],
 234                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 235}
 236
 237static int exclude_super_stripes(struct btrfs_root *root,
 238                                 struct btrfs_block_group_cache *cache)
 239{
 240        u64 bytenr;
 241        u64 *logical;
 242        int stripe_len;
 243        int i, nr, ret;
 244
 245        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 246                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 247                cache->bytes_super += stripe_len;
 248                ret = add_excluded_extent(root, cache->key.objectid,
 249                                          stripe_len);
 250                BUG_ON(ret); /* -ENOMEM */
 251        }
 252
 253        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 254                bytenr = btrfs_sb_offset(i);
 255                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 256                                       cache->key.objectid, bytenr,
 257                                       0, &logical, &nr, &stripe_len);
 258                BUG_ON(ret); /* -ENOMEM */
 259
 260                while (nr--) {
 261                        cache->bytes_super += stripe_len;
 262                        ret = add_excluded_extent(root, logical[nr],
 263                                                  stripe_len);
 264                        BUG_ON(ret); /* -ENOMEM */
 265                }
 266
 267                kfree(logical);
 268        }
 269        return 0;
 270}
 271
 272static struct btrfs_caching_control *
 273get_caching_control(struct btrfs_block_group_cache *cache)
 274{
 275        struct btrfs_caching_control *ctl;
 276
 277        spin_lock(&cache->lock);
 278        if (cache->cached != BTRFS_CACHE_STARTED) {
 279                spin_unlock(&cache->lock);
 280                return NULL;
 281        }
 282
 283        /* We're loading it the fast way, so we don't have a caching_ctl. */
 284        if (!cache->caching_ctl) {
 285                spin_unlock(&cache->lock);
 286                return NULL;
 287        }
 288
 289        ctl = cache->caching_ctl;
 290        atomic_inc(&ctl->count);
 291        spin_unlock(&cache->lock);
 292        return ctl;
 293}
 294
 295static void put_caching_control(struct btrfs_caching_control *ctl)
 296{
 297        if (atomic_dec_and_test(&ctl->count))
 298                kfree(ctl);
 299}
 300
 301/*
 302 * this is only called by cache_block_group, since we could have freed extents
 303 * we need to check the pinned_extents for any extents that can't be used yet
 304 * since their free space will be released as soon as the transaction commits.
 305 */
 306static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 307                              struct btrfs_fs_info *info, u64 start, u64 end)
 308{
 309        u64 extent_start, extent_end, size, total_added = 0;
 310        int ret;
 311
 312        while (start < end) {
 313                ret = find_first_extent_bit(info->pinned_extents, start,
 314                                            &extent_start, &extent_end,
 315                                            EXTENT_DIRTY | EXTENT_UPTODATE,
 316                                            NULL);
 317                if (ret)
 318                        break;
 319
 320                if (extent_start <= start) {
 321                        start = extent_end + 1;
 322                } else if (extent_start > start && extent_start < end) {
 323                        size = extent_start - start;
 324                        total_added += size;
 325                        ret = btrfs_add_free_space(block_group, start,
 326                                                   size);
 327                        BUG_ON(ret); /* -ENOMEM or logic error */
 328                        start = extent_end + 1;
 329                } else {
 330                        break;
 331                }
 332        }
 333
 334        if (start < end) {
 335                size = end - start;
 336                total_added += size;
 337                ret = btrfs_add_free_space(block_group, start, size);
 338                BUG_ON(ret); /* -ENOMEM or logic error */
 339        }
 340
 341        return total_added;
 342}
 343
 344static noinline void caching_thread(struct btrfs_work *work)
 345{
 346        struct btrfs_block_group_cache *block_group;
 347        struct btrfs_fs_info *fs_info;
 348        struct btrfs_caching_control *caching_ctl;
 349        struct btrfs_root *extent_root;
 350        struct btrfs_path *path;
 351        struct extent_buffer *leaf;
 352        struct btrfs_key key;
 353        u64 total_found = 0;
 354        u64 last = 0;
 355        u32 nritems;
 356        int ret = 0;
 357
 358        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 359        block_group = caching_ctl->block_group;
 360        fs_info = block_group->fs_info;
 361        extent_root = fs_info->extent_root;
 362
 363        path = btrfs_alloc_path();
 364        if (!path)
 365                goto out;
 366
 367        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 368
 369        /*
 370         * We don't want to deadlock with somebody trying to allocate a new
 371         * extent for the extent root while also trying to search the extent
 372         * root to add free space.  So we skip locking and search the commit
 373         * root, since its read-only
 374         */
 375        path->skip_locking = 1;
 376        path->search_commit_root = 1;
 377        path->reada = 1;
 378
 379        key.objectid = last;
 380        key.offset = 0;
 381        key.type = BTRFS_EXTENT_ITEM_KEY;
 382again:
 383        mutex_lock(&caching_ctl->mutex);
 384        /* need to make sure the commit_root doesn't disappear */
 385        down_read(&fs_info->extent_commit_sem);
 386
 387        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 388        if (ret < 0)
 389                goto err;
 390
 391        leaf = path->nodes[0];
 392        nritems = btrfs_header_nritems(leaf);
 393
 394        while (1) {
 395                if (btrfs_fs_closing(fs_info) > 1) {
 396                        last = (u64)-1;
 397                        break;
 398                }
 399
 400                if (path->slots[0] < nritems) {
 401                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 402                } else {
 403                        ret = find_next_key(path, 0, &key);
 404                        if (ret)
 405                                break;
 406
 407                        if (need_resched() ||
 408                            btrfs_next_leaf(extent_root, path)) {
 409                                caching_ctl->progress = last;
 410                                btrfs_release_path(path);
 411                                up_read(&fs_info->extent_commit_sem);
 412                                mutex_unlock(&caching_ctl->mutex);
 413                                cond_resched();
 414                                goto again;
 415                        }
 416                        leaf = path->nodes[0];
 417                        nritems = btrfs_header_nritems(leaf);
 418                        continue;
 419                }
 420
 421                if (key.objectid < block_group->key.objectid) {
 422                        path->slots[0]++;
 423                        continue;
 424                }
 425
 426                if (key.objectid >= block_group->key.objectid +
 427                    block_group->key.offset)
 428                        break;
 429
 430                if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 431                        total_found += add_new_free_space(block_group,
 432                                                          fs_info, last,
 433                                                          key.objectid);
 434                        last = key.objectid + key.offset;
 435
 436                        if (total_found > (1024 * 1024 * 2)) {
 437                                total_found = 0;
 438                                wake_up(&caching_ctl->wait);
 439                        }
 440                }
 441                path->slots[0]++;
 442        }
 443        ret = 0;
 444
 445        total_found += add_new_free_space(block_group, fs_info, last,
 446                                          block_group->key.objectid +
 447                                          block_group->key.offset);
 448        caching_ctl->progress = (u64)-1;
 449
 450        spin_lock(&block_group->lock);
 451        block_group->caching_ctl = NULL;
 452        block_group->cached = BTRFS_CACHE_FINISHED;
 453        spin_unlock(&block_group->lock);
 454
 455err:
 456        btrfs_free_path(path);
 457        up_read(&fs_info->extent_commit_sem);
 458
 459        free_excluded_extents(extent_root, block_group);
 460
 461        mutex_unlock(&caching_ctl->mutex);
 462out:
 463        wake_up(&caching_ctl->wait);
 464
 465        put_caching_control(caching_ctl);
 466        btrfs_put_block_group(block_group);
 467}
 468
 469static int cache_block_group(struct btrfs_block_group_cache *cache,
 470                             struct btrfs_trans_handle *trans,
 471                             struct btrfs_root *root,
 472                             int load_cache_only)
 473{
 474        DEFINE_WAIT(wait);
 475        struct btrfs_fs_info *fs_info = cache->fs_info;
 476        struct btrfs_caching_control *caching_ctl;
 477        int ret = 0;
 478
 479        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 480        if (!caching_ctl)
 481                return -ENOMEM;
 482
 483        INIT_LIST_HEAD(&caching_ctl->list);
 484        mutex_init(&caching_ctl->mutex);
 485        init_waitqueue_head(&caching_ctl->wait);
 486        caching_ctl->block_group = cache;
 487        caching_ctl->progress = cache->key.objectid;
 488        atomic_set(&caching_ctl->count, 1);
 489        caching_ctl->work.func = caching_thread;
 490
 491        spin_lock(&cache->lock);
 492        /*
 493         * This should be a rare occasion, but this could happen I think in the
 494         * case where one thread starts to load the space cache info, and then
 495         * some other thread starts a transaction commit which tries to do an
 496         * allocation while the other thread is still loading the space cache
 497         * info.  The previous loop should have kept us from choosing this block
 498         * group, but if we've moved to the state where we will wait on caching
 499         * block groups we need to first check if we're doing a fast load here,
 500         * so we can wait for it to finish, otherwise we could end up allocating
 501         * from a block group who's cache gets evicted for one reason or
 502         * another.
 503         */
 504        while (cache->cached == BTRFS_CACHE_FAST) {
 505                struct btrfs_caching_control *ctl;
 506
 507                ctl = cache->caching_ctl;
 508                atomic_inc(&ctl->count);
 509                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 510                spin_unlock(&cache->lock);
 511
 512                schedule();
 513
 514                finish_wait(&ctl->wait, &wait);
 515                put_caching_control(ctl);
 516                spin_lock(&cache->lock);
 517        }
 518
 519        if (cache->cached != BTRFS_CACHE_NO) {
 520                spin_unlock(&cache->lock);
 521                kfree(caching_ctl);
 522                return 0;
 523        }
 524        WARN_ON(cache->caching_ctl);
 525        cache->caching_ctl = caching_ctl;
 526        cache->cached = BTRFS_CACHE_FAST;
 527        spin_unlock(&cache->lock);
 528
 529        /*
 530         * We can't do the read from on-disk cache during a commit since we need
 531         * to have the normal tree locking.  Also if we are currently trying to
 532         * allocate blocks for the tree root we can't do the fast caching since
 533         * we likely hold important locks.
 534         */
 535        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 536                ret = load_free_space_cache(fs_info, cache);
 537
 538                spin_lock(&cache->lock);
 539                if (ret == 1) {
 540                        cache->caching_ctl = NULL;
 541                        cache->cached = BTRFS_CACHE_FINISHED;
 542                        cache->last_byte_to_unpin = (u64)-1;
 543                } else {
 544                        if (load_cache_only) {
 545                                cache->caching_ctl = NULL;
 546                                cache->cached = BTRFS_CACHE_NO;
 547                        } else {
 548                                cache->cached = BTRFS_CACHE_STARTED;
 549                        }
 550                }
 551                spin_unlock(&cache->lock);
 552                wake_up(&caching_ctl->wait);
 553                if (ret == 1) {
 554                        put_caching_control(caching_ctl);
 555                        free_excluded_extents(fs_info->extent_root, cache);
 556                        return 0;
 557                }
 558        } else {
 559                /*
 560                 * We are not going to do the fast caching, set cached to the
 561                 * appropriate value and wakeup any waiters.
 562                 */
 563                spin_lock(&cache->lock);
 564                if (load_cache_only) {
 565                        cache->caching_ctl = NULL;
 566                        cache->cached = BTRFS_CACHE_NO;
 567                } else {
 568                        cache->cached = BTRFS_CACHE_STARTED;
 569                }
 570                spin_unlock(&cache->lock);
 571                wake_up(&caching_ctl->wait);
 572        }
 573
 574        if (load_cache_only) {
 575                put_caching_control(caching_ctl);
 576                return 0;
 577        }
 578
 579        down_write(&fs_info->extent_commit_sem);
 580        atomic_inc(&caching_ctl->count);
 581        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 582        up_write(&fs_info->extent_commit_sem);
 583
 584        btrfs_get_block_group(cache);
 585
 586        btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
 587
 588        return ret;
 589}
 590
 591/*
 592 * return the block group that starts at or after bytenr
 593 */
 594static struct btrfs_block_group_cache *
 595btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 596{
 597        struct btrfs_block_group_cache *cache;
 598
 599        cache = block_group_cache_tree_search(info, bytenr, 0);
 600
 601        return cache;
 602}
 603
 604/*
 605 * return the block group that contains the given bytenr
 606 */
 607struct btrfs_block_group_cache *btrfs_lookup_block_group(
 608                                                 struct btrfs_fs_info *info,
 609                                                 u64 bytenr)
 610{
 611        struct btrfs_block_group_cache *cache;
 612
 613        cache = block_group_cache_tree_search(info, bytenr, 1);
 614
 615        return cache;
 616}
 617
 618static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 619                                                  u64 flags)
 620{
 621        struct list_head *head = &info->space_info;
 622        struct btrfs_space_info *found;
 623
 624        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 625
 626        rcu_read_lock();
 627        list_for_each_entry_rcu(found, head, list) {
 628                if (found->flags & flags) {
 629                        rcu_read_unlock();
 630                        return found;
 631                }
 632        }
 633        rcu_read_unlock();
 634        return NULL;
 635}
 636
 637/*
 638 * after adding space to the filesystem, we need to clear the full flags
 639 * on all the space infos.
 640 */
 641void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 642{
 643        struct list_head *head = &info->space_info;
 644        struct btrfs_space_info *found;
 645
 646        rcu_read_lock();
 647        list_for_each_entry_rcu(found, head, list)
 648                found->full = 0;
 649        rcu_read_unlock();
 650}
 651
 652static u64 div_factor(u64 num, int factor)
 653{
 654        if (factor == 10)
 655                return num;
 656        num *= factor;
 657        do_div(num, 10);
 658        return num;
 659}
 660
 661static u64 div_factor_fine(u64 num, int factor)
 662{
 663        if (factor == 100)
 664                return num;
 665        num *= factor;
 666        do_div(num, 100);
 667        return num;
 668}
 669
 670u64 btrfs_find_block_group(struct btrfs_root *root,
 671                           u64 search_start, u64 search_hint, int owner)
 672{
 673        struct btrfs_block_group_cache *cache;
 674        u64 used;
 675        u64 last = max(search_hint, search_start);
 676        u64 group_start = 0;
 677        int full_search = 0;
 678        int factor = 9;
 679        int wrapped = 0;
 680again:
 681        while (1) {
 682                cache = btrfs_lookup_first_block_group(root->fs_info, last);
 683                if (!cache)
 684                        break;
 685
 686                spin_lock(&cache->lock);
 687                last = cache->key.objectid + cache->key.offset;
 688                used = btrfs_block_group_used(&cache->item);
 689
 690                if ((full_search || !cache->ro) &&
 691                    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 692                        if (used + cache->pinned + cache->reserved <
 693                            div_factor(cache->key.offset, factor)) {
 694                                group_start = cache->key.objectid;
 695                                spin_unlock(&cache->lock);
 696                                btrfs_put_block_group(cache);
 697                                goto found;
 698                        }
 699                }
 700                spin_unlock(&cache->lock);
 701                btrfs_put_block_group(cache);
 702                cond_resched();
 703        }
 704        if (!wrapped) {
 705                last = search_start;
 706                wrapped = 1;
 707                goto again;
 708        }
 709        if (!full_search && factor < 10) {
 710                last = search_start;
 711                full_search = 1;
 712                factor = 10;
 713                goto again;
 714        }
 715found:
 716        return group_start;
 717}
 718
 719/* simple helper to search for an existing extent at a given offset */
 720int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 721{
 722        int ret;
 723        struct btrfs_key key;
 724        struct btrfs_path *path;
 725
 726        path = btrfs_alloc_path();
 727        if (!path)
 728                return -ENOMEM;
 729
 730        key.objectid = start;
 731        key.offset = len;
 732        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 733        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 734                                0, 0);
 735        btrfs_free_path(path);
 736        return ret;
 737}
 738
 739/*
 740 * helper function to lookup reference count and flags of extent.
 741 *
 742 * the head node for delayed ref is used to store the sum of all the
 743 * reference count modifications queued up in the rbtree. the head
 744 * node may also store the extent flags to set. This way you can check
 745 * to see what the reference count and extent flags would be if all of
 746 * the delayed refs are not processed.
 747 */
 748int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 749                             struct btrfs_root *root, u64 bytenr,
 750                             u64 num_bytes, u64 *refs, u64 *flags)
 751{
 752        struct btrfs_delayed_ref_head *head;
 753        struct btrfs_delayed_ref_root *delayed_refs;
 754        struct btrfs_path *path;
 755        struct btrfs_extent_item *ei;
 756        struct extent_buffer *leaf;
 757        struct btrfs_key key;
 758        u32 item_size;
 759        u64 num_refs;
 760        u64 extent_flags;
 761        int ret;
 762
 763        path = btrfs_alloc_path();
 764        if (!path)
 765                return -ENOMEM;
 766
 767        key.objectid = bytenr;
 768        key.type = BTRFS_EXTENT_ITEM_KEY;
 769        key.offset = num_bytes;
 770        if (!trans) {
 771                path->skip_locking = 1;
 772                path->search_commit_root = 1;
 773        }
 774again:
 775        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 776                                &key, path, 0, 0);
 777        if (ret < 0)
 778                goto out_free;
 779
 780        if (ret == 0) {
 781                leaf = path->nodes[0];
 782                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 783                if (item_size >= sizeof(*ei)) {
 784                        ei = btrfs_item_ptr(leaf, path->slots[0],
 785                                            struct btrfs_extent_item);
 786                        num_refs = btrfs_extent_refs(leaf, ei);
 787                        extent_flags = btrfs_extent_flags(leaf, ei);
 788                } else {
 789#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 790                        struct btrfs_extent_item_v0 *ei0;
 791                        BUG_ON(item_size != sizeof(*ei0));
 792                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 793                                             struct btrfs_extent_item_v0);
 794                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 795                        /* FIXME: this isn't correct for data */
 796                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 797#else
 798                        BUG();
 799#endif
 800                }
 801                BUG_ON(num_refs == 0);
 802        } else {
 803                num_refs = 0;
 804                extent_flags = 0;
 805                ret = 0;
 806        }
 807
 808        if (!trans)
 809                goto out;
 810
 811        delayed_refs = &trans->transaction->delayed_refs;
 812        spin_lock(&delayed_refs->lock);
 813        head = btrfs_find_delayed_ref_head(trans, bytenr);
 814        if (head) {
 815                if (!mutex_trylock(&head->mutex)) {
 816                        atomic_inc(&head->node.refs);
 817                        spin_unlock(&delayed_refs->lock);
 818
 819                        btrfs_release_path(path);
 820
 821                        /*
 822                         * Mutex was contended, block until it's released and try
 823                         * again
 824                         */
 825                        mutex_lock(&head->mutex);
 826                        mutex_unlock(&head->mutex);
 827                        btrfs_put_delayed_ref(&head->node);
 828                        goto again;
 829                }
 830                if (head->extent_op && head->extent_op->update_flags)
 831                        extent_flags |= head->extent_op->flags_to_set;
 832                else
 833                        BUG_ON(num_refs == 0);
 834
 835                num_refs += head->node.ref_mod;
 836                mutex_unlock(&head->mutex);
 837        }
 838        spin_unlock(&delayed_refs->lock);
 839out:
 840        WARN_ON(num_refs == 0);
 841        if (refs)
 842                *refs = num_refs;
 843        if (flags)
 844                *flags = extent_flags;
 845out_free:
 846        btrfs_free_path(path);
 847        return ret;
 848}
 849
 850/*
 851 * Back reference rules.  Back refs have three main goals:
 852 *
 853 * 1) differentiate between all holders of references to an extent so that
 854 *    when a reference is dropped we can make sure it was a valid reference
 855 *    before freeing the extent.
 856 *
 857 * 2) Provide enough information to quickly find the holders of an extent
 858 *    if we notice a given block is corrupted or bad.
 859 *
 860 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 861 *    maintenance.  This is actually the same as #2, but with a slightly
 862 *    different use case.
 863 *
 864 * There are two kinds of back refs. The implicit back refs is optimized
 865 * for pointers in non-shared tree blocks. For a given pointer in a block,
 866 * back refs of this kind provide information about the block's owner tree
 867 * and the pointer's key. These information allow us to find the block by
 868 * b-tree searching. The full back refs is for pointers in tree blocks not
 869 * referenced by their owner trees. The location of tree block is recorded
 870 * in the back refs. Actually the full back refs is generic, and can be
 871 * used in all cases the implicit back refs is used. The major shortcoming
 872 * of the full back refs is its overhead. Every time a tree block gets
 873 * COWed, we have to update back refs entry for all pointers in it.
 874 *
 875 * For a newly allocated tree block, we use implicit back refs for
 876 * pointers in it. This means most tree related operations only involve
 877 * implicit back refs. For a tree block created in old transaction, the
 878 * only way to drop a reference to it is COW it. So we can detect the
 879 * event that tree block loses its owner tree's reference and do the
 880 * back refs conversion.
 881 *
 882 * When a tree block is COW'd through a tree, there are four cases:
 883 *
 884 * The reference count of the block is one and the tree is the block's
 885 * owner tree. Nothing to do in this case.
 886 *
 887 * The reference count of the block is one and the tree is not the
 888 * block's owner tree. In this case, full back refs is used for pointers
 889 * in the block. Remove these full back refs, add implicit back refs for
 890 * every pointers in the new block.
 891 *
 892 * The reference count of the block is greater than one and the tree is
 893 * the block's owner tree. In this case, implicit back refs is used for
 894 * pointers in the block. Add full back refs for every pointers in the
 895 * block, increase lower level extents' reference counts. The original
 896 * implicit back refs are entailed to the new block.
 897 *
 898 * The reference count of the block is greater than one and the tree is
 899 * not the block's owner tree. Add implicit back refs for every pointer in
 900 * the new block, increase lower level extents' reference count.
 901 *
 902 * Back Reference Key composing:
 903 *
 904 * The key objectid corresponds to the first byte in the extent,
 905 * The key type is used to differentiate between types of back refs.
 906 * There are different meanings of the key offset for different types
 907 * of back refs.
 908 *
 909 * File extents can be referenced by:
 910 *
 911 * - multiple snapshots, subvolumes, or different generations in one subvol
 912 * - different files inside a single subvolume
 913 * - different offsets inside a file (bookend extents in file.c)
 914 *
 915 * The extent ref structure for the implicit back refs has fields for:
 916 *
 917 * - Objectid of the subvolume root
 918 * - objectid of the file holding the reference
 919 * - original offset in the file
 920 * - how many bookend extents
 921 *
 922 * The key offset for the implicit back refs is hash of the first
 923 * three fields.
 924 *
 925 * The extent ref structure for the full back refs has field for:
 926 *
 927 * - number of pointers in the tree leaf
 928 *
 929 * The key offset for the implicit back refs is the first byte of
 930 * the tree leaf
 931 *
 932 * When a file extent is allocated, The implicit back refs is used.
 933 * the fields are filled in:
 934 *
 935 *     (root_key.objectid, inode objectid, offset in file, 1)
 936 *
 937 * When a file extent is removed file truncation, we find the
 938 * corresponding implicit back refs and check the following fields:
 939 *
 940 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 941 *
 942 * Btree extents can be referenced by:
 943 *
 944 * - Different subvolumes
 945 *
 946 * Both the implicit back refs and the full back refs for tree blocks
 947 * only consist of key. The key offset for the implicit back refs is
 948 * objectid of block's owner tree. The key offset for the full back refs
 949 * is the first byte of parent block.
 950 *
 951 * When implicit back refs is used, information about the lowest key and
 952 * level of the tree block are required. These information are stored in
 953 * tree block info structure.
 954 */
 955
 956#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 957static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 958                                  struct btrfs_root *root,
 959                                  struct btrfs_path *path,
 960                                  u64 owner, u32 extra_size)
 961{
 962        struct btrfs_extent_item *item;
 963        struct btrfs_extent_item_v0 *ei0;
 964        struct btrfs_extent_ref_v0 *ref0;
 965        struct btrfs_tree_block_info *bi;
 966        struct extent_buffer *leaf;
 967        struct btrfs_key key;
 968        struct btrfs_key found_key;
 969        u32 new_size = sizeof(*item);
 970        u64 refs;
 971        int ret;
 972
 973        leaf = path->nodes[0];
 974        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 975
 976        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 977        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 978                             struct btrfs_extent_item_v0);
 979        refs = btrfs_extent_refs_v0(leaf, ei0);
 980
 981        if (owner == (u64)-1) {
 982                while (1) {
 983                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 984                                ret = btrfs_next_leaf(root, path);
 985                                if (ret < 0)
 986                                        return ret;
 987                                BUG_ON(ret > 0); /* Corruption */
 988                                leaf = path->nodes[0];
 989                        }
 990                        btrfs_item_key_to_cpu(leaf, &found_key,
 991                                              path->slots[0]);
 992                        BUG_ON(key.objectid != found_key.objectid);
 993                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
 994                                path->slots[0]++;
 995                                continue;
 996                        }
 997                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
 998                                              struct btrfs_extent_ref_v0);
 999                        owner = btrfs_ref_objectid_v0(leaf, ref0);
1000                        break;
1001                }
1002        }
1003        btrfs_release_path(path);
1004
1005        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1006                new_size += sizeof(*bi);
1007
1008        new_size -= sizeof(*ei0);
1009        ret = btrfs_search_slot(trans, root, &key, path,
1010                                new_size + extra_size, 1);
1011        if (ret < 0)
1012                return ret;
1013        BUG_ON(ret); /* Corruption */
1014
1015        btrfs_extend_item(trans, root, path, new_size);
1016
1017        leaf = path->nodes[0];
1018        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1019        btrfs_set_extent_refs(leaf, item, refs);
1020        /* FIXME: get real generation */
1021        btrfs_set_extent_generation(leaf, item, 0);
1022        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1023                btrfs_set_extent_flags(leaf, item,
1024                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1025                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1026                bi = (struct btrfs_tree_block_info *)(item + 1);
1027                /* FIXME: get first key of the block */
1028                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1029                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1030        } else {
1031                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1032        }
1033        btrfs_mark_buffer_dirty(leaf);
1034        return 0;
1035}
1036#endif
1037
1038static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1039{
1040        u32 high_crc = ~(u32)0;
1041        u32 low_crc = ~(u32)0;
1042        __le64 lenum;
1043
1044        lenum = cpu_to_le64(root_objectid);
1045        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1046        lenum = cpu_to_le64(owner);
1047        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1048        lenum = cpu_to_le64(offset);
1049        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1050
1051        return ((u64)high_crc << 31) ^ (u64)low_crc;
1052}
1053
1054static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1055                                     struct btrfs_extent_data_ref *ref)
1056{
1057        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1058                                    btrfs_extent_data_ref_objectid(leaf, ref),
1059                                    btrfs_extent_data_ref_offset(leaf, ref));
1060}
1061
1062static int match_extent_data_ref(struct extent_buffer *leaf,
1063                                 struct btrfs_extent_data_ref *ref,
1064                                 u64 root_objectid, u64 owner, u64 offset)
1065{
1066        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1067            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1068            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1069                return 0;
1070        return 1;
1071}
1072
1073static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1074                                           struct btrfs_root *root,
1075                                           struct btrfs_path *path,
1076                                           u64 bytenr, u64 parent,
1077                                           u64 root_objectid,
1078                                           u64 owner, u64 offset)
1079{
1080        struct btrfs_key key;
1081        struct btrfs_extent_data_ref *ref;
1082        struct extent_buffer *leaf;
1083        u32 nritems;
1084        int ret;
1085        int recow;
1086        int err = -ENOENT;
1087
1088        key.objectid = bytenr;
1089        if (parent) {
1090                key.type = BTRFS_SHARED_DATA_REF_KEY;
1091                key.offset = parent;
1092        } else {
1093                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1094                key.offset = hash_extent_data_ref(root_objectid,
1095                                                  owner, offset);
1096        }
1097again:
1098        recow = 0;
1099        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1100        if (ret < 0) {
1101                err = ret;
1102                goto fail;
1103        }
1104
1105        if (parent) {
1106                if (!ret)
1107                        return 0;
1108#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1109                key.type = BTRFS_EXTENT_REF_V0_KEY;
1110                btrfs_release_path(path);
1111                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1112                if (ret < 0) {
1113                        err = ret;
1114                        goto fail;
1115                }
1116                if (!ret)
1117                        return 0;
1118#endif
1119                goto fail;
1120        }
1121
1122        leaf = path->nodes[0];
1123        nritems = btrfs_header_nritems(leaf);
1124        while (1) {
1125                if (path->slots[0] >= nritems) {
1126                        ret = btrfs_next_leaf(root, path);
1127                        if (ret < 0)
1128                                err = ret;
1129                        if (ret)
1130                                goto fail;
1131
1132                        leaf = path->nodes[0];
1133                        nritems = btrfs_header_nritems(leaf);
1134                        recow = 1;
1135                }
1136
1137                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1138                if (key.objectid != bytenr ||
1139                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1140                        goto fail;
1141
1142                ref = btrfs_item_ptr(leaf, path->slots[0],
1143                                     struct btrfs_extent_data_ref);
1144
1145                if (match_extent_data_ref(leaf, ref, root_objectid,
1146                                          owner, offset)) {
1147                        if (recow) {
1148                                btrfs_release_path(path);
1149                                goto again;
1150                        }
1151                        err = 0;
1152                        break;
1153                }
1154                path->slots[0]++;
1155        }
1156fail:
1157        return err;
1158}
1159
1160static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1161                                           struct btrfs_root *root,
1162                                           struct btrfs_path *path,
1163                                           u64 bytenr, u64 parent,
1164                                           u64 root_objectid, u64 owner,
1165                                           u64 offset, int refs_to_add)
1166{
1167        struct btrfs_key key;
1168        struct extent_buffer *leaf;
1169        u32 size;
1170        u32 num_refs;
1171        int ret;
1172
1173        key.objectid = bytenr;
1174        if (parent) {
1175                key.type = BTRFS_SHARED_DATA_REF_KEY;
1176                key.offset = parent;
1177                size = sizeof(struct btrfs_shared_data_ref);
1178        } else {
1179                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1180                key.offset = hash_extent_data_ref(root_objectid,
1181                                                  owner, offset);
1182                size = sizeof(struct btrfs_extent_data_ref);
1183        }
1184
1185        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1186        if (ret && ret != -EEXIST)
1187                goto fail;
1188
1189        leaf = path->nodes[0];
1190        if (parent) {
1191                struct btrfs_shared_data_ref *ref;
1192                ref = btrfs_item_ptr(leaf, path->slots[0],
1193                                     struct btrfs_shared_data_ref);
1194                if (ret == 0) {
1195                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1196                } else {
1197                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1198                        num_refs += refs_to_add;
1199                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1200                }
1201        } else {
1202                struct btrfs_extent_data_ref *ref;
1203                while (ret == -EEXIST) {
1204                        ref = btrfs_item_ptr(leaf, path->slots[0],
1205                                             struct btrfs_extent_data_ref);
1206                        if (match_extent_data_ref(leaf, ref, root_objectid,
1207                                                  owner, offset))
1208                                break;
1209                        btrfs_release_path(path);
1210                        key.offset++;
1211                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1212                                                      size);
1213                        if (ret && ret != -EEXIST)
1214                                goto fail;
1215
1216                        leaf = path->nodes[0];
1217                }
1218                ref = btrfs_item_ptr(leaf, path->slots[0],
1219                                     struct btrfs_extent_data_ref);
1220                if (ret == 0) {
1221                        btrfs_set_extent_data_ref_root(leaf, ref,
1222                                                       root_objectid);
1223                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1224                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1225                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1226                } else {
1227                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1228                        num_refs += refs_to_add;
1229                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1230                }
1231        }
1232        btrfs_mark_buffer_dirty(leaf);
1233        ret = 0;
1234fail:
1235        btrfs_release_path(path);
1236        return ret;
1237}
1238
1239static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1240                                           struct btrfs_root *root,
1241                                           struct btrfs_path *path,
1242                                           int refs_to_drop)
1243{
1244        struct btrfs_key key;
1245        struct btrfs_extent_data_ref *ref1 = NULL;
1246        struct btrfs_shared_data_ref *ref2 = NULL;
1247        struct extent_buffer *leaf;
1248        u32 num_refs = 0;
1249        int ret = 0;
1250
1251        leaf = path->nodes[0];
1252        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1253
1254        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1255                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1256                                      struct btrfs_extent_data_ref);
1257                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1258        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1259                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1260                                      struct btrfs_shared_data_ref);
1261                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1262#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1263        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1264                struct btrfs_extent_ref_v0 *ref0;
1265                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1266                                      struct btrfs_extent_ref_v0);
1267                num_refs = btrfs_ref_count_v0(leaf, ref0);
1268#endif
1269        } else {
1270                BUG();
1271        }
1272
1273        BUG_ON(num_refs < refs_to_drop);
1274        num_refs -= refs_to_drop;
1275
1276        if (num_refs == 0) {
1277                ret = btrfs_del_item(trans, root, path);
1278        } else {
1279                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1280                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1281                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1282                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1283#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1284                else {
1285                        struct btrfs_extent_ref_v0 *ref0;
1286                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1287                                        struct btrfs_extent_ref_v0);
1288                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1289                }
1290#endif
1291                btrfs_mark_buffer_dirty(leaf);
1292        }
1293        return ret;
1294}
1295
1296static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1297                                          struct btrfs_path *path,
1298                                          struct btrfs_extent_inline_ref *iref)
1299{
1300        struct btrfs_key key;
1301        struct extent_buffer *leaf;
1302        struct btrfs_extent_data_ref *ref1;
1303        struct btrfs_shared_data_ref *ref2;
1304        u32 num_refs = 0;
1305
1306        leaf = path->nodes[0];
1307        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1308        if (iref) {
1309                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1310                    BTRFS_EXTENT_DATA_REF_KEY) {
1311                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1312                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1313                } else {
1314                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1315                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1316                }
1317        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1318                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1319                                      struct btrfs_extent_data_ref);
1320                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1321        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1322                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1323                                      struct btrfs_shared_data_ref);
1324                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1325#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1326        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1327                struct btrfs_extent_ref_v0 *ref0;
1328                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1329                                      struct btrfs_extent_ref_v0);
1330                num_refs = btrfs_ref_count_v0(leaf, ref0);
1331#endif
1332        } else {
1333                WARN_ON(1);
1334        }
1335        return num_refs;
1336}
1337
1338static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1339                                          struct btrfs_root *root,
1340                                          struct btrfs_path *path,
1341                                          u64 bytenr, u64 parent,
1342                                          u64 root_objectid)
1343{
1344        struct btrfs_key key;
1345        int ret;
1346
1347        key.objectid = bytenr;
1348        if (parent) {
1349                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1350                key.offset = parent;
1351        } else {
1352                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1353                key.offset = root_objectid;
1354        }
1355
1356        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1357        if (ret > 0)
1358                ret = -ENOENT;
1359#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1360        if (ret == -ENOENT && parent) {
1361                btrfs_release_path(path);
1362                key.type = BTRFS_EXTENT_REF_V0_KEY;
1363                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1364                if (ret > 0)
1365                        ret = -ENOENT;
1366        }
1367#endif
1368        return ret;
1369}
1370
1371static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1372                                          struct btrfs_root *root,
1373                                          struct btrfs_path *path,
1374                                          u64 bytenr, u64 parent,
1375                                          u64 root_objectid)
1376{
1377        struct btrfs_key key;
1378        int ret;
1379
1380        key.objectid = bytenr;
1381        if (parent) {
1382                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1383                key.offset = parent;
1384        } else {
1385                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1386                key.offset = root_objectid;
1387        }
1388
1389        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1390        btrfs_release_path(path);
1391        return ret;
1392}
1393
1394static inline int extent_ref_type(u64 parent, u64 owner)
1395{
1396        int type;
1397        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1398                if (parent > 0)
1399                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1400                else
1401                        type = BTRFS_TREE_BLOCK_REF_KEY;
1402        } else {
1403                if (parent > 0)
1404                        type = BTRFS_SHARED_DATA_REF_KEY;
1405                else
1406                        type = BTRFS_EXTENT_DATA_REF_KEY;
1407        }
1408        return type;
1409}
1410
1411static int find_next_key(struct btrfs_path *path, int level,
1412                         struct btrfs_key *key)
1413
1414{
1415        for (; level < BTRFS_MAX_LEVEL; level++) {
1416                if (!path->nodes[level])
1417                        break;
1418                if (path->slots[level] + 1 >=
1419                    btrfs_header_nritems(path->nodes[level]))
1420                        continue;
1421                if (level == 0)
1422                        btrfs_item_key_to_cpu(path->nodes[level], key,
1423                                              path->slots[level] + 1);
1424                else
1425                        btrfs_node_key_to_cpu(path->nodes[level], key,
1426                                              path->slots[level] + 1);
1427                return 0;
1428        }
1429        return 1;
1430}
1431
1432/*
1433 * look for inline back ref. if back ref is found, *ref_ret is set
1434 * to the address of inline back ref, and 0 is returned.
1435 *
1436 * if back ref isn't found, *ref_ret is set to the address where it
1437 * should be inserted, and -ENOENT is returned.
1438 *
1439 * if insert is true and there are too many inline back refs, the path
1440 * points to the extent item, and -EAGAIN is returned.
1441 *
1442 * NOTE: inline back refs are ordered in the same way that back ref
1443 *       items in the tree are ordered.
1444 */
1445static noinline_for_stack
1446int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1447                                 struct btrfs_root *root,
1448                                 struct btrfs_path *path,
1449                                 struct btrfs_extent_inline_ref **ref_ret,
1450                                 u64 bytenr, u64 num_bytes,
1451                                 u64 parent, u64 root_objectid,
1452                                 u64 owner, u64 offset, int insert)
1453{
1454        struct btrfs_key key;
1455        struct extent_buffer *leaf;
1456        struct btrfs_extent_item *ei;
1457        struct btrfs_extent_inline_ref *iref;
1458        u64 flags;
1459        u64 item_size;
1460        unsigned long ptr;
1461        unsigned long end;
1462        int extra_size;
1463        int type;
1464        int want;
1465        int ret;
1466        int err = 0;
1467
1468        key.objectid = bytenr;
1469        key.type = BTRFS_EXTENT_ITEM_KEY;
1470        key.offset = num_bytes;
1471
1472        want = extent_ref_type(parent, owner);
1473        if (insert) {
1474                extra_size = btrfs_extent_inline_ref_size(want);
1475                path->keep_locks = 1;
1476        } else
1477                extra_size = -1;
1478        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1479        if (ret < 0) {
1480                err = ret;
1481                goto out;
1482        }
1483        if (ret && !insert) {
1484                err = -ENOENT;
1485                goto out;
1486        }
1487        BUG_ON(ret); /* Corruption */
1488
1489        leaf = path->nodes[0];
1490        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1491#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1492        if (item_size < sizeof(*ei)) {
1493                if (!insert) {
1494                        err = -ENOENT;
1495                        goto out;
1496                }
1497                ret = convert_extent_item_v0(trans, root, path, owner,
1498                                             extra_size);
1499                if (ret < 0) {
1500                        err = ret;
1501                        goto out;
1502                }
1503                leaf = path->nodes[0];
1504                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1505        }
1506#endif
1507        BUG_ON(item_size < sizeof(*ei));
1508
1509        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1510        flags = btrfs_extent_flags(leaf, ei);
1511
1512        ptr = (unsigned long)(ei + 1);
1513        end = (unsigned long)ei + item_size;
1514
1515        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1516                ptr += sizeof(struct btrfs_tree_block_info);
1517                BUG_ON(ptr > end);
1518        } else {
1519                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1520        }
1521
1522        err = -ENOENT;
1523        while (1) {
1524                if (ptr >= end) {
1525                        WARN_ON(ptr > end);
1526                        break;
1527                }
1528                iref = (struct btrfs_extent_inline_ref *)ptr;
1529                type = btrfs_extent_inline_ref_type(leaf, iref);
1530                if (want < type)
1531                        break;
1532                if (want > type) {
1533                        ptr += btrfs_extent_inline_ref_size(type);
1534                        continue;
1535                }
1536
1537                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1538                        struct btrfs_extent_data_ref *dref;
1539                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1540                        if (match_extent_data_ref(leaf, dref, root_objectid,
1541                                                  owner, offset)) {
1542                                err = 0;
1543                                break;
1544                        }
1545                        if (hash_extent_data_ref_item(leaf, dref) <
1546                            hash_extent_data_ref(root_objectid, owner, offset))
1547                                break;
1548                } else {
1549                        u64 ref_offset;
1550                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1551                        if (parent > 0) {
1552                                if (parent == ref_offset) {
1553                                        err = 0;
1554                                        break;
1555                                }
1556                                if (ref_offset < parent)
1557                                        break;
1558                        } else {
1559                                if (root_objectid == ref_offset) {
1560                                        err = 0;
1561                                        break;
1562                                }
1563                                if (ref_offset < root_objectid)
1564                                        break;
1565                        }
1566                }
1567                ptr += btrfs_extent_inline_ref_size(type);
1568        }
1569        if (err == -ENOENT && insert) {
1570                if (item_size + extra_size >=
1571                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1572                        err = -EAGAIN;
1573                        goto out;
1574                }
1575                /*
1576                 * To add new inline back ref, we have to make sure
1577                 * there is no corresponding back ref item.
1578                 * For simplicity, we just do not add new inline back
1579                 * ref if there is any kind of item for this block
1580                 */
1581                if (find_next_key(path, 0, &key) == 0 &&
1582                    key.objectid == bytenr &&
1583                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1584                        err = -EAGAIN;
1585                        goto out;
1586                }
1587        }
1588        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1589out:
1590        if (insert) {
1591                path->keep_locks = 0;
1592                btrfs_unlock_up_safe(path, 1);
1593        }
1594        return err;
1595}
1596
1597/*
1598 * helper to add new inline back ref
1599 */
1600static noinline_for_stack
1601void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1602                                 struct btrfs_root *root,
1603                                 struct btrfs_path *path,
1604                                 struct btrfs_extent_inline_ref *iref,
1605                                 u64 parent, u64 root_objectid,
1606                                 u64 owner, u64 offset, int refs_to_add,
1607                                 struct btrfs_delayed_extent_op *extent_op)
1608{
1609        struct extent_buffer *leaf;
1610        struct btrfs_extent_item *ei;
1611        unsigned long ptr;
1612        unsigned long end;
1613        unsigned long item_offset;
1614        u64 refs;
1615        int size;
1616        int type;
1617
1618        leaf = path->nodes[0];
1619        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1620        item_offset = (unsigned long)iref - (unsigned long)ei;
1621
1622        type = extent_ref_type(parent, owner);
1623        size = btrfs_extent_inline_ref_size(type);
1624
1625        btrfs_extend_item(trans, root, path, size);
1626
1627        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1628        refs = btrfs_extent_refs(leaf, ei);
1629        refs += refs_to_add;
1630        btrfs_set_extent_refs(leaf, ei, refs);
1631        if (extent_op)
1632                __run_delayed_extent_op(extent_op, leaf, ei);
1633
1634        ptr = (unsigned long)ei + item_offset;
1635        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1636        if (ptr < end - size)
1637                memmove_extent_buffer(leaf, ptr + size, ptr,
1638                                      end - size - ptr);
1639
1640        iref = (struct btrfs_extent_inline_ref *)ptr;
1641        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1642        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1643                struct btrfs_extent_data_ref *dref;
1644                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1645                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1646                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1647                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1648                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1649        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1650                struct btrfs_shared_data_ref *sref;
1651                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1652                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1653                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1654        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1655                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1656        } else {
1657                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1658        }
1659        btrfs_mark_buffer_dirty(leaf);
1660}
1661
1662static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1663                                 struct btrfs_root *root,
1664                                 struct btrfs_path *path,
1665                                 struct btrfs_extent_inline_ref **ref_ret,
1666                                 u64 bytenr, u64 num_bytes, u64 parent,
1667                                 u64 root_objectid, u64 owner, u64 offset)
1668{
1669        int ret;
1670
1671        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1672                                           bytenr, num_bytes, parent,
1673                                           root_objectid, owner, offset, 0);
1674        if (ret != -ENOENT)
1675                return ret;
1676
1677        btrfs_release_path(path);
1678        *ref_ret = NULL;
1679
1680        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1681                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1682                                            root_objectid);
1683        } else {
1684                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1685                                             root_objectid, owner, offset);
1686        }
1687        return ret;
1688}
1689
1690/*
1691 * helper to update/remove inline back ref
1692 */
1693static noinline_for_stack
1694void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1695                                  struct btrfs_root *root,
1696                                  struct btrfs_path *path,
1697                                  struct btrfs_extent_inline_ref *iref,
1698                                  int refs_to_mod,
1699                                  struct btrfs_delayed_extent_op *extent_op)
1700{
1701        struct extent_buffer *leaf;
1702        struct btrfs_extent_item *ei;
1703        struct btrfs_extent_data_ref *dref = NULL;
1704        struct btrfs_shared_data_ref *sref = NULL;
1705        unsigned long ptr;
1706        unsigned long end;
1707        u32 item_size;
1708        int size;
1709        int type;
1710        u64 refs;
1711
1712        leaf = path->nodes[0];
1713        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1714        refs = btrfs_extent_refs(leaf, ei);
1715        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1716        refs += refs_to_mod;
1717        btrfs_set_extent_refs(leaf, ei, refs);
1718        if (extent_op)
1719                __run_delayed_extent_op(extent_op, leaf, ei);
1720
1721        type = btrfs_extent_inline_ref_type(leaf, iref);
1722
1723        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1724                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1725                refs = btrfs_extent_data_ref_count(leaf, dref);
1726        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1727                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1728                refs = btrfs_shared_data_ref_count(leaf, sref);
1729        } else {
1730                refs = 1;
1731                BUG_ON(refs_to_mod != -1);
1732        }
1733
1734        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1735        refs += refs_to_mod;
1736
1737        if (refs > 0) {
1738                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1739                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1740                else
1741                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1742        } else {
1743                size =  btrfs_extent_inline_ref_size(type);
1744                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1745                ptr = (unsigned long)iref;
1746                end = (unsigned long)ei + item_size;
1747                if (ptr + size < end)
1748                        memmove_extent_buffer(leaf, ptr, ptr + size,
1749                                              end - ptr - size);
1750                item_size -= size;
1751                btrfs_truncate_item(trans, root, path, item_size, 1);
1752        }
1753        btrfs_mark_buffer_dirty(leaf);
1754}
1755
1756static noinline_for_stack
1757int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1758                                 struct btrfs_root *root,
1759                                 struct btrfs_path *path,
1760                                 u64 bytenr, u64 num_bytes, u64 parent,
1761                                 u64 root_objectid, u64 owner,
1762                                 u64 offset, int refs_to_add,
1763                                 struct btrfs_delayed_extent_op *extent_op)
1764{
1765        struct btrfs_extent_inline_ref *iref;
1766        int ret;
1767
1768        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1769                                           bytenr, num_bytes, parent,
1770                                           root_objectid, owner, offset, 1);
1771        if (ret == 0) {
1772                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1773                update_inline_extent_backref(trans, root, path, iref,
1774                                             refs_to_add, extent_op);
1775        } else if (ret == -ENOENT) {
1776                setup_inline_extent_backref(trans, root, path, iref, parent,
1777                                            root_objectid, owner, offset,
1778                                            refs_to_add, extent_op);
1779                ret = 0;
1780        }
1781        return ret;
1782}
1783
1784static int insert_extent_backref(struct btrfs_trans_handle *trans,
1785                                 struct btrfs_root *root,
1786                                 struct btrfs_path *path,
1787                                 u64 bytenr, u64 parent, u64 root_objectid,
1788                                 u64 owner, u64 offset, int refs_to_add)
1789{
1790        int ret;
1791        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1792                BUG_ON(refs_to_add != 1);
1793                ret = insert_tree_block_ref(trans, root, path, bytenr,
1794                                            parent, root_objectid);
1795        } else {
1796                ret = insert_extent_data_ref(trans, root, path, bytenr,
1797                                             parent, root_objectid,
1798                                             owner, offset, refs_to_add);
1799        }
1800        return ret;
1801}
1802
1803static int remove_extent_backref(struct btrfs_trans_handle *trans,
1804                                 struct btrfs_root *root,
1805                                 struct btrfs_path *path,
1806                                 struct btrfs_extent_inline_ref *iref,
1807                                 int refs_to_drop, int is_data)
1808{
1809        int ret = 0;
1810
1811        BUG_ON(!is_data && refs_to_drop != 1);
1812        if (iref) {
1813                update_inline_extent_backref(trans, root, path, iref,
1814                                             -refs_to_drop, NULL);
1815        } else if (is_data) {
1816                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1817        } else {
1818                ret = btrfs_del_item(trans, root, path);
1819        }
1820        return ret;
1821}
1822
1823static int btrfs_issue_discard(struct block_device *bdev,
1824                                u64 start, u64 len)
1825{
1826        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1827}
1828
1829static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1830                                u64 num_bytes, u64 *actual_bytes)
1831{
1832        int ret;
1833        u64 discarded_bytes = 0;
1834        struct btrfs_bio *bbio = NULL;
1835
1836
1837        /* Tell the block device(s) that the sectors can be discarded */
1838        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1839                              bytenr, &num_bytes, &bbio, 0);
1840        /* Error condition is -ENOMEM */
1841        if (!ret) {
1842                struct btrfs_bio_stripe *stripe = bbio->stripes;
1843                int i;
1844
1845
1846                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1847                        if (!stripe->dev->can_discard)
1848                                continue;
1849
1850                        ret = btrfs_issue_discard(stripe->dev->bdev,
1851                                                  stripe->physical,
1852                                                  stripe->length);
1853                        if (!ret)
1854                                discarded_bytes += stripe->length;
1855                        else if (ret != -EOPNOTSUPP)
1856                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1857
1858                        /*
1859                         * Just in case we get back EOPNOTSUPP for some reason,
1860                         * just ignore the return value so we don't screw up
1861                         * people calling discard_extent.
1862                         */
1863                        ret = 0;
1864                }
1865                kfree(bbio);
1866        }
1867
1868        if (actual_bytes)
1869                *actual_bytes = discarded_bytes;
1870
1871
1872        return ret;
1873}
1874
1875/* Can return -ENOMEM */
1876int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1877                         struct btrfs_root *root,
1878                         u64 bytenr, u64 num_bytes, u64 parent,
1879                         u64 root_objectid, u64 owner, u64 offset, int for_cow)
1880{
1881        int ret;
1882        struct btrfs_fs_info *fs_info = root->fs_info;
1883
1884        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1885               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1886
1887        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1888                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1889                                        num_bytes,
1890                                        parent, root_objectid, (int)owner,
1891                                        BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1892        } else {
1893                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1894                                        num_bytes,
1895                                        parent, root_objectid, owner, offset,
1896                                        BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1897        }
1898        return ret;
1899}
1900
1901static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1902                                  struct btrfs_root *root,
1903                                  u64 bytenr, u64 num_bytes,
1904                                  u64 parent, u64 root_objectid,
1905                                  u64 owner, u64 offset, int refs_to_add,
1906                                  struct btrfs_delayed_extent_op *extent_op)
1907{
1908        struct btrfs_path *path;
1909        struct extent_buffer *leaf;
1910        struct btrfs_extent_item *item;
1911        u64 refs;
1912        int ret;
1913        int err = 0;
1914
1915        path = btrfs_alloc_path();
1916        if (!path)
1917                return -ENOMEM;
1918
1919        path->reada = 1;
1920        path->leave_spinning = 1;
1921        /* this will setup the path even if it fails to insert the back ref */
1922        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1923                                           path, bytenr, num_bytes, parent,
1924                                           root_objectid, owner, offset,
1925                                           refs_to_add, extent_op);
1926        if (ret == 0)
1927                goto out;
1928
1929        if (ret != -EAGAIN) {
1930                err = ret;
1931                goto out;
1932        }
1933
1934        leaf = path->nodes[0];
1935        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1936        refs = btrfs_extent_refs(leaf, item);
1937        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1938        if (extent_op)
1939                __run_delayed_extent_op(extent_op, leaf, item);
1940
1941        btrfs_mark_buffer_dirty(leaf);
1942        btrfs_release_path(path);
1943
1944        path->reada = 1;
1945        path->leave_spinning = 1;
1946
1947        /* now insert the actual backref */
1948        ret = insert_extent_backref(trans, root->fs_info->extent_root,
1949                                    path, bytenr, parent, root_objectid,
1950                                    owner, offset, refs_to_add);
1951        if (ret)
1952                btrfs_abort_transaction(trans, root, ret);
1953out:
1954        btrfs_free_path(path);
1955        return err;
1956}
1957
1958static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1959                                struct btrfs_root *root,
1960                                struct btrfs_delayed_ref_node *node,
1961                                struct btrfs_delayed_extent_op *extent_op,
1962                                int insert_reserved)
1963{
1964        int ret = 0;
1965        struct btrfs_delayed_data_ref *ref;
1966        struct btrfs_key ins;
1967        u64 parent = 0;
1968        u64 ref_root = 0;
1969        u64 flags = 0;
1970
1971        ins.objectid = node->bytenr;
1972        ins.offset = node->num_bytes;
1973        ins.type = BTRFS_EXTENT_ITEM_KEY;
1974
1975        ref = btrfs_delayed_node_to_data_ref(node);
1976        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1977                parent = ref->parent;
1978        else
1979                ref_root = ref->root;
1980
1981        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1982                if (extent_op) {
1983                        BUG_ON(extent_op->update_key);
1984                        flags |= extent_op->flags_to_set;
1985                }
1986                ret = alloc_reserved_file_extent(trans, root,
1987                                                 parent, ref_root, flags,
1988                                                 ref->objectid, ref->offset,
1989                                                 &ins, node->ref_mod);
1990        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1991                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1992                                             node->num_bytes, parent,
1993                                             ref_root, ref->objectid,
1994                                             ref->offset, node->ref_mod,
1995                                             extent_op);
1996        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1997                ret = __btrfs_free_extent(trans, root, node->bytenr,
1998                                          node->num_bytes, parent,
1999                                          ref_root, ref->objectid,
2000                                          ref->offset, node->ref_mod,
2001                                          extent_op);
2002        } else {
2003                BUG();
2004        }
2005        return ret;
2006}
2007
2008static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2009                                    struct extent_buffer *leaf,
2010                                    struct btrfs_extent_item *ei)
2011{
2012        u64 flags = btrfs_extent_flags(leaf, ei);
2013        if (extent_op->update_flags) {
2014                flags |= extent_op->flags_to_set;
2015                btrfs_set_extent_flags(leaf, ei, flags);
2016        }
2017
2018        if (extent_op->update_key) {
2019                struct btrfs_tree_block_info *bi;
2020                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2021                bi = (struct btrfs_tree_block_info *)(ei + 1);
2022                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2023        }
2024}
2025
2026static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2027                                 struct btrfs_root *root,
2028                                 struct btrfs_delayed_ref_node *node,
2029                                 struct btrfs_delayed_extent_op *extent_op)
2030{
2031        struct btrfs_key key;
2032        struct btrfs_path *path;
2033        struct btrfs_extent_item *ei;
2034        struct extent_buffer *leaf;
2035        u32 item_size;
2036        int ret;
2037        int err = 0;
2038
2039        if (trans->aborted)
2040                return 0;
2041
2042        path = btrfs_alloc_path();
2043        if (!path)
2044                return -ENOMEM;
2045
2046        key.objectid = node->bytenr;
2047        key.type = BTRFS_EXTENT_ITEM_KEY;
2048        key.offset = node->num_bytes;
2049
2050        path->reada = 1;
2051        path->leave_spinning = 1;
2052        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2053                                path, 0, 1);
2054        if (ret < 0) {
2055                err = ret;
2056                goto out;
2057        }
2058        if (ret > 0) {
2059                err = -EIO;
2060                goto out;
2061        }
2062
2063        leaf = path->nodes[0];
2064        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2065#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2066        if (item_size < sizeof(*ei)) {
2067                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2068                                             path, (u64)-1, 0);
2069                if (ret < 0) {
2070                        err = ret;
2071                        goto out;
2072                }
2073                leaf = path->nodes[0];
2074                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2075        }
2076#endif
2077        BUG_ON(item_size < sizeof(*ei));
2078        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2079        __run_delayed_extent_op(extent_op, leaf, ei);
2080
2081        btrfs_mark_buffer_dirty(leaf);
2082out:
2083        btrfs_free_path(path);
2084        return err;
2085}
2086
2087static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2088                                struct btrfs_root *root,
2089                                struct btrfs_delayed_ref_node *node,
2090                                struct btrfs_delayed_extent_op *extent_op,
2091                                int insert_reserved)
2092{
2093        int ret = 0;
2094        struct btrfs_delayed_tree_ref *ref;
2095        struct btrfs_key ins;
2096        u64 parent = 0;
2097        u64 ref_root = 0;
2098
2099        ins.objectid = node->bytenr;
2100        ins.offset = node->num_bytes;
2101        ins.type = BTRFS_EXTENT_ITEM_KEY;
2102
2103        ref = btrfs_delayed_node_to_tree_ref(node);
2104        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2105                parent = ref->parent;
2106        else
2107                ref_root = ref->root;
2108
2109        BUG_ON(node->ref_mod != 1);
2110        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2111                BUG_ON(!extent_op || !extent_op->update_flags ||
2112                       !extent_op->update_key);
2113                ret = alloc_reserved_tree_block(trans, root,
2114                                                parent, ref_root,
2115                                                extent_op->flags_to_set,
2116                                                &extent_op->key,
2117                                                ref->level, &ins);
2118        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2119                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2120                                             node->num_bytes, parent, ref_root,
2121                                             ref->level, 0, 1, extent_op);
2122        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2123                ret = __btrfs_free_extent(trans, root, node->bytenr,
2124                                          node->num_bytes, parent, ref_root,
2125                                          ref->level, 0, 1, extent_op);
2126        } else {
2127                BUG();
2128        }
2129        return ret;
2130}
2131
2132/* helper function to actually process a single delayed ref entry */
2133static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2134                               struct btrfs_root *root,
2135                               struct btrfs_delayed_ref_node *node,
2136                               struct btrfs_delayed_extent_op *extent_op,
2137                               int insert_reserved)
2138{
2139        int ret = 0;
2140
2141        if (trans->aborted)
2142                return 0;
2143
2144        if (btrfs_delayed_ref_is_head(node)) {
2145                struct btrfs_delayed_ref_head *head;
2146                /*
2147                 * we've hit the end of the chain and we were supposed
2148                 * to insert this extent into the tree.  But, it got
2149                 * deleted before we ever needed to insert it, so all
2150                 * we have to do is clean up the accounting
2151                 */
2152                BUG_ON(extent_op);
2153                head = btrfs_delayed_node_to_head(node);
2154                if (insert_reserved) {
2155                        btrfs_pin_extent(root, node->bytenr,
2156                                         node->num_bytes, 1);
2157                        if (head->is_data) {
2158                                ret = btrfs_del_csums(trans, root,
2159                                                      node->bytenr,
2160                                                      node->num_bytes);
2161                        }
2162                }
2163                mutex_unlock(&head->mutex);
2164                return ret;
2165        }
2166
2167        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2168            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2169                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2170                                           insert_reserved);
2171        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2172                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2173                ret = run_delayed_data_ref(trans, root, node, extent_op,
2174                                           insert_reserved);
2175        else
2176                BUG();
2177        return ret;
2178}
2179
2180static noinline struct btrfs_delayed_ref_node *
2181select_delayed_ref(struct btrfs_delayed_ref_head *head)
2182{
2183        struct rb_node *node;
2184        struct btrfs_delayed_ref_node *ref;
2185        int action = BTRFS_ADD_DELAYED_REF;
2186again:
2187        /*
2188         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2189         * this prevents ref count from going down to zero when
2190         * there still are pending delayed ref.
2191         */
2192        node = rb_prev(&head->node.rb_node);
2193        while (1) {
2194                if (!node)
2195                        break;
2196                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2197                                rb_node);
2198                if (ref->bytenr != head->node.bytenr)
2199                        break;
2200                if (ref->action == action)
2201                        return ref;
2202                node = rb_prev(node);
2203        }
2204        if (action == BTRFS_ADD_DELAYED_REF) {
2205                action = BTRFS_DROP_DELAYED_REF;
2206                goto again;
2207        }
2208        return NULL;
2209}
2210
2211/*
2212 * Returns 0 on success or if called with an already aborted transaction.
2213 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2214 */
2215static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2216                                       struct btrfs_root *root,
2217                                       struct list_head *cluster)
2218{
2219        struct btrfs_delayed_ref_root *delayed_refs;
2220        struct btrfs_delayed_ref_node *ref;
2221        struct btrfs_delayed_ref_head *locked_ref = NULL;
2222        struct btrfs_delayed_extent_op *extent_op;
2223        struct btrfs_fs_info *fs_info = root->fs_info;
2224        int ret;
2225        int count = 0;
2226        int must_insert_reserved = 0;
2227
2228        delayed_refs = &trans->transaction->delayed_refs;
2229        while (1) {
2230                if (!locked_ref) {
2231                        /* pick a new head ref from the cluster list */
2232                        if (list_empty(cluster))
2233                                break;
2234
2235                        locked_ref = list_entry(cluster->next,
2236                                     struct btrfs_delayed_ref_head, cluster);
2237
2238                        /* grab the lock that says we are going to process
2239                         * all the refs for this head */
2240                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2241
2242                        /*
2243                         * we may have dropped the spin lock to get the head
2244                         * mutex lock, and that might have given someone else
2245                         * time to free the head.  If that's true, it has been
2246                         * removed from our list and we can move on.
2247                         */
2248                        if (ret == -EAGAIN) {
2249                                locked_ref = NULL;
2250                                count++;
2251                                continue;
2252                        }
2253                }
2254
2255                /*
2256                 * We need to try and merge add/drops of the same ref since we
2257                 * can run into issues with relocate dropping the implicit ref
2258                 * and then it being added back again before the drop can
2259                 * finish.  If we merged anything we need to re-loop so we can
2260                 * get a good ref.
2261                 */
2262                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2263                                         locked_ref);
2264
2265                /*
2266                 * locked_ref is the head node, so we have to go one
2267                 * node back for any delayed ref updates
2268                 */
2269                ref = select_delayed_ref(locked_ref);
2270
2271                if (ref && ref->seq &&
2272                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2273                        /*
2274                         * there are still refs with lower seq numbers in the
2275                         * process of being added. Don't run this ref yet.
2276                         */
2277                        list_del_init(&locked_ref->cluster);
2278                        mutex_unlock(&locked_ref->mutex);
2279                        locked_ref = NULL;
2280                        delayed_refs->num_heads_ready++;
2281                        spin_unlock(&delayed_refs->lock);
2282                        cond_resched();
2283                        spin_lock(&delayed_refs->lock);
2284                        continue;
2285                }
2286
2287                /*
2288                 * record the must insert reserved flag before we
2289                 * drop the spin lock.
2290                 */
2291                must_insert_reserved = locked_ref->must_insert_reserved;
2292                locked_ref->must_insert_reserved = 0;
2293
2294                extent_op = locked_ref->extent_op;
2295                locked_ref->extent_op = NULL;
2296
2297                if (!ref) {
2298                        /* All delayed refs have been processed, Go ahead
2299                         * and send the head node to run_one_delayed_ref,
2300                         * so that any accounting fixes can happen
2301                         */
2302                        ref = &locked_ref->node;
2303
2304                        if (extent_op && must_insert_reserved) {
2305                                kfree(extent_op);
2306                                extent_op = NULL;
2307                        }
2308
2309                        if (extent_op) {
2310                                spin_unlock(&delayed_refs->lock);
2311
2312                                ret = run_delayed_extent_op(trans, root,
2313                                                            ref, extent_op);
2314                                kfree(extent_op);
2315
2316                                if (ret) {
2317                                        printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318                                        spin_lock(&delayed_refs->lock);
2319                                        return ret;
2320                                }
2321
2322                                goto next;
2323                        }
2324
2325                        list_del_init(&locked_ref->cluster);
2326                        locked_ref = NULL;
2327                }
2328
2329                ref->in_tree = 0;
2330                rb_erase(&ref->rb_node, &delayed_refs->root);
2331                delayed_refs->num_entries--;
2332                if (locked_ref) {
2333                        /*
2334                         * when we play the delayed ref, also correct the
2335                         * ref_mod on head
2336                         */
2337                        switch (ref->action) {
2338                        case BTRFS_ADD_DELAYED_REF:
2339                        case BTRFS_ADD_DELAYED_EXTENT:
2340                                locked_ref->node.ref_mod -= ref->ref_mod;
2341                                break;
2342                        case BTRFS_DROP_DELAYED_REF:
2343                                locked_ref->node.ref_mod += ref->ref_mod;
2344                                break;
2345                        default:
2346                                WARN_ON(1);
2347                        }
2348                }
2349                spin_unlock(&delayed_refs->lock);
2350
2351                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2352                                          must_insert_reserved);
2353
2354                btrfs_put_delayed_ref(ref);
2355                kfree(extent_op);
2356                count++;
2357
2358                if (ret) {
2359                        printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360                        spin_lock(&delayed_refs->lock);
2361                        return ret;
2362                }
2363
2364next:
2365                cond_resched();
2366                spin_lock(&delayed_refs->lock);
2367        }
2368        return count;
2369}
2370
2371#ifdef SCRAMBLE_DELAYED_REFS
2372/*
2373 * Normally delayed refs get processed in ascending bytenr order. This
2374 * correlates in most cases to the order added. To expose dependencies on this
2375 * order, we start to process the tree in the middle instead of the beginning
2376 */
2377static u64 find_middle(struct rb_root *root)
2378{
2379        struct rb_node *n = root->rb_node;
2380        struct btrfs_delayed_ref_node *entry;
2381        int alt = 1;
2382        u64 middle;
2383        u64 first = 0, last = 0;
2384
2385        n = rb_first(root);
2386        if (n) {
2387                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2388                first = entry->bytenr;
2389        }
2390        n = rb_last(root);
2391        if (n) {
2392                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2393                last = entry->bytenr;
2394        }
2395        n = root->rb_node;
2396
2397        while (n) {
2398                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2399                WARN_ON(!entry->in_tree);
2400
2401                middle = entry->bytenr;
2402
2403                if (alt)
2404                        n = n->rb_left;
2405                else
2406                        n = n->rb_right;
2407
2408                alt = 1 - alt;
2409        }
2410        return middle;
2411}
2412#endif
2413
2414int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2415                                         struct btrfs_fs_info *fs_info)
2416{
2417        struct qgroup_update *qgroup_update;
2418        int ret = 0;
2419
2420        if (list_empty(&trans->qgroup_ref_list) !=
2421            !trans->delayed_ref_elem.seq) {
2422                /* list without seq or seq without list */
2423                printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2424                        list_empty(&trans->qgroup_ref_list) ? "" : " not",
2425                        trans->delayed_ref_elem.seq);
2426                BUG();
2427        }
2428
2429        if (!trans->delayed_ref_elem.seq)
2430                return 0;
2431
2432        while (!list_empty(&trans->qgroup_ref_list)) {
2433                qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2434                                                 struct qgroup_update, list);
2435                list_del(&qgroup_update->list);
2436                if (!ret)
2437                        ret = btrfs_qgroup_account_ref(
2438                                        trans, fs_info, qgroup_update->node,
2439                                        qgroup_update->extent_op);
2440                kfree(qgroup_update);
2441        }
2442
2443        btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2444
2445        return ret;
2446}
2447
2448/*
2449 * this starts processing the delayed reference count updates and
2450 * extent insertions we have queued up so far.  count can be
2451 * 0, which means to process everything in the tree at the start
2452 * of the run (but not newly added entries), or it can be some target
2453 * number you'd like to process.
2454 *
2455 * Returns 0 on success or if called with an aborted transaction
2456 * Returns <0 on error and aborts the transaction
2457 */
2458int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2459                           struct btrfs_root *root, unsigned long count)
2460{
2461        struct rb_node *node;
2462        struct btrfs_delayed_ref_root *delayed_refs;
2463        struct btrfs_delayed_ref_node *ref;
2464        struct list_head cluster;
2465        int ret;
2466        u64 delayed_start;
2467        int run_all = count == (unsigned long)-1;
2468        int run_most = 0;
2469        int loops;
2470
2471        /* We'll clean this up in btrfs_cleanup_transaction */
2472        if (trans->aborted)
2473                return 0;
2474
2475        if (root == root->fs_info->extent_root)
2476                root = root->fs_info->tree_root;
2477
2478        btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2479
2480        delayed_refs = &trans->transaction->delayed_refs;
2481        INIT_LIST_HEAD(&cluster);
2482again:
2483        loops = 0;
2484        spin_lock(&delayed_refs->lock);
2485
2486#ifdef SCRAMBLE_DELAYED_REFS
2487        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2488#endif
2489
2490        if (count == 0) {
2491                count = delayed_refs->num_entries * 2;
2492                run_most = 1;
2493        }
2494        while (1) {
2495                if (!(run_all || run_most) &&
2496                    delayed_refs->num_heads_ready < 64)
2497                        break;
2498
2499                /*
2500                 * go find something we can process in the rbtree.  We start at
2501                 * the beginning of the tree, and then build a cluster
2502                 * of refs to process starting at the first one we are able to
2503                 * lock
2504                 */
2505                delayed_start = delayed_refs->run_delayed_start;
2506                ret = btrfs_find_ref_cluster(trans, &cluster,
2507                                             delayed_refs->run_delayed_start);
2508                if (ret)
2509                        break;
2510
2511                ret = run_clustered_refs(trans, root, &cluster);
2512                if (ret < 0) {
2513                        spin_unlock(&delayed_refs->lock);
2514                        btrfs_abort_transaction(trans, root, ret);
2515                        return ret;
2516                }
2517
2518                count -= min_t(unsigned long, ret, count);
2519
2520                if (count == 0)
2521                        break;
2522
2523                if (delayed_start >= delayed_refs->run_delayed_start) {
2524                        if (loops == 0) {
2525                                /*
2526                                 * btrfs_find_ref_cluster looped. let's do one
2527                                 * more cycle. if we don't run any delayed ref
2528                                 * during that cycle (because we can't because
2529                                 * all of them are blocked), bail out.
2530                                 */
2531                                loops = 1;
2532                        } else {
2533                                /*
2534                                 * no runnable refs left, stop trying
2535                                 */
2536                                BUG_ON(run_all);
2537                                break;
2538                        }
2539                }
2540                if (ret) {
2541                        /* refs were run, let's reset staleness detection */
2542                        loops = 0;
2543                }
2544        }
2545
2546        if (run_all) {
2547                if (!list_empty(&trans->new_bgs)) {
2548                        spin_unlock(&delayed_refs->lock);
2549                        btrfs_create_pending_block_groups(trans, root);
2550                        spin_lock(&delayed_refs->lock);
2551                }
2552
2553                node = rb_first(&delayed_refs->root);
2554                if (!node)
2555                        goto out;
2556                count = (unsigned long)-1;
2557
2558                while (node) {
2559                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
2560                                       rb_node);
2561                        if (btrfs_delayed_ref_is_head(ref)) {
2562                                struct btrfs_delayed_ref_head *head;
2563
2564                                head = btrfs_delayed_node_to_head(ref);
2565                                atomic_inc(&ref->refs);
2566
2567                                spin_unlock(&delayed_refs->lock);
2568                                /*
2569                                 * Mutex was contended, block until it's
2570                                 * released and try again
2571                                 */
2572                                mutex_lock(&head->mutex);
2573                                mutex_unlock(&head->mutex);
2574
2575                                btrfs_put_delayed_ref(ref);
2576                                cond_resched();
2577                                goto again;
2578                        }
2579                        node = rb_next(node);
2580                }
2581                spin_unlock(&delayed_refs->lock);
2582                schedule_timeout(1);
2583                goto again;
2584        }
2585out:
2586        spin_unlock(&delayed_refs->lock);
2587        assert_qgroups_uptodate(trans);
2588        return 0;
2589}
2590
2591int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2592                                struct btrfs_root *root,
2593                                u64 bytenr, u64 num_bytes, u64 flags,
2594                                int is_data)
2595{
2596        struct btrfs_delayed_extent_op *extent_op;
2597        int ret;
2598
2599        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2600        if (!extent_op)
2601                return -ENOMEM;
2602
2603        extent_op->flags_to_set = flags;
2604        extent_op->update_flags = 1;
2605        extent_op->update_key = 0;
2606        extent_op->is_data = is_data ? 1 : 0;
2607
2608        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2609                                          num_bytes, extent_op);
2610        if (ret)
2611                kfree(extent_op);
2612        return ret;
2613}
2614
2615static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2616                                      struct btrfs_root *root,
2617                                      struct btrfs_path *path,
2618                                      u64 objectid, u64 offset, u64 bytenr)
2619{
2620        struct btrfs_delayed_ref_head *head;
2621        struct btrfs_delayed_ref_node *ref;
2622        struct btrfs_delayed_data_ref *data_ref;
2623        struct btrfs_delayed_ref_root *delayed_refs;
2624        struct rb_node *node;
2625        int ret = 0;
2626
2627        ret = -ENOENT;
2628        delayed_refs = &trans->transaction->delayed_refs;
2629        spin_lock(&delayed_refs->lock);
2630        head = btrfs_find_delayed_ref_head(trans, bytenr);
2631        if (!head)
2632                goto out;
2633
2634        if (!mutex_trylock(&head->mutex)) {
2635                atomic_inc(&head->node.refs);
2636                spin_unlock(&delayed_refs->lock);
2637
2638                btrfs_release_path(path);
2639
2640                /*
2641                 * Mutex was contended, block until it's released and let
2642                 * caller try again
2643                 */
2644                mutex_lock(&head->mutex);
2645                mutex_unlock(&head->mutex);
2646                btrfs_put_delayed_ref(&head->node);
2647                return -EAGAIN;
2648        }
2649
2650        node = rb_prev(&head->node.rb_node);
2651        if (!node)
2652                goto out_unlock;
2653
2654        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2655
2656        if (ref->bytenr != bytenr)
2657                goto out_unlock;
2658
2659        ret = 1;
2660        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2661                goto out_unlock;
2662
2663        data_ref = btrfs_delayed_node_to_data_ref(ref);
2664
2665        node = rb_prev(node);
2666        if (node) {
2667                int seq = ref->seq;
2668
2669                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2670                if (ref->bytenr == bytenr && ref->seq == seq)
2671                        goto out_unlock;
2672        }
2673
2674        if (data_ref->root != root->root_key.objectid ||
2675            data_ref->objectid != objectid || data_ref->offset != offset)
2676                goto out_unlock;
2677
2678        ret = 0;
2679out_unlock:
2680        mutex_unlock(&head->mutex);
2681out:
2682        spin_unlock(&delayed_refs->lock);
2683        return ret;
2684}
2685
2686static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2687                                        struct btrfs_root *root,
2688                                        struct btrfs_path *path,
2689                                        u64 objectid, u64 offset, u64 bytenr)
2690{
2691        struct btrfs_root *extent_root = root->fs_info->extent_root;
2692        struct extent_buffer *leaf;
2693        struct btrfs_extent_data_ref *ref;
2694        struct btrfs_extent_inline_ref *iref;
2695        struct btrfs_extent_item *ei;
2696        struct btrfs_key key;
2697        u32 item_size;
2698        int ret;
2699
2700        key.objectid = bytenr;
2701        key.offset = (u64)-1;
2702        key.type = BTRFS_EXTENT_ITEM_KEY;
2703
2704        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2705        if (ret < 0)
2706                goto out;
2707        BUG_ON(ret == 0); /* Corruption */
2708
2709        ret = -ENOENT;
2710        if (path->slots[0] == 0)
2711                goto out;
2712
2713        path->slots[0]--;
2714        leaf = path->nodes[0];
2715        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2716
2717        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2718                goto out;
2719
2720        ret = 1;
2721        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2722#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2723        if (item_size < sizeof(*ei)) {
2724                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2725                goto out;
2726        }
2727#endif
2728        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2729
2730        if (item_size != sizeof(*ei) +
2731            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2732                goto out;
2733
2734        if (btrfs_extent_generation(leaf, ei) <=
2735            btrfs_root_last_snapshot(&root->root_item))
2736                goto out;
2737
2738        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2739        if (btrfs_extent_inline_ref_type(leaf, iref) !=
2740            BTRFS_EXTENT_DATA_REF_KEY)
2741                goto out;
2742
2743        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2744        if (btrfs_extent_refs(leaf, ei) !=
2745            btrfs_extent_data_ref_count(leaf, ref) ||
2746            btrfs_extent_data_ref_root(leaf, ref) !=
2747            root->root_key.objectid ||
2748            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2749            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2750                goto out;
2751
2752        ret = 0;
2753out:
2754        return ret;
2755}
2756
2757int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2758                          struct btrfs_root *root,
2759                          u64 objectid, u64 offset, u64 bytenr)
2760{
2761        struct btrfs_path *path;
2762        int ret;
2763        int ret2;
2764
2765        path = btrfs_alloc_path();
2766        if (!path)
2767                return -ENOENT;
2768
2769        do {
2770                ret = check_committed_ref(trans, root, path, objectid,
2771                                          offset, bytenr);
2772                if (ret && ret != -ENOENT)
2773                        goto out;
2774
2775                ret2 = check_delayed_ref(trans, root, path, objectid,
2776                                         offset, bytenr);
2777        } while (ret2 == -EAGAIN);
2778
2779        if (ret2 && ret2 != -ENOENT) {
2780                ret = ret2;
2781                goto out;
2782        }
2783
2784        if (ret != -ENOENT || ret2 != -ENOENT)
2785                ret = 0;
2786out:
2787        btrfs_free_path(path);
2788        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2789                WARN_ON(ret > 0);
2790        return ret;
2791}
2792
2793static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2794                           struct btrfs_root *root,
2795                           struct extent_buffer *buf,
2796                           int full_backref, int inc, int for_cow)
2797{
2798        u64 bytenr;
2799        u64 num_bytes;
2800        u64 parent;
2801        u64 ref_root;
2802        u32 nritems;
2803        struct btrfs_key key;
2804        struct btrfs_file_extent_item *fi;
2805        int i;
2806        int level;
2807        int ret = 0;
2808        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2809                            u64, u64, u64, u64, u64, u64, int);
2810
2811        ref_root = btrfs_header_owner(buf);
2812        nritems = btrfs_header_nritems(buf);
2813        level = btrfs_header_level(buf);
2814
2815        if (!root->ref_cows && level == 0)
2816                return 0;
2817
2818        if (inc)
2819                process_func = btrfs_inc_extent_ref;
2820        else
2821                process_func = btrfs_free_extent;
2822
2823        if (full_backref)
2824                parent = buf->start;
2825        else
2826                parent = 0;
2827
2828        for (i = 0; i < nritems; i++) {
2829                if (level == 0) {
2830                        btrfs_item_key_to_cpu(buf, &key, i);
2831                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2832                                continue;
2833                        fi = btrfs_item_ptr(buf, i,
2834                                            struct btrfs_file_extent_item);
2835                        if (btrfs_file_extent_type(buf, fi) ==
2836                            BTRFS_FILE_EXTENT_INLINE)
2837                                continue;
2838                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2839                        if (bytenr == 0)
2840                                continue;
2841
2842                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2843                        key.offset -= btrfs_file_extent_offset(buf, fi);
2844                        ret = process_func(trans, root, bytenr, num_bytes,
2845                                           parent, ref_root, key.objectid,
2846                                           key.offset, for_cow);
2847                        if (ret)
2848                                goto fail;
2849                } else {
2850                        bytenr = btrfs_node_blockptr(buf, i);
2851                        num_bytes = btrfs_level_size(root, level - 1);
2852                        ret = process_func(trans, root, bytenr, num_bytes,
2853                                           parent, ref_root, level - 1, 0,
2854                                           for_cow);
2855                        if (ret)
2856                                goto fail;
2857                }
2858        }
2859        return 0;
2860fail:
2861        return ret;
2862}
2863
2864int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2865                  struct extent_buffer *buf, int full_backref, int for_cow)
2866{
2867        return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2868}
2869
2870int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2871                  struct extent_buffer *buf, int full_backref, int for_cow)
2872{
2873        return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2874}
2875
2876static int write_one_cache_group(struct btrfs_trans_handle *trans,
2877                                 struct btrfs_root *root,
2878                                 struct btrfs_path *path,
2879                                 struct btrfs_block_group_cache *cache)
2880{
2881        int ret;
2882        struct btrfs_root *extent_root = root->fs_info->extent_root;
2883        unsigned long bi;
2884        struct extent_buffer *leaf;
2885
2886        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2887        if (ret < 0)
2888                goto fail;
2889        BUG_ON(ret); /* Corruption */
2890
2891        leaf = path->nodes[0];
2892        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2893        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2894        btrfs_mark_buffer_dirty(leaf);
2895        btrfs_release_path(path);
2896fail:
2897        if (ret) {
2898                btrfs_abort_transaction(trans, root, ret);
2899                return ret;
2900        }
2901        return 0;
2902
2903}
2904
2905static struct btrfs_block_group_cache *
2906next_block_group(struct btrfs_root *root,
2907                 struct btrfs_block_group_cache *cache)
2908{
2909        struct rb_node *node;
2910        spin_lock(&root->fs_info->block_group_cache_lock);
2911        node = rb_next(&cache->cache_node);
2912        btrfs_put_block_group(cache);
2913        if (node) {
2914                cache = rb_entry(node, struct btrfs_block_group_cache,
2915                                 cache_node);
2916                btrfs_get_block_group(cache);
2917        } else
2918                cache = NULL;
2919        spin_unlock(&root->fs_info->block_group_cache_lock);
2920        return cache;
2921}
2922
2923static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2924                            struct btrfs_trans_handle *trans,
2925                            struct btrfs_path *path)
2926{
2927        struct btrfs_root *root = block_group->fs_info->tree_root;
2928        struct inode *inode = NULL;
2929        u64 alloc_hint = 0;
2930        int dcs = BTRFS_DC_ERROR;
2931        int num_pages = 0;
2932        int retries = 0;
2933        int ret = 0;
2934
2935        /*
2936         * If this block group is smaller than 100 megs don't bother caching the
2937         * block group.
2938         */
2939        if (block_group->key.offset < (100 * 1024 * 1024)) {
2940                spin_lock(&block_group->lock);
2941                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2942                spin_unlock(&block_group->lock);
2943                return 0;
2944        }
2945
2946again:
2947        inode = lookup_free_space_inode(root, block_group, path);
2948        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2949                ret = PTR_ERR(inode);
2950                btrfs_release_path(path);
2951                goto out;
2952        }
2953
2954        if (IS_ERR(inode)) {
2955                BUG_ON(retries);
2956                retries++;
2957
2958                if (block_group->ro)
2959                        goto out_free;
2960
2961                ret = create_free_space_inode(root, trans, block_group, path);
2962                if (ret)
2963                        goto out_free;
2964                goto again;
2965        }
2966
2967        /* We've already setup this transaction, go ahead and exit */
2968        if (block_group->cache_generation == trans->transid &&
2969            i_size_read(inode)) {
2970                dcs = BTRFS_DC_SETUP;
2971                goto out_put;
2972        }
2973
2974        /*
2975         * We want to set the generation to 0, that way if anything goes wrong
2976         * from here on out we know not to trust this cache when we load up next
2977         * time.
2978         */
2979        BTRFS_I(inode)->generation = 0;
2980        ret = btrfs_update_inode(trans, root, inode);
2981        WARN_ON(ret);
2982
2983        if (i_size_read(inode) > 0) {
2984                ret = btrfs_truncate_free_space_cache(root, trans, path,
2985                                                      inode);
2986                if (ret)
2987                        goto out_put;
2988        }
2989
2990        spin_lock(&block_group->lock);
2991        if (block_group->cached != BTRFS_CACHE_FINISHED ||
2992            !btrfs_test_opt(root, SPACE_CACHE)) {
2993                /*
2994                 * don't bother trying to write stuff out _if_
2995                 * a) we're not cached,
2996                 * b) we're with nospace_cache mount option.
2997                 */
2998                dcs = BTRFS_DC_WRITTEN;
2999                spin_unlock(&block_group->lock);
3000                goto out_put;
3001        }
3002        spin_unlock(&block_group->lock);
3003
3004        /*
3005         * Try to preallocate enough space based on how big the block group is.
3006         * Keep in mind this has to include any pinned space which could end up
3007         * taking up quite a bit since it's not folded into the other space
3008         * cache.
3009         */
3010        num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3011        if (!num_pages)
3012                num_pages = 1;
3013
3014        num_pages *= 16;
3015        num_pages *= PAGE_CACHE_SIZE;
3016
3017        ret = btrfs_check_data_free_space(inode, num_pages);
3018        if (ret)
3019                goto out_put;
3020
3021        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3022                                              num_pages, num_pages,
3023                                              &alloc_hint);
3024        if (!ret)
3025                dcs = BTRFS_DC_SETUP;
3026        btrfs_free_reserved_data_space(inode, num_pages);
3027
3028out_put:
3029        iput(inode);
3030out_free:
3031        btrfs_release_path(path);
3032out:
3033        spin_lock(&block_group->lock);
3034        if (!ret && dcs == BTRFS_DC_SETUP)
3035                block_group->cache_generation = trans->transid;
3036        block_group->disk_cache_state = dcs;
3037        spin_unlock(&block_group->lock);
3038
3039        return ret;
3040}
3041
3042int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3043                                   struct btrfs_root *root)
3044{
3045        struct btrfs_block_group_cache *cache;
3046        int err = 0;
3047        struct btrfs_path *path;
3048        u64 last = 0;
3049
3050        path = btrfs_alloc_path();
3051        if (!path)
3052                return -ENOMEM;
3053
3054again:
3055        while (1) {
3056                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3057                while (cache) {
3058                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3059                                break;
3060                        cache = next_block_group(root, cache);
3061                }
3062                if (!cache) {
3063                        if (last == 0)
3064                                break;
3065                        last = 0;
3066                        continue;
3067                }
3068                err = cache_save_setup(cache, trans, path);
3069                last = cache->key.objectid + cache->key.offset;
3070                btrfs_put_block_group(cache);
3071        }
3072
3073        while (1) {
3074                if (last == 0) {
3075                        err = btrfs_run_delayed_refs(trans, root,
3076                                                     (unsigned long)-1);
3077                        if (err) /* File system offline */
3078                                goto out;
3079                }
3080
3081                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3082                while (cache) {
3083                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3084                                btrfs_put_block_group(cache);
3085                                goto again;
3086                        }
3087
3088                        if (cache->dirty)
3089                                break;
3090                        cache = next_block_group(root, cache);
3091                }
3092                if (!cache) {
3093                        if (last == 0)
3094                                break;
3095                        last = 0;
3096                        continue;
3097                }
3098
3099                if (cache->disk_cache_state == BTRFS_DC_SETUP)
3100                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3101                cache->dirty = 0;
3102                last = cache->key.objectid + cache->key.offset;
3103
3104                err = write_one_cache_group(trans, root, path, cache);
3105                if (err) /* File system offline */
3106                        goto out;
3107
3108                btrfs_put_block_group(cache);
3109        }
3110
3111        while (1) {
3112                /*
3113                 * I don't think this is needed since we're just marking our
3114                 * preallocated extent as written, but just in case it can't
3115                 * hurt.
3116                 */
3117                if (last == 0) {
3118                        err = btrfs_run_delayed_refs(trans, root,
3119                                                     (unsigned long)-1);
3120                        if (err) /* File system offline */
3121                                goto out;
3122                }
3123
3124                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3125                while (cache) {
3126                        /*
3127                         * Really this shouldn't happen, but it could if we
3128                         * couldn't write the entire preallocated extent and
3129                         * splitting the extent resulted in a new block.
3130                         */
3131                        if (cache->dirty) {
3132                                btrfs_put_block_group(cache);
3133                                goto again;
3134                        }
3135                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3136                                break;
3137                        cache = next_block_group(root, cache);
3138                }
3139                if (!cache) {
3140                        if (last == 0)
3141                                break;
3142                        last = 0;
3143                        continue;
3144                }
3145
3146                err = btrfs_write_out_cache(root, trans, cache, path);
3147
3148                /*
3149                 * If we didn't have an error then the cache state is still
3150                 * NEED_WRITE, so we can set it to WRITTEN.
3151                 */
3152                if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3153                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
3154                last = cache->key.objectid + cache->key.offset;
3155                btrfs_put_block_group(cache);
3156        }
3157out:
3158
3159        btrfs_free_path(path);
3160        return err;
3161}
3162
3163int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3164{
3165        struct btrfs_block_group_cache *block_group;
3166        int readonly = 0;
3167
3168        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3169        if (!block_group || block_group->ro)
3170                readonly = 1;
3171        if (block_group)
3172                btrfs_put_block_group(block_group);
3173        return readonly;
3174}
3175
3176static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3177                             u64 total_bytes, u64 bytes_used,
3178                             struct btrfs_space_info **space_info)
3179{
3180        struct btrfs_space_info *found;
3181        int i;
3182        int factor;
3183
3184        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3185                     BTRFS_BLOCK_GROUP_RAID10))
3186                factor = 2;
3187        else
3188                factor = 1;
3189
3190        found = __find_space_info(info, flags);
3191        if (found) {
3192                spin_lock(&found->lock);
3193                found->total_bytes += total_bytes;
3194                found->disk_total += total_bytes * factor;
3195                found->bytes_used += bytes_used;
3196                found->disk_used += bytes_used * factor;
3197                found->full = 0;
3198                spin_unlock(&found->lock);
3199                *space_info = found;
3200                return 0;
3201        }
3202        found = kzalloc(sizeof(*found), GFP_NOFS);
3203        if (!found)
3204                return -ENOMEM;
3205
3206        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3207                INIT_LIST_HEAD(&found->block_groups[i]);
3208        init_rwsem(&found->groups_sem);
3209        spin_lock_init(&found->lock);
3210        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3211        found->total_bytes = total_bytes;
3212        found->disk_total = total_bytes * factor;
3213        found->bytes_used = bytes_used;
3214        found->disk_used = bytes_used * factor;
3215        found->bytes_pinned = 0;
3216        found->bytes_reserved = 0;
3217        found->bytes_readonly = 0;
3218        found->bytes_may_use = 0;
3219        found->full = 0;
3220        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3221        found->chunk_alloc = 0;
3222        found->flush = 0;
3223        init_waitqueue_head(&found->wait);
3224        *space_info = found;
3225        list_add_rcu(&found->list, &info->space_info);
3226        if (flags & BTRFS_BLOCK_GROUP_DATA)
3227                info->data_sinfo = found;
3228        return 0;
3229}
3230
3231static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3232{
3233        u64 extra_flags = chunk_to_extended(flags) &
3234                                BTRFS_EXTENDED_PROFILE_MASK;
3235
3236        if (flags & BTRFS_BLOCK_GROUP_DATA)
3237                fs_info->avail_data_alloc_bits |= extra_flags;
3238        if (flags & BTRFS_BLOCK_GROUP_METADATA)
3239                fs_info->avail_metadata_alloc_bits |= extra_flags;
3240        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3241                fs_info->avail_system_alloc_bits |= extra_flags;
3242}
3243
3244/*
3245 * returns target flags in extended format or 0 if restripe for this
3246 * chunk_type is not in progress
3247 *
3248 * should be called with either volume_mutex or balance_lock held
3249 */
3250static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3251{
3252        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3253        u64 target = 0;
3254
3255        if (!bctl)
3256                return 0;
3257
3258        if (flags & BTRFS_BLOCK_GROUP_DATA &&
3259            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3260                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3261        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3262                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3263                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3264        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3265                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3266                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3267        }
3268
3269        return target;
3270}
3271
3272/*
3273 * @flags: available profiles in extended format (see ctree.h)
3274 *
3275 * Returns reduced profile in chunk format.  If profile changing is in
3276 * progress (either running or paused) picks the target profile (if it's
3277 * already available), otherwise falls back to plain reducing.
3278 */
3279u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3280{
3281        /*
3282         * we add in the count of missing devices because we want
3283         * to make sure that any RAID levels on a degraded FS
3284         * continue to be honored.
3285         */
3286        u64 num_devices = root->fs_info->fs_devices->rw_devices +
3287                root->fs_info->fs_devices->missing_devices;
3288        u64 target;
3289
3290        /*
3291         * see if restripe for this chunk_type is in progress, if so
3292         * try to reduce to the target profile
3293         */
3294        spin_lock(&root->fs_info->balance_lock);
3295        target = get_restripe_target(root->fs_info, flags);
3296        if (target) {
3297                /* pick target profile only if it's already available */
3298                if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3299                        spin_unlock(&root->fs_info->balance_lock);
3300                        return extended_to_chunk(target);
3301                }
3302        }
3303        spin_unlock(&root->fs_info->balance_lock);
3304
3305        if (num_devices == 1)
3306                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3307        if (num_devices < 4)
3308                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3309
3310        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3311            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3312                      BTRFS_BLOCK_GROUP_RAID10))) {
3313                flags &= ~BTRFS_BLOCK_GROUP_DUP;
3314        }
3315
3316        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3317            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3318                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3319        }
3320
3321        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3322            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3323             (flags & BTRFS_BLOCK_GROUP_RAID10) |
3324             (flags & BTRFS_BLOCK_GROUP_DUP))) {
3325                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3326        }
3327
3328        return extended_to_chunk(flags);
3329}
3330
3331static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3332{
3333        if (flags & BTRFS_BLOCK_GROUP_DATA)
3334                flags |= root->fs_info->avail_data_alloc_bits;
3335        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3336                flags |= root->fs_info->avail_system_alloc_bits;
3337        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3338                flags |= root->fs_info->avail_metadata_alloc_bits;
3339
3340        return btrfs_reduce_alloc_profile(root, flags);
3341}
3342
3343u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3344{
3345        u64 flags;
3346
3347        if (data)
3348                flags = BTRFS_BLOCK_GROUP_DATA;
3349        else if (root == root->fs_info->chunk_root)
3350                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3351        else
3352                flags = BTRFS_BLOCK_GROUP_METADATA;
3353
3354        return get_alloc_profile(root, flags);
3355}
3356
3357/*
3358 * This will check the space that the inode allocates from to make sure we have
3359 * enough space for bytes.
3360 */
3361int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3362{
3363        struct btrfs_space_info *data_sinfo;
3364        struct btrfs_root *root = BTRFS_I(inode)->root;
3365        struct btrfs_fs_info *fs_info = root->fs_info;
3366        u64 used;
3367        int ret = 0, committed = 0, alloc_chunk = 1;
3368
3369        /* make sure bytes are sectorsize aligned */
3370        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3371
3372        if (root == root->fs_info->tree_root ||
3373            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3374                alloc_chunk = 0;
3375                committed = 1;
3376        }
3377
3378        data_sinfo = fs_info->data_sinfo;
3379        if (!data_sinfo)
3380                goto alloc;
3381
3382again:
3383        /* make sure we have enough space to handle the data first */
3384        spin_lock(&data_sinfo->lock);
3385        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3386                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3387                data_sinfo->bytes_may_use;
3388
3389        if (used + bytes > data_sinfo->total_bytes) {
3390                struct btrfs_trans_handle *trans;
3391
3392                /*
3393                 * if we don't have enough free bytes in this space then we need
3394                 * to alloc a new chunk.
3395                 */
3396                if (!data_sinfo->full && alloc_chunk) {
3397                        u64 alloc_target;
3398
3399                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3400                        spin_unlock(&data_sinfo->lock);
3401alloc:
3402                        alloc_target = btrfs_get_alloc_profile(root, 1);
3403                        trans = btrfs_join_transaction(root);
3404                        if (IS_ERR(trans))
3405                                return PTR_ERR(trans);
3406
3407                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3408                                             alloc_target,
3409                                             CHUNK_ALLOC_NO_FORCE);
3410                        btrfs_end_transaction(trans, root);
3411                        if (ret < 0) {
3412                                if (ret != -ENOSPC)
3413                                        return ret;
3414                                else
3415                                        goto commit_trans;
3416                        }
3417
3418                        if (!data_sinfo)
3419                                data_sinfo = fs_info->data_sinfo;
3420
3421                        goto again;
3422                }
3423
3424                /*
3425                 * If we have less pinned bytes than we want to allocate then
3426                 * don't bother committing the transaction, it won't help us.
3427                 */
3428                if (data_sinfo->bytes_pinned < bytes)
3429                        committed = 1;
3430                spin_unlock(&data_sinfo->lock);
3431
3432                /* commit the current transaction and try again */
3433commit_trans:
3434                if (!committed &&
3435                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3436                        committed = 1;
3437                        trans = btrfs_join_transaction(root);
3438                        if (IS_ERR(trans))
3439                                return PTR_ERR(trans);
3440                        ret = btrfs_commit_transaction(trans, root);
3441                        if (ret)
3442                                return ret;
3443                        goto again;
3444                }
3445
3446                return -ENOSPC;
3447        }
3448        data_sinfo->bytes_may_use += bytes;
3449        trace_btrfs_space_reservation(root->fs_info, "space_info",
3450                                      data_sinfo->flags, bytes, 1);
3451        spin_unlock(&data_sinfo->lock);
3452
3453        return 0;
3454}
3455
3456/*
3457 * Called if we need to clear a data reservation for this inode.
3458 */
3459void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3460{
3461        struct btrfs_root *root = BTRFS_I(inode)->root;
3462        struct btrfs_space_info *data_sinfo;
3463
3464        /* make sure bytes are sectorsize aligned */
3465        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3466
3467        data_sinfo = root->fs_info->data_sinfo;
3468        spin_lock(&data_sinfo->lock);
3469        data_sinfo->bytes_may_use -= bytes;
3470        trace_btrfs_space_reservation(root->fs_info, "space_info",
3471                                      data_sinfo->flags, bytes, 0);
3472        spin_unlock(&data_sinfo->lock);
3473}
3474
3475static void force_metadata_allocation(struct btrfs_fs_info *info)
3476{
3477        struct list_head *head = &info->space_info;
3478        struct btrfs_space_info *found;
3479
3480        rcu_read_lock();
3481        list_for_each_entry_rcu(found, head, list) {
3482                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3483                        found->force_alloc = CHUNK_ALLOC_FORCE;
3484        }
3485        rcu_read_unlock();
3486}
3487
3488static int should_alloc_chunk(struct btrfs_root *root,
3489                              struct btrfs_space_info *sinfo, int force)
3490{
3491        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3492        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3493        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3494        u64 thresh;
3495
3496        if (force == CHUNK_ALLOC_FORCE)
3497                return 1;
3498
3499        /*
3500         * We need to take into account the global rsv because for all intents
3501         * and purposes it's used space.  Don't worry about locking the
3502         * global_rsv, it doesn't change except when the transaction commits.
3503         */
3504        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3505                num_allocated += global_rsv->size;
3506
3507        /*
3508         * in limited mode, we want to have some free space up to
3509         * about 1% of the FS size.
3510         */
3511        if (force == CHUNK_ALLOC_LIMITED) {
3512                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3513                thresh = max_t(u64, 64 * 1024 * 1024,
3514                               div_factor_fine(thresh, 1));
3515
3516                if (num_bytes - num_allocated < thresh)
3517                        return 1;
3518        }
3519
3520        if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3521                return 0;
3522        return 1;
3523}
3524
3525static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3526{
3527        u64 num_dev;
3528
3529        if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3530            type & BTRFS_BLOCK_GROUP_RAID0)
3531                num_dev = root->fs_info->fs_devices->rw_devices;
3532        else if (type & BTRFS_BLOCK_GROUP_RAID1)
3533                num_dev = 2;
3534        else
3535                num_dev = 1;    /* DUP or single */
3536
3537        /* metadata for updaing devices and chunk tree */
3538        return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3539}
3540
3541static void check_system_chunk(struct btrfs_trans_handle *trans,
3542                               struct btrfs_root *root, u64 type)
3543{
3544        struct btrfs_space_info *info;
3545        u64 left;
3546        u64 thresh;
3547
3548        info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3549        spin_lock(&info->lock);
3550        left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3551                info->bytes_reserved - info->bytes_readonly;
3552        spin_unlock(&info->lock);
3553
3554        thresh = get_system_chunk_thresh(root, type);
3555        if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3556                printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3557                       left, thresh, type);
3558                dump_space_info(info, 0, 0);
3559        }
3560
3561        if (left < thresh) {
3562                u64 flags;
3563
3564                flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3565                btrfs_alloc_chunk(trans, root, flags);
3566        }
3567}
3568
3569static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3570                          struct btrfs_root *extent_root, u64 flags, int force)
3571{
3572        struct btrfs_space_info *space_info;
3573        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3574        int wait_for_alloc = 0;
3575        int ret = 0;
3576
3577        space_info = __find_space_info(extent_root->fs_info, flags);
3578        if (!space_info) {
3579                ret = update_space_info(extent_root->fs_info, flags,
3580                                        0, 0, &space_info);
3581                BUG_ON(ret); /* -ENOMEM */
3582        }
3583        BUG_ON(!space_info); /* Logic error */
3584
3585again:
3586        spin_lock(&space_info->lock);
3587        if (force < space_info->force_alloc)
3588                force = space_info->force_alloc;
3589        if (space_info->full) {
3590                spin_unlock(&space_info->lock);
3591                return 0;
3592        }
3593
3594        if (!should_alloc_chunk(extent_root, space_info, force)) {
3595                spin_unlock(&space_info->lock);
3596                return 0;
3597        } else if (space_info->chunk_alloc) {
3598                wait_for_alloc = 1;
3599        } else {
3600                space_info->chunk_alloc = 1;
3601        }
3602
3603        spin_unlock(&space_info->lock);
3604
3605        mutex_lock(&fs_info->chunk_mutex);
3606
3607        /*
3608         * The chunk_mutex is held throughout the entirety of a chunk
3609         * allocation, so once we've acquired the chunk_mutex we know that the
3610         * other guy is done and we need to recheck and see if we should
3611         * allocate.
3612         */
3613        if (wait_for_alloc) {
3614                mutex_unlock(&fs_info->chunk_mutex);
3615                wait_for_alloc = 0;
3616                goto again;
3617        }
3618
3619        /*
3620         * If we have mixed data/metadata chunks we want to make sure we keep
3621         * allocating mixed chunks instead of individual chunks.
3622         */
3623        if (btrfs_mixed_space_info(space_info))
3624                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3625
3626        /*
3627         * if we're doing a data chunk, go ahead and make sure that
3628         * we keep a reasonable number of metadata chunks allocated in the
3629         * FS as well.
3630         */
3631        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3632                fs_info->data_chunk_allocations++;
3633                if (!(fs_info->data_chunk_allocations %
3634                      fs_info->metadata_ratio))
3635                        force_metadata_allocation(fs_info);
3636        }
3637
3638        /*
3639         * Check if we have enough space in SYSTEM chunk because we may need
3640         * to update devices.
3641         */
3642        check_system_chunk(trans, extent_root, flags);
3643
3644        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3645        if (ret < 0 && ret != -ENOSPC)
3646                goto out;
3647
3648        spin_lock(&space_info->lock);
3649        if (ret)
3650                space_info->full = 1;
3651        else
3652                ret = 1;
3653
3654        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3655        space_info->chunk_alloc = 0;
3656        spin_unlock(&space_info->lock);
3657out:
3658        mutex_unlock(&fs_info->chunk_mutex);
3659        return ret;
3660}
3661
3662static int can_overcommit(struct btrfs_root *root,
3663                          struct btrfs_space_info *space_info, u64 bytes,
3664                          int flush)
3665{
3666        u64 profile = btrfs_get_alloc_profile(root, 0);
3667        u64 avail;
3668        u64 used;
3669
3670        used = space_info->bytes_used + space_info->bytes_reserved +
3671                space_info->bytes_pinned + space_info->bytes_readonly +
3672                space_info->bytes_may_use;
3673
3674        spin_lock(&root->fs_info->free_chunk_lock);
3675        avail = root->fs_info->free_chunk_space;
3676        spin_unlock(&root->fs_info->free_chunk_lock);
3677
3678        /*
3679         * If we have dup, raid1 or raid10 then only half of the free
3680         * space is actually useable.
3681         */
3682        if (profile & (BTRFS_BLOCK_GROUP_DUP |
3683                       BTRFS_BLOCK_GROUP_RAID1 |
3684                       BTRFS_BLOCK_GROUP_RAID10))
3685                avail >>= 1;
3686
3687        /*
3688         * If we aren't flushing don't let us overcommit too much, say
3689         * 1/8th of the space.  If we can flush, let it overcommit up to
3690         * 1/2 of the space.
3691         */
3692        if (flush)
3693                avail >>= 3;
3694        else
3695                avail >>= 1;
3696
3697        if (used + bytes < space_info->total_bytes + avail)
3698                return 1;
3699        return 0;
3700}
3701
3702/*
3703 * shrink metadata reservation for delalloc
3704 */
3705static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3706                            bool wait_ordered)
3707{
3708        struct btrfs_block_rsv *block_rsv;
3709        struct btrfs_space_info *space_info;
3710        struct btrfs_trans_handle *trans;
3711        u64 delalloc_bytes;
3712        u64 max_reclaim;
3713        long time_left;
3714        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715        int loops = 0;
3716
3717        trans = (struct btrfs_trans_handle *)current->journal_info;
3718        block_rsv = &root->fs_info->delalloc_block_rsv;
3719        space_info = block_rsv->space_info;
3720
3721        smp_mb();
3722        delalloc_bytes = root->fs_info->delalloc_bytes;
3723        if (delalloc_bytes == 0) {
3724                if (trans)
3725                        return;
3726                btrfs_wait_ordered_extents(root, 0);
3727                return;
3728        }
3729
3730        while (delalloc_bytes && loops < 3) {
3731                max_reclaim = min(delalloc_bytes, to_reclaim);
3732                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3734                                               WB_REASON_FS_FREE_SPACE);
3735
3736                /*
3737                 * We need to wait for the async pages to actually start before
3738                 * we do anything.
3739                 */
3740                wait_event(root->fs_info->async_submit_wait,
3741                           !atomic_read(&root->fs_info->async_delalloc_pages));
3742
3743                spin_lock(&space_info->lock);
3744                if (can_overcommit(root, space_info, orig, !trans)) {
3745                        spin_unlock(&space_info->lock);
3746                        break;
3747                }
3748                spin_unlock(&space_info->lock);
3749
3750                loops++;
3751                if (wait_ordered && !trans) {
3752                        btrfs_wait_ordered_extents(root, 0);
3753                } else {
3754                        time_left = schedule_timeout_killable(1);
3755                        if (time_left)
3756                                break;
3757                }
3758                smp_mb();
3759                delalloc_bytes = root->fs_info->delalloc_bytes;
3760        }
3761}
3762
3763/**
3764 * maybe_commit_transaction - possibly commit the transaction if its ok to
3765 * @root - the root we're allocating for
3766 * @bytes - the number of bytes we want to reserve
3767 * @force - force the commit
3768 *
3769 * This will check to make sure that committing the transaction will actually
3770 * get us somewhere and then commit the transaction if it does.  Otherwise it
3771 * will return -ENOSPC.
3772 */
3773static int may_commit_transaction(struct btrfs_root *root,
3774                                  struct btrfs_space_info *space_info,
3775                                  u64 bytes, int force)
3776{
3777        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3778        struct btrfs_trans_handle *trans;
3779
3780        trans = (struct btrfs_trans_handle *)current->journal_info;
3781        if (trans)
3782                return -EAGAIN;
3783
3784        if (force)
3785                goto commit;
3786
3787        /* See if there is enough pinned space to make this reservation */
3788        spin_lock(&space_info->lock);
3789        if (space_info->bytes_pinned >= bytes) {
3790                spin_unlock(&space_info->lock);
3791                goto commit;
3792        }
3793        spin_unlock(&space_info->lock);
3794
3795        /*
3796         * See if there is some space in the delayed insertion reservation for
3797         * this reservation.
3798         */
3799        if (space_info != delayed_rsv->space_info)
3800                return -ENOSPC;
3801
3802        spin_lock(&space_info->lock);
3803        spin_lock(&delayed_rsv->lock);
3804        if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3805                spin_unlock(&delayed_rsv->lock);
3806                spin_unlock(&space_info->lock);
3807                return -ENOSPC;
3808        }
3809        spin_unlock(&delayed_rsv->lock);
3810        spin_unlock(&space_info->lock);
3811
3812commit:
3813        trans = btrfs_join_transaction(root);
3814        if (IS_ERR(trans))
3815                return -ENOSPC;
3816
3817        return btrfs_commit_transaction(trans, root);
3818}
3819
3820enum flush_state {
3821        FLUSH_DELAYED_ITEMS_NR  =       1,
3822        FLUSH_DELAYED_ITEMS     =       2,
3823        FLUSH_DELALLOC          =       3,
3824        FLUSH_DELALLOC_WAIT     =       4,
3825        ALLOC_CHUNK             =       5,
3826        COMMIT_TRANS            =       6,
3827};
3828
3829static int flush_space(struct btrfs_root *root,
3830                       struct btrfs_space_info *space_info, u64 num_bytes,
3831                       u64 orig_bytes, int state)
3832{
3833        struct btrfs_trans_handle *trans;
3834        int nr;
3835        int ret = 0;
3836
3837        switch (state) {
3838        case FLUSH_DELAYED_ITEMS_NR:
3839        case FLUSH_DELAYED_ITEMS:
3840                if (state == FLUSH_DELAYED_ITEMS_NR) {
3841                        u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3842
3843                        nr = (int)div64_u64(num_bytes, bytes);
3844                        if (!nr)
3845                                nr = 1;
3846                        nr *= 2;
3847                } else {
3848                        nr = -1;
3849                }
3850                trans = btrfs_join_transaction(root);
3851                if (IS_ERR(trans)) {
3852                        ret = PTR_ERR(trans);
3853                        break;
3854                }
3855                ret = btrfs_run_delayed_items_nr(trans, root, nr);
3856                btrfs_end_transaction(trans, root);
3857                break;
3858        case FLUSH_DELALLOC:
3859        case FLUSH_DELALLOC_WAIT:
3860                shrink_delalloc(root, num_bytes, orig_bytes,
3861                                state == FLUSH_DELALLOC_WAIT);
3862                break;
3863        case ALLOC_CHUNK:
3864                trans = btrfs_join_transaction(root);
3865                if (IS_ERR(trans)) {
3866                        ret = PTR_ERR(trans);
3867                        break;
3868                }
3869                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3870                                     btrfs_get_alloc_profile(root, 0),
3871                                     CHUNK_ALLOC_NO_FORCE);
3872                btrfs_end_transaction(trans, root);
3873                if (ret == -ENOSPC)
3874                        ret = 0;
3875                break;
3876        case COMMIT_TRANS:
3877                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3878                break;
3879        default:
3880                ret = -ENOSPC;
3881                break;
3882        }
3883
3884        return ret;
3885}
3886/**
3887 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3888 * @root - the root we're allocating for
3889 * @block_rsv - the block_rsv we're allocating for
3890 * @orig_bytes - the number of bytes we want
3891 * @flush - wether or not we can flush to make our reservation
3892 *
3893 * This will reserve orgi_bytes number of bytes from the space info associated
3894 * with the block_rsv.  If there is not enough space it will make an attempt to
3895 * flush out space to make room.  It will do this by flushing delalloc if
3896 * possible or committing the transaction.  If flush is 0 then no attempts to
3897 * regain reservations will be made and this will fail if there is not enough
3898 * space already.
3899 */
3900static int reserve_metadata_bytes(struct btrfs_root *root,
3901                                  struct btrfs_block_rsv *block_rsv,
3902                                  u64 orig_bytes, int flush)
3903{
3904        struct btrfs_space_info *space_info = block_rsv->space_info;
3905        u64 used;
3906        u64 num_bytes = orig_bytes;
3907        int flush_state = FLUSH_DELAYED_ITEMS_NR;
3908        int ret = 0;
3909        bool flushing = false;
3910
3911again:
3912        ret = 0;
3913        spin_lock(&space_info->lock);
3914        /*
3915         * We only want to wait if somebody other than us is flushing and we are
3916         * actually alloed to flush.
3917         */
3918        while (flush && !flushing && space_info->flush) {
3919                spin_unlock(&space_info->lock);
3920                /*
3921                 * If we have a trans handle we can't wait because the flusher
3922                 * may have to commit the transaction, which would mean we would
3923                 * deadlock since we are waiting for the flusher to finish, but
3924                 * hold the current transaction open.
3925                 */
3926                if (current->journal_info)
3927                        return -EAGAIN;
3928                ret = wait_event_killable(space_info->wait, !space_info->flush);
3929                /* Must have been killed, return */
3930                if (ret)
3931                        return -EINTR;
3932
3933                spin_lock(&space_info->lock);
3934        }
3935
3936        ret = -ENOSPC;
3937        used = space_info->bytes_used + space_info->bytes_reserved +
3938                space_info->bytes_pinned + space_info->bytes_readonly +
3939                space_info->bytes_may_use;
3940
3941        /*
3942         * The idea here is that we've not already over-reserved the block group
3943         * then we can go ahead and save our reservation first and then start
3944         * flushing if we need to.  Otherwise if we've already overcommitted
3945         * lets start flushing stuff first and then come back and try to make
3946         * our reservation.
3947         */
3948        if (used <= space_info->total_bytes) {
3949                if (used + orig_bytes <= space_info->total_bytes) {
3950                        space_info->bytes_may_use += orig_bytes;
3951                        trace_btrfs_space_reservation(root->fs_info,
3952                                "space_info", space_info->flags, orig_bytes, 1);
3953                        ret = 0;
3954                } else {
3955                        /*
3956                         * Ok set num_bytes to orig_bytes since we aren't
3957                         * overocmmitted, this way we only try and reclaim what
3958                         * we need.
3959                         */
3960                        num_bytes = orig_bytes;
3961                }
3962        } else {
3963                /*
3964                 * Ok we're over committed, set num_bytes to the overcommitted
3965                 * amount plus the amount of bytes that we need for this
3966                 * reservation.
3967                 */
3968                num_bytes = used - space_info->total_bytes +
3969                        (orig_bytes * 2);
3970        }
3971
3972        if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
3973                space_info->bytes_may_use += orig_bytes;
3974                trace_btrfs_space_reservation(root->fs_info, "space_info",
3975                                              space_info->flags, orig_bytes,
3976                                              1);
3977                ret = 0;
3978        }
3979
3980        /*
3981         * Couldn't make our reservation, save our place so while we're trying
3982         * to reclaim space we can actually use it instead of somebody else
3983         * stealing it from us.
3984         */
3985        if (ret && flush) {
3986                flushing = true;
3987                space_info->flush = 1;
3988        }
3989
3990        spin_unlock(&space_info->lock);
3991
3992        if (!ret || !flush)
3993                goto out;
3994
3995        ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996                          flush_state);
3997        flush_state++;
3998        if (!ret)
3999                goto again;
4000        else if (flush_state <= COMMIT_TRANS)
4001                goto again;
4002
4003out:
4004        if (flushing) {
4005                spin_lock(&space_info->lock);
4006                space_info->flush = 0;
4007                wake_up_all(&space_info->wait);
4008                spin_unlock(&space_info->lock);
4009        }
4010        return ret;
4011}
4012
4013static struct btrfs_block_rsv *get_block_rsv(
4014                                        const struct btrfs_trans_handle *trans,
4015                                        const struct btrfs_root *root)
4016{
4017        struct btrfs_block_rsv *block_rsv = NULL;
4018
4019        if (root->ref_cows)
4020                block_rsv = trans->block_rsv;
4021
4022        if (root == root->fs_info->csum_root && trans->adding_csums)
4023                block_rsv = trans->block_rsv;
4024
4025        if (!block_rsv)
4026                block_rsv = root->block_rsv;
4027
4028        if (!block_rsv)
4029                block_rsv = &root->fs_info->empty_block_rsv;
4030
4031        return block_rsv;
4032}
4033
4034static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4035                               u64 num_bytes)
4036{
4037        int ret = -ENOSPC;
4038        spin_lock(&block_rsv->lock);
4039        if (block_rsv->reserved >= num_bytes) {
4040                block_rsv->reserved -= num_bytes;
4041                if (block_rsv->reserved < block_rsv->size)
4042                        block_rsv->full = 0;
4043                ret = 0;
4044        }
4045        spin_unlock(&block_rsv->lock);
4046        return ret;
4047}
4048
4049static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4050                                u64 num_bytes, int update_size)
4051{
4052        spin_lock(&block_rsv->lock);
4053        block_rsv->reserved += num_bytes;
4054        if (update_size)
4055                block_rsv->size += num_bytes;
4056        else if (block_rsv->reserved >= block_rsv->size)
4057                block_rsv->full = 1;
4058        spin_unlock(&block_rsv->lock);
4059}
4060
4061static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4062                                    struct btrfs_block_rsv *block_rsv,
4063                                    struct btrfs_block_rsv *dest, u64 num_bytes)
4064{
4065        struct btrfs_space_info *space_info = block_rsv->space_info;
4066
4067        spin_lock(&block_rsv->lock);
4068        if (num_bytes == (u64)-1)
4069                num_bytes = block_rsv->size;
4070        block_rsv->size -= num_bytes;
4071        if (block_rsv->reserved >= block_rsv->size) {
4072                num_bytes = block_rsv->reserved - block_rsv->size;
4073                block_rsv->reserved = block_rsv->size;
4074                block_rsv->full = 1;
4075        } else {
4076                num_bytes = 0;
4077        }
4078        spin_unlock(&block_rsv->lock);
4079
4080        if (num_bytes > 0) {
4081                if (dest) {
4082                        spin_lock(&dest->lock);
4083                        if (!dest->full) {
4084                                u64 bytes_to_add;
4085
4086                                bytes_to_add = dest->size - dest->reserved;
4087                                bytes_to_add = min(num_bytes, bytes_to_add);
4088                                dest->reserved += bytes_to_add;
4089                                if (dest->reserved >= dest->size)
4090                                        dest->full = 1;
4091                                num_bytes -= bytes_to_add;
4092                        }
4093                        spin_unlock(&dest->lock);
4094                }
4095                if (num_bytes) {
4096                        spin_lock(&space_info->lock);
4097                        space_info->bytes_may_use -= num_bytes;
4098                        trace_btrfs_space_reservation(fs_info, "space_info",
4099                                        space_info->flags, num_bytes, 0);
4100                        space_info->reservation_progress++;
4101                        spin_unlock(&space_info->lock);
4102                }
4103        }
4104}
4105
4106static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4107                                   struct btrfs_block_rsv *dst, u64 num_bytes)
4108{
4109        int ret;
4110
4111        ret = block_rsv_use_bytes(src, num_bytes);
4112        if (ret)
4113                return ret;
4114
4115        block_rsv_add_bytes(dst, num_bytes, 1);
4116        return 0;
4117}
4118
4119void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4120{
4121        memset(rsv, 0, sizeof(*rsv));
4122        spin_lock_init(&rsv->lock);
4123        rsv->type = type;
4124}
4125
4126struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4127                                              unsigned short type)
4128{
4129        struct btrfs_block_rsv *block_rsv;
4130        struct btrfs_fs_info *fs_info = root->fs_info;
4131
4132        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4133        if (!block_rsv)
4134                return NULL;
4135
4136        btrfs_init_block_rsv(block_rsv, type);
4137        block_rsv->space_info = __find_space_info(fs_info,
4138                                                  BTRFS_BLOCK_GROUP_METADATA);
4139        return block_rsv;
4140}
4141
4142void btrfs_free_block_rsv(struct btrfs_root *root,
4143                          struct btrfs_block_rsv *rsv)
4144{
4145        if (!rsv)
4146                return;
4147        btrfs_block_rsv_release(root, rsv, (u64)-1);
4148        kfree(rsv);
4149}
4150
4151static inline int __block_rsv_add(struct btrfs_root *root,
4152                                  struct btrfs_block_rsv *block_rsv,
4153                                  u64 num_bytes, int flush)
4154{
4155        int ret;
4156
4157        if (num_bytes == 0)
4158                return 0;
4159
4160        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4161        if (!ret) {
4162                block_rsv_add_bytes(block_rsv, num_bytes, 1);
4163                return 0;
4164        }
4165
4166        return ret;
4167}
4168
4169int btrfs_block_rsv_add(struct btrfs_root *root,
4170                        struct btrfs_block_rsv *block_rsv,
4171                        u64 num_bytes)
4172{
4173        return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174}
4175
4176int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177                                struct btrfs_block_rsv *block_rsv,
4178                                u64 num_bytes)
4179{
4180        return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181}
4182
4183int btrfs_block_rsv_check(struct btrfs_root *root,
4184                          struct btrfs_block_rsv *block_rsv, int min_factor)
4185{
4186        u64 num_bytes = 0;
4187        int ret = -ENOSPC;
4188
4189        if (!block_rsv)
4190                return 0;
4191
4192        spin_lock(&block_rsv->lock);
4193        num_bytes = div_factor(block_rsv->size, min_factor);
4194        if (block_rsv->reserved >= num_bytes)
4195                ret = 0;
4196        spin_unlock(&block_rsv->lock);
4197
4198        return ret;
4199}
4200
4201static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4202                                           struct btrfs_block_rsv *block_rsv,
4203                                           u64 min_reserved, int flush)
4204{
4205        u64 num_bytes = 0;
4206        int ret = -ENOSPC;
4207
4208        if (!block_rsv)
4209                return 0;
4210
4211        spin_lock(&block_rsv->lock);
4212        num_bytes = min_reserved;
4213        if (block_rsv->reserved >= num_bytes)
4214                ret = 0;
4215        else
4216                num_bytes -= block_rsv->reserved;
4217        spin_unlock(&block_rsv->lock);
4218
4219        if (!ret)
4220                return 0;
4221
4222        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4223        if (!ret) {
4224                block_rsv_add_bytes(block_rsv, num_bytes, 0);
4225                return 0;
4226        }
4227
4228        return ret;
4229}
4230
4231int btrfs_block_rsv_refill(struct btrfs_root *root,
4232                           struct btrfs_block_rsv *block_rsv,
4233                           u64 min_reserved)
4234{
4235        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236}
4237
4238int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239                                   struct btrfs_block_rsv *block_rsv,
4240                                   u64 min_reserved)
4241{
4242        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243}
4244
4245int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246                            struct btrfs_block_rsv *dst_rsv,
4247                            u64 num_bytes)
4248{
4249        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4250}
4251
4252void btrfs_block_rsv_release(struct btrfs_root *root,
4253                             struct btrfs_block_rsv *block_rsv,
4254                             u64 num_bytes)
4255{
4256        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4257        if (global_rsv->full || global_rsv == block_rsv ||
4258            block_rsv->space_info != global_rsv->space_info)
4259                global_rsv = NULL;
4260        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4261                                num_bytes);
4262}
4263
4264/*
4265 * helper to calculate size of global block reservation.
4266 * the desired value is sum of space used by extent tree,
4267 * checksum tree and root tree
4268 */
4269static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4270{
4271        struct btrfs_space_info *sinfo;
4272        u64 num_bytes;
4273        u64 meta_used;
4274        u64 data_used;
4275        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4276
4277        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4278        spin_lock(&sinfo->lock);
4279        data_used = sinfo->bytes_used;
4280        spin_unlock(&sinfo->lock);
4281
4282        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4283        spin_lock(&sinfo->lock);
4284        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4285                data_used = 0;
4286        meta_used = sinfo->bytes_used;
4287        spin_unlock(&sinfo->lock);
4288
4289        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4290                    csum_size * 2;
4291        num_bytes += div64_u64(data_used + meta_used, 50);
4292
4293        if (num_bytes * 3 > meta_used)
4294                num_bytes = div64_u64(meta_used, 3);
4295
4296        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4297}
4298
4299static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4300{
4301        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4302        struct btrfs_space_info *sinfo = block_rsv->space_info;
4303        u64 num_bytes;
4304
4305        num_bytes = calc_global_metadata_size(fs_info);
4306
4307        spin_lock(&sinfo->lock);
4308        spin_lock(&block_rsv->lock);
4309
4310        block_rsv->size = num_bytes;
4311
4312        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4313                    sinfo->bytes_reserved + sinfo->bytes_readonly +
4314                    sinfo->bytes_may_use;
4315
4316        if (sinfo->total_bytes > num_bytes) {
4317                num_bytes = sinfo->total_bytes - num_bytes;
4318                block_rsv->reserved += num_bytes;
4319                sinfo->bytes_may_use += num_bytes;
4320                trace_btrfs_space_reservation(fs_info, "space_info",
4321                                      sinfo->flags, num_bytes, 1);
4322        }
4323
4324        if (block_rsv->reserved >= block_rsv->size) {
4325                num_bytes = block_rsv->reserved - block_rsv->size;
4326                sinfo->bytes_may_use -= num_bytes;
4327                trace_btrfs_space_reservation(fs_info, "space_info",
4328                                      sinfo->flags, num_bytes, 0);
4329                sinfo->reservation_progress++;
4330                block_rsv->reserved = block_rsv->size;
4331                block_rsv->full = 1;
4332        }
4333
4334        spin_unlock(&block_rsv->lock);
4335        spin_unlock(&sinfo->lock);
4336}
4337
4338static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4339{
4340        struct btrfs_space_info *space_info;
4341
4342        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4343        fs_info->chunk_block_rsv.space_info = space_info;
4344
4345        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4346        fs_info->global_block_rsv.space_info = space_info;
4347        fs_info->delalloc_block_rsv.space_info = space_info;
4348        fs_info->trans_block_rsv.space_info = space_info;
4349        fs_info->empty_block_rsv.space_info = space_info;
4350        fs_info->delayed_block_rsv.space_info = space_info;
4351
4352        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4353        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4354        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4355        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4356        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4357
4358        update_global_block_rsv(fs_info);
4359}
4360
4361static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4362{
4363        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4364                                (u64)-1);
4365        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4366        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4367        WARN_ON(fs_info->trans_block_rsv.size > 0);
4368        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4369        WARN_ON(fs_info->chunk_block_rsv.size > 0);
4370        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4371        WARN_ON(fs_info->delayed_block_rsv.size > 0);
4372        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4373}
4374
4375void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4376                                  struct btrfs_root *root)
4377{
4378        if (!trans->block_rsv)
4379                return;
4380
4381        if (!trans->bytes_reserved)
4382                return;
4383
4384        trace_btrfs_space_reservation(root->fs_info, "transaction",
4385                                      trans->transid, trans->bytes_reserved, 0);
4386        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4387        trans->bytes_reserved = 0;
4388}
4389
4390/* Can only return 0 or -ENOSPC */
4391int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4392                                  struct inode *inode)
4393{
4394        struct btrfs_root *root = BTRFS_I(inode)->root;
4395        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4396        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4397
4398        /*
4399         * We need to hold space in order to delete our orphan item once we've
4400         * added it, so this takes the reservation so we can release it later
4401         * when we are truly done with the orphan item.
4402         */
4403        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4404        trace_btrfs_space_reservation(root->fs_info, "orphan",
4405                                      btrfs_ino(inode), num_bytes, 1);
4406        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4407}
4408
4409void btrfs_orphan_release_metadata(struct inode *inode)
4410{
4411        struct btrfs_root *root = BTRFS_I(inode)->root;
4412        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4413        trace_btrfs_space_reservation(root->fs_info, "orphan",
4414                                      btrfs_ino(inode), num_bytes, 0);
4415        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4416}
4417
4418int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4419                                struct btrfs_pending_snapshot *pending)
4420{
4421        struct btrfs_root *root = pending->root;
4422        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4423        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4424        /*
4425         * two for root back/forward refs, two for directory entries,
4426         * one for root of the snapshot and one for parent inode.
4427         */
4428        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4429        dst_rsv->space_info = src_rsv->space_info;
4430        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4431}
4432
4433/**
4434 * drop_outstanding_extent - drop an outstanding extent
4435 * @inode: the inode we're dropping the extent for
4436 *
4437 * This is called when we are freeing up an outstanding extent, either called
4438 * after an error or after an extent is written.  This will return the number of
4439 * reserved extents that need to be freed.  This must be called with
4440 * BTRFS_I(inode)->lock held.
4441 */
4442static unsigned drop_outstanding_extent(struct inode *inode)
4443{
4444        unsigned drop_inode_space = 0;
4445        unsigned dropped_extents = 0;
4446
4447        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4448        BTRFS_I(inode)->outstanding_extents--;
4449
4450        if (BTRFS_I(inode)->outstanding_extents == 0 &&
4451            test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4452                               &BTRFS_I(inode)->runtime_flags))
4453                drop_inode_space = 1;
4454
4455        /*
4456         * If we have more or the same amount of outsanding extents than we have
4457         * reserved then we need to leave the reserved extents count alone.
4458         */
4459        if (BTRFS_I(inode)->outstanding_extents >=
4460            BTRFS_I(inode)->reserved_extents)
4461                return drop_inode_space;
4462
4463        dropped_extents = BTRFS_I(inode)->reserved_extents -
4464                BTRFS_I(inode)->outstanding_extents;
4465        BTRFS_I(inode)->reserved_extents -= dropped_extents;
4466        return dropped_extents + drop_inode_space;
4467}
4468
4469/**
4470 * calc_csum_metadata_size - return the amount of metada space that must be
4471 *      reserved/free'd for the given bytes.
4472 * @inode: the inode we're manipulating
4473 * @num_bytes: the number of bytes in question
4474 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4475 *
4476 * This adjusts the number of csum_bytes in the inode and then returns the
4477 * correct amount of metadata that must either be reserved or freed.  We
4478 * calculate how many checksums we can fit into one leaf and then divide the
4479 * number of bytes that will need to be checksumed by this value to figure out
4480 * how many checksums will be required.  If we are adding bytes then the number
4481 * may go up and we will return the number of additional bytes that must be
4482 * reserved.  If it is going down we will return the number of bytes that must
4483 * be freed.
4484 *
4485 * This must be called with BTRFS_I(inode)->lock held.
4486 */
4487static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4488                                   int reserve)
4489{
4490        struct btrfs_root *root = BTRFS_I(inode)->root;
4491        u64 csum_size;
4492        int num_csums_per_leaf;
4493        int num_csums;
4494        int old_csums;
4495
4496        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4497            BTRFS_I(inode)->csum_bytes == 0)
4498                return 0;
4499
4500        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4501        if (reserve)
4502                BTRFS_I(inode)->csum_bytes += num_bytes;
4503        else
4504                BTRFS_I(inode)->csum_bytes -= num_bytes;
4505        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4506        num_csums_per_leaf = (int)div64_u64(csum_size,
4507                                            sizeof(struct btrfs_csum_item) +
4508                                            sizeof(struct btrfs_disk_key));
4509        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4510        num_csums = num_csums + num_csums_per_leaf - 1;
4511        num_csums = num_csums / num_csums_per_leaf;
4512
4513        old_csums = old_csums + num_csums_per_leaf - 1;
4514        old_csums = old_csums / num_csums_per_leaf;
4515
4516        /* No change, no need to reserve more */
4517        if (old_csums == num_csums)
4518                return 0;
4519
4520        if (reserve)
4521                return btrfs_calc_trans_metadata_size(root,
4522                                                      num_csums - old_csums);
4523
4524        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4525}
4526
4527int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4528{
4529        struct btrfs_root *root = BTRFS_I(inode)->root;
4530        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4531        u64 to_reserve = 0;
4532        u64 csum_bytes;
4533        unsigned nr_extents = 0;
4534        int extra_reserve = 0;
4535        int flush = 1;
4536        int ret;
4537
4538        /* Need to be holding the i_mutex here if we aren't free space cache */
4539        if (btrfs_is_free_space_inode(inode))
4540                flush = 0;
4541
4542        if (flush && btrfs_transaction_in_commit(root->fs_info))
4543                schedule_timeout(1);
4544
4545        mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4546        num_bytes = ALIGN(num_bytes, root->sectorsize);
4547
4548        spin_lock(&BTRFS_I(inode)->lock);
4549        BTRFS_I(inode)->outstanding_extents++;
4550
4551        if (BTRFS_I(inode)->outstanding_extents >
4552            BTRFS_I(inode)->reserved_extents)
4553                nr_extents = BTRFS_I(inode)->outstanding_extents -
4554                        BTRFS_I(inode)->reserved_extents;
4555
4556        /*
4557         * Add an item to reserve for updating the inode when we complete the
4558         * delalloc io.
4559         */
4560        if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4561                      &BTRFS_I(inode)->runtime_flags)) {
4562                nr_extents++;
4563                extra_reserve = 1;
4564        }
4565
4566        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4567        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4568        csum_bytes = BTRFS_I(inode)->csum_bytes;
4569        spin_unlock(&BTRFS_I(inode)->lock);
4570
4571        if (root->fs_info->quota_enabled) {
4572                ret = btrfs_qgroup_reserve(root, num_bytes +
4573                                           nr_extents * root->leafsize);
4574                if (ret) {
4575                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576                        return ret;
4577                }
4578        }
4579
4580        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4581        if (ret) {
4582                u64 to_free = 0;
4583                unsigned dropped;
4584
4585                spin_lock(&BTRFS_I(inode)->lock);
4586                dropped = drop_outstanding_extent(inode);
4587                /*
4588                 * If the inodes csum_bytes is the same as the original
4589                 * csum_bytes then we know we haven't raced with any free()ers
4590                 * so we can just reduce our inodes csum bytes and carry on.
4591                 * Otherwise we have to do the normal free thing to account for
4592                 * the case that the free side didn't free up its reserve
4593                 * because of this outstanding reservation.
4594                 */
4595                if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4596                        calc_csum_metadata_size(inode, num_bytes, 0);
4597                else
4598                        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4599                spin_unlock(&BTRFS_I(inode)->lock);
4600                if (dropped)
4601                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
4602
4603                if (to_free) {
4604                        btrfs_block_rsv_release(root, block_rsv, to_free);
4605                        trace_btrfs_space_reservation(root->fs_info,
4606                                                      "delalloc",
4607                                                      btrfs_ino(inode),
4608                                                      to_free, 0);
4609                }
4610                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611                return ret;
4612        }
4613
4614        spin_lock(&BTRFS_I(inode)->lock);
4615        if (extra_reserve) {
4616                set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4617                        &BTRFS_I(inode)->runtime_flags);
4618                nr_extents--;
4619        }
4620        BTRFS_I(inode)->reserved_extents += nr_extents;
4621        spin_unlock(&BTRFS_I(inode)->lock);
4622        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623
4624        if (to_reserve)
4625                trace_btrfs_space_reservation(root->fs_info,"delalloc",
4626                                              btrfs_ino(inode), to_reserve, 1);
4627        block_rsv_add_bytes(block_rsv, to_reserve, 1);
4628
4629        return 0;
4630}
4631
4632/**
4633 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4634 * @inode: the inode to release the reservation for
4635 * @num_bytes: the number of bytes we're releasing
4636 *
4637 * This will release the metadata reservation for an inode.  This can be called
4638 * once we complete IO for a given set of bytes to release their metadata
4639 * reservations.
4640 */
4641void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4642{
4643        struct btrfs_root *root = BTRFS_I(inode)->root;
4644        u64 to_free = 0;
4645        unsigned dropped;
4646
4647        num_bytes = ALIGN(num_bytes, root->sectorsize);
4648        spin_lock(&BTRFS_I(inode)->lock);
4649        dropped = drop_outstanding_extent(inode);
4650
4651        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4652        spin_unlock(&BTRFS_I(inode)->lock);
4653        if (dropped > 0)
4654                to_free += btrfs_calc_trans_metadata_size(root, dropped);
4655
4656        trace_btrfs_space_reservation(root->fs_info, "delalloc",
4657                                      btrfs_ino(inode), to_free, 0);
4658        if (root->fs_info->quota_enabled) {
4659                btrfs_qgroup_free(root, num_bytes +
4660                                        dropped * root->leafsize);
4661        }
4662
4663        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4664                                to_free);
4665}
4666
4667/**
4668 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4669 * @inode: inode we're writing to
4670 * @num_bytes: the number of bytes we want to allocate
4671 *
4672 * This will do the following things
4673 *
4674 * o reserve space in the data space info for num_bytes
4675 * o reserve space in the metadata space info based on number of outstanding
4676 *   extents and how much csums will be needed
4677 * o add to the inodes ->delalloc_bytes
4678 * o add it to the fs_info's delalloc inodes list.
4679 *
4680 * This will return 0 for success and -ENOSPC if there is no space left.
4681 */
4682int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4683{
4684        int ret;
4685
4686        ret = btrfs_check_data_free_space(inode, num_bytes);
4687        if (ret)
4688                return ret;
4689
4690        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4691        if (ret) {
4692                btrfs_free_reserved_data_space(inode, num_bytes);
4693                return ret;
4694        }
4695
4696        return 0;
4697}
4698
4699/**
4700 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4701 * @inode: inode we're releasing space for
4702 * @num_bytes: the number of bytes we want to free up
4703 *
4704 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4705 * called in the case that we don't need the metadata AND data reservations
4706 * anymore.  So if there is an error or we insert an inline extent.
4707 *
4708 * This function will release the metadata space that was not used and will
4709 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4710 * list if there are no delalloc bytes left.
4711 */
4712void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4713{
4714        btrfs_delalloc_release_metadata(inode, num_bytes);
4715        btrfs_free_reserved_data_space(inode, num_bytes);
4716}
4717
4718static int update_block_group(struct btrfs_trans_handle *trans,
4719                              struct btrfs_root *root,
4720                              u64 bytenr, u64 num_bytes, int alloc)
4721{
4722        struct btrfs_block_group_cache *cache = NULL;
4723        struct btrfs_fs_info *info = root->fs_info;
4724        u64 total = num_bytes;
4725        u64 old_val;
4726        u64 byte_in_group;
4727        int factor;
4728
4729        /* block accounting for super block */
4730        spin_lock(&info->delalloc_lock);
4731        old_val = btrfs_super_bytes_used(info->super_copy);
4732        if (alloc)
4733                old_val += num_bytes;
4734        else
4735                old_val -= num_bytes;
4736        btrfs_set_super_bytes_used(info->super_copy, old_val);
4737        spin_unlock(&info->delalloc_lock);
4738
4739        while (total) {
4740                cache = btrfs_lookup_block_group(info, bytenr);
4741                if (!cache)
4742                        return -ENOENT;
4743                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4744                                    BTRFS_BLOCK_GROUP_RAID1 |
4745                                    BTRFS_BLOCK_GROUP_RAID10))
4746                        factor = 2;
4747                else
4748                        factor = 1;
4749                /*
4750                 * If this block group has free space cache written out, we
4751                 * need to make sure to load it if we are removing space.  This
4752                 * is because we need the unpinning stage to actually add the
4753                 * space back to the block group, otherwise we will leak space.
4754                 */
4755                if (!alloc && cache->cached == BTRFS_CACHE_NO)
4756                        cache_block_group(cache, trans, NULL, 1);
4757
4758                byte_in_group = bytenr - cache->key.objectid;
4759                WARN_ON(byte_in_group > cache->key.offset);
4760
4761                spin_lock(&cache->space_info->lock);
4762                spin_lock(&cache->lock);
4763
4764                if (btrfs_test_opt(root, SPACE_CACHE) &&
4765                    cache->disk_cache_state < BTRFS_DC_CLEAR)
4766                        cache->disk_cache_state = BTRFS_DC_CLEAR;
4767
4768                cache->dirty = 1;
4769                old_val = btrfs_block_group_used(&cache->item);
4770                num_bytes = min(total, cache->key.offset - byte_in_group);
4771                if (alloc) {
4772                        old_val += num_bytes;
4773                        btrfs_set_block_group_used(&cache->item, old_val);
4774                        cache->reserved -= num_bytes;
4775                        cache->space_info->bytes_reserved -= num_bytes;
4776                        cache->space_info->bytes_used += num_bytes;
4777                        cache->space_info->disk_used += num_bytes * factor;
4778                        spin_unlock(&cache->lock);
4779                        spin_unlock(&cache->space_info->lock);
4780                } else {
4781                        old_val -= num_bytes;
4782                        btrfs_set_block_group_used(&cache->item, old_val);
4783                        cache->pinned += num_bytes;
4784                        cache->space_info->bytes_pinned += num_bytes;
4785                        cache->space_info->bytes_used -= num_bytes;
4786                        cache->space_info->disk_used -= num_bytes * factor;
4787                        spin_unlock(&cache->lock);
4788                        spin_unlock(&cache->space_info->lock);
4789
4790                        set_extent_dirty(info->pinned_extents,
4791                                         bytenr, bytenr + num_bytes - 1,
4792                                         GFP_NOFS | __GFP_NOFAIL);
4793                }
4794                btrfs_put_block_group(cache);
4795                total -= num_bytes;
4796                bytenr += num_bytes;
4797        }
4798        return 0;
4799}
4800
4801static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4802{
4803        struct btrfs_block_group_cache *cache;
4804        u64 bytenr;
4805
4806        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4807        if (!cache)
4808                return 0;
4809
4810        bytenr = cache->key.objectid;
4811        btrfs_put_block_group(cache);
4812
4813        return bytenr;
4814}
4815
4816static int pin_down_extent(struct btrfs_root *root,
4817                           struct btrfs_block_group_cache *cache,
4818                           u64 bytenr, u64 num_bytes, int reserved)
4819{
4820        spin_lock(&cache->space_info->lock);
4821        spin_lock(&cache->lock);
4822        cache->pinned += num_bytes;
4823        cache->space_info->bytes_pinned += num_bytes;
4824        if (reserved) {
4825                cache->reserved -= num_bytes;
4826                cache->space_info->bytes_reserved -= num_bytes;
4827        }
4828        spin_unlock(&cache->lock);
4829        spin_unlock(&cache->space_info->lock);
4830
4831        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4832                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4833        return 0;
4834}
4835
4836/*
4837 * this function must be called within transaction
4838 */
4839int btrfs_pin_extent(struct btrfs_root *root,
4840                     u64 bytenr, u64 num_bytes, int reserved)
4841{
4842        struct btrfs_block_group_cache *cache;
4843
4844        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4845        BUG_ON(!cache); /* Logic error */
4846
4847        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4848
4849        btrfs_put_block_group(cache);
4850        return 0;
4851}
4852
4853/*
4854 * this function must be called within transaction
4855 */
4856int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4857                                    struct btrfs_root *root,
4858                                    u64 bytenr, u64 num_bytes)
4859{
4860        struct btrfs_block_group_cache *cache;
4861
4862        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4863        BUG_ON(!cache); /* Logic error */
4864
4865        /*
4866         * pull in the free space cache (if any) so that our pin
4867         * removes the free space from the cache.  We have load_only set
4868         * to one because the slow code to read in the free extents does check
4869         * the pinned extents.
4870         */
4871        cache_block_group(cache, trans, root, 1);
4872
4873        pin_down_extent(root, cache, bytenr, num_bytes, 0);
4874
4875        /* remove us from the free space cache (if we're there at all) */
4876        btrfs_remove_free_space(cache, bytenr, num_bytes);
4877        btrfs_put_block_group(cache);
4878        return 0;
4879}
4880
4881/**
4882 * btrfs_update_reserved_bytes - update the block_group and space info counters
4883 * @cache:      The cache we are manipulating
4884 * @num_bytes:  The number of bytes in question
4885 * @reserve:    One of the reservation enums
4886 *
4887 * This is called by the allocator when it reserves space, or by somebody who is
4888 * freeing space that was never actually used on disk.  For example if you
4889 * reserve some space for a new leaf in transaction A and before transaction A
4890 * commits you free that leaf, you call this with reserve set to 0 in order to
4891 * clear the reservation.
4892 *
4893 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4894 * ENOSPC accounting.  For data we handle the reservation through clearing the
4895 * delalloc bits in the io_tree.  We have to do this since we could end up
4896 * allocating less disk space for the amount of data we have reserved in the
4897 * case of compression.
4898 *
4899 * If this is a reservation and the block group has become read only we cannot
4900 * make the reservation and return -EAGAIN, otherwise this function always
4901 * succeeds.
4902 */
4903static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4904                                       u64 num_bytes, int reserve)
4905{
4906        struct btrfs_space_info *space_info = cache->space_info;
4907        int ret = 0;
4908
4909        spin_lock(&space_info->lock);
4910        spin_lock(&cache->lock);
4911        if (reserve != RESERVE_FREE) {
4912                if (cache->ro) {
4913                        ret = -EAGAIN;
4914                } else {
4915                        cache->reserved += num_bytes;
4916                        space_info->bytes_reserved += num_bytes;
4917                        if (reserve == RESERVE_ALLOC) {
4918                                trace_btrfs_space_reservation(cache->fs_info,
4919                                                "space_info", space_info->flags,
4920                                                num_bytes, 0);
4921                                space_info->bytes_may_use -= num_bytes;
4922                        }
4923                }
4924        } else {
4925                if (cache->ro)
4926                        space_info->bytes_readonly += num_bytes;
4927                cache->reserved -= num_bytes;
4928                space_info->bytes_reserved -= num_bytes;
4929                space_info->reservation_progress++;
4930        }
4931        spin_unlock(&cache->lock);
4932        spin_unlock(&space_info->lock);
4933        return ret;
4934}
4935
4936void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4937                                struct btrfs_root *root)
4938{
4939        struct btrfs_fs_info *fs_info = root->fs_info;
4940        struct btrfs_caching_control *next;
4941        struct btrfs_caching_control *caching_ctl;
4942        struct btrfs_block_group_cache *cache;
4943
4944        down_write(&fs_info->extent_commit_sem);
4945
4946        list_for_each_entry_safe(caching_ctl, next,
4947                                 &fs_info->caching_block_groups, list) {
4948                cache = caching_ctl->block_group;
4949                if (block_group_cache_done(cache)) {
4950                        cache->last_byte_to_unpin = (u64)-1;
4951                        list_del_init(&caching_ctl->list);
4952                        put_caching_control(caching_ctl);
4953                } else {
4954                        cache->last_byte_to_unpin = caching_ctl->progress;
4955                }
4956        }
4957
4958        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4959                fs_info->pinned_extents = &fs_info->freed_extents[1];
4960        else
4961                fs_info->pinned_extents = &fs_info->freed_extents[0];
4962
4963        up_write(&fs_info->extent_commit_sem);
4964
4965        update_global_block_rsv(fs_info);
4966}
4967
4968static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969{
4970        struct btrfs_fs_info *fs_info = root->fs_info;
4971        struct btrfs_block_group_cache *cache = NULL;
4972        u64 len;
4973
4974        while (start <= end) {
4975                if (!cache ||
4976                    start >= cache->key.objectid + cache->key.offset) {
4977                        if (cache)
4978                                btrfs_put_block_group(cache);
4979                        cache = btrfs_lookup_block_group(fs_info, start);
4980                        BUG_ON(!cache); /* Logic error */
4981                }
4982
4983                len = cache->key.objectid + cache->key.offset - start;
4984                len = min(len, end + 1 - start);
4985
4986                if (start < cache->last_byte_to_unpin) {
4987                        len = min(len, cache->last_byte_to_unpin - start);
4988                        btrfs_add_free_space(cache, start, len);
4989                }
4990
4991                start += len;
4992
4993                spin_lock(&cache->space_info->lock);
4994                spin_lock(&cache->lock);
4995                cache->pinned -= len;
4996                cache->space_info->bytes_pinned -= len;
4997                if (cache->ro)
4998                        cache->space_info->bytes_readonly += len;
4999                spin_unlock(&cache->lock);
5000                spin_unlock(&cache->space_info->lock);
5001        }
5002
5003        if (cache)
5004                btrfs_put_block_group(cache);
5005        return 0;
5006}
5007
5008int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5009                               struct btrfs_root *root)
5010{
5011        struct btrfs_fs_info *fs_info = root->fs_info;
5012        struct extent_io_tree *unpin;
5013        u64 start;
5014        u64 end;
5015        int ret;
5016
5017        if (trans->aborted)
5018                return 0;
5019
5020        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5021                unpin = &fs_info->freed_extents[1];
5022        else
5023                unpin = &fs_info->freed_extents[0];
5024
5025        while (1) {
5026                ret = find_first_extent_bit(unpin, 0, &start, &end,
5027                                            EXTENT_DIRTY, NULL);
5028                if (ret)
5029                        break;
5030
5031                if (btrfs_test_opt(root, DISCARD))
5032                        ret = btrfs_discard_extent(root, start,
5033                                                   end + 1 - start, NULL);
5034
5035                clear_extent_dirty(unpin, start, end, GFP_NOFS);
5036                unpin_extent_range(root, start, end);
5037                cond_resched();
5038        }
5039
5040        return 0;
5041}
5042
5043static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5044                                struct btrfs_root *root,
5045                                u64 bytenr, u64 num_bytes, u64 parent,
5046                                u64 root_objectid, u64 owner_objectid,
5047                                u64 owner_offset, int refs_to_drop,
5048                                struct btrfs_delayed_extent_op *extent_op)
5049{
5050        struct btrfs_key key;
5051        struct btrfs_path *path;
5052        struct btrfs_fs_info *info = root->fs_info;
5053        stru