linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include <linux/ratelimit.h>
  27#include "compat.h"
  28#include "hash.h"
  29#include "ctree.h"
  30#include "disk-io.h"
  31#include "print-tree.h"
  32#include "transaction.h"
  33#include "volumes.h"
  34#include "locking.h"
  35#include "free-space-cache.h"
  36
  37/* control flags for do_chunk_alloc's force field
  38 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  39 * if we really need one.
  40 *
  41 * CHUNK_ALLOC_FORCE means it must try to allocate one
  42 *
  43 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  44 * if we have very few chunks already allocated.  This is
  45 * used as part of the clustering code to help make sure
  46 * we have a good pool of storage to cluster in, without
  47 * filling the FS with empty chunks
  48 *
  49 */
  50enum {
  51        CHUNK_ALLOC_NO_FORCE = 0,
  52        CHUNK_ALLOC_FORCE = 1,
  53        CHUNK_ALLOC_LIMITED = 2,
  54};
  55
  56/*
  57 * Control how reservations are dealt with.
  58 *
  59 * RESERVE_FREE - freeing a reservation.
  60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  61 *   ENOSPC accounting
  62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  63 *   bytes_may_use as the ENOSPC accounting is done elsewhere
  64 */
  65enum {
  66        RESERVE_FREE = 0,
  67        RESERVE_ALLOC = 1,
  68        RESERVE_ALLOC_NO_ACCOUNT = 2,
  69};
  70
  71static int update_block_group(struct btrfs_trans_handle *trans,
  72                              struct btrfs_root *root,
  73                              u64 bytenr, u64 num_bytes, int alloc);
  74static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  75                                struct btrfs_root *root,
  76                                u64 bytenr, u64 num_bytes, u64 parent,
  77                                u64 root_objectid, u64 owner_objectid,
  78                                u64 owner_offset, int refs_to_drop,
  79                                struct btrfs_delayed_extent_op *extra_op);
  80static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  81                                    struct extent_buffer *leaf,
  82                                    struct btrfs_extent_item *ei);
  83static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  84                                      struct btrfs_root *root,
  85                                      u64 parent, u64 root_objectid,
  86                                      u64 flags, u64 owner, u64 offset,
  87                                      struct btrfs_key *ins, int ref_mod);
  88static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  89                                     struct btrfs_root *root,
  90                                     u64 parent, u64 root_objectid,
  91                                     u64 flags, struct btrfs_disk_key *key,
  92                                     int level, struct btrfs_key *ins);
  93static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  94                          struct btrfs_root *extent_root, u64 alloc_bytes,
  95                          u64 flags, int force);
  96static int find_next_key(struct btrfs_path *path, int level,
  97                         struct btrfs_key *key);
  98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
  99                            int dump_block_groups);
 100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 101                                       u64 num_bytes, int reserve);
 102
 103static noinline int
 104block_group_cache_done(struct btrfs_block_group_cache *cache)
 105{
 106        smp_mb();
 107        return cache->cached == BTRFS_CACHE_FINISHED;
 108}
 109
 110static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 111{
 112        return (cache->flags & bits) == bits;
 113}
 114
 115static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 116{
 117        atomic_inc(&cache->count);
 118}
 119
 120void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 121{
 122        if (atomic_dec_and_test(&cache->count)) {
 123                WARN_ON(cache->pinned > 0);
 124                WARN_ON(cache->reserved > 0);
 125                kfree(cache->free_space_ctl);
 126                kfree(cache);
 127        }
 128}
 129
 130/*
 131 * this adds the block group to the fs_info rb tree for the block group
 132 * cache
 133 */
 134static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 135                                struct btrfs_block_group_cache *block_group)
 136{
 137        struct rb_node **p;
 138        struct rb_node *parent = NULL;
 139        struct btrfs_block_group_cache *cache;
 140
 141        spin_lock(&info->block_group_cache_lock);
 142        p = &info->block_group_cache_tree.rb_node;
 143
 144        while (*p) {
 145                parent = *p;
 146                cache = rb_entry(parent, struct btrfs_block_group_cache,
 147                                 cache_node);
 148                if (block_group->key.objectid < cache->key.objectid) {
 149                        p = &(*p)->rb_left;
 150                } else if (block_group->key.objectid > cache->key.objectid) {
 151                        p = &(*p)->rb_right;
 152                } else {
 153                        spin_unlock(&info->block_group_cache_lock);
 154                        return -EEXIST;
 155                }
 156        }
 157
 158        rb_link_node(&block_group->cache_node, parent, p);
 159        rb_insert_color(&block_group->cache_node,
 160                        &info->block_group_cache_tree);
 161        spin_unlock(&info->block_group_cache_lock);
 162
 163        return 0;
 164}
 165
 166/*
 167 * This will return the block group at or after bytenr if contains is 0, else
 168 * it will return the block group that contains the bytenr
 169 */
 170static struct btrfs_block_group_cache *
 171block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 172                              int contains)
 173{
 174        struct btrfs_block_group_cache *cache, *ret = NULL;
 175        struct rb_node *n;
 176        u64 end, start;
 177
 178        spin_lock(&info->block_group_cache_lock);
 179        n = info->block_group_cache_tree.rb_node;
 180
 181        while (n) {
 182                cache = rb_entry(n, struct btrfs_block_group_cache,
 183                                 cache_node);
 184                end = cache->key.objectid + cache->key.offset - 1;
 185                start = cache->key.objectid;
 186
 187                if (bytenr < start) {
 188                        if (!contains && (!ret || start < ret->key.objectid))
 189                                ret = cache;
 190                        n = n->rb_left;
 191                } else if (bytenr > start) {
 192                        if (contains && bytenr <= end) {
 193                                ret = cache;
 194                                break;
 195                        }
 196                        n = n->rb_right;
 197                } else {
 198                        ret = cache;
 199                        break;
 200                }
 201        }
 202        if (ret)
 203                btrfs_get_block_group(ret);
 204        spin_unlock(&info->block_group_cache_lock);
 205
 206        return ret;
 207}
 208
 209static int add_excluded_extent(struct btrfs_root *root,
 210                               u64 start, u64 num_bytes)
 211{
 212        u64 end = start + num_bytes - 1;
 213        set_extent_bits(&root->fs_info->freed_extents[0],
 214                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 215        set_extent_bits(&root->fs_info->freed_extents[1],
 216                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 217        return 0;
 218}
 219
 220static void free_excluded_extents(struct btrfs_root *root,
 221                                  struct btrfs_block_group_cache *cache)
 222{
 223        u64 start, end;
 224
 225        start = cache->key.objectid;
 226        end = start + cache->key.offset - 1;
 227
 228        clear_extent_bits(&root->fs_info->freed_extents[0],
 229                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 230        clear_extent_bits(&root->fs_info->freed_extents[1],
 231                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 232}
 233
 234static int exclude_super_stripes(struct btrfs_root *root,
 235                                 struct btrfs_block_group_cache *cache)
 236{
 237        u64 bytenr;
 238        u64 *logical;
 239        int stripe_len;
 240        int i, nr, ret;
 241
 242        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 243                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 244                cache->bytes_super += stripe_len;
 245                ret = add_excluded_extent(root, cache->key.objectid,
 246                                          stripe_len);
 247                BUG_ON(ret);
 248        }
 249
 250        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 251                bytenr = btrfs_sb_offset(i);
 252                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 253                                       cache->key.objectid, bytenr,
 254                                       0, &logical, &nr, &stripe_len);
 255                BUG_ON(ret);
 256
 257                while (nr--) {
 258                        cache->bytes_super += stripe_len;
 259                        ret = add_excluded_extent(root, logical[nr],
 260                                                  stripe_len);
 261                        BUG_ON(ret);
 262                }
 263
 264                kfree(logical);
 265        }
 266        return 0;
 267}
 268
 269static struct btrfs_caching_control *
 270get_caching_control(struct btrfs_block_group_cache *cache)
 271{
 272        struct btrfs_caching_control *ctl;
 273
 274        spin_lock(&cache->lock);
 275        if (cache->cached != BTRFS_CACHE_STARTED) {
 276                spin_unlock(&cache->lock);
 277                return NULL;
 278        }
 279
 280        /* We're loading it the fast way, so we don't have a caching_ctl. */
 281        if (!cache->caching_ctl) {
 282                spin_unlock(&cache->lock);
 283                return NULL;
 284        }
 285
 286        ctl = cache->caching_ctl;
 287        atomic_inc(&ctl->count);
 288        spin_unlock(&cache->lock);
 289        return ctl;
 290}
 291
 292static void put_caching_control(struct btrfs_caching_control *ctl)
 293{
 294        if (atomic_dec_and_test(&ctl->count))
 295                kfree(ctl);
 296}
 297
 298/*
 299 * this is only called by cache_block_group, since we could have freed extents
 300 * we need to check the pinned_extents for any extents that can't be used yet
 301 * since their free space will be released as soon as the transaction commits.
 302 */
 303static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 304                              struct btrfs_fs_info *info, u64 start, u64 end)
 305{
 306        u64 extent_start, extent_end, size, total_added = 0;
 307        int ret;
 308
 309        while (start < end) {
 310                ret = find_first_extent_bit(info->pinned_extents, start,
 311                                            &extent_start, &extent_end,
 312                                            EXTENT_DIRTY | EXTENT_UPTODATE);
 313                if (ret)
 314                        break;
 315
 316                if (extent_start <= start) {
 317                        start = extent_end + 1;
 318                } else if (extent_start > start && extent_start < end) {
 319                        size = extent_start - start;
 320                        total_added += size;
 321                        ret = btrfs_add_free_space(block_group, start,
 322                                                   size);
 323                        BUG_ON(ret);
 324                        start = extent_end + 1;
 325                } else {
 326                        break;
 327                }
 328        }
 329
 330        if (start < end) {
 331                size = end - start;
 332                total_added += size;
 333                ret = btrfs_add_free_space(block_group, start, size);
 334                BUG_ON(ret);
 335        }
 336
 337        return total_added;
 338}
 339
 340static noinline void caching_thread(struct btrfs_work *work)
 341{
 342        struct btrfs_block_group_cache *block_group;
 343        struct btrfs_fs_info *fs_info;
 344        struct btrfs_caching_control *caching_ctl;
 345        struct btrfs_root *extent_root;
 346        struct btrfs_path *path;
 347        struct extent_buffer *leaf;
 348        struct btrfs_key key;
 349        u64 total_found = 0;
 350        u64 last = 0;
 351        u32 nritems;
 352        int ret = 0;
 353
 354        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 355        block_group = caching_ctl->block_group;
 356        fs_info = block_group->fs_info;
 357        extent_root = fs_info->extent_root;
 358
 359        path = btrfs_alloc_path();
 360        if (!path)
 361                goto out;
 362
 363        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 364
 365        /*
 366         * We don't want to deadlock with somebody trying to allocate a new
 367         * extent for the extent root while also trying to search the extent
 368         * root to add free space.  So we skip locking and search the commit
 369         * root, since its read-only
 370         */
 371        path->skip_locking = 1;
 372        path->search_commit_root = 1;
 373        path->reada = 1;
 374
 375        key.objectid = last;
 376        key.offset = 0;
 377        key.type = BTRFS_EXTENT_ITEM_KEY;
 378again:
 379        mutex_lock(&caching_ctl->mutex);
 380        /* need to make sure the commit_root doesn't disappear */
 381        down_read(&fs_info->extent_commit_sem);
 382
 383        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 384        if (ret < 0)
 385                goto err;
 386
 387        leaf = path->nodes[0];
 388        nritems = btrfs_header_nritems(leaf);
 389
 390        while (1) {
 391                if (btrfs_fs_closing(fs_info) > 1) {
 392                        last = (u64)-1;
 393                        break;
 394                }
 395
 396                if (path->slots[0] < nritems) {
 397                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 398                } else {
 399                        ret = find_next_key(path, 0, &key);
 400                        if (ret)
 401                                break;
 402
 403                        if (need_resched() ||
 404                            btrfs_next_leaf(extent_root, path)) {
 405                                caching_ctl->progress = last;
 406                                btrfs_release_path(path);
 407                                up_read(&fs_info->extent_commit_sem);
 408                                mutex_unlock(&caching_ctl->mutex);
 409                                cond_resched();
 410                                goto again;
 411                        }
 412                        leaf = path->nodes[0];
 413                        nritems = btrfs_header_nritems(leaf);
 414                        continue;
 415                }
 416
 417                if (key.objectid < block_group->key.objectid) {
 418                        path->slots[0]++;
 419                        continue;
 420                }
 421
 422                if (key.objectid >= block_group->key.objectid +
 423                    block_group->key.offset)
 424                        break;
 425
 426                if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 427                        total_found += add_new_free_space(block_group,
 428                                                          fs_info, last,
 429                                                          key.objectid);
 430                        last = key.objectid + key.offset;
 431
 432                        if (total_found > (1024 * 1024 * 2)) {
 433                                total_found = 0;
 434                                wake_up(&caching_ctl->wait);
 435                        }
 436                }
 437                path->slots[0]++;
 438        }
 439        ret = 0;
 440
 441        total_found += add_new_free_space(block_group, fs_info, last,
 442                                          block_group->key.objectid +
 443                                          block_group->key.offset);
 444        caching_ctl->progress = (u64)-1;
 445
 446        spin_lock(&block_group->lock);
 447        block_group->caching_ctl = NULL;
 448        block_group->cached = BTRFS_CACHE_FINISHED;
 449        spin_unlock(&block_group->lock);
 450
 451err:
 452        btrfs_free_path(path);
 453        up_read(&fs_info->extent_commit_sem);
 454
 455        free_excluded_extents(extent_root, block_group);
 456
 457        mutex_unlock(&caching_ctl->mutex);
 458out:
 459        wake_up(&caching_ctl->wait);
 460
 461        put_caching_control(caching_ctl);
 462        btrfs_put_block_group(block_group);
 463}
 464
 465static int cache_block_group(struct btrfs_block_group_cache *cache,
 466                             struct btrfs_trans_handle *trans,
 467                             struct btrfs_root *root,
 468                             int load_cache_only)
 469{
 470        DEFINE_WAIT(wait);
 471        struct btrfs_fs_info *fs_info = cache->fs_info;
 472        struct btrfs_caching_control *caching_ctl;
 473        int ret = 0;
 474
 475        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 476        BUG_ON(!caching_ctl);
 477
 478        INIT_LIST_HEAD(&caching_ctl->list);
 479        mutex_init(&caching_ctl->mutex);
 480        init_waitqueue_head(&caching_ctl->wait);
 481        caching_ctl->block_group = cache;
 482        caching_ctl->progress = cache->key.objectid;
 483        atomic_set(&caching_ctl->count, 1);
 484        caching_ctl->work.func = caching_thread;
 485
 486        spin_lock(&cache->lock);
 487        /*
 488         * This should be a rare occasion, but this could happen I think in the
 489         * case where one thread starts to load the space cache info, and then
 490         * some other thread starts a transaction commit which tries to do an
 491         * allocation while the other thread is still loading the space cache
 492         * info.  The previous loop should have kept us from choosing this block
 493         * group, but if we've moved to the state where we will wait on caching
 494         * block groups we need to first check if we're doing a fast load here,
 495         * so we can wait for it to finish, otherwise we could end up allocating
 496         * from a block group who's cache gets evicted for one reason or
 497         * another.
 498         */
 499        while (cache->cached == BTRFS_CACHE_FAST) {
 500                struct btrfs_caching_control *ctl;
 501
 502                ctl = cache->caching_ctl;
 503                atomic_inc(&ctl->count);
 504                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 505                spin_unlock(&cache->lock);
 506
 507                schedule();
 508
 509                finish_wait(&ctl->wait, &wait);
 510                put_caching_control(ctl);
 511                spin_lock(&cache->lock);
 512        }
 513
 514        if (cache->cached != BTRFS_CACHE_NO) {
 515                spin_unlock(&cache->lock);
 516                kfree(caching_ctl);
 517                return 0;
 518        }
 519        WARN_ON(cache->caching_ctl);
 520        cache->caching_ctl = caching_ctl;
 521        cache->cached = BTRFS_CACHE_FAST;
 522        spin_unlock(&cache->lock);
 523
 524        /*
 525         * We can't do the read from on-disk cache during a commit since we need
 526         * to have the normal tree locking.  Also if we are currently trying to
 527         * allocate blocks for the tree root we can't do the fast caching since
 528         * we likely hold important locks.
 529         */
 530        if (trans && (!trans->transaction->in_commit) &&
 531            (root && root != root->fs_info->tree_root) &&
 532            btrfs_test_opt(root, SPACE_CACHE)) {
 533                ret = load_free_space_cache(fs_info, cache);
 534
 535                spin_lock(&cache->lock);
 536                if (ret == 1) {
 537                        cache->caching_ctl = NULL;
 538                        cache->cached = BTRFS_CACHE_FINISHED;
 539                        cache->last_byte_to_unpin = (u64)-1;
 540                } else {
 541                        if (load_cache_only) {
 542                                cache->caching_ctl = NULL;
 543                                cache->cached = BTRFS_CACHE_NO;
 544                        } else {
 545                                cache->cached = BTRFS_CACHE_STARTED;
 546                        }
 547                }
 548                spin_unlock(&cache->lock);
 549                wake_up(&caching_ctl->wait);
 550                if (ret == 1) {
 551                        put_caching_control(caching_ctl);
 552                        free_excluded_extents(fs_info->extent_root, cache);
 553                        return 0;
 554                }
 555        } else {
 556                /*
 557                 * We are not going to do the fast caching, set cached to the
 558                 * appropriate value and wakeup any waiters.
 559                 */
 560                spin_lock(&cache->lock);
 561                if (load_cache_only) {
 562                        cache->caching_ctl = NULL;
 563                        cache->cached = BTRFS_CACHE_NO;
 564                } else {
 565                        cache->cached = BTRFS_CACHE_STARTED;
 566                }
 567                spin_unlock(&cache->lock);
 568                wake_up(&caching_ctl->wait);
 569        }
 570
 571        if (load_cache_only) {
 572                put_caching_control(caching_ctl);
 573                return 0;
 574        }
 575
 576        down_write(&fs_info->extent_commit_sem);
 577        atomic_inc(&caching_ctl->count);
 578        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 579        up_write(&fs_info->extent_commit_sem);
 580
 581        btrfs_get_block_group(cache);
 582
 583        btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
 584
 585        return ret;
 586}
 587
 588/*
 589 * return the block group that starts at or after bytenr
 590 */
 591static struct btrfs_block_group_cache *
 592btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 593{
 594        struct btrfs_block_group_cache *cache;
 595
 596        cache = block_group_cache_tree_search(info, bytenr, 0);
 597
 598        return cache;
 599}
 600
 601/*
 602 * return the block group that contains the given bytenr
 603 */
 604struct btrfs_block_group_cache *btrfs_lookup_block_group(
 605                                                 struct btrfs_fs_info *info,
 606                                                 u64 bytenr)
 607{
 608        struct btrfs_block_group_cache *cache;
 609
 610        cache = block_group_cache_tree_search(info, bytenr, 1);
 611
 612        return cache;
 613}
 614
 615static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 616                                                  u64 flags)
 617{
 618        struct list_head *head = &info->space_info;
 619        struct btrfs_space_info *found;
 620
 621        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
 622                 BTRFS_BLOCK_GROUP_METADATA;
 623
 624        rcu_read_lock();
 625        list_for_each_entry_rcu(found, head, list) {
 626                if (found->flags & flags) {
 627                        rcu_read_unlock();
 628                        return found;
 629                }
 630        }
 631        rcu_read_unlock();
 632        return NULL;
 633}
 634
 635/*
 636 * after adding space to the filesystem, we need to clear the full flags
 637 * on all the space infos.
 638 */
 639void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 640{
 641        struct list_head *head = &info->space_info;
 642        struct btrfs_space_info *found;
 643
 644        rcu_read_lock();
 645        list_for_each_entry_rcu(found, head, list)
 646                found->full = 0;
 647        rcu_read_unlock();
 648}
 649
 650static u64 div_factor(u64 num, int factor)
 651{
 652        if (factor == 10)
 653                return num;
 654        num *= factor;
 655        do_div(num, 10);
 656        return num;
 657}
 658
 659static u64 div_factor_fine(u64 num, int factor)
 660{
 661        if (factor == 100)
 662                return num;
 663        num *= factor;
 664        do_div(num, 100);
 665        return num;
 666}
 667
 668u64 btrfs_find_block_group(struct btrfs_root *root,
 669                           u64 search_start, u64 search_hint, int owner)
 670{
 671        struct btrfs_block_group_cache *cache;
 672        u64 used;
 673        u64 last = max(search_hint, search_start);
 674        u64 group_start = 0;
 675        int full_search = 0;
 676        int factor = 9;
 677        int wrapped = 0;
 678again:
 679        while (1) {
 680                cache = btrfs_lookup_first_block_group(root->fs_info, last);
 681                if (!cache)
 682                        break;
 683
 684                spin_lock(&cache->lock);
 685                last = cache->key.objectid + cache->key.offset;
 686                used = btrfs_block_group_used(&cache->item);
 687
 688                if ((full_search || !cache->ro) &&
 689                    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 690                        if (used + cache->pinned + cache->reserved <
 691                            div_factor(cache->key.offset, factor)) {
 692                                group_start = cache->key.objectid;
 693                                spin_unlock(&cache->lock);
 694                                btrfs_put_block_group(cache);
 695                                goto found;
 696                        }
 697                }
 698                spin_unlock(&cache->lock);
 699                btrfs_put_block_group(cache);
 700                cond_resched();
 701        }
 702        if (!wrapped) {
 703                last = search_start;
 704                wrapped = 1;
 705                goto again;
 706        }
 707        if (!full_search && factor < 10) {
 708                last = search_start;
 709                full_search = 1;
 710                factor = 10;
 711                goto again;
 712        }
 713found:
 714        return group_start;
 715}
 716
 717/* simple helper to search for an existing extent at a given offset */
 718int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 719{
 720        int ret;
 721        struct btrfs_key key;
 722        struct btrfs_path *path;
 723
 724        path = btrfs_alloc_path();
 725        if (!path)
 726                return -ENOMEM;
 727
 728        key.objectid = start;
 729        key.offset = len;
 730        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 731        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 732                                0, 0);
 733        btrfs_free_path(path);
 734        return ret;
 735}
 736
 737/*
 738 * helper function to lookup reference count and flags of extent.
 739 *
 740 * the head node for delayed ref is used to store the sum of all the
 741 * reference count modifications queued up in the rbtree. the head
 742 * node may also store the extent flags to set. This way you can check
 743 * to see what the reference count and extent flags would be if all of
 744 * the delayed refs are not processed.
 745 */
 746int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 747                             struct btrfs_root *root, u64 bytenr,
 748                             u64 num_bytes, u64 *refs, u64 *flags)
 749{
 750        struct btrfs_delayed_ref_head *head;
 751        struct btrfs_delayed_ref_root *delayed_refs;
 752        struct btrfs_path *path;
 753        struct btrfs_extent_item *ei;
 754        struct extent_buffer *leaf;
 755        struct btrfs_key key;
 756        u32 item_size;
 757        u64 num_refs;
 758        u64 extent_flags;
 759        int ret;
 760
 761        path = btrfs_alloc_path();
 762        if (!path)
 763                return -ENOMEM;
 764
 765        key.objectid = bytenr;
 766        key.type = BTRFS_EXTENT_ITEM_KEY;
 767        key.offset = num_bytes;
 768        if (!trans) {
 769                path->skip_locking = 1;
 770                path->search_commit_root = 1;
 771        }
 772again:
 773        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 774                                &key, path, 0, 0);
 775        if (ret < 0)
 776                goto out_free;
 777
 778        if (ret == 0) {
 779                leaf = path->nodes[0];
 780                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 781                if (item_size >= sizeof(*ei)) {
 782                        ei = btrfs_item_ptr(leaf, path->slots[0],
 783                                            struct btrfs_extent_item);
 784                        num_refs = btrfs_extent_refs(leaf, ei);
 785                        extent_flags = btrfs_extent_flags(leaf, ei);
 786                } else {
 787#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 788                        struct btrfs_extent_item_v0 *ei0;
 789                        BUG_ON(item_size != sizeof(*ei0));
 790                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 791                                             struct btrfs_extent_item_v0);
 792                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 793                        /* FIXME: this isn't correct for data */
 794                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 795#else
 796                        BUG();
 797#endif
 798                }
 799                BUG_ON(num_refs == 0);
 800        } else {
 801                num_refs = 0;
 802                extent_flags = 0;
 803                ret = 0;
 804        }
 805
 806        if (!trans)
 807                goto out;
 808
 809        delayed_refs = &trans->transaction->delayed_refs;
 810        spin_lock(&delayed_refs->lock);
 811        head = btrfs_find_delayed_ref_head(trans, bytenr);
 812        if (head) {
 813                if (!mutex_trylock(&head->mutex)) {
 814                        atomic_inc(&head->node.refs);
 815                        spin_unlock(&delayed_refs->lock);
 816
 817                        btrfs_release_path(path);
 818
 819                        /*
 820                         * Mutex was contended, block until it's released and try
 821                         * again
 822                         */
 823                        mutex_lock(&head->mutex);
 824                        mutex_unlock(&head->mutex);
 825                        btrfs_put_delayed_ref(&head->node);
 826                        goto again;
 827                }
 828                if (head->extent_op && head->extent_op->update_flags)
 829                        extent_flags |= head->extent_op->flags_to_set;
 830                else
 831                        BUG_ON(num_refs == 0);
 832
 833                num_refs += head->node.ref_mod;
 834                mutex_unlock(&head->mutex);
 835        }
 836        spin_unlock(&delayed_refs->lock);
 837out:
 838        WARN_ON(num_refs == 0);
 839        if (refs)
 840                *refs = num_refs;
 841        if (flags)
 842                *flags = extent_flags;
 843out_free:
 844        btrfs_free_path(path);
 845        return ret;
 846}
 847
 848/*
 849 * Back reference rules.  Back refs have three main goals:
 850 *
 851 * 1) differentiate between all holders of references to an extent so that
 852 *    when a reference is dropped we can make sure it was a valid reference
 853 *    before freeing the extent.
 854 *
 855 * 2) Provide enough information to quickly find the holders of an extent
 856 *    if we notice a given block is corrupted or bad.
 857 *
 858 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 859 *    maintenance.  This is actually the same as #2, but with a slightly
 860 *    different use case.
 861 *
 862 * There are two kinds of back refs. The implicit back refs is optimized
 863 * for pointers in non-shared tree blocks. For a given pointer in a block,
 864 * back refs of this kind provide information about the block's owner tree
 865 * and the pointer's key. These information allow us to find the block by
 866 * b-tree searching. The full back refs is for pointers in tree blocks not
 867 * referenced by their owner trees. The location of tree block is recorded
 868 * in the back refs. Actually the full back refs is generic, and can be
 869 * used in all cases the implicit back refs is used. The major shortcoming
 870 * of the full back refs is its overhead. Every time a tree block gets
 871 * COWed, we have to update back refs entry for all pointers in it.
 872 *
 873 * For a newly allocated tree block, we use implicit back refs for
 874 * pointers in it. This means most tree related operations only involve
 875 * implicit back refs. For a tree block created in old transaction, the
 876 * only way to drop a reference to it is COW it. So we can detect the
 877 * event that tree block loses its owner tree's reference and do the
 878 * back refs conversion.
 879 *
 880 * When a tree block is COW'd through a tree, there are four cases:
 881 *
 882 * The reference count of the block is one and the tree is the block's
 883 * owner tree. Nothing to do in this case.
 884 *
 885 * The reference count of the block is one and the tree is not the
 886 * block's owner tree. In this case, full back refs is used for pointers
 887 * in the block. Remove these full back refs, add implicit back refs for
 888 * every pointers in the new block.
 889 *
 890 * The reference count of the block is greater than one and the tree is
 891 * the block's owner tree. In this case, implicit back refs is used for
 892 * pointers in the block. Add full back refs for every pointers in the
 893 * block, increase lower level extents' reference counts. The original
 894 * implicit back refs are entailed to the new block.
 895 *
 896 * The reference count of the block is greater than one and the tree is
 897 * not the block's owner tree. Add implicit back refs for every pointer in
 898 * the new block, increase lower level extents' reference count.
 899 *
 900 * Back Reference Key composing:
 901 *
 902 * The key objectid corresponds to the first byte in the extent,
 903 * The key type is used to differentiate between types of back refs.
 904 * There are different meanings of the key offset for different types
 905 * of back refs.
 906 *
 907 * File extents can be referenced by:
 908 *
 909 * - multiple snapshots, subvolumes, or different generations in one subvol
 910 * - different files inside a single subvolume
 911 * - different offsets inside a file (bookend extents in file.c)
 912 *
 913 * The extent ref structure for the implicit back refs has fields for:
 914 *
 915 * - Objectid of the subvolume root
 916 * - objectid of the file holding the reference
 917 * - original offset in the file
 918 * - how many bookend extents
 919 *
 920 * The key offset for the implicit back refs is hash of the first
 921 * three fields.
 922 *
 923 * The extent ref structure for the full back refs has field for:
 924 *
 925 * - number of pointers in the tree leaf
 926 *
 927 * The key offset for the implicit back refs is the first byte of
 928 * the tree leaf
 929 *
 930 * When a file extent is allocated, The implicit back refs is used.
 931 * the fields are filled in:
 932 *
 933 *     (root_key.objectid, inode objectid, offset in file, 1)
 934 *
 935 * When a file extent is removed file truncation, we find the
 936 * corresponding implicit back refs and check the following fields:
 937 *
 938 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 939 *
 940 * Btree extents can be referenced by:
 941 *
 942 * - Different subvolumes
 943 *
 944 * Both the implicit back refs and the full back refs for tree blocks
 945 * only consist of key. The key offset for the implicit back refs is
 946 * objectid of block's owner tree. The key offset for the full back refs
 947 * is the first byte of parent block.
 948 *
 949 * When implicit back refs is used, information about the lowest key and
 950 * level of the tree block are required. These information are stored in
 951 * tree block info structure.
 952 */
 953
 954#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 955static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 956                                  struct btrfs_root *root,
 957                                  struct btrfs_path *path,
 958                                  u64 owner, u32 extra_size)
 959{
 960        struct btrfs_extent_item *item;
 961        struct btrfs_extent_item_v0 *ei0;
 962        struct btrfs_extent_ref_v0 *ref0;
 963        struct btrfs_tree_block_info *bi;
 964        struct extent_buffer *leaf;
 965        struct btrfs_key key;
 966        struct btrfs_key found_key;
 967        u32 new_size = sizeof(*item);
 968        u64 refs;
 969        int ret;
 970
 971        leaf = path->nodes[0];
 972        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 973
 974        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 975        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 976                             struct btrfs_extent_item_v0);
 977        refs = btrfs_extent_refs_v0(leaf, ei0);
 978
 979        if (owner == (u64)-1) {
 980                while (1) {
 981                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 982                                ret = btrfs_next_leaf(root, path);
 983                                if (ret < 0)
 984                                        return ret;
 985                                BUG_ON(ret > 0);
 986                                leaf = path->nodes[0];
 987                        }
 988                        btrfs_item_key_to_cpu(leaf, &found_key,
 989                                              path->slots[0]);
 990                        BUG_ON(key.objectid != found_key.objectid);
 991                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
 992                                path->slots[0]++;
 993                                continue;
 994                        }
 995                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
 996                                              struct btrfs_extent_ref_v0);
 997                        owner = btrfs_ref_objectid_v0(leaf, ref0);
 998                        break;
 999                }
1000        }
1001        btrfs_release_path(path);
1002
1003        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1004                new_size += sizeof(*bi);
1005
1006        new_size -= sizeof(*ei0);
1007        ret = btrfs_search_slot(trans, root, &key, path,
1008                                new_size + extra_size, 1);
1009        if (ret < 0)
1010                return ret;
1011        BUG_ON(ret);
1012
1013        ret = btrfs_extend_item(trans, root, path, new_size);
1014
1015        leaf = path->nodes[0];
1016        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1017        btrfs_set_extent_refs(leaf, item, refs);
1018        /* FIXME: get real generation */
1019        btrfs_set_extent_generation(leaf, item, 0);
1020        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1021                btrfs_set_extent_flags(leaf, item,
1022                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1023                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1024                bi = (struct btrfs_tree_block_info *)(item + 1);
1025                /* FIXME: get first key of the block */
1026                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1027                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1028        } else {
1029                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1030        }
1031        btrfs_mark_buffer_dirty(leaf);
1032        return 0;
1033}
1034#endif
1035
1036static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1037{
1038        u32 high_crc = ~(u32)0;
1039        u32 low_crc = ~(u32)0;
1040        __le64 lenum;
1041
1042        lenum = cpu_to_le64(root_objectid);
1043        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1044        lenum = cpu_to_le64(owner);
1045        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1046        lenum = cpu_to_le64(offset);
1047        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1048
1049        return ((u64)high_crc << 31) ^ (u64)low_crc;
1050}
1051
1052static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1053                                     struct btrfs_extent_data_ref *ref)
1054{
1055        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1056                                    btrfs_extent_data_ref_objectid(leaf, ref),
1057                                    btrfs_extent_data_ref_offset(leaf, ref));
1058}
1059
1060static int match_extent_data_ref(struct extent_buffer *leaf,
1061                                 struct btrfs_extent_data_ref *ref,
1062                                 u64 root_objectid, u64 owner, u64 offset)
1063{
1064        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1065            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1066            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1067                return 0;
1068        return 1;
1069}
1070
1071static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1072                                           struct btrfs_root *root,
1073                                           struct btrfs_path *path,
1074                                           u64 bytenr, u64 parent,
1075                                           u64 root_objectid,
1076                                           u64 owner, u64 offset)
1077{
1078        struct btrfs_key key;
1079        struct btrfs_extent_data_ref *ref;
1080        struct extent_buffer *leaf;
1081        u32 nritems;
1082        int ret;
1083        int recow;
1084        int err = -ENOENT;
1085
1086        key.objectid = bytenr;
1087        if (parent) {
1088                key.type = BTRFS_SHARED_DATA_REF_KEY;
1089                key.offset = parent;
1090        } else {
1091                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1092                key.offset = hash_extent_data_ref(root_objectid,
1093                                                  owner, offset);
1094        }
1095again:
1096        recow = 0;
1097        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1098        if (ret < 0) {
1099                err = ret;
1100                goto fail;
1101        }
1102
1103        if (parent) {
1104                if (!ret)
1105                        return 0;
1106#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1107                key.type = BTRFS_EXTENT_REF_V0_KEY;
1108                btrfs_release_path(path);
1109                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1110                if (ret < 0) {
1111                        err = ret;
1112                        goto fail;
1113                }
1114                if (!ret)
1115                        return 0;
1116#endif
1117                goto fail;
1118        }
1119
1120        leaf = path->nodes[0];
1121        nritems = btrfs_header_nritems(leaf);
1122        while (1) {
1123                if (path->slots[0] >= nritems) {
1124                        ret = btrfs_next_leaf(root, path);
1125                        if (ret < 0)
1126                                err = ret;
1127                        if (ret)
1128                                goto fail;
1129
1130                        leaf = path->nodes[0];
1131                        nritems = btrfs_header_nritems(leaf);
1132                        recow = 1;
1133                }
1134
1135                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1136                if (key.objectid != bytenr ||
1137                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1138                        goto fail;
1139
1140                ref = btrfs_item_ptr(leaf, path->slots[0],
1141                                     struct btrfs_extent_data_ref);
1142
1143                if (match_extent_data_ref(leaf, ref, root_objectid,
1144                                          owner, offset)) {
1145                        if (recow) {
1146                                btrfs_release_path(path);
1147                                goto again;
1148                        }
1149                        err = 0;
1150                        break;
1151                }
1152                path->slots[0]++;
1153        }
1154fail:
1155        return err;
1156}
1157
1158static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1159                                           struct btrfs_root *root,
1160                                           struct btrfs_path *path,
1161                                           u64 bytenr, u64 parent,
1162                                           u64 root_objectid, u64 owner,
1163                                           u64 offset, int refs_to_add)
1164{
1165        struct btrfs_key key;
1166        struct extent_buffer *leaf;
1167        u32 size;
1168        u32 num_refs;
1169        int ret;
1170
1171        key.objectid = bytenr;
1172        if (parent) {
1173                key.type = BTRFS_SHARED_DATA_REF_KEY;
1174                key.offset = parent;
1175                size = sizeof(struct btrfs_shared_data_ref);
1176        } else {
1177                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1178                key.offset = hash_extent_data_ref(root_objectid,
1179                                                  owner, offset);
1180                size = sizeof(struct btrfs_extent_data_ref);
1181        }
1182
1183        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1184        if (ret && ret != -EEXIST)
1185                goto fail;
1186
1187        leaf = path->nodes[0];
1188        if (parent) {
1189                struct btrfs_shared_data_ref *ref;
1190                ref = btrfs_item_ptr(leaf, path->slots[0],
1191                                     struct btrfs_shared_data_ref);
1192                if (ret == 0) {
1193                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1194                } else {
1195                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1196                        num_refs += refs_to_add;
1197                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1198                }
1199        } else {
1200                struct btrfs_extent_data_ref *ref;
1201                while (ret == -EEXIST) {
1202                        ref = btrfs_item_ptr(leaf, path->slots[0],
1203                                             struct btrfs_extent_data_ref);
1204                        if (match_extent_data_ref(leaf, ref, root_objectid,
1205                                                  owner, offset))
1206                                break;
1207                        btrfs_release_path(path);
1208                        key.offset++;
1209                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1210                                                      size);
1211                        if (ret && ret != -EEXIST)
1212                                goto fail;
1213
1214                        leaf = path->nodes[0];
1215                }
1216                ref = btrfs_item_ptr(leaf, path->slots[0],
1217                                     struct btrfs_extent_data_ref);
1218                if (ret == 0) {
1219                        btrfs_set_extent_data_ref_root(leaf, ref,
1220                                                       root_objectid);
1221                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1222                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1223                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1224                } else {
1225                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1226                        num_refs += refs_to_add;
1227                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1228                }
1229        }
1230        btrfs_mark_buffer_dirty(leaf);
1231        ret = 0;
1232fail:
1233        btrfs_release_path(path);
1234        return ret;
1235}
1236
1237static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1238                                           struct btrfs_root *root,
1239                                           struct btrfs_path *path,
1240                                           int refs_to_drop)
1241{
1242        struct btrfs_key key;
1243        struct btrfs_extent_data_ref *ref1 = NULL;
1244        struct btrfs_shared_data_ref *ref2 = NULL;
1245        struct extent_buffer *leaf;
1246        u32 num_refs = 0;
1247        int ret = 0;
1248
1249        leaf = path->nodes[0];
1250        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1251
1252        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1253                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1254                                      struct btrfs_extent_data_ref);
1255                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1256        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1257                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1258                                      struct btrfs_shared_data_ref);
1259                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1260#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1261        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1262                struct btrfs_extent_ref_v0 *ref0;
1263                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1264                                      struct btrfs_extent_ref_v0);
1265                num_refs = btrfs_ref_count_v0(leaf, ref0);
1266#endif
1267        } else {
1268                BUG();
1269        }
1270
1271        BUG_ON(num_refs < refs_to_drop);
1272        num_refs -= refs_to_drop;
1273
1274        if (num_refs == 0) {
1275                ret = btrfs_del_item(trans, root, path);
1276        } else {
1277                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1278                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1279                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1280                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1281#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1282                else {
1283                        struct btrfs_extent_ref_v0 *ref0;
1284                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1285                                        struct btrfs_extent_ref_v0);
1286                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1287                }
1288#endif
1289                btrfs_mark_buffer_dirty(leaf);
1290        }
1291        return ret;
1292}
1293
1294static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1295                                          struct btrfs_path *path,
1296                                          struct btrfs_extent_inline_ref *iref)
1297{
1298        struct btrfs_key key;
1299        struct extent_buffer *leaf;
1300        struct btrfs_extent_data_ref *ref1;
1301        struct btrfs_shared_data_ref *ref2;
1302        u32 num_refs = 0;
1303
1304        leaf = path->nodes[0];
1305        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1306        if (iref) {
1307                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1308                    BTRFS_EXTENT_DATA_REF_KEY) {
1309                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1310                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1311                } else {
1312                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1313                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1314                }
1315        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1316                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1317                                      struct btrfs_extent_data_ref);
1318                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1319        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1320                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1321                                      struct btrfs_shared_data_ref);
1322                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1323#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1324        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1325                struct btrfs_extent_ref_v0 *ref0;
1326                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1327                                      struct btrfs_extent_ref_v0);
1328                num_refs = btrfs_ref_count_v0(leaf, ref0);
1329#endif
1330        } else {
1331                WARN_ON(1);
1332        }
1333        return num_refs;
1334}
1335
1336static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1337                                          struct btrfs_root *root,
1338                                          struct btrfs_path *path,
1339                                          u64 bytenr, u64 parent,
1340                                          u64 root_objectid)
1341{
1342        struct btrfs_key key;
1343        int ret;
1344
1345        key.objectid = bytenr;
1346        if (parent) {
1347                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1348                key.offset = parent;
1349        } else {
1350                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1351                key.offset = root_objectid;
1352        }
1353
1354        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1355        if (ret > 0)
1356                ret = -ENOENT;
1357#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1358        if (ret == -ENOENT && parent) {
1359                btrfs_release_path(path);
1360                key.type = BTRFS_EXTENT_REF_V0_KEY;
1361                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1362                if (ret > 0)
1363                        ret = -ENOENT;
1364        }
1365#endif
1366        return ret;
1367}
1368
1369static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1370                                          struct btrfs_root *root,
1371                                          struct btrfs_path *path,
1372                                          u64 bytenr, u64 parent,
1373                                          u64 root_objectid)
1374{
1375        struct btrfs_key key;
1376        int ret;
1377
1378        key.objectid = bytenr;
1379        if (parent) {
1380                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1381                key.offset = parent;
1382        } else {
1383                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1384                key.offset = root_objectid;
1385        }
1386
1387        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1388        btrfs_release_path(path);
1389        return ret;
1390}
1391
1392static inline int extent_ref_type(u64 parent, u64 owner)
1393{
1394        int type;
1395        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1396                if (parent > 0)
1397                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1398                else
1399                        type = BTRFS_TREE_BLOCK_REF_KEY;
1400        } else {
1401                if (parent > 0)
1402                        type = BTRFS_SHARED_DATA_REF_KEY;
1403                else
1404                        type = BTRFS_EXTENT_DATA_REF_KEY;
1405        }
1406        return type;
1407}
1408
1409static int find_next_key(struct btrfs_path *path, int level,
1410                         struct btrfs_key *key)
1411
1412{
1413        for (; level < BTRFS_MAX_LEVEL; level++) {
1414                if (!path->nodes[level])
1415                        break;
1416                if (path->slots[level] + 1 >=
1417                    btrfs_header_nritems(path->nodes[level]))
1418                        continue;
1419                if (level == 0)
1420                        btrfs_item_key_to_cpu(path->nodes[level], key,
1421                                              path->slots[level] + 1);
1422                else
1423                        btrfs_node_key_to_cpu(path->nodes[level], key,
1424                                              path->slots[level] + 1);
1425                return 0;
1426        }
1427        return 1;
1428}
1429
1430/*
1431 * look for inline back ref. if back ref is found, *ref_ret is set
1432 * to the address of inline back ref, and 0 is returned.
1433 *
1434 * if back ref isn't found, *ref_ret is set to the address where it
1435 * should be inserted, and -ENOENT is returned.
1436 *
1437 * if insert is true and there are too many inline back refs, the path
1438 * points to the extent item, and -EAGAIN is returned.
1439 *
1440 * NOTE: inline back refs are ordered in the same way that back ref
1441 *       items in the tree are ordered.
1442 */
1443static noinline_for_stack
1444int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1445                                 struct btrfs_root *root,
1446                                 struct btrfs_path *path,
1447                                 struct btrfs_extent_inline_ref **ref_ret,
1448                                 u64 bytenr, u64 num_bytes,
1449                                 u64 parent, u64 root_objectid,
1450                                 u64 owner, u64 offset, int insert)
1451{
1452        struct btrfs_key key;
1453        struct extent_buffer *leaf;
1454        struct btrfs_extent_item *ei;
1455        struct btrfs_extent_inline_ref *iref;
1456        u64 flags;
1457        u64 item_size;
1458        unsigned long ptr;
1459        unsigned long end;
1460        int extra_size;
1461        int type;
1462        int want;
1463        int ret;
1464        int err = 0;
1465
1466        key.objectid = bytenr;
1467        key.type = BTRFS_EXTENT_ITEM_KEY;
1468        key.offset = num_bytes;
1469
1470        want = extent_ref_type(parent, owner);
1471        if (insert) {
1472                extra_size = btrfs_extent_inline_ref_size(want);
1473                path->keep_locks = 1;
1474        } else
1475                extra_size = -1;
1476        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1477        if (ret < 0) {
1478                err = ret;
1479                goto out;
1480        }
1481        BUG_ON(ret);
1482
1483        leaf = path->nodes[0];
1484        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1485#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1486        if (item_size < sizeof(*ei)) {
1487                if (!insert) {
1488                        err = -ENOENT;
1489                        goto out;
1490                }
1491                ret = convert_extent_item_v0(trans, root, path, owner,
1492                                             extra_size);
1493                if (ret < 0) {
1494                        err = ret;
1495                        goto out;
1496                }
1497                leaf = path->nodes[0];
1498                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1499        }
1500#endif
1501        BUG_ON(item_size < sizeof(*ei));
1502
1503        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1504        flags = btrfs_extent_flags(leaf, ei);
1505
1506        ptr = (unsigned long)(ei + 1);
1507        end = (unsigned long)ei + item_size;
1508
1509        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1510                ptr += sizeof(struct btrfs_tree_block_info);
1511                BUG_ON(ptr > end);
1512        } else {
1513                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1514        }
1515
1516        err = -ENOENT;
1517        while (1) {
1518                if (ptr >= end) {
1519                        WARN_ON(ptr > end);
1520                        break;
1521                }
1522                iref = (struct btrfs_extent_inline_ref *)ptr;
1523                type = btrfs_extent_inline_ref_type(leaf, iref);
1524                if (want < type)
1525                        break;
1526                if (want > type) {
1527                        ptr += btrfs_extent_inline_ref_size(type);
1528                        continue;
1529                }
1530
1531                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1532                        struct btrfs_extent_data_ref *dref;
1533                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1534                        if (match_extent_data_ref(leaf, dref, root_objectid,
1535                                                  owner, offset)) {
1536                                err = 0;
1537                                break;
1538                        }
1539                        if (hash_extent_data_ref_item(leaf, dref) <
1540                            hash_extent_data_ref(root_objectid, owner, offset))
1541                                break;
1542                } else {
1543                        u64 ref_offset;
1544                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1545                        if (parent > 0) {
1546                                if (parent == ref_offset) {
1547                                        err = 0;
1548                                        break;
1549                                }
1550                                if (ref_offset < parent)
1551                                        break;
1552                        } else {
1553                                if (root_objectid == ref_offset) {
1554                                        err = 0;
1555                                        break;
1556                                }
1557                                if (ref_offset < root_objectid)
1558                                        break;
1559                        }
1560                }
1561                ptr += btrfs_extent_inline_ref_size(type);
1562        }
1563        if (err == -ENOENT && insert) {
1564                if (item_size + extra_size >=
1565                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1566                        err = -EAGAIN;
1567                        goto out;
1568                }
1569                /*
1570                 * To add new inline back ref, we have to make sure
1571                 * there is no corresponding back ref item.
1572                 * For simplicity, we just do not add new inline back
1573                 * ref if there is any kind of item for this block
1574                 */
1575                if (find_next_key(path, 0, &key) == 0 &&
1576                    key.objectid == bytenr &&
1577                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1578                        err = -EAGAIN;
1579                        goto out;
1580                }
1581        }
1582        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1583out:
1584        if (insert) {
1585                path->keep_locks = 0;
1586                btrfs_unlock_up_safe(path, 1);
1587        }
1588        return err;
1589}
1590
1591/*
1592 * helper to add new inline back ref
1593 */
1594static noinline_for_stack
1595int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1596                                struct btrfs_root *root,
1597                                struct btrfs_path *path,
1598                                struct btrfs_extent_inline_ref *iref,
1599                                u64 parent, u64 root_objectid,
1600                                u64 owner, u64 offset, int refs_to_add,
1601                                struct btrfs_delayed_extent_op *extent_op)
1602{
1603        struct extent_buffer *leaf;
1604        struct btrfs_extent_item *ei;
1605        unsigned long ptr;
1606        unsigned long end;
1607        unsigned long item_offset;
1608        u64 refs;
1609        int size;
1610        int type;
1611        int ret;
1612
1613        leaf = path->nodes[0];
1614        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1615        item_offset = (unsigned long)iref - (unsigned long)ei;
1616
1617        type = extent_ref_type(parent, owner);
1618        size = btrfs_extent_inline_ref_size(type);
1619
1620        ret = btrfs_extend_item(trans, root, path, size);
1621
1622        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1623        refs = btrfs_extent_refs(leaf, ei);
1624        refs += refs_to_add;
1625        btrfs_set_extent_refs(leaf, ei, refs);
1626        if (extent_op)
1627                __run_delayed_extent_op(extent_op, leaf, ei);
1628
1629        ptr = (unsigned long)ei + item_offset;
1630        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1631        if (ptr < end - size)
1632                memmove_extent_buffer(leaf, ptr + size, ptr,
1633                                      end - size - ptr);
1634
1635        iref = (struct btrfs_extent_inline_ref *)ptr;
1636        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1637        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1638                struct btrfs_extent_data_ref *dref;
1639                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1640                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1641                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1642                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1643                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1644        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1645                struct btrfs_shared_data_ref *sref;
1646                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1647                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1648                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1649        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1650                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1651        } else {
1652                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1653        }
1654        btrfs_mark_buffer_dirty(leaf);
1655        return 0;
1656}
1657
1658static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1659                                 struct btrfs_root *root,
1660                                 struct btrfs_path *path,
1661                                 struct btrfs_extent_inline_ref **ref_ret,
1662                                 u64 bytenr, u64 num_bytes, u64 parent,
1663                                 u64 root_objectid, u64 owner, u64 offset)
1664{
1665        int ret;
1666
1667        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1668                                           bytenr, num_bytes, parent,
1669                                           root_objectid, owner, offset, 0);
1670        if (ret != -ENOENT)
1671                return ret;
1672
1673        btrfs_release_path(path);
1674        *ref_ret = NULL;
1675
1676        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1677                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1678                                            root_objectid);
1679        } else {
1680                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1681                                             root_objectid, owner, offset);
1682        }
1683        return ret;
1684}
1685
1686/*
1687 * helper to update/remove inline back ref
1688 */
1689static noinline_for_stack
1690int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1691                                 struct btrfs_root *root,
1692                                 struct btrfs_path *path,
1693                                 struct btrfs_extent_inline_ref *iref,
1694                                 int refs_to_mod,
1695                                 struct btrfs_delayed_extent_op *extent_op)
1696{
1697        struct extent_buffer *leaf;
1698        struct btrfs_extent_item *ei;
1699        struct btrfs_extent_data_ref *dref = NULL;
1700        struct btrfs_shared_data_ref *sref = NULL;
1701        unsigned long ptr;
1702        unsigned long end;
1703        u32 item_size;
1704        int size;
1705        int type;
1706        int ret;
1707        u64 refs;
1708
1709        leaf = path->nodes[0];
1710        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1711        refs = btrfs_extent_refs(leaf, ei);
1712        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1713        refs += refs_to_mod;
1714        btrfs_set_extent_refs(leaf, ei, refs);
1715        if (extent_op)
1716                __run_delayed_extent_op(extent_op, leaf, ei);
1717
1718        type = btrfs_extent_inline_ref_type(leaf, iref);
1719
1720        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1721                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1722                refs = btrfs_extent_data_ref_count(leaf, dref);
1723        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1724                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1725                refs = btrfs_shared_data_ref_count(leaf, sref);
1726        } else {
1727                refs = 1;
1728                BUG_ON(refs_to_mod != -1);
1729        }
1730
1731        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1732        refs += refs_to_mod;
1733
1734        if (refs > 0) {
1735                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1736                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1737                else
1738                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1739        } else {
1740                size =  btrfs_extent_inline_ref_size(type);
1741                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1742                ptr = (unsigned long)iref;
1743                end = (unsigned long)ei + item_size;
1744                if (ptr + size < end)
1745                        memmove_extent_buffer(leaf, ptr, ptr + size,
1746                                              end - ptr - size);
1747                item_size -= size;
1748                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1749        }
1750        btrfs_mark_buffer_dirty(leaf);
1751        return 0;
1752}
1753
1754static noinline_for_stack
1755int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1756                                 struct btrfs_root *root,
1757                                 struct btrfs_path *path,
1758                                 u64 bytenr, u64 num_bytes, u64 parent,
1759                                 u64 root_objectid, u64 owner,
1760                                 u64 offset, int refs_to_add,
1761                                 struct btrfs_delayed_extent_op *extent_op)
1762{
1763        struct btrfs_extent_inline_ref *iref;
1764        int ret;
1765
1766        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1767                                           bytenr, num_bytes, parent,
1768                                           root_objectid, owner, offset, 1);
1769        if (ret == 0) {
1770                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1771                ret = update_inline_extent_backref(trans, root, path, iref,
1772                                                   refs_to_add, extent_op);
1773        } else if (ret == -ENOENT) {
1774                ret = setup_inline_extent_backref(trans, root, path, iref,
1775                                                  parent, root_objectid,
1776                                                  owner, offset, refs_to_add,
1777                                                  extent_op);
1778        }
1779        return ret;
1780}
1781
1782static int insert_extent_backref(struct btrfs_trans_handle *trans,
1783                                 struct btrfs_root *root,
1784                                 struct btrfs_path *path,
1785                                 u64 bytenr, u64 parent, u64 root_objectid,
1786                                 u64 owner, u64 offset, int refs_to_add)
1787{
1788        int ret;
1789        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1790                BUG_ON(refs_to_add != 1);
1791                ret = insert_tree_block_ref(trans, root, path, bytenr,
1792                                            parent, root_objectid);
1793        } else {
1794                ret = insert_extent_data_ref(trans, root, path, bytenr,
1795                                             parent, root_objectid,
1796                                             owner, offset, refs_to_add);
1797        }
1798        return ret;
1799}
1800
1801static int remove_extent_backref(struct btrfs_trans_handle *trans,
1802                                 struct btrfs_root *root,
1803                                 struct btrfs_path *path,
1804                                 struct btrfs_extent_inline_ref *iref,
1805                                 int refs_to_drop, int is_data)
1806{
1807        int ret;
1808
1809        BUG_ON(!is_data && refs_to_drop != 1);
1810        if (iref) {
1811                ret = update_inline_extent_backref(trans, root, path, iref,
1812                                                   -refs_to_drop, NULL);
1813        } else if (is_data) {
1814                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1815        } else {
1816                ret = btrfs_del_item(trans, root, path);
1817        }
1818        return ret;
1819}
1820
1821static int btrfs_issue_discard(struct block_device *bdev,
1822                                u64 start, u64 len)
1823{
1824        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1825}
1826
1827static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1828                                u64 num_bytes, u64 *actual_bytes)
1829{
1830        int ret;
1831        u64 discarded_bytes = 0;
1832        struct btrfs_bio *bbio = NULL;
1833
1834
1835        /* Tell the block device(s) that the sectors can be discarded */
1836        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1837                              bytenr, &num_bytes, &bbio, 0);
1838        if (!ret) {
1839                struct btrfs_bio_stripe *stripe = bbio->stripes;
1840                int i;
1841
1842
1843                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1844                        if (!stripe->dev->can_discard)
1845                                continue;
1846
1847                        ret = btrfs_issue_discard(stripe->dev->bdev,
1848                                                  stripe->physical,
1849                                                  stripe->length);
1850                        if (!ret)
1851                                discarded_bytes += stripe->length;
1852                        else if (ret != -EOPNOTSUPP)
1853                                break;
1854
1855                        /*
1856                         * Just in case we get back EOPNOTSUPP for some reason,
1857                         * just ignore the return value so we don't screw up
1858                         * people calling discard_extent.
1859                         */
1860                        ret = 0;
1861                }
1862                kfree(bbio);
1863        }
1864
1865        if (actual_bytes)
1866                *actual_bytes = discarded_bytes;
1867
1868
1869        return ret;
1870}
1871
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873                         struct btrfs_root *root,
1874                         u64 bytenr, u64 num_bytes, u64 parent,
1875                         u64 root_objectid, u64 owner, u64 offset)
1876{
1877        int ret;
1878        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880
1881        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1883                                        parent, root_objectid, (int)owner,
1884                                        BTRFS_ADD_DELAYED_REF, NULL);
1885        } else {
1886                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1887                                        parent, root_objectid, owner, offset,
1888                                        BTRFS_ADD_DELAYED_REF, NULL);
1889        }
1890        return ret;
1891}
1892
1893static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1894                                  struct btrfs_root *root,
1895                                  u64 bytenr, u64 num_bytes,
1896                                  u64 parent, u64 root_objectid,
1897                                  u64 owner, u64 offset, int refs_to_add,
1898                                  struct btrfs_delayed_extent_op *extent_op)
1899{
1900        struct btrfs_path *path;
1901        struct extent_buffer *leaf;
1902        struct btrfs_extent_item *item;
1903        u64 refs;
1904        int ret;
1905        int err = 0;
1906
1907        path = btrfs_alloc_path();
1908        if (!path)
1909                return -ENOMEM;
1910
1911        path->reada = 1;
1912        path->leave_spinning = 1;
1913        /* this will setup the path even if it fails to insert the back ref */
1914        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1915                                           path, bytenr, num_bytes, parent,
1916                                           root_objectid, owner, offset,
1917                                           refs_to_add, extent_op);
1918        if (ret == 0)
1919                goto out;
1920
1921        if (ret != -EAGAIN) {
1922                err = ret;
1923                goto out;
1924        }
1925
1926        leaf = path->nodes[0];
1927        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1928        refs = btrfs_extent_refs(leaf, item);
1929        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1930        if (extent_op)
1931                __run_delayed_extent_op(extent_op, leaf, item);
1932
1933        btrfs_mark_buffer_dirty(leaf);
1934        btrfs_release_path(path);
1935
1936        path->reada = 1;
1937        path->leave_spinning = 1;
1938
1939        /* now insert the actual backref */
1940        ret = insert_extent_backref(trans, root->fs_info->extent_root,
1941                                    path, bytenr, parent, root_objectid,
1942                                    owner, offset, refs_to_add);
1943        BUG_ON(ret);
1944out:
1945        btrfs_free_path(path);
1946        return err;
1947}
1948
1949static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1950                                struct btrfs_root *root,
1951                                struct btrfs_delayed_ref_node *node,
1952                                struct btrfs_delayed_extent_op *extent_op,
1953                                int insert_reserved)
1954{
1955        int ret = 0;
1956        struct btrfs_delayed_data_ref *ref;
1957        struct btrfs_key ins;
1958        u64 parent = 0;
1959        u64 ref_root = 0;
1960        u64 flags = 0;
1961
1962        ins.objectid = node->bytenr;
1963        ins.offset = node->num_bytes;
1964        ins.type = BTRFS_EXTENT_ITEM_KEY;
1965
1966        ref = btrfs_delayed_node_to_data_ref(node);
1967        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1968                parent = ref->parent;
1969        else
1970                ref_root = ref->root;
1971
1972        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1973                if (extent_op) {
1974                        BUG_ON(extent_op->update_key);
1975                        flags |= extent_op->flags_to_set;
1976                }
1977                ret = alloc_reserved_file_extent(trans, root,
1978                                                 parent, ref_root, flags,
1979                                                 ref->objectid, ref->offset,
1980                                                 &ins, node->ref_mod);
1981        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1982                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1983                                             node->num_bytes, parent,
1984                                             ref_root, ref->objectid,
1985                                             ref->offset, node->ref_mod,
1986                                             extent_op);
1987        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1988                ret = __btrfs_free_extent(trans, root, node->bytenr,
1989                                          node->num_bytes, parent,
1990                                          ref_root, ref->objectid,
1991                                          ref->offset, node->ref_mod,
1992                                          extent_op);
1993        } else {
1994                BUG();
1995        }
1996        return ret;
1997}
1998
1999static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2000                                    struct extent_buffer *leaf,
2001                                    struct btrfs_extent_item *ei)
2002{
2003        u64 flags = btrfs_extent_flags(leaf, ei);
2004        if (extent_op->update_flags) {
2005                flags |= extent_op->flags_to_set;
2006                btrfs_set_extent_flags(leaf, ei, flags);
2007        }
2008
2009        if (extent_op->update_key) {
2010                struct btrfs_tree_block_info *bi;
2011                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2012                bi = (struct btrfs_tree_block_info *)(ei + 1);
2013                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2014        }
2015}
2016
2017static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2018                                 struct btrfs_root *root,
2019                                 struct btrfs_delayed_ref_node *node,
2020                                 struct btrfs_delayed_extent_op *extent_op)
2021{
2022        struct btrfs_key key;
2023        struct btrfs_path *path;
2024        struct btrfs_extent_item *ei;
2025        struct extent_buffer *leaf;
2026        u32 item_size;
2027        int ret;
2028        int err = 0;
2029
2030        path = btrfs_alloc_path();
2031        if (!path)
2032                return -ENOMEM;
2033
2034        key.objectid = node->bytenr;
2035        key.type = BTRFS_EXTENT_ITEM_KEY;
2036        key.offset = node->num_bytes;
2037
2038        path->reada = 1;
2039        path->leave_spinning = 1;
2040        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2041                                path, 0, 1);
2042        if (ret < 0) {
2043                err = ret;
2044                goto out;
2045        }
2046        if (ret > 0) {
2047                err = -EIO;
2048                goto out;
2049        }
2050
2051        leaf = path->nodes[0];
2052        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2053#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2054        if (item_size < sizeof(*ei)) {
2055                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2056                                             path, (u64)-1, 0);
2057                if (ret < 0) {
2058                        err = ret;
2059                        goto out;
2060                }
2061                leaf = path->nodes[0];
2062                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2063        }
2064#endif
2065        BUG_ON(item_size < sizeof(*ei));
2066        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2067        __run_delayed_extent_op(extent_op, leaf, ei);
2068
2069        btrfs_mark_buffer_dirty(leaf);
2070out:
2071        btrfs_free_path(path);
2072        return err;
2073}
2074
2075static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2076                                struct btrfs_root *root,
2077                                struct btrfs_delayed_ref_node *node,
2078                                struct btrfs_delayed_extent_op *extent_op,
2079                                int insert_reserved)
2080{
2081        int ret = 0;
2082        struct btrfs_delayed_tree_ref *ref;
2083        struct btrfs_key ins;
2084        u64 parent = 0;
2085        u64 ref_root = 0;
2086
2087        ins.objectid = node->bytenr;
2088        ins.offset = node->num_bytes;
2089        ins.type = BTRFS_EXTENT_ITEM_KEY;
2090
2091        ref = btrfs_delayed_node_to_tree_ref(node);
2092        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2093                parent = ref->parent;
2094        else
2095                ref_root = ref->root;
2096
2097        BUG_ON(node->ref_mod != 1);
2098        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2099                BUG_ON(!extent_op || !extent_op->update_flags ||
2100                       !extent_op->update_key);
2101                ret = alloc_reserved_tree_block(trans, root,
2102                                                parent, ref_root,
2103                                                extent_op->flags_to_set,
2104                                                &extent_op->key,
2105                                                ref->level, &ins);
2106        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2107                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2108                                             node->num_bytes, parent, ref_root,
2109                                             ref->level, 0, 1, extent_op);
2110        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2111                ret = __btrfs_free_extent(trans, root, node->bytenr,
2112                                          node->num_bytes, parent, ref_root,
2113                                          ref->level, 0, 1, extent_op);
2114        } else {
2115                BUG();
2116        }
2117        return ret;
2118}
2119
2120/* helper function to actually process a single delayed ref entry */
2121static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2122                               struct btrfs_root *root,
2123                               struct btrfs_delayed_ref_node *node,
2124                               struct btrfs_delayed_extent_op *extent_op,
2125                               int insert_reserved)
2126{
2127        int ret;
2128        if (btrfs_delayed_ref_is_head(node)) {
2129                struct btrfs_delayed_ref_head *head;
2130                /*
2131                 * we've hit the end of the chain and we were supposed
2132                 * to insert this extent into the tree.  But, it got
2133                 * deleted before we ever needed to insert it, so all
2134                 * we have to do is clean up the accounting
2135                 */
2136                BUG_ON(extent_op);
2137                head = btrfs_delayed_node_to_head(node);
2138                if (insert_reserved) {
2139                        btrfs_pin_extent(root, node->bytenr,
2140                                         node->num_bytes, 1);
2141                        if (head->is_data) {
2142                                ret = btrfs_del_csums(trans, root,
2143                                                      node->bytenr,
2144                                                      node->num_bytes);
2145                                BUG_ON(ret);
2146                        }
2147                }
2148                mutex_unlock(&head->mutex);
2149                return 0;
2150        }
2151
2152        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2153            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2154                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2155                                           insert_reserved);
2156        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2157                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2158                ret = run_delayed_data_ref(trans, root, node, extent_op,
2159                                           insert_reserved);
2160        else
2161                BUG();
2162        return ret;
2163}
2164
2165static noinline struct btrfs_delayed_ref_node *
2166select_delayed_ref(struct btrfs_delayed_ref_head *head)
2167{
2168        struct rb_node *node;
2169        struct btrfs_delayed_ref_node *ref;
2170        int action = BTRFS_ADD_DELAYED_REF;
2171again:
2172        /*
2173         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2174         * this prevents ref count from going down to zero when
2175         * there still are pending delayed ref.
2176         */
2177        node = rb_prev(&head->node.rb_node);
2178        while (1) {
2179                if (!node)
2180                        break;
2181                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2182                                rb_node);
2183                if (ref->bytenr != head->node.bytenr)
2184                        break;
2185                if (ref->action == action)
2186                        return ref;
2187                node = rb_prev(node);
2188        }
2189        if (action == BTRFS_ADD_DELAYED_REF) {
2190                action = BTRFS_DROP_DELAYED_REF;
2191                goto again;
2192        }
2193        return NULL;
2194}
2195
2196static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2197                                       struct btrfs_root *root,
2198                                       struct list_head *cluster)
2199{
2200        struct btrfs_delayed_ref_root *delayed_refs;
2201        struct btrfs_delayed_ref_node *ref;
2202        struct btrfs_delayed_ref_head *locked_ref = NULL;
2203        struct btrfs_delayed_extent_op *extent_op;
2204        int ret;
2205        int count = 0;
2206        int must_insert_reserved = 0;
2207
2208        delayed_refs = &trans->transaction->delayed_refs;
2209        while (1) {
2210                if (!locked_ref) {
2211                        /* pick a new head ref from the cluster list */
2212                        if (list_empty(cluster))
2213                                break;
2214
2215                        locked_ref = list_entry(cluster->next,
2216                                     struct btrfs_delayed_ref_head, cluster);
2217
2218                        /* grab the lock that says we are going to process
2219                         * all the refs for this head */
2220                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2221
2222                        /*
2223                         * we may have dropped the spin lock to get the head
2224                         * mutex lock, and that might have given someone else
2225                         * time to free the head.  If that's true, it has been
2226                         * removed from our list and we can move on.
2227                         */
2228                        if (ret == -EAGAIN) {
2229                                locked_ref = NULL;
2230                                count++;
2231                                continue;
2232                        }
2233                }
2234
2235                /*
2236                 * record the must insert reserved flag before we
2237                 * drop the spin lock.
2238                 */
2239                must_insert_reserved = locked_ref->must_insert_reserved;
2240                locked_ref->must_insert_reserved = 0;
2241
2242                extent_op = locked_ref->extent_op;
2243                locked_ref->extent_op = NULL;
2244
2245                /*
2246                 * locked_ref is the head node, so we have to go one
2247                 * node back for any delayed ref updates
2248                 */
2249                ref = select_delayed_ref(locked_ref);
2250                if (!ref) {
2251                        /* All delayed refs have been processed, Go ahead
2252                         * and send the head node to run_one_delayed_ref,
2253                         * so that any accounting fixes can happen
2254                         */
2255                        ref = &locked_ref->node;
2256
2257                        if (extent_op && must_insert_reserved) {
2258                                kfree(extent_op);
2259                                extent_op = NULL;
2260                        }
2261
2262                        if (extent_op) {
2263                                spin_unlock(&delayed_refs->lock);
2264
2265                                ret = run_delayed_extent_op(trans, root,
2266                                                            ref, extent_op);
2267                                BUG_ON(ret);
2268                                kfree(extent_op);
2269
2270                                cond_resched();
2271                                spin_lock(&delayed_refs->lock);
2272                                continue;
2273                        }
2274
2275                        list_del_init(&locked_ref->cluster);
2276                        locked_ref = NULL;
2277                }
2278
2279                ref->in_tree = 0;
2280                rb_erase(&ref->rb_node, &delayed_refs->root);
2281                delayed_refs->num_entries--;
2282
2283                spin_unlock(&delayed_refs->lock);
2284
2285                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2286                                          must_insert_reserved);
2287                BUG_ON(ret);
2288
2289                btrfs_put_delayed_ref(ref);
2290                kfree(extent_op);
2291                count++;
2292
2293                cond_resched();
2294                spin_lock(&delayed_refs->lock);
2295        }
2296        return count;
2297}
2298
2299/*
2300 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far.  count can be
2302 * 0, which means to process everything in the tree at the start
2303 * of the run (but not newly added entries), or it can be some target
2304 * number you'd like to process.
2305 */
2306int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2307                           struct btrfs_root *root, unsigned long count)
2308{
2309        struct rb_node *node;
2310        struct btrfs_delayed_ref_root *delayed_refs;
2311        struct btrfs_delayed_ref_node *ref;
2312        struct list_head cluster;
2313        int ret;
2314        int run_all = count == (unsigned long)-1;
2315        int run_most = 0;
2316
2317        if (root == root->fs_info->extent_root)
2318                root = root->fs_info->tree_root;
2319
2320        delayed_refs = &trans->transaction->delayed_refs;
2321        INIT_LIST_HEAD(&cluster);
2322again:
2323        spin_lock(&delayed_refs->lock);
2324        if (count == 0) {
2325                count = delayed_refs->num_entries * 2;
2326                run_most = 1;
2327        }
2328        while (1) {
2329                if (!(run_all || run_most) &&
2330                    delayed_refs->num_heads_ready < 64)
2331                        break;
2332
2333                /*
2334                 * go find something we can process in the rbtree.  We start at
2335                 * the beginning of the tree, and then build a cluster
2336                 * of refs to process starting at the first one we are able to
2337                 * lock
2338                 */
2339                ret = btrfs_find_ref_cluster(trans, &cluster,
2340                                             delayed_refs->run_delayed_start);
2341                if (ret)
2342                        break;
2343
2344                ret = run_clustered_refs(trans, root, &cluster);
2345                BUG_ON(ret < 0);
2346
2347                count -= min_t(unsigned long, ret, count);
2348
2349                if (count == 0)
2350                        break;
2351        }
2352
2353        if (run_all) {
2354                node = rb_first(&delayed_refs->root);
2355                if (!node)
2356                        goto out;
2357                count = (unsigned long)-1;
2358
2359                while (node) {
2360                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
2361                                       rb_node);
2362                        if (btrfs_delayed_ref_is_head(ref)) {
2363                                struct btrfs_delayed_ref_head *head;
2364
2365                                head = btrfs_delayed_node_to_head(ref);
2366                                atomic_inc(&ref->refs);
2367
2368                                spin_unlock(&delayed_refs->lock);
2369                                /*
2370                                 * Mutex was contended, block until it's
2371                                 * released and try again
2372                                 */
2373                                mutex_lock(&head->mutex);
2374                                mutex_unlock(&head->mutex);
2375
2376                                btrfs_put_delayed_ref(ref);
2377                                cond_resched();
2378                                goto again;
2379                        }
2380                        node = rb_next(node);
2381                }
2382                spin_unlock(&delayed_refs->lock);
2383                schedule_timeout(1);
2384                goto again;
2385        }
2386out:
2387        spin_unlock(&delayed_refs->lock);
2388        return 0;
2389}
2390
2391int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2392                                struct btrfs_root *root,
2393                                u64 bytenr, u64 num_bytes, u64 flags,
2394                                int is_data)
2395{
2396        struct btrfs_delayed_extent_op *extent_op;
2397        int ret;
2398
2399        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2400        if (!extent_op)
2401                return -ENOMEM;
2402
2403        extent_op->flags_to_set = flags;
2404        extent_op->update_flags = 1;
2405        extent_op->update_key = 0;
2406        extent_op->is_data = is_data ? 1 : 0;
2407
2408        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2409        if (ret)
2410                kfree(extent_op);
2411        return ret;
2412}
2413
2414static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2415                                      struct btrfs_root *root,
2416                                      struct btrfs_path *path,
2417                                      u64 objectid, u64 offset, u64 bytenr)
2418{
2419        struct btrfs_delayed_ref_head *head;
2420        struct btrfs_delayed_ref_node *ref;
2421        struct btrfs_delayed_data_ref *data_ref;
2422        struct btrfs_delayed_ref_root *delayed_refs;
2423        struct rb_node *node;
2424        int ret = 0;
2425
2426        ret = -ENOENT;
2427        delayed_refs = &trans->transaction->delayed_refs;
2428        spin_lock(&delayed_refs->lock);
2429        head = btrfs_find_delayed_ref_head(trans, bytenr);
2430        if (!head)
2431                goto out;
2432
2433        if (!mutex_trylock(&head->mutex)) {
2434                atomic_inc(&head->node.refs);
2435                spin_unlock(&delayed_refs->lock);
2436
2437                btrfs_release_path(path);
2438
2439                /*
2440                 * Mutex was contended, block until it's released and let
2441                 * caller try again
2442                 */
2443                mutex_lock(&head->mutex);
2444                mutex_unlock(&head->mutex);
2445                btrfs_put_delayed_ref(&head->node);
2446                return -EAGAIN;
2447        }
2448
2449        node = rb_prev(&head->node.rb_node);
2450        if (!node)
2451                goto out_unlock;
2452
2453        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2454
2455        if (ref->bytenr != bytenr)
2456                goto out_unlock;
2457
2458        ret = 1;
2459        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2460                goto out_unlock;
2461
2462        data_ref = btrfs_delayed_node_to_data_ref(ref);
2463
2464        node = rb_prev(node);
2465        if (node) {
2466                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2467                if (ref->bytenr == bytenr)
2468                        goto out_unlock;
2469        }
2470
2471        if (data_ref->root != root->root_key.objectid ||
2472            data_ref->objectid != objectid || data_ref->offset != offset)
2473                goto out_unlock;
2474
2475        ret = 0;
2476out_unlock:
2477        mutex_unlock(&head->mutex);
2478out:
2479        spin_unlock(&delayed_refs->lock);
2480        return ret;
2481}
2482
2483static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2484                                        struct btrfs_root *root,
2485                                        struct btrfs_path *path,
2486                                        u64 objectid, u64 offset, u64 bytenr)
2487{
2488        struct btrfs_root *extent_root = root->fs_info->extent_root;
2489        struct extent_buffer *leaf;
2490        struct btrfs_extent_data_ref *ref;
2491        struct btrfs_extent_inline_ref *iref;
2492        struct btrfs_extent_item *ei;
2493        struct btrfs_key key;
2494        u32 item_size;
2495        int ret;
2496
2497        key.objectid = bytenr;
2498        key.offset = (u64)-1;
2499        key.type = BTRFS_EXTENT_ITEM_KEY;
2500
2501        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2502        if (ret < 0)
2503                goto out;
2504        BUG_ON(ret == 0);
2505
2506        ret = -ENOENT;
2507        if (path->slots[0] == 0)
2508                goto out;
2509
2510        path->slots[0]--;
2511        leaf = path->nodes[0];
2512        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2513
2514        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2515                goto out;
2516
2517        ret = 1;
2518        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2519#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2520        if (item_size < sizeof(*ei)) {
2521                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2522                goto out;
2523        }
2524#endif
2525        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2526
2527        if (item_size != sizeof(*ei) +
2528            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2529                goto out;
2530
2531        if (btrfs_extent_generation(leaf, ei) <=
2532            btrfs_root_last_snapshot(&root->root_item))
2533                goto out;
2534
2535        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2536        if (btrfs_extent_inline_ref_type(leaf, iref) !=
2537            BTRFS_EXTENT_DATA_REF_KEY)
2538                goto out;
2539
2540        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2541        if (btrfs_extent_refs(leaf, ei) !=
2542            btrfs_extent_data_ref_count(leaf, ref) ||
2543            btrfs_extent_data_ref_root(leaf, ref) !=
2544            root->root_key.objectid ||
2545            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2546            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2547                goto out;
2548
2549        ret = 0;
2550out:
2551        return ret;
2552}
2553
2554int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2555                          struct btrfs_root *root,
2556                          u64 objectid, u64 offset, u64 bytenr)
2557{
2558        struct btrfs_path *path;
2559        int ret;
2560        int ret2;
2561
2562        path = btrfs_alloc_path();
2563        if (!path)
2564                return -ENOENT;
2565
2566        do {
2567                ret = check_committed_ref(trans, root, path, objectid,
2568                                          offset, bytenr);
2569                if (ret && ret != -ENOENT)
2570                        goto out;
2571
2572                ret2 = check_delayed_ref(trans, root, path, objectid,
2573                                         offset, bytenr);
2574        } while (ret2 == -EAGAIN);
2575
2576        if (ret2 && ret2 != -ENOENT) {
2577                ret = ret2;
2578                goto out;
2579        }
2580
2581        if (ret != -ENOENT || ret2 != -ENOENT)
2582                ret = 0;
2583out:
2584        btrfs_free_path(path);
2585        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2586                WARN_ON(ret > 0);
2587        return ret;
2588}
2589
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591                           struct btrfs_root *root,
2592                           struct extent_buffer *buf,
2593                           int full_backref, int inc)
2594{
2595        u64 bytenr;
2596        u64 num_bytes;
2597        u64 parent;
2598        u64 ref_root;
2599        u32 nritems;
2600        struct btrfs_key key;
2601        struct btrfs_file_extent_item *fi;
2602        int i;
2603        int level;
2604        int ret = 0;
2605        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606                            u64, u64, u64, u64, u64, u64);
2607
2608        ref_root = btrfs_header_owner(buf);
2609        nritems = btrfs_header_nritems(buf);
2610        level = btrfs_header_level(buf);
2611
2612        if (!root->ref_cows && level == 0)
2613                return 0;
2614
2615        if (inc)
2616                process_func = btrfs_inc_extent_ref;
2617        else
2618                process_func = btrfs_free_extent;
2619
2620        if (full_backref)
2621                parent = buf->start;
2622        else
2623                parent = 0;
2624
2625        for (i = 0; i < nritems; i++) {
2626                if (level == 0) {
2627                        btrfs_item_key_to_cpu(buf, &key, i);
2628                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2629                                continue;
2630                        fi = btrfs_item_ptr(buf, i,
2631                                            struct btrfs_file_extent_item);
2632                        if (btrfs_file_extent_type(buf, fi) ==
2633                            BTRFS_FILE_EXTENT_INLINE)
2634                                continue;
2635                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2636                        if (bytenr == 0)
2637                                continue;
2638
2639                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2640                        key.offset -= btrfs_file_extent_offset(buf, fi);
2641                        ret = process_func(trans, root, bytenr, num_bytes,
2642                                           parent, ref_root, key.objectid,
2643                                           key.offset);
2644                        if (ret)
2645                                goto fail;
2646                } else {
2647                        bytenr = btrfs_node_blockptr(buf, i);
2648                        num_bytes = btrfs_level_size(root, level - 1);
2649                        ret = process_func(trans, root, bytenr, num_bytes,
2650                                           parent, ref_root, level - 1, 0);
2651                        if (ret)
2652                                goto fail;
2653                }
2654        }
2655        return 0;
2656fail:
2657        BUG();
2658        return ret;
2659}
2660
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662                  struct extent_buffer *buf, int full_backref)
2663{
2664        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2665}
2666
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668                  struct extent_buffer *buf, int full_backref)
2669{
2670        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2671}
2672
2673static int write_one_cache_group(struct btrfs_trans_handle *trans,
2674                                 struct btrfs_root *root,
2675                                 struct btrfs_path *path,
2676                                 struct btrfs_block_group_cache *cache)
2677{
2678        int ret;
2679        struct btrfs_root *extent_root = root->fs_info->extent_root;
2680        unsigned long bi;
2681        struct extent_buffer *leaf;
2682
2683        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2684        if (ret < 0)
2685                goto fail;
2686        BUG_ON(ret);
2687
2688        leaf = path->nodes[0];
2689        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2690        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2691        btrfs_mark_buffer_dirty(leaf);
2692        btrfs_release_path(path);
2693fail:
2694        if (ret)
2695                return ret;
2696        return 0;
2697
2698}
2699
2700static struct btrfs_block_group_cache *
2701next_block_group(struct btrfs_root *root,
2702                 struct btrfs_block_group_cache *cache)
2703{
2704        struct rb_node *node;
2705        spin_lock(&root->fs_info->block_group_cache_lock);
2706        node = rb_next(&cache->cache_node);
2707        btrfs_put_block_group(cache);
2708        if (node) {
2709                cache = rb_entry(node, struct btrfs_block_group_cache,
2710                                 cache_node);
2711                btrfs_get_block_group(cache);
2712        } else
2713                cache = NULL;
2714        spin_unlock(&root->fs_info->block_group_cache_lock);
2715        return cache;
2716}
2717
2718static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2719                            struct btrfs_trans_handle *trans,
2720                            struct btrfs_path *path)
2721{
2722        struct btrfs_root *root = block_group->fs_info->tree_root;
2723        struct inode *inode = NULL;
2724        u64 alloc_hint = 0;
2725        int dcs = BTRFS_DC_ERROR;
2726        int num_pages = 0;
2727        int retries = 0;
2728        int ret = 0;
2729
2730        /*
2731         * If this block group is smaller than 100 megs don't bother caching the
2732         * block group.
2733         */
2734        if (block_group->key.offset < (100 * 1024 * 1024)) {
2735                spin_lock(&block_group->lock);
2736                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2737                spin_unlock(&block_group->lock);
2738                return 0;
2739        }
2740
2741again:
2742        inode = lookup_free_space_inode(root, block_group, path);
2743        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2744                ret = PTR_ERR(inode);
2745                btrfs_release_path(path);
2746                goto out;
2747        }
2748
2749        if (IS_ERR(inode)) {
2750                BUG_ON(retries);
2751                retries++;
2752
2753                if (block_group->ro)
2754                        goto out_free;
2755
2756                ret = create_free_space_inode(root, trans, block_group, path);
2757                if (ret)
2758                        goto out_free;
2759                goto again;
2760        }
2761
2762        /* We've already setup this transaction, go ahead and exit */
2763        if (block_group->cache_generation == trans->transid &&
2764            i_size_read(inode)) {
2765                dcs = BTRFS_DC_SETUP;
2766                goto out_put;
2767        }
2768
2769        /*
2770         * We want to set the generation to 0, that way if anything goes wrong
2771         * from here on out we know not to trust this cache when we load up next
2772         * time.
2773         */
2774        BTRFS_I(inode)->generation = 0;
2775        ret = btrfs_update_inode(trans, root, inode);
2776        WARN_ON(ret);
2777
2778        if (i_size_read(inode) > 0) {
2779                ret = btrfs_truncate_free_space_cache(root, trans, path,
2780                                                      inode);
2781                if (ret)
2782                        goto out_put;
2783        }
2784
2785        spin_lock(&block_group->lock);
2786        if (block_group->cached != BTRFS_CACHE_FINISHED) {
2787                /* We're not cached, don't bother trying to write stuff out */
2788                dcs = BTRFS_DC_WRITTEN;
2789                spin_unlock(&block_group->lock);
2790                goto out_put;
2791        }
2792        spin_unlock(&block_group->lock);
2793
2794        num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2795        if (!num_pages)
2796                num_pages = 1;
2797
2798        /*
2799         * Just to make absolutely sure we have enough space, we're going to
2800         * preallocate 12 pages worth of space for each block group.  In
2801         * practice we ought to use at most 8, but we need extra space so we can
2802         * add our header and have a terminator between the extents and the
2803         * bitmaps.
2804         */
2805        num_pages *= 16;
2806        num_pages *= PAGE_CACHE_SIZE;
2807
2808        ret = btrfs_check_data_free_space(inode, num_pages);
2809        if (ret)
2810                goto out_put;
2811
2812        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2813                                              num_pages, num_pages,
2814                                              &alloc_hint);
2815        if (!ret)
2816                dcs = BTRFS_DC_SETUP;
2817        btrfs_free_reserved_data_space(inode, num_pages);
2818
2819out_put:
2820        iput(inode);
2821out_free:
2822        btrfs_release_path(path);
2823out:
2824        spin_lock(&block_group->lock);
2825        if (!ret && dcs == BTRFS_DC_SETUP)
2826                block_group->cache_generation = trans->transid;
2827        block_group->disk_cache_state = dcs;
2828        spin_unlock(&block_group->lock);
2829
2830        return ret;
2831}
2832
2833int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2834                                   struct btrfs_root *root)
2835{
2836        struct btrfs_block_group_cache *cache;
2837        int err = 0;
2838        struct btrfs_path *path;
2839        u64 last = 0;
2840
2841        path = btrfs_alloc_path();
2842        if (!path)
2843                return -ENOMEM;
2844
2845again:
2846        while (1) {
2847                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2848                while (cache) {
2849                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2850                                break;
2851                        cache = next_block_group(root, cache);
2852                }
2853                if (!cache) {
2854                        if (last == 0)
2855                                break;
2856                        last = 0;
2857                        continue;
2858                }
2859                err = cache_save_setup(cache, trans, path);
2860                last = cache->key.objectid + cache->key.offset;
2861                btrfs_put_block_group(cache);
2862        }
2863
2864        while (1) {
2865                if (last == 0) {
2866                        err = btrfs_run_delayed_refs(trans, root,
2867                                                     (unsigned long)-1);
2868                        BUG_ON(err);
2869                }
2870
2871                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2872                while (cache) {
2873                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2874                                btrfs_put_block_group(cache);
2875                                goto again;
2876                        }
2877
2878                        if (cache->dirty)
2879                                break;
2880                        cache = next_block_group(root, cache);
2881                }
2882                if (!cache) {
2883                        if (last == 0)
2884                                break;
2885                        last = 0;
2886                        continue;
2887                }
2888
2889                if (cache->disk_cache_state == BTRFS_DC_SETUP)
2890                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2891                cache->dirty = 0;
2892                last = cache->key.objectid + cache->key.offset;
2893
2894                err = write_one_cache_group(trans, root, path, cache);
2895                BUG_ON(err);
2896                btrfs_put_block_group(cache);
2897        }
2898
2899        while (1) {
2900                /*
2901                 * I don't think this is needed since we're just marking our
2902                 * preallocated extent as written, but just in case it can't
2903                 * hurt.
2904                 */
2905                if (last == 0) {
2906                        err = btrfs_run_delayed_refs(trans, root,
2907                                                     (unsigned long)-1);
2908                        BUG_ON(err);
2909                }
2910
2911                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2912                while (cache) {
2913                        /*
2914                         * Really this shouldn't happen, but it could if we
2915                         * couldn't write the entire preallocated extent and
2916                         * splitting the extent resulted in a new block.
2917                         */
2918                        if (cache->dirty) {
2919                                btrfs_put_block_group(cache);
2920                                goto again;
2921                        }
2922                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2923                                break;
2924                        cache = next_block_group(root, cache);
2925                }
2926                if (!cache) {
2927                        if (last == 0)
2928                                break;
2929                        last = 0;
2930                        continue;
2931                }
2932
2933                btrfs_write_out_cache(root, trans, cache, path);
2934
2935                /*
2936                 * If we didn't have an error then the cache state is still
2937                 * NEED_WRITE, so we can set it to WRITTEN.
2938                 */
2939                if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2940                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
2941                last = cache->key.objectid + cache->key.offset;
2942                btrfs_put_block_group(cache);
2943        }
2944
2945        btrfs_free_path(path);
2946        return 0;
2947}
2948
2949int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2950{
2951        struct btrfs_block_group_cache *block_group;
2952        int readonly = 0;
2953
2954        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2955        if (!block_group || block_group->ro)
2956                readonly = 1;
2957        if (block_group)
2958                btrfs_put_block_group(block_group);
2959        return readonly;
2960}
2961
2962static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2963                             u64 total_bytes, u64 bytes_used,
2964                             struct btrfs_space_info **space_info)
2965{
2966        struct btrfs_space_info *found;
2967        int i;
2968        int factor;
2969
2970        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2971                     BTRFS_BLOCK_GROUP_RAID10))
2972                factor = 2;
2973        else
2974                factor = 1;
2975
2976        found = __find_space_info(info, flags);
2977        if (found) {
2978                spin_lock(&found->lock);
2979                found->total_bytes += total_bytes;
2980                found->disk_total += total_bytes * factor;
2981                found->bytes_used += bytes_used;
2982                found->disk_used += bytes_used * factor;
2983                found->full = 0;
2984                spin_unlock(&found->lock);
2985                *space_info = found;
2986                return 0;
2987        }
2988        found = kzalloc(sizeof(*found), GFP_NOFS);
2989        if (!found)
2990                return -ENOMEM;
2991
2992        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2993                INIT_LIST_HEAD(&found->block_groups[i]);
2994        init_rwsem(&found->groups_sem);
2995        spin_lock_init(&found->lock);
2996        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2997                                BTRFS_BLOCK_GROUP_SYSTEM |
2998                                BTRFS_BLOCK_GROUP_METADATA);
2999        found->total_bytes = total_bytes;
3000        found->disk_total = total_bytes * factor;
3001        found->bytes_used = bytes_used;
3002        found->disk_used = bytes_used * factor;
3003        found->bytes_pinned = 0;
3004        found->bytes_reserved = 0;
3005        found->bytes_readonly = 0;
3006        found->bytes_may_use = 0;
3007        found->full = 0;
3008        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3009        found->chunk_alloc = 0;
3010        found->flush = 0;
3011        init_waitqueue_head(&found->wait);
3012        *space_info = found;
3013        list_add_rcu(&found->list, &info->space_info);
3014        return 0;
3015}
3016
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{
3019        u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
3020                                   BTRFS_BLOCK_GROUP_RAID1 |
3021                                   BTRFS_BLOCK_GROUP_RAID10 |
3022                                   BTRFS_BLOCK_GROUP_DUP);
3023        if (extra_flags) {
3024                if (flags & BTRFS_BLOCK_GROUP_DATA)
3025                        fs_info->avail_data_alloc_bits |= extra_flags;
3026                if (flags & BTRFS_BLOCK_GROUP_METADATA)
3027                        fs_info->avail_metadata_alloc_bits |= extra_flags;
3028                if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3029                        fs_info->avail_system_alloc_bits |= extra_flags;
3030        }
3031}
3032
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{
3035        /*
3036         * we add in the count of missing devices because we want
3037         * to make sure that any RAID levels on a degraded FS
3038         * continue to be honored.
3039         */
3040        u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041                root->fs_info->fs_devices->missing_devices;
3042
3043        if (num_devices == 1)
3044                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045        if (num_devices < 4)
3046                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3047
3048        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3049            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3050                      BTRFS_BLOCK_GROUP_RAID10))) {
3051                flags &= ~BTRFS_BLOCK_GROUP_DUP;
3052        }
3053
3054        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3055            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3056                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3057        }
3058
3059        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061             (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062             (flags & BTRFS_BLOCK_GROUP_DUP)))
3063                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3064        return flags;
3065}
3066
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{
3069        if (flags & BTRFS_BLOCK_GROUP_DATA)
3070                flags |= root->fs_info->avail_data_alloc_bits &
3071                         root->fs_info->data_alloc_profile;
3072        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073                flags |= root->fs_info->avail_system_alloc_bits &
3074                         root->fs_info->system_alloc_profile;
3075        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076                flags |= root->fs_info->avail_metadata_alloc_bits &
3077                         root->fs_info->metadata_alloc_profile;
3078        return btrfs_reduce_alloc_profile(root, flags);
3079}
3080
3081u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3082{
3083        u64 flags;
3084
3085        if (data)
3086                flags = BTRFS_BLOCK_GROUP_DATA;
3087        else if (root == root->fs_info->chunk_root)
3088                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3089        else
3090                flags = BTRFS_BLOCK_GROUP_METADATA;
3091
3092        return get_alloc_profile(root, flags);
3093}
3094
3095void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3096{
3097        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3098                                                       BTRFS_BLOCK_GROUP_DATA);
3099}
3100
3101/*
3102 * This will check the space that the inode allocates from to make sure we have
3103 * enough space for bytes.
3104 */
3105int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3106{
3107        struct btrfs_space_info *data_sinfo;
3108        struct btrfs_root *root = BTRFS_I(inode)->root;
3109        u64 used;
3110        int ret = 0, committed = 0, alloc_chunk = 1;
3111
3112        /* make sure bytes are sectorsize aligned */
3113        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3114
3115        if (root == root->fs_info->tree_root ||
3116            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3117                alloc_chunk = 0;
3118                committed = 1;
3119        }
3120
3121        data_sinfo = BTRFS_I(inode)->space_info;
3122        if (!data_sinfo)
3123                goto alloc;
3124
3125again:
3126        /* make sure we have enough space to handle the data first */
3127        spin_lock(&data_sinfo->lock);
3128        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3129                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3130                data_sinfo->bytes_may_use;
3131
3132        if (used + bytes > data_sinfo->total_bytes) {
3133                struct btrfs_trans_handle *trans;
3134
3135                /*
3136                 * if we don't have enough free bytes in this space then we need
3137                 * to alloc a new chunk.
3138                 */
3139                if (!data_sinfo->full && alloc_chunk) {
3140                        u64 alloc_target;
3141
3142                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3143                        spin_unlock(&data_sinfo->lock);
3144alloc:
3145                        alloc_target = btrfs_get_alloc_profile(root, 1);
3146                        trans = btrfs_join_transaction(root);
3147                        if (IS_ERR(trans))
3148                                return PTR_ERR(trans);
3149
3150                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3151                                             bytes + 2 * 1024 * 1024,
3152                                             alloc_target,
3153                                             CHUNK_ALLOC_NO_FORCE);
3154                        btrfs_end_transaction(trans, root);
3155                        if (ret < 0) {
3156                                if (ret != -ENOSPC)
3157                                        return ret;
3158                                else
3159                                        goto commit_trans;
3160                        }
3161
3162                        if (!data_sinfo) {
3163                                btrfs_set_inode_space_info(root, inode);
3164                                data_sinfo = BTRFS_I(inode)->space_info;
3165                        }
3166                        goto again;
3167                }
3168
3169                /*
3170                 * If we have less pinned bytes than we want to allocate then
3171                 * don't bother committing the transaction, it won't help us.
3172                 */
3173                if (data_sinfo->bytes_pinned < bytes)
3174                        committed = 1;
3175                spin_unlock(&data_sinfo->lock);
3176
3177                /* commit the current transaction and try again */
3178commit_trans:
3179                if (!committed &&
3180                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3181                        committed = 1;
3182                        trans = btrfs_join_transaction(root);
3183                        if (IS_ERR(trans))
3184                                return PTR_ERR(trans);
3185                        ret = btrfs_commit_transaction(trans, root);
3186                        if (ret)
3187                                return ret;
3188                        goto again;
3189                }
3190
3191                return -ENOSPC;
3192        }
3193        data_sinfo->bytes_may_use += bytes;
3194        spin_unlock(&data_sinfo->lock);
3195
3196        return 0;
3197}
3198
3199/*
3200 * Called if we need to clear a data reservation for this inode.
3201 */
3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3203{
3204        struct btrfs_root *root = BTRFS_I(inode)->root;
3205        struct btrfs_space_info *data_sinfo;
3206
3207        /* make sure bytes are sectorsize aligned */
3208        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3209
3210        data_sinfo = BTRFS_I(inode)->space_info;
3211        spin_lock(&data_sinfo->lock);
3212        data_sinfo->bytes_may_use -= bytes;
3213        spin_unlock(&data_sinfo->lock);
3214}
3215
3216static void force_metadata_allocation(struct btrfs_fs_info *info)
3217{
3218        struct list_head *head = &info->space_info;
3219        struct btrfs_space_info *found;
3220
3221        rcu_read_lock();
3222        list_for_each_entry_rcu(found, head, list) {
3223                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3224                        found->force_alloc = CHUNK_ALLOC_FORCE;
3225        }
3226        rcu_read_unlock();
3227}
3228
3229static int should_alloc_chunk(struct btrfs_root *root,
3230                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
3231                              int force)
3232{
3233        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3234        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3235        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3236        u64 thresh;
3237
3238        if (force == CHUNK_ALLOC_FORCE)
3239                return 1;
3240
3241        /*
3242         * We need to take into account the global rsv because for all intents
3243         * and purposes it's used space.  Don't worry about locking the
3244         * global_rsv, it doesn't change except when the transaction commits.
3245         */
3246        num_allocated += global_rsv->size;
3247
3248        /*
3249         * in limited mode, we want to have some free space up to
3250         * about 1% of the FS size.
3251         */
3252        if (force == CHUNK_ALLOC_LIMITED) {
3253                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3254                thresh = max_t(u64, 64 * 1024 * 1024,
3255                               div_factor_fine(thresh, 1));
3256
3257                if (num_bytes - num_allocated < thresh)
3258                        return 1;
3259        }
3260
3261        /*
3262         * we have two similar checks here, one based on percentage
3263         * and once based on a hard number of 256MB.  The idea
3264         * is that if we have a good amount of free
3265         * room, don't allocate a chunk.  A good mount is
3266         * less than 80% utilized of the chunks we have allocated,
3267         * or more than 256MB free
3268         */
3269        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270                return 0;
3271
3272        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273                return 0;
3274
3275        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276
3277        /* 256MB or 5% of the FS */
3278        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3279
3280        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3281                return 0;
3282        return 1;
3283}
3284
3285static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3286                          struct btrfs_root *extent_root, u64 alloc_bytes,
3287                          u64 flags, int force)
3288{
3289        struct btrfs_space_info *space_info;
3290        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3291        int wait_for_alloc = 0;
3292        int ret = 0;
3293
3294        flags = btrfs_reduce_alloc_profile(extent_root, flags);
3295
3296        space_info = __find_space_info(extent_root->fs_info, flags);
3297        if (!space_info) {
3298                ret = update_space_info(extent_root->fs_info, flags,
3299                                        0, 0, &space_info);
3300                BUG_ON(ret);
3301        }
3302        BUG_ON(!space_info);
3303
3304again:
3305        spin_lock(&space_info->lock);
3306        if (space_info->force_alloc)
3307                force = space_info->force_alloc;
3308        if (space_info->full) {
3309                spin_unlock(&space_info->lock);
3310                return 0;
3311        }
3312
3313        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3314                spin_unlock(&space_info->lock);
3315                return 0;
3316        } else if (space_info->chunk_alloc) {
3317                wait_for_alloc = 1;
3318        } else {
3319                space_info->chunk_alloc = 1;
3320        }
3321
3322        spin_unlock(&space_info->lock);
3323
3324        mutex_lock(&fs_info->chunk_mutex);
3325
3326        /*
3327         * The chunk_mutex is held throughout the entirety of a chunk
3328         * allocation, so once we've acquired the chunk_mutex we know that the
3329         * other guy is done and we need to recheck and see if we should
3330         * allocate.
3331         */
3332        if (wait_for_alloc) {
3333                mutex_unlock(&fs_info->chunk_mutex);
3334                wait_for_alloc = 0;
3335                goto again;
3336        }
3337
3338        /*
3339         * If we have mixed data/metadata chunks we want to make sure we keep
3340         * allocating mixed chunks instead of individual chunks.
3341         */
3342        if (btrfs_mixed_space_info(space_info))
3343                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3344
3345        /*
3346         * if we're doing a data chunk, go ahead and make sure that
3347         * we keep a reasonable number of metadata chunks allocated in the
3348         * FS as well.
3349         */
3350        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3351                fs_info->data_chunk_allocations++;
3352                if (!(fs_info->data_chunk_allocations %
3353                      fs_info->metadata_ratio))
3354                        force_metadata_allocation(fs_info);
3355        }
3356
3357        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3358        if (ret < 0 && ret != -ENOSPC)
3359                goto out;
3360
3361        spin_lock(&space_info->lock);
3362        if (ret)
3363                space_info->full = 1;
3364        else
3365                ret = 1;
3366
3367        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3368        space_info->chunk_alloc = 0;
3369        spin_unlock(&space_info->lock);
3370out:
3371        mutex_unlock(&extent_root->fs_info->chunk_mutex);
3372        return ret;
3373}
3374
3375/*
3376 * shrink metadata reservation for delalloc
3377 */
3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3379                           bool wait_ordered)
3380{
3381        struct btrfs_block_rsv *block_rsv;
3382        struct btrfs_space_info *space_info;
3383        struct btrfs_trans_handle *trans;
3384        u64 reserved;
3385        u64 max_reclaim;
3386        u64 reclaimed = 0;
3387        long time_left;
3388        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3389        int loops = 0;
3390        unsigned long progress;
3391
3392        trans = (struct btrfs_trans_handle *)current->journal_info;
3393        block_rsv = &root->fs_info->delalloc_block_rsv;
3394        space_info = block_rsv->space_info;
3395
3396        smp_mb();
3397        reserved = space_info->bytes_may_use;
3398        progress = space_info->reservation_progress;
3399
3400        if (reserved == 0)
3401                return 0;
3402
3403        smp_mb();
3404        if (root->fs_info->delalloc_bytes == 0) {
3405                if (trans)
3406                        return 0;
3407                btrfs_wait_ordered_extents(root, 0, 0);
3408                return 0;
3409        }
3410
3411        max_reclaim = min(reserved, to_reclaim);
3412        nr_pages = max_t(unsigned long, nr_pages,
3413                         max_reclaim >> PAGE_CACHE_SHIFT);
3414        while (loops < 1024) {
3415                /* have the flusher threads jump in and do some IO */
3416                smp_mb();
3417                nr_pages = min_t(unsigned long, nr_pages,
3418                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3419                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3420                                                WB_REASON_FS_FREE_SPACE);
3421
3422                spin_lock(&space_info->lock);
3423                if (reserved > space_info->bytes_may_use)
3424                        reclaimed += reserved - space_info->bytes_may_use;
3425                reserved = space_info->bytes_may_use;
3426                spin_unlock(&space_info->lock);
3427
3428                loops++;
3429
3430                if (reserved == 0 || reclaimed >= max_reclaim)
3431                        break;
3432
3433                if (trans && trans->transaction->blocked)
3434                        return -EAGAIN;
3435
3436                if (wait_ordered && !trans) {
3437                        btrfs_wait_ordered_extents(root, 0, 0);
3438                } else {
3439                        time_left = schedule_timeout_interruptible(1);
3440
3441                        /* We were interrupted, exit */
3442                        if (time_left)
3443                                break;
3444                }
3445
3446                /* we've kicked the IO a few times, if anything has been freed,
3447                 * exit.  There is no sense in looping here for a long time
3448                 * when we really need to commit the transaction, or there are
3449                 * just too many writers without enough free space
3450                 */
3451
3452                if (loops > 3) {
3453                        smp_mb();
3454                        if (progress != space_info->reservation_progress)
3455                                break;
3456                }
3457
3458        }
3459
3460        return reclaimed >= to_reclaim;
3461}
3462
3463/**
3464 * maybe_commit_transaction - possibly commit the transaction if its ok to
3465 * @root - the root we're allocating for
3466 * @bytes - the number of bytes we want to reserve
3467 * @force - force the commit
3468 *
3469 * This will check to make sure that committing the transaction will actually
3470 * get us somewhere and then commit the transaction if it does.  Otherwise it
3471 * will return -ENOSPC.
3472 */
3473static int may_commit_transaction(struct btrfs_root *root,
3474                                  struct btrfs_space_info *space_info,
3475                                  u64 bytes, int force)
3476{
3477        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3478        struct btrfs_trans_handle *trans;
3479
3480        trans = (struct btrfs_trans_handle *)current->journal_info;
3481        if (trans)
3482                return -EAGAIN;
3483
3484        if (force)
3485                goto commit;
3486
3487        /* See if there is enough pinned space to make this reservation */
3488        spin_lock(&space_info->lock);
3489        if (space_info->bytes_pinned >= bytes) {
3490                spin_unlock(&space_info->lock);
3491                goto commit;
3492        }
3493        spin_unlock(&space_info->lock);
3494
3495        /*
3496         * See if there is some space in the delayed insertion reservation for
3497         * this reservation.
3498         */
3499        if (space_info != delayed_rsv->space_info)
3500                return -ENOSPC;
3501
3502        spin_lock(&delayed_rsv->lock);
3503        if (delayed_rsv->size < bytes) {
3504                spin_unlock(&delayed_rsv->lock);
3505                return -ENOSPC;
3506        }
3507        spin_unlock(&delayed_rsv->lock);
3508
3509commit:
3510        trans = btrfs_join_transaction(root);
3511        if (IS_ERR(trans))
3512                return -ENOSPC;
3513
3514        return btrfs_commit_transaction(trans, root);
3515}
3516
3517/**
3518 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3519 * @root - the root we're allocating for
3520 * @block_rsv - the block_rsv we're allocating for
3521 * @orig_bytes - the number of bytes we want
3522 * @flush - wether or not we can flush to make our reservation
3523 *
3524 * This will reserve orgi_bytes number of bytes from the space info associated
3525 * with the block_rsv.  If there is not enough space it will make an attempt to
3526 * flush out space to make room.  It will do this by flushing delalloc if
3527 * possible or committing the transaction.  If flush is 0 then no attempts to
3528 * regain reservations will be made and this will fail if there is not enough
3529 * space already.
3530 */
3531static int reserve_metadata_bytes(struct btrfs_root *root,
3532                                  struct btrfs_block_rsv *block_rsv,
3533                                  u64 orig_bytes, int flush)
3534{
3535        struct btrfs_space_info *space_info = block_rsv->space_info;
3536        u64 used;
3537        u64 num_bytes = orig_bytes;
3538        int retries = 0;
3539        int ret = 0;
3540        bool committed = false;
3541        bool flushing = false;
3542        bool wait_ordered = false;
3543
3544again:
3545        ret = 0;
3546        spin_lock(&space_info->lock);
3547        /*
3548         * We only want to wait if somebody other than us is flushing and we are
3549         * actually alloed to flush.
3550         */
3551        while (flush && !flushing && space_info->flush) {
3552                spin_unlock(&space_info->lock);
3553                /*
3554                 * If we have a trans handle we can't wait because the flusher
3555                 * may have to commit the transaction, which would mean we would
3556                 * deadlock since we are waiting for the flusher to finish, but
3557                 * hold the current transaction open.
3558                 */
3559                if (current->journal_info)
3560                        return -EAGAIN;
3561                ret = wait_event_interruptible(space_info->wait,
3562                                               !space_info->flush);
3563                /* Must have been interrupted, return */
3564                if (ret)
3565                        return -EINTR;
3566
3567                spin_lock(&space_info->lock);
3568        }
3569
3570        ret = -ENOSPC;
3571        used = space_info->bytes_used + space_info->bytes_reserved +
3572                space_info->bytes_pinned + space_info->bytes_readonly +
3573                space_info->bytes_may_use;
3574
3575        /*
3576         * The idea here is that we've not already over-reserved the block group
3577         * then we can go ahead and save our reservation first and then start
3578         * flushing if we need to.  Otherwise if we've already overcommitted
3579         * lets start flushing stuff first and then come back and try to make
3580         * our reservation.
3581         */
3582        if (used <= space_info->total_bytes) {
3583                if (used + orig_bytes <= space_info->total_bytes) {
3584                        space_info->bytes_may_use += orig_bytes;
3585                        ret = 0;
3586                } else {
3587                        /*
3588                         * Ok set num_bytes to orig_bytes since we aren't
3589                         * overocmmitted, this way we only try and reclaim what
3590                         * we need.
3591                         */
3592                        num_bytes = orig_bytes;
3593                }
3594        } else {
3595                /*
3596                 * Ok we're over committed, set num_bytes to the overcommitted
3597                 * amount plus the amount of bytes that we need for this
3598                 * reservation.
3599                 */
3600                wait_ordered = true;
3601                num_bytes = used - space_info->total_bytes +
3602                        (orig_bytes * (retries + 1));
3603        }
3604
3605        if (ret) {
3606                u64 profile = btrfs_get_alloc_profile(root, 0);
3607                u64 avail;
3608
3609                /*
3610                 * If we have a lot of space that's pinned, don't bother doing
3611                 * the overcommit dance yet and just commit the transaction.
3612                 */
3613                avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3614                do_div(avail, 10);
3615                if (space_info->bytes_pinned >= avail && flush && !committed) {
3616                        space_info->flush = 1;
3617                        flushing = true;
3618                        spin_unlock(&space_info->lock);
3619                        ret = may_commit_transaction(root, space_info,
3620                                                     orig_bytes, 1);
3621                        if (ret)
3622                                goto out;
3623                        committed = true;
3624                        goto again;
3625                }
3626
3627                spin_lock(&root->fs_info->free_chunk_lock);
3628                avail = root->fs_info->free_chunk_space;
3629
3630                /*
3631                 * If we have dup, raid1 or raid10 then only half of the free
3632                 * space is actually useable.
3633                 */
3634                if (profile & (BTRFS_BLOCK_GROUP_DUP |
3635                               BTRFS_BLOCK_GROUP_RAID1 |
3636                               BTRFS_BLOCK_GROUP_RAID10))
3637                        avail >>= 1;
3638
3639                /*
3640                 * If we aren't flushing don't let us overcommit too much, say
3641                 * 1/8th of the space.  If we can flush, let it overcommit up to
3642                 * 1/2 of the space.
3643                 */
3644                if (flush)
3645                        avail >>= 3;
3646                else
3647                        avail >>= 1;
3648                 spin_unlock(&root->fs_info->free_chunk_lock);
3649
3650                if (used + num_bytes < space_info->total_bytes + avail) {
3651                        space_info->bytes_may_use += orig_bytes;
3652                        ret = 0;
3653                } else {
3654                        wait_ordered = true;
3655                }
3656        }
3657
3658        /*
3659         * Couldn't make our reservation, save our place so while we're trying
3660         * to reclaim space we can actually use it instead of somebody else
3661         * stealing it from us.
3662         */
3663        if (ret && flush) {
3664                flushing = true;
3665                space_info->flush = 1;
3666        }
3667
3668        spin_unlock(&space_info->lock);
3669
3670        if (!ret || !flush)
3671                goto out;
3672
3673        /*
3674         * We do synchronous shrinking since we don't actually unreserve
3675         * metadata until after the IO is completed.
3676         */
3677        ret = shrink_delalloc(root, num_bytes, wait_ordered);
3678        if (ret < 0)
3679                goto out;
3680
3681        ret = 0;
3682
3683        /*
3684         * So if we were overcommitted it's possible that somebody else flushed
3685         * out enough space and we simply didn't have enough space to reclaim,
3686         * so go back around and try again.
3687         */
3688        if (retries < 2) {
3689                wait_ordered = true;
3690                retries++;
3691                goto again;
3692        }
3693
3694        ret = -ENOSPC;
3695        if (committed)
3696                goto out;
3697
3698        ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3699        if (!ret) {
3700                committed = true;
3701                goto again;
3702        }
3703
3704out:
3705        if (flushing) {
3706                spin_lock(&space_info->lock);
3707                space_info->flush = 0;
3708                wake_up_all(&space_info->wait);
3709                spin_unlock(&space_info->lock);
3710        }
3711        return ret;
3712}
3713
3714static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3715                                             struct btrfs_root *root)
3716{
3717        struct btrfs_block_rsv *block_rsv = NULL;
3718
3719        if (root->ref_cows || root == root->fs_info->csum_root)
3720                block_rsv = trans->block_rsv;
3721
3722        if (!block_rsv)
3723                block_rsv = root->block_rsv;
3724
3725        if (!block_rsv)
3726                block_rsv = &root->fs_info->empty_block_rsv;
3727
3728        return block_rsv;
3729}
3730
3731static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3732                               u64 num_bytes)
3733{
3734        int ret = -ENOSPC;
3735        spin_lock(&block_rsv->lock);
3736        if (block_rsv->reserved >= num_bytes) {
3737                block_rsv->reserved -= num_bytes;
3738                if (block_rsv->reserved < block_rsv->size)
3739                        block_rsv->full = 0;
3740                ret = 0;
3741        }
3742        spin_unlock(&block_rsv->lock);
3743        return ret;
3744}
3745
3746static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3747                                u64 num_bytes, int update_size)
3748{
3749        spin_lock(&block_rsv->lock);
3750        block_rsv->reserved += num_bytes;
3751        if (update_size)
3752                block_rsv->size += num_bytes;
3753        else if (block_rsv->reserved >= block_rsv->size)
3754                block_rsv->full = 1;
3755        spin_unlock(&block_rsv->lock);
3756}
3757
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3759                                    struct btrfs_block_rsv *dest, u64 num_bytes)
3760{
3761        struct btrfs_space_info *space_info = block_rsv->space_info;
3762
3763        spin_lock(&block_rsv->lock);
3764        if (num_bytes == (u64)-1)
3765                num_bytes = block_rsv->size;
3766        block_rsv->size -= num_bytes;
3767        if (block_rsv->reserved >= block_rsv->size) {
3768                num_bytes = block_rsv->reserved - block_rsv->size;
3769                block_rsv->reserved = block_rsv->size;
3770                block_rsv->full = 1;
3771        } else {
3772                num_bytes = 0;
3773        }
3774        spin_unlock(&block_rsv->lock);
3775
3776        if (num_bytes > 0) {
3777                if (dest) {
3778                        spin_lock(&dest->lock);
3779                        if (!dest->full) {
3780                                u64 bytes_to_add;
3781
3782                                bytes_to_add = dest->size - dest->reserved;
3783                                bytes_to_add = min(num_bytes, bytes_to_add);
3784                                dest->reserved += bytes_to_add;
3785                                if (dest->reserved >= dest->size)
3786                                        dest->full = 1;
3787                                num_bytes -= bytes_to_add;
3788                        }
3789                        spin_unlock(&dest->lock);
3790                }
3791                if (num_bytes) {
3792                        spin_lock(&space_info->lock);
3793                        space_info->bytes_may_use -= num_bytes;
3794                        space_info->reservation_progress++;
3795                        spin_unlock(&space_info->lock);
3796                }
3797        }
3798}
3799
3800static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3801                                   struct btrfs_block_rsv *dst, u64 num_bytes)
3802{
3803        int ret;
3804
3805        ret = block_rsv_use_bytes(src, num_bytes);
3806        if (ret)
3807                return ret;
3808
3809        block_rsv_add_bytes(dst, num_bytes, 1);
3810        return 0;
3811}
3812
3813void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3814{
3815        memset(rsv, 0, sizeof(*rsv));
3816        spin_lock_init(&rsv->lock);
3817}
3818
3819struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3820{
3821        struct btrfs_block_rsv *block_rsv;
3822        struct btrfs_fs_info *fs_info = root->fs_info;
3823
3824        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3825        if (!block_rsv)
3826                return NULL;
3827
3828        btrfs_init_block_rsv(block_rsv);
3829        block_rsv->space_info = __find_space_info(fs_info,
3830                                                  BTRFS_BLOCK_GROUP_METADATA);
3831        return block_rsv;
3832}
3833
3834void btrfs_free_block_rsv(struct btrfs_root *root,
3835                          struct btrfs_block_rsv *rsv)
3836{
3837        btrfs_block_rsv_release(root, rsv, (u64)-1);
3838        kfree(rsv);
3839}
3840
3841static inline int __block_rsv_add(struct btrfs_root *root,
3842                                  struct btrfs_block_rsv *block_rsv,
3843                                  u64 num_bytes, int flush)
3844{
3845        int ret;
3846
3847        if (num_bytes == 0)
3848                return 0;
3849
3850        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3851        if (!ret) {
3852                block_rsv_add_bytes(block_rsv, num_bytes, 1);
3853                return 0;
3854        }
3855
3856        return ret;
3857}
3858
3859int btrfs_block_rsv_add(struct btrfs_root *root,
3860                        struct btrfs_block_rsv *block_rsv,
3861                        u64 num_bytes)
3862{
3863        return __block_rsv_add(root, block_rsv, num_bytes, 1);
3864}
3865
3866int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3867                                struct btrfs_block_rsv *block_rsv,
3868                                u64 num_bytes)
3869{
3870        return __block_rsv_add(root, block_rsv, num_bytes, 0);
3871}
3872
3873int btrfs_block_rsv_check(struct btrfs_root *root,
3874                          struct btrfs_block_rsv *block_rsv, int min_factor)
3875{
3876        u64 num_bytes = 0;
3877        int ret = -ENOSPC;
3878
3879        if (!block_rsv)
3880                return 0;
3881
3882        spin_lock(&block_rsv->lock);
3883        num_bytes = div_factor(block_rsv->size, min_factor);
3884        if (block_rsv->reserved >= num_bytes)
3885                ret = 0;
3886        spin_unlock(&block_rsv->lock);
3887
3888        return ret;
3889}
3890
3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3892                                           struct btrfs_block_rsv *block_rsv,
3893                                           u64 min_reserved, int flush)
3894{
3895        u64 num_bytes = 0;
3896        int ret = -ENOSPC;
3897
3898        if (!block_rsv)
3899                return 0;
3900
3901        spin_lock(&block_rsv->lock);
3902        num_bytes = min_reserved;
3903        if (block_rsv->reserved >= num_bytes)
3904                ret = 0;
3905        else
3906                num_bytes -= block_rsv->reserved;
3907        spin_unlock(&block_rsv->lock);
3908
3909        if (!ret)
3910                return 0;
3911
3912        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3913        if (!ret) {
3914                block_rsv_add_bytes(block_rsv, num_bytes, 0);
3915                return 0;
3916        }
3917
3918        return ret;
3919}
3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922                           struct btrfs_block_rsv *block_rsv,
3923                           u64 min_reserved)
3924{
3925        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929                                   struct btrfs_block_rsv *block_rsv,
3930                                   u64 min_reserved)
3931{
3932        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3933}
3934
3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3936                            struct btrfs_block_rsv *dst_rsv,
3937                            u64 num_bytes)
3938{
3939        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3940}
3941
3942void btrfs_block_rsv_release(struct btrfs_root *root,
3943                             struct btrfs_block_rsv *block_rsv,
3944                             u64 num_bytes)
3945{
3946        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3947        if (global_rsv->full || global_rsv == block_rsv ||
3948            block_rsv->space_info != global_rsv->space_info)
3949                global_rsv = NULL;
3950        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3951}
3952
3953/*
3954 * helper to calculate size of global block reservation.
3955 * the desired value is sum of space used by extent tree,
3956 * checksum tree and root tree
3957 */
3958static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3959{
3960        struct btrfs_space_info *sinfo;
3961        u64 num_bytes;
3962        u64 meta_used;
3963        u64 data_used;
3964        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3965
3966        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3967        spin_lock(&sinfo->lock);
3968        data_used = sinfo->bytes_used;
3969        spin_unlock(&sinfo->lock);
3970
3971        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3972        spin_lock(&sinfo->lock);
3973        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3974                data_used = 0;
3975        meta_used = sinfo->bytes_used;
3976        spin_unlock(&sinfo->lock);
3977
3978        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3979                    csum_size * 2;
3980        num_bytes += div64_u64(data_used + meta_used, 50);
3981
3982        if (num_bytes * 3 > meta_used)
3983                num_bytes = div64_u64(meta_used, 3);
3984
3985        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3986}
3987
3988static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3989{
3990        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3991        struct btrfs_space_info *sinfo = block_rsv->space_info;
3992        u64 num_bytes;
3993
3994        num_bytes = calc_global_metadata_size(fs_info);
3995
3996        spin_lock(&block_rsv->lock);
3997        spin_lock(&sinfo->lock);
3998
3999        block_rsv->size = num_bytes;
4000
4001        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4002                    sinfo->bytes_reserved + sinfo->bytes_readonly +
4003                    sinfo->bytes_may_use;
4004
4005        if (sinfo->total_bytes > num_bytes) {
4006                num_bytes = sinfo->total_bytes - num_bytes;
4007                block_rsv->reserved += num_bytes;
4008                sinfo->bytes_may_use += num_bytes;
4009        }
4010
4011        if (block_rsv->reserved >= block_rsv->size) {
4012                num_bytes = block_rsv->reserved - block_rsv->size;
4013                sinfo->bytes_may_use -= num_bytes;
4014                sinfo->reservation_progress++;
4015                block_rsv->reserved = block_rsv->size;
4016                block_rsv->full = 1;
4017        }
4018
4019        spin_unlock(&sinfo->lock);
4020        spin_unlock(&block_rsv->lock);
4021}
4022
4023static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4024{
4025        struct btrfs_space_info *space_info;
4026
4027        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4028        fs_info->chunk_block_rsv.space_info = space_info;
4029
4030        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4031        fs_info->global_block_rsv.space_info = space_info;
4032        fs_info->delalloc_block_rsv.space_info = space_info;
4033        fs_info->trans_block_rsv.space_info = space_info;
4034        fs_info->empty_block_rsv.space_info = space_info;
4035        fs_info->delayed_block_rsv.space_info = space_info;
4036
4037        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4038        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4039        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4040        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4041        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4042
4043        update_global_block_rsv(fs_info);
4044}
4045
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{
4048        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
4049        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051        WARN_ON(fs_info->trans_block_rsv.size > 0);
4052        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4053        WARN_ON(fs_info->chunk_block_rsv.size > 0);
4054        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4055        WARN_ON(fs_info->delayed_block_rsv.size > 0);
4056        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4057}
4058
4059void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4060                                  struct btrfs_root *root)
4061{
4062        if (!trans->bytes_reserved)
4063                return;
4064
4065        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066        trans->bytes_reserved = 0;
4067}
4068
4069int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4070                                  struct inode *inode)
4071{
4072        struct btrfs_root *root = BTRFS_I(inode)->root;
4073        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4074        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4075
4076        /*
4077         * We need to hold space in order to delete our orphan item once we've
4078         * added it, so this takes the reservation so we can release it later
4079         * when we are truly done with the orphan item.
4080         */
4081        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4082        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083}
4084
4085void btrfs_orphan_release_metadata(struct inode *inode)
4086{
4087        struct btrfs_root *root = BTRFS_I(inode)->root;
4088        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4089        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090}
4091
4092int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4093                                struct btrfs_pending_snapshot *pending)
4094{
4095        struct btrfs_root *root = pending->root;
4096        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4097        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4098        /*
4099         * two for root back/forward refs, two for directory entries
4100         * and one for root of the snapshot.
4101         */
4102        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4103        dst_rsv->space_info = src_rsv->space_info;
4104        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4105}
4106
4107/**
4108 * drop_outstanding_extent - drop an outstanding extent
4109 * @inode: the inode we're dropping the extent for
4110 *
4111 * This is called when we are freeing up an outstanding extent, either called
4112 * after an error or after an extent is written.  This will return the number of
4113 * reserved extents that need to be freed.  This must be called with
4114 * BTRFS_I(inode)->lock held.
4115 */
4116static unsigned drop_outstanding_extent(struct inode *inode)
4117{
4118        unsigned drop_inode_space = 0;
4119        unsigned dropped_extents = 0;
4120
4121        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4122        BTRFS_I(inode)->outstanding_extents--;
4123
4124        if (BTRFS_I(inode)->outstanding_extents == 0 &&
4125            BTRFS_I(inode)->delalloc_meta_reserved) {
4126                drop_inode_space = 1;
4127                BTRFS_I(inode)->delalloc_meta_reserved = 0;
4128        }
4129
4130        /*
4131         * If we have more or the same amount of outsanding extents than we have
4132         * reserved then we need to leave the reserved extents count alone.
4133         */
4134        if (BTRFS_I(inode)->outstanding_extents >=
4135            BTRFS_I(inode)->reserved_extents)
4136                return drop_inode_space;
4137
4138        dropped_extents = BTRFS_I(inode)->reserved_extents -
4139                BTRFS_I(inode)->outstanding_extents;
4140        BTRFS_I(inode)->reserved_extents -= dropped_extents;
4141        return dropped_extents + drop_inode_space;
4142}
4143
4144/**
4145 * calc_csum_metadata_size - return the amount of metada space that must be
4146 *      reserved/free'd for the given bytes.
4147 * @inode: the inode we're manipulating
4148 * @num_bytes: the number of bytes in question
4149 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4150 *
4151 * This adjusts the number of csum_bytes in the inode and then returns the
4152 * correct amount of metadata that must either be reserved or freed.  We
4153 * calculate how many checksums we can fit into one leaf and then divide the
4154 * number of bytes that will need to be checksumed by this value to figure out
4155 * how many checksums will be required.  If we are adding bytes then the number
4156 * may go up and we will return the number of additional bytes that must be
4157 * reserved.  If it is going down we will return the number of bytes that must
4158 * be freed.
4159 *
4160 * This must be called with BTRFS_I(inode)->lock held.
4161 */
4162static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4163                                   int reserve)
4164{
4165        struct btrfs_root *root = BTRFS_I(inode)->root;
4166        u64 csum_size;
4167        int num_csums_per_leaf;
4168        int num_csums;
4169        int old_csums;
4170
4171        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4172            BTRFS_I(inode)->csum_bytes == 0)
4173                return 0;
4174
4175        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4176        if (reserve)
4177                BTRFS_I(inode)->csum_bytes += num_bytes;
4178        else
4179                BTRFS_I(inode)->csum_bytes -= num_bytes;
4180        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4181        num_csums_per_leaf = (int)div64_u64(csum_size,
4182                                            sizeof(struct btrfs_csum_item) +
4183                                            sizeof(struct btrfs_disk_key));
4184        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4185        num_csums = num_csums + num_csums_per_leaf - 1;
4186        num_csums = num_csums / num_csums_per_leaf;
4187
4188        old_csums = old_csums + num_csums_per_leaf - 1;
4189        old_csums = old_csums / num_csums_per_leaf;
4190
4191        /* No change, no need to reserve more */
4192        if (old_csums == num_csums)
4193                return 0;
4194
4195        if (reserve)
4196                return btrfs_calc_trans_metadata_size(root,
4197                                                      num_csums - old_csums);
4198
4199        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4200}
4201
4202int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4203{
4204        struct btrfs_root *root = BTRFS_I(inode)->root;
4205        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4206        u64 to_reserve = 0;
4207        u64 csum_bytes;
4208        unsigned nr_extents = 0;
4209        int extra_reserve = 0;
4210        int flush = 1;
4211        int ret;
4212
4213        /* Need to be holding the i_mutex here if we aren't free space cache */
4214        if (btrfs_is_free_space_inode(root, inode))
4215                flush = 0;
4216        else
4217                WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218
4219        if (flush && btrfs_transaction_in_commit(root->fs_info))
4220                schedule_timeout(1);
4221
4222        num_bytes = ALIGN(num_bytes, root->sectorsize);
4223
4224        spin_lock(&BTRFS_I(inode)->lock);
4225        BTRFS_I(inode)->outstanding_extents++;
4226
4227        if (BTRFS_I(inode)->outstanding_extents >
4228            BTRFS_I(inode)->reserved_extents)
4229                nr_extents = BTRFS_I(inode)->outstanding_extents -
4230                        BTRFS_I(inode)->reserved_extents;
4231
4232        /*
4233         * Add an item to reserve for updating the inode when we complete the
4234         * delalloc io.
4235         */
4236        if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4237                nr_extents++;
4238                extra_reserve = 1;
4239        }
4240
4241        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4242        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4243        csum_bytes = BTRFS_I(inode)->csum_bytes;
4244        spin_unlock(&BTRFS_I(inode)->lock);
4245
4246        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4247        if (ret) {
4248                u64 to_free = 0;
4249                unsigned dropped;
4250
4251                spin_lock(&BTRFS_I(inode)->lock);
4252                dropped = drop_outstanding_extent(inode);
4253                /*
4254                 * If the inodes csum_bytes is the same as the original
4255                 * csum_bytes then we know we haven't raced with any free()ers
4256                 * so we can just reduce our inodes csum bytes and carry on.
4257                 * Otherwise we have to do the normal free thing to account for
4258                 * the case that the free side didn't free up its reserve
4259                 * because of this outstanding reservation.
4260                 */
4261                if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4262                        calc_csum_metadata_size(inode, num_bytes, 0);
4263                else
4264                        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4265                spin_unlock(&BTRFS_I(inode)->lock);
4266                if (dropped)
4267                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268
4269                if (to_free)
4270                        btrfs_block_rsv_release(root, block_rsv, to_free);
4271                return ret;
4272        }
4273
4274        spin_lock(&BTRFS_I(inode)->lock);
4275        if (extra_reserve) {
4276                BTRFS_I(inode)->delalloc_meta_reserved = 1;
4277                nr_extents--;
4278        }
4279        BTRFS_I(inode)->reserved_extents += nr_extents;
4280        spin_unlock(&BTRFS_I(inode)->lock);
4281
4282        block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283
4284        return 0;
4285}
4286
4287/**
4288 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4289 * @inode: the inode to release the reservation for
4290 * @num_bytes: the number of bytes we're releasing
4291 *
4292 * This will release the metadata reservation for an inode.  This can be called
4293 * once we complete IO for a given set of bytes to release their metadata
4294 * reservations.
4295 */
4296void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4297{
4298        struct btrfs_root *root = BTRFS_I(inode)->root;
4299        u64 to_free = 0;
4300        unsigned dropped;
4301
4302        num_bytes = ALIGN(num_bytes, root->sectorsize);
4303        spin_lock(&BTRFS_I(inode)->lock);
4304        dropped = drop_outstanding_extent(inode);
4305
4306        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4307        spin_unlock(&BTRFS_I(inode)->lock);
4308        if (dropped > 0)
4309                to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310
4311        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312                                to_free);
4313}
4314
4315/**
4316 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4317 * @inode: inode we're writing to
4318 * @num_bytes: the number of bytes we want to allocate
4319 *
4320 * This will do the following things
4321 *
4322 * o reserve space in the data space info for num_bytes
4323 * o reserve space in the metadata space info based on number of outstanding
4324 *   extents and how much csums will be needed
4325 * o add to the inodes ->delalloc_bytes
4326 * o add it to the fs_info's delalloc inodes list.
4327 *
4328 * This will return 0 for success and -ENOSPC if there is no space left.
4329 */
4330int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4331{
4332        int ret;
4333
4334        ret = btrfs_check_data_free_space(inode, num_bytes);
4335        if (ret)
4336                return ret;
4337
4338        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4339        if (ret) {
4340                btrfs_free_reserved_data_space(inode, num_bytes);
4341                return ret;
4342        }
4343
4344        return 0;
4345}
4346
4347/**
4348 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4349 * @inode: inode we're releasing space for
4350 * @num_bytes: the number of bytes we want to free up
4351 *
4352 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4353 * called in the case that we don't need the metadata AND data reservations
4354 * anymore.  So if there is an error or we insert an inline extent.
4355 *
4356 * This function will release the metadata space that was not used and will
4357 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4358 * list if there are no delalloc bytes left.
4359 */
4360void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4361{
4362        btrfs_delalloc_release_metadata(inode, num_bytes);
4363        btrfs_free_reserved_data_space(inode, num_bytes);
4364}
4365
4366static int update_block_group(struct btrfs_trans_handle *trans,
4367                              struct btrfs_root *root,
4368                              u64 bytenr, u64 num_bytes, int alloc)
4369{
4370        struct btrfs_block_group_cache *cache = NULL;
4371        struct btrfs_fs_info *info = root->fs_info;
4372        u64 total = num_bytes;
4373        u64 old_val;
4374        u64 byte_in_group;
4375        int factor;
4376
4377        /* block accounting for super block */
4378        spin_lock(&info->delalloc_lock);
4379        old_val = btrfs_super_bytes_used(info->super_copy);
4380        if (alloc)
4381                old_val += num_bytes;
4382        else
4383                old_val -= num_bytes;
4384        btrfs_set_super_bytes_used(info->super_copy, old_val);
4385        spin_unlock(&info->delalloc_lock);
4386
4387        while (total) {
4388                cache = btrfs_lookup_block_group(info, bytenr);
4389                if (!cache)
4390                        return -1;
4391                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4392                                    BTRFS_BLOCK_GROUP_RAID1 |
4393                                    BTRFS_BLOCK_GROUP_RAID10))
4394                        factor = 2;
4395                else
4396                        factor = 1;
4397                /*
4398                 * If this block group has free space cache written out, we
4399                 * need to make sure to load it if we are removing space.  This
4400                 * is because we need the unpinning stage to actually add the
4401                 * space back to the block group, otherwise we will leak space.
4402                 */
4403                if (!alloc && cache->cached == BTRFS_CACHE_NO)
4404                        cache_block_group(cache, trans, NULL, 1);
4405
4406                byte_in_group = bytenr - cache->key.objectid;
4407                WARN_ON(byte_in_group > cache->key.offset);
4408
4409                spin_lock(&cache->space_info->lock);
4410                spin_lock(&cache->lock);
4411
4412                if (btrfs_test_opt(root, SPACE_CACHE) &&
4413                    cache->disk_cache_state < BTRFS_DC_CLEAR)
4414                        cache->disk_cache_state = BTRFS_DC_CLEAR;
4415
4416                cache->dirty = 1;
4417                old_val = btrfs_block_group_used(&cache->item);
4418                num_bytes = min(total, cache->key.offset - byte_in_group);
4419                if (alloc) {
4420                        old_val += num_bytes;
4421                        btrfs_set_block_group_used(&cache->item, old_val);
4422                        cache->reserved -= num_bytes;
4423                        cache->space_info->bytes_reserved -= num_bytes;
4424                        cache->space_info->bytes_used += num_bytes;
4425                        cache->space_info->disk_used += num_bytes * factor;
4426                        spin_unlock(&cache->lock);
4427                        spin_unlock(&cache->space_info->lock);
4428                } else {
4429                        old_val -= num_bytes;
4430                        btrfs_set_block_group_used(&cache->item, old_val);
4431                        cache->pinned += num_bytes;
4432                        cache->space_info->bytes_pinned += num_bytes;
4433                        cache->space_info->bytes_used -= num_bytes;
4434                        cache->space_info->disk_used -= num_bytes * factor;
4435                        spin_unlock(&cache->lock);
4436                        spin_unlock(&cache->space_info->lock);
4437
4438                        set_extent_dirty(info->pinned_extents,
4439                                         bytenr, bytenr + num_bytes - 1,
4440                                         GFP_NOFS | __GFP_NOFAIL);
4441                }
4442                btrfs_put_block_group(cache);
4443                total -= num_bytes;
4444                bytenr += num_bytes;
4445        }
4446        return 0;
4447}
4448
4449static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4450{
4451        struct btrfs_block_group_cache *cache;
4452        u64 bytenr;
4453
4454        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4455        if (!cache)
4456                return 0;
4457
4458        bytenr = cache->key.objectid;
4459        btrfs_put_block_group(cache);
4460
4461        return bytenr;
4462}
4463
4464static int pin_down_extent(struct btrfs_root *root,
4465                           struct btrfs_block_group_cache *cache,
4466                           u64 bytenr, u64 num_bytes, int reserved)
4467{
4468        spin_lock(&cache->space_info->lock);
4469        spin_lock(&cache->lock);
4470        cache->pinned += num_bytes;
4471        cache->space_info->bytes_pinned += num_bytes;
4472        if (reserved) {
4473                cache->reserved -= num_bytes;
4474                cache->space_info->bytes_reserved -= num_bytes;
4475        }
4476        spin_unlock(&cache->lock);
4477        spin_unlock(&cache->space_info->lock);
4478
4479        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4480                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4481        return 0;
4482}
4483
4484/*
4485 * this function must be called within transaction
4486 */
4487int btrfs_pin_extent(struct btrfs_root *root,
4488                     u64 bytenr, u64 num_bytes, int reserved)
4489{
4490        struct btrfs_block_group_cache *cache;
4491
4492        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4493        BUG_ON(!cache);
4494
4495        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4496
4497        btrfs_put_block_group(cache);
4498        return 0;
4499}
4500
4501/*
4502 * this function must be called within transaction
4503 */
4504int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4505                                    struct btrfs_root *root,
4506                                    u64 bytenr, u64 num_bytes)
4507{
4508        struct btrfs_block_group_cache *cache;
4509
4510        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4511        BUG_ON(!cache);
4512
4513        /*
4514         * pull in the free space cache (if any) so that our pin
4515         * removes the free space from the cache.  We have load_only set
4516         * to one because the slow code to read in the free extents does check
4517         * the pinned extents.
4518         */
4519        cache_block_group(cache, trans, root, 1);
4520
4521        pin_down_extent(root, cache, bytenr, num_bytes, 0);
4522
4523        /* remove us from the free space cache (if we're there at all) */
4524        btrfs_remove_free_space(cache, bytenr, num_bytes);
4525        btrfs_put_block_group(cache);
4526        return 0;
4527}
4528
4529/**
4530 * btrfs_update_reserved_bytes - update the block_group and space info counters
4531 * @cache:      The cache we are manipulating
4532 * @num_bytes:  The number of bytes in question
4533 * @reserve:    One of the reservation enums
4534 *
4535 * This is called by the allocator when it reserves space, or by somebody who is
4536 * freeing space that was never actually used on disk.  For example if you
4537 * reserve some space for a new leaf in transaction A and before transaction A
4538 * commits you free that leaf, you call this with reserve set to 0 in order to
4539 * clear the reservation.
4540 *
4541 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4542 * ENOSPC accounting.  For data we handle the reservation through clearing the
4543 * delalloc bits in the io_tree.  We have to do this since we could end up
4544 * allocating less disk space for the amount of data we have reserved in the
4545 * case of compression.
4546 *
4547 * If this is a reservation and the block group has become read only we cannot
4548 * make the reservation and return -EAGAIN, otherwise this function always
4549 * succeeds.
4550 */
4551static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4552                                       u64 num_bytes, int reserve)
4553{
4554        struct btrfs_space_info *space_info = cache->space_info;
4555        int ret = 0;
4556        spin_lock(&space_info->lock);
4557        spin_lock(&cache->lock);
4558        if (reserve != RESERVE_FREE) {
4559                if (cache->ro) {
4560                        ret = -EAGAIN;
4561                } else {
4562                        cache->reserved += num_bytes;
4563                        space_info->bytes_reserved += num_bytes;
4564                        if (reserve == RESERVE_ALLOC) {
4565                                BUG_ON(space_info->bytes_may_use < num_bytes);
4566                                space_info->bytes_may_use -= num_bytes;
4567                        }
4568                }
4569        } else {
4570                if (cache->ro)
4571                        space_info->bytes_readonly += num_bytes;
4572                cache->reserved -= num_bytes;
4573                space_info->bytes_reserved -= num_bytes;
4574                space_info->reservation_progress++;
4575        }
4576        spin_unlock(&cache->lock);
4577        spin_unlock(&space_info->lock);
4578        return ret;
4579}
4580
4581int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4582                                struct btrfs_root *root)
4583{
4584        struct btrfs_fs_info *fs_info = root->fs_info;
4585        struct btrfs_caching_control *next;
4586        struct btrfs_caching_control *caching_ctl;
4587        struct btrfs_block_group_cache *cache;
4588
4589        down_write(&fs_info->extent_commit_sem);
4590
4591        list_for_each_entry_safe(caching_ctl, next,
4592                                 &fs_info->caching_block_groups, list) {
4593                cache = caching_ctl->block_group;
4594                if (block_group_cache_done(cache)) {
4595                        cache->last_byte_to_unpin = (u64)-1;
4596                        list_del_init(&caching_ctl->list);
4597                        put_caching_control(caching_ctl);
4598                } else {
4599                        cache->last_byte_to_unpin = caching_ctl->progress;
4600                }
4601        }
4602
4603        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4604                fs_info->pinned_extents = &fs_info->freed_extents[1];
4605        else
4606                fs_info->pinned_extents = &fs_info->freed_extents[0];
4607
4608        up_write(&fs_info->extent_commit_sem);
4609
4610        update_global_block_rsv(fs_info);
4611        return 0;
4612}
4613
4614static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4615{
4616        struct btrfs_fs_info *fs_info = root->fs_info;
4617        struct btrfs_block_group_cache *cache = NULL;
4618        u64 len;
4619
4620        while (start <= end) {
4621                if (!cache ||
4622                    start >= cache->key.objectid + cache->key.offset) {
4623                        if (cache)
4624                                btrfs_put_block_group(cache);
4625                        cache = btrfs_lookup_block_group(fs_info, start);
4626                        BUG_ON(!cache);
4627                }
4628
4629                len = cache->key.objectid + cache->key.offset - start;
4630                len = min(len, end + 1 - start);
4631
4632                if (start < cache->last_byte_to_unpin) {
4633                        len = min(len, cache->last_byte_to_unpin - start);
4634                        btrfs_add_free_space(cache, start, len);
4635                }
4636
4637                start += len;
4638
4639                spin_lock(&cache->space_info->lock);
4640                spin_lock(&cache->lock);
4641                cache->pinned -= len;
4642                cache->space_info->bytes_pinned -= len;
4643                if (cache->ro)
4644                        cache->space_info->bytes_readonly += len;
4645                spin_unlock(&cache->lock);
4646                spin_unlock(&cache->space_info->lock);
4647        }
4648
4649        if (cache)
4650                btrfs_put_block_group(cache);
4651        return 0;
4652}
4653
4654int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4655                               struct btrfs_root *root)
4656{
4657        struct btrfs_fs_info *fs_info = root->fs_info;
4658        struct extent_io_tree *unpin;
4659        u64 start;
4660        u64 end;
4661        int ret;
4662
4663        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4664                unpin = &fs_info->freed_extents[1];
4665        else
4666                unpin = &fs_info->freed_extents[0];
4667
4668        while (1) {
4669                ret = find_first_extent_bit(unpin, 0, &start, &end,
4670                                            EXTENT_DIRTY);
4671                if (ret)
4672                        break;
4673
4674                if (btrfs_test_opt(root, DISCARD))
4675                        ret = btrfs_discard_extent(root, start,
4676                                                   end + 1 - start, NULL);
4677
4678                clear_extent_dirty(unpin, start, end, GFP_NOFS);
4679                unpin_extent_range(root, start, end);
4680                cond_resched();
4681        }
4682
4683        return 0;
4684}
4685
4686static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4687                                struct btrfs_root *root,
4688                                u64 bytenr, u64 num_bytes, u64 parent,
4689                                u64 root_objectid, u64 owner_objectid,
4690                                u64 owner_offset, int refs_to_drop,
4691                                struct btrfs_delayed_extent_op *extent_op)
4692{
4693        struct btrfs_key key;
4694        struct btrfs_path *path;
4695        struct btrfs_fs_info *info = root->fs_info;
4696        struct btrfs_root *extent_root = info->extent_root;
4697        struct extent_buffer *leaf;
4698        struct btrfs_extent_item *ei;
4699        struct btrfs_extent_inline_ref *iref;
4700        int ret;
4701        int is_data;
4702        int extent_slot = 0;
4703        int found_extent = 0;
4704        int num_to_del = 1;
4705        u32 item_size;
4706        u64 refs;
4707
4708        path = btrfs_alloc_path();
4709        if (!path)
4710                return -ENOMEM;
4711
4712        path->reada = 1;
4713        path->leave_spinning = 1;
4714
4715        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4716        BUG_ON(!is_data && refs_to_drop != 1);
4717
4718        ret = lookup_extent_backref(trans, extent_root, path, &iref,
4719                                    bytenr, num_bytes, parent,
4720                                    root_objectid, owner_objectid,
4721                                    owner_offset);
4722        if (ret == 0) {
4723                extent_slot = path->slots[0];
4724                while (extent_slot >= 0) {
4725                        btrfs_item_key_to_cpu(path->nodes[0], &key,
4726                                              extent_slot);
4727                        if (key.objectid != bytenr)
4728                                break;
4729                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4730                            key.offset == num_bytes) {
4731                                found_extent = 1;
4732                                break;
4733                        }
4734                        if (path->slots[0] - extent_slot > 5)
4735                                break;
4736                        extent_slot--;
4737                }
4738#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4739                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4740                if (found_extent && item_size < sizeof(*ei))
4741                        found_extent = 0;
4742#endif
4743                if (!found_extent) {
4744                        BUG_ON(iref);
4745                        ret = remove_extent_backref(trans, extent_root, path,
4746                                                    NULL, refs_to_drop,
4747                                                    is_data);
4748                        BUG_ON(ret);
4749                        btrfs_release_path(path);
4750                        path->leave_spinning = 1;
4751
4752                        key.objectid = bytenr;
4753                        key.type = BTRFS_EXTENT_ITEM_KEY;
4754                        key.offset = num_bytes;
4755
4756                        ret = btrfs_search_slot(trans, extent_root,
4757                                                &key, path, -1, 1);
4758                        if (ret) {
4759                                printk(KERN_ERR "umm, got %d back from search"
4760                                       ", was looking for %llu\n", ret,
4761                                       (unsigned long long)bytenr);
4762                                if (ret > 0)
4763                                        btrfs_print_leaf(extent_root,
4764                                                         path->nodes[0]);
4765                        }
4766                        BUG_ON(ret);
4767                        extent_slot = path->slots[0];
4768                }
4769        } else {
4770                btrfs_print_leaf(extent_root, path->nodes[0]);
4771                WARN_ON(1);
4772                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4773                       "parent %llu root %llu  owner %llu offset %llu\n",
4774                       (unsigned long long)bytenr,
4775                       (unsigned long long)parent,
4776                       (unsigned long long)root_objectid,
4777                       (unsigned long long)owner_objectid,
4778                       (unsigned long long)owner_offset);
4779        }
4780
4781        leaf = path->nodes[0];
4782        item_size = btrfs_item_size_nr(leaf, extent_slot);
4783#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4784        if (item_size < sizeof(*ei)) {
4785                BUG_ON(found_extent || extent_slot != path->slots[0]);
4786                ret = convert_extent_item_v0(trans, extent_root, path,
4787                                             owner_objectid, 0);
4788                BUG_ON(ret < 0);
4789
4790                btrfs_release_path(path);
4791                path->leave_spinning = 1;
4792
4793                key.objectid = bytenr;
4794                key.type = BTRFS_EXTENT_ITEM_KEY;
4795                key.offset = num_bytes;
4796
4797                ret = btrfs_search_slot(trans, extent_root, &key, path,
4798                                        -1, 1);
4799                if (ret) {
4800                        printk(KERN_ERR "umm, got %d back from search"
4801                               ", was looking for %llu\n", ret,
4802                               (unsigned long long)bytenr);
4803                        btrfs_print_leaf(extent_root, path->nodes[0]);
4804                }
4805                BUG_ON(ret);
4806                extent_slot = path->slots[0];
4807                leaf = path->nodes[0];
4808                item_size = btrfs_item_size_nr(leaf, extent_slot);
4809        }
4810#endif
4811        BUG_ON(item_size < sizeof(*ei));
4812        ei = btrfs_item_ptr(leaf, extent_slot,
4813                            struct btrfs_extent_item);
4814        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4815                struct btrfs_tree_block_info *bi;
4816                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4817                bi = (struct btrfs_tree_block_info *)(ei + 1);
4818                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4819        }
4820
4821        refs = btrfs_extent_refs(leaf, ei);
4822        BUG_ON(refs < refs_to_drop);
4823        refs -= refs_to_drop;
4824
4825        if (refs > 0) {
4826                if (extent_op)
4827                        __run_delayed_extent_op(extent_op, leaf, ei);
4828                /*
4829                 * In the case of inline back ref, reference count will
4830                 * be updated by remove_extent_backref
4831                 */
4832                if (iref) {
4833                        BUG_ON(!found_extent);
4834                } else {
4835                        btrfs_set_extent_refs(leaf, ei, refs);
4836                        btrfs_mark_buffer_dirty(leaf);
4837                }
4838                if (found_extent) {
4839                        ret = remove_extent_backref(trans, extent_root, path,
4840                                                    iref, refs_to_drop,
4841                                                    is_data);
4842                        BUG_ON(ret);
4843                }
4844        } else {
4845                if (found_extent) {
4846                        BUG_ON(is_data && refs_to_drop !=
4847                               extent_data_ref_count(root, path, iref));
4848                        if (iref) {
4849                                BUG_ON(path->slots[0] != extent_slot);
4850                        } else {
4851                                BUG_ON(path->slots[0] != extent_slot + 1);
4852                                path->slots[0] = extent_slot;
4853                                num_to_del = 2;
4854                        }
4855                }
4856
4857                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4858                                      num_to_del);
4859                BUG_ON(ret);
4860                btrfs_release_path(path);
4861
4862                if (is_data) {
4863                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4864                        BUG_ON(ret);
4865                } else {
4866                        invalidate_mapping_pages(info->btree_inode->i_mapping,
4867                             bytenr >> PAGE_CACHE_SHIFT,
4868                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4869                }
4870
4871                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4872                BUG_ON(ret);
4873        }
4874        btrfs_free_path(path);
4875        return ret;
4876}
4877
4878/*
4879 * when we free an block, it is possible (and likely) that we free the last
4880 * delayed ref for that extent as well.  This searches the delayed ref tree for
4881 * a given extent, and if there are no other delayed refs to be processed, it
4882 * removes it from the tree.
4883 */
4884static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4885                                      struct btrfs_root *root, u64 bytenr)
4886{
4887        struct btrfs_delayed_ref_head *head;
4888        struct btrfs_delayed_ref_root *delayed_refs;
4889        struct btrfs_delayed_ref_node *ref;
4890        struct rb_node *node;
4891        int ret = 0;
4892
4893        delayed_refs = &trans->transaction->delayed_refs;
4894        spin_lock(&delayed_refs->lock);
4895        head = btrfs_find_delayed_ref_head(trans, bytenr);
4896        if (!head)
4897                goto out;
4898
4899        node = rb_prev(&head->node.rb_node);
4900        if (!node)
4901                goto out;
4902
4903        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4904
4905        /* there are still entries for this ref, we can't drop it */
4906        if (ref->bytenr == bytenr)
4907                goto out;
4908
4909        if (head->extent_op) {
4910                if (!head->must_insert_reserved)
4911                        goto out;
4912                kfree(head->extent_op);
4913                head->extent_op = NULL;
4914        }
4915
4916        /*
4917         * waiting for the lock here would deadlock.  If someone else has it
4918         * locked they are already in the process of dropping it anyway
4919         */
4920        if (!mutex_trylock(&head->mutex))
4921                goto out;
4922
4923        /*
4924         * at this point we have a head with no other entries.  Go
4925         * ahead and process it.
4926         */
4927        head->node.in_tree = 0;
4928        rb_erase(&head->node.rb_node, &delayed_refs->root);
4929
4930        delayed_refs->num_entries--;
4931
4932        /*
4933         * we don't take a ref on the node because we're removing it from the
4934         * tree, so we just steal the ref the tree was holding.
4935         */
4936        delayed_refs->num_heads--;
4937        if (list_empty(&head->cluster))
4938                delayed_refs->num_heads_ready--;
4939
4940        list_del_init(&head->cluster);
4941        spin_unlock(&delayed_refs->lock);
4942
4943        BUG_ON(head->extent_op);
4944        if (head->must_insert_reserved)
4945                ret = 1;
4946
4947        mutex_unlock(&head->mutex);
4948        btrfs_put_delayed_ref(&head->node);
4949        return ret;
4950out:
4951        spin_unlock(&delayed_refs->lock);
4952        return 0;
4953}
4954
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956                           struct btrfs_root *root,
4957                           struct extent_buffer *buf,
4958                           u64 parent, int last_ref)
4959{
4960        struct btrfs_block_group_cache *cache = NULL;
4961        int ret;
4962
4963        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4965                                                parent, root->root_key.objectid,
4966                                                btrfs_header_level(buf),
4967                                                BTRFS_DROP_DELAYED_REF, NULL);
4968                BUG_ON(ret);
4969        }
4970
4971        if (!last_ref)
4972                return;
4973
4974        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4975
4976        if (btrfs_header_generation(buf) == trans->transid) {
4977                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4978                        ret = check_ref_cleanup(trans, root, buf->start);
4979                        if (!ret)
4980                                goto out;
4981                }
4982
4983                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4984                        pin_down_extent(root, cache, buf->start, buf->len, 1);
4985                        goto out;
4986                }
4987
4988                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4989
4990                btrfs_add_free_space(cache, buf->start, buf->len);
4991                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4992        }
4993out:
4994        /*
4995         * Deleting the buffer, clear the corrupt flag since it doesn't matter
4996         * anymore.
4997         */
4998        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4999        btrfs_put_block_group(cache);
5000}
5001
5002int btrfs_free_extent(struct btrfs_trans_handle *trans,
5003                      struct btrfs_root *root,
5004                      u64 bytenr, u64 num_bytes, u64 parent,
5005                      u64 root_objectid, u64 owner, u64 offset)
5006{
5007        int ret;
5008
5009        /*
5010         * tree log blocks never actually go into the extent allocation
5011         * tree, just update pinning info and exit early.
5012         */
5013        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5014                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5015                /* unlocks the pinned mutex */
5016                btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017                ret = 0;
5018        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
5020                                        parent, root_objectid, (int)owner,
5021                                        BTRFS_DROP_DELAYED_REF, NULL);
5022                BUG_ON(ret);
5023        } else {
5024                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
5025                                        parent, root_objectid, owner,
5026                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
5027                BUG_ON(ret);
5028        }
5029        return ret;
5030}
5031
5032static u64 stripe_align(struct btrfs_root *root, u64 val)
5033{
5034        u64 mask = ((u64)root->stripesize - 1);
5035        u64 ret = (val + mask) & ~mask;
5036        return ret;
5037}
5038
5039/*
5040 * when we wait for progress in the block group caching, its because
5041 * our allocation attempt failed at least once.  So, we must sleep
5042 * and let some progress happen before we try again.
5043 *
5044 * This function will sleep at least once waiting for new free space to
5045 * show up, and then it will check the block group free space numbers
5046 * for our min num_bytes.  Another option is to have it go ahead
5047 * and look in the rbtree for a free extent of a given size, but this
5048 * is a good start.
5049 */
5050static noinline int
5051wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5052                                u64 num_bytes)
5053{
5054        struct btrfs_caching_control *caching_ctl;
5055        DEFINE_WAIT(wait);
5056
5057        caching_ctl = get_caching_control(cache);
5058        if (!caching_ctl)
5059                return 0;
5060
5061        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5062                   (cache->free_space_ctl->free_space >= num_bytes));
5063
5064        put_caching_control(caching_ctl);
5065        return 0;
5066}
5067
5068static noinline int
5069wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5070{
5071        struct btrfs_caching_control *caching_ctl;
5072        DEFINE_WAIT(wait);
5073
5074        caching_ctl = get_caching_control(cache);
5075        if (!caching_ctl)
5076                return 0;
5077
5078        wait_event(caching_ctl->wait, block_group_cache_done(cache));
5079
5080        put_caching_control(caching_ctl);
5081        return 0;
5082}
5083
5084static int get_block_group_index(struct btrfs_block_group_cache *cache)
5085{
5086        int index;
5087        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
5088                index = 0;
5089        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
5090                index = 1;
5091        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
5092                index = 2;
5093        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
5094                index = 3;
5095        else
5096                index = 4;
5097        return index;
5098}
5099
5100enum btrfs_loop_type {
5101        LOOP_FIND_IDEAL = 0,
5102        LOOP_CACHING_NOWAIT = 1,
5103        LOOP_CACHING_WAIT = 2,
5104        LOOP_ALLOC_CHUNK = 3,
5105        LOOP_NO_EMPTY_SIZE = 4,
5106};
5107
5108/*
5109 * walks the btree of allocated extents and find a hole of a given size.
5110 * The key ins is changed to record the hole:
5111 * ins->objectid == block start
5112 * ins->flags = BTRFS_EXTENT_ITEM_KEY
5113 * ins->offset == number of blocks
5114 * Any available blocks before search_start are skipped.
5115 */
5116static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5117                                     struct btrfs_root *orig_root,
5118                                     u64 num_bytes, u64 empty_size,
5119                                     u64 search_start, u64 search_end,
5120                                     u64 hint_byte, struct btrfs_key *ins,
5121                                     u64 data)
5122{
5123        int ret = 0;
5124        struct btrfs_root *root = orig_root->fs_info->extent_root;
5125        struct btrfs_free_cluster *last_ptr = NULL;
5126        struct btrfs_block_group_cache *block_group = NULL;
5127        struct btrfs_block_group_cache *used_block_group;
5128        int empty_cluster = 2 * 1024 * 1024;
5129        int allowed_chunk_alloc = 0;
5130        int done_chunk_alloc = 0;
5131        struct btrfs_space_info *space_info;
5132        int loop = 0;
5133        int index = 0;
5134        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5135                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5136        bool found_uncached_bg = false;
5137        bool failed_cluster_refill = false;
5138        bool failed_alloc = false;
5139        bool use_cluster = true;
5140        bool have_caching_bg = false;
5141        u64 ideal_cache_percent = 0;
5142        u64 ideal_cache_offset = 0;
5143
5144        WARN_ON(num_bytes < root->sectorsize);
5145        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5146        ins->objectid = 0;
5147        ins->offset = 0;
5148
5149        space_info = __find_space_info(root->fs_info, data);
5150        if (!space_info) {
5151                printk(