linux/fs/btrfs/disk-io.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/blkdev.h>
  21#include <linux/scatterlist.h>
  22#include <linux/swap.h>
  23#include <linux/radix-tree.h>
  24#include <linux/writeback.h>
  25#include <linux/buffer_head.h>
  26#include <linux/workqueue.h>
  27#include <linux/kthread.h>
  28#include <linux/freezer.h>
  29#include <linux/crc32c.h>
  30#include <linux/slab.h>
  31#include <linux/migrate.h>
  32#include <linux/ratelimit.h>
  33#include <asm/unaligned.h>
  34#include "compat.h"
  35#include "ctree.h"
  36#include "disk-io.h"
  37#include "transaction.h"
  38#include "btrfs_inode.h"
  39#include "volumes.h"
  40#include "print-tree.h"
  41#include "async-thread.h"
  42#include "locking.h"
  43#include "tree-log.h"
  44#include "free-space-cache.h"
  45#include "inode-map.h"
  46
  47static struct extent_io_ops btree_extent_io_ops;
  48static void end_workqueue_fn(struct btrfs_work *work);
  49static void free_fs_root(struct btrfs_root *root);
  50static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
  51                                    int read_only);
  52static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
  53static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
  54static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  55                                      struct btrfs_root *root);
  56static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
  57static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
  58static int btrfs_destroy_marked_extents(struct btrfs_root *root,
  59                                        struct extent_io_tree *dirty_pages,
  60                                        int mark);
  61static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
  62                                       struct extent_io_tree *pinned_extents);
  63static int btrfs_cleanup_transaction(struct btrfs_root *root);
  64
  65/*
  66 * end_io_wq structs are used to do processing in task context when an IO is
  67 * complete.  This is used during reads to verify checksums, and it is used
  68 * by writes to insert metadata for new file extents after IO is complete.
  69 */
  70struct end_io_wq {
  71        struct bio *bio;
  72        bio_end_io_t *end_io;
  73        void *private;
  74        struct btrfs_fs_info *info;
  75        int error;
  76        int metadata;
  77        struct list_head list;
  78        struct btrfs_work work;
  79};
  80
  81/*
  82 * async submit bios are used to offload expensive checksumming
  83 * onto the worker threads.  They checksum file and metadata bios
  84 * just before they are sent down the IO stack.
  85 */
  86struct async_submit_bio {
  87        struct inode *inode;
  88        struct bio *bio;
  89        struct list_head list;
  90        extent_submit_bio_hook_t *submit_bio_start;
  91        extent_submit_bio_hook_t *submit_bio_done;
  92        int rw;
  93        int mirror_num;
  94        unsigned long bio_flags;
  95        /*
  96         * bio_offset is optional, can be used if the pages in the bio
  97         * can't tell us where in the file the bio should go
  98         */
  99        u64 bio_offset;
 100        struct btrfs_work work;
 101};
 102
 103/*
 104 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 105 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 106 * the level the eb occupies in the tree.
 107 *
 108 * Different roots are used for different purposes and may nest inside each
 109 * other and they require separate keysets.  As lockdep keys should be
 110 * static, assign keysets according to the purpose of the root as indicated
 111 * by btrfs_root->objectid.  This ensures that all special purpose roots
 112 * have separate keysets.
 113 *
 114 * Lock-nesting across peer nodes is always done with the immediate parent
 115 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 116 * subclass to avoid triggering lockdep warning in such cases.
 117 *
 118 * The key is set by the readpage_end_io_hook after the buffer has passed
 119 * csum validation but before the pages are unlocked.  It is also set by
 120 * btrfs_init_new_buffer on freshly allocated blocks.
 121 *
 122 * We also add a check to make sure the highest level of the tree is the
 123 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 124 * needs update as well.
 125 */
 126#ifdef CONFIG_DEBUG_LOCK_ALLOC
 127# if BTRFS_MAX_LEVEL != 8
 128#  error
 129# endif
 130
 131static struct btrfs_lockdep_keyset {
 132        u64                     id;             /* root objectid */
 133        const char              *name_stem;     /* lock name stem */
 134        char                    names[BTRFS_MAX_LEVEL + 1][20];
 135        struct lock_class_key   keys[BTRFS_MAX_LEVEL + 1];
 136} btrfs_lockdep_keysets[] = {
 137        { .id = BTRFS_ROOT_TREE_OBJECTID,       .name_stem = "root"     },
 138        { .id = BTRFS_EXTENT_TREE_OBJECTID,     .name_stem = "extent"   },
 139        { .id = BTRFS_CHUNK_TREE_OBJECTID,      .name_stem = "chunk"    },
 140        { .id = BTRFS_DEV_TREE_OBJECTID,        .name_stem = "dev"      },
 141        { .id = BTRFS_FS_TREE_OBJECTID,         .name_stem = "fs"       },
 142        { .id = BTRFS_CSUM_TREE_OBJECTID,       .name_stem = "csum"     },
 143        { .id = BTRFS_ORPHAN_OBJECTID,          .name_stem = "orphan"   },
 144        { .id = BTRFS_TREE_LOG_OBJECTID,        .name_stem = "log"      },
 145        { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
 146        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
 147        { .id = 0,                              .name_stem = "tree"     },
 148};
 149
 150void __init btrfs_init_lockdep(void)
 151{
 152        int i, j;
 153
 154        /* initialize lockdep class names */
 155        for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
 156                struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
 157
 158                for (j = 0; j < ARRAY_SIZE(ks->names); j++)
 159                        snprintf(ks->names[j], sizeof(ks->names[j]),
 160                                 "btrfs-%s-%02d", ks->name_stem, j);
 161        }
 162}
 163
 164void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
 165                                    int level)
 166{
 167        struct btrfs_lockdep_keyset *ks;
 168
 169        BUG_ON(level >= ARRAY_SIZE(ks->keys));
 170
 171        /* find the matching keyset, id 0 is the default entry */
 172        for (ks = btrfs_lockdep_keysets; ks->id; ks++)
 173                if (ks->id == objectid)
 174                        break;
 175
 176        lockdep_set_class_and_name(&eb->lock,
 177                                   &ks->keys[level], ks->names[level]);
 178}
 179
 180#endif
 181
 182/*
 183 * extents on the btree inode are pretty simple, there's one extent
 184 * that covers the entire device
 185 */
 186static struct extent_map *btree_get_extent(struct inode *inode,
 187                struct page *page, size_t pg_offset, u64 start, u64 len,
 188                int create)
 189{
 190        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 191        struct extent_map *em;
 192        int ret;
 193
 194        read_lock(&em_tree->lock);
 195        em = lookup_extent_mapping(em_tree, start, len);
 196        if (em) {
 197                em->bdev =
 198                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 199                read_unlock(&em_tree->lock);
 200                goto out;
 201        }
 202        read_unlock(&em_tree->lock);
 203
 204        em = alloc_extent_map();
 205        if (!em) {
 206                em = ERR_PTR(-ENOMEM);
 207                goto out;
 208        }
 209        em->start = 0;
 210        em->len = (u64)-1;
 211        em->block_len = (u64)-1;
 212        em->block_start = 0;
 213        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 214
 215        write_lock(&em_tree->lock);
 216        ret = add_extent_mapping(em_tree, em);
 217        if (ret == -EEXIST) {
 218                u64 failed_start = em->start;
 219                u64 failed_len = em->len;
 220
 221                free_extent_map(em);
 222                em = lookup_extent_mapping(em_tree, start, len);
 223                if (em) {
 224                        ret = 0;
 225                } else {
 226                        em = lookup_extent_mapping(em_tree, failed_start,
 227                                                   failed_len);
 228                        ret = -EIO;
 229                }
 230        } else if (ret) {
 231                free_extent_map(em);
 232                em = NULL;
 233        }
 234        write_unlock(&em_tree->lock);
 235
 236        if (ret)
 237                em = ERR_PTR(ret);
 238out:
 239        return em;
 240}
 241
 242u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 243{
 244        return crc32c(seed, data, len);
 245}
 246
 247void btrfs_csum_final(u32 crc, char *result)
 248{
 249        put_unaligned_le32(~crc, result);
 250}
 251
 252/*
 253 * compute the csum for a btree block, and either verify it or write it
 254 * into the csum field of the block.
 255 */
 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 257                           int verify)
 258{
 259        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 260        char *result = NULL;
 261        unsigned long len;
 262        unsigned long cur_len;
 263        unsigned long offset = BTRFS_CSUM_SIZE;
 264        char *kaddr;
 265        unsigned long map_start;
 266        unsigned long map_len;
 267        int err;
 268        u32 crc = ~(u32)0;
 269        unsigned long inline_result;
 270
 271        len = buf->len - offset;
 272        while (len > 0) {
 273                err = map_private_extent_buffer(buf, offset, 32,
 274                                        &kaddr, &map_start, &map_len);
 275                if (err)
 276                        return 1;
 277                cur_len = min(len, map_len - (offset - map_start));
 278                crc = btrfs_csum_data(root, kaddr + offset - map_start,
 279                                      crc, cur_len);
 280                len -= cur_len;
 281                offset += cur_len;
 282        }
 283        if (csum_size > sizeof(inline_result)) {
 284                result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
 285                if (!result)
 286                        return 1;
 287        } else {
 288                result = (char *)&inline_result;
 289        }
 290
 291        btrfs_csum_final(crc, result);
 292
 293        if (verify) {
 294                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 295                        u32 val;
 296                        u32 found = 0;
 297                        memcpy(&found, result, csum_size);
 298
 299                        read_extent_buffer(buf, &val, 0, csum_size);
 300                        printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
 301                                       "failed on %llu wanted %X found %X "
 302                                       "level %d\n",
 303                                       root->fs_info->sb->s_id,
 304                                       (unsigned long long)buf->start, val, found,
 305                                       btrfs_header_level(buf));
 306                        if (result != (char *)&inline_result)
 307                                kfree(result);
 308                        return 1;
 309                }
 310        } else {
 311                write_extent_buffer(buf, result, 0, csum_size);
 312        }
 313        if (result != (char *)&inline_result)
 314                kfree(result);
 315        return 0;
 316}
 317
 318/*
 319 * we can't consider a given block up to date unless the transid of the
 320 * block matches the transid in the parent node's pointer.  This is how we
 321 * detect blocks that either didn't get written at all or got written
 322 * in the wrong place.
 323 */
 324static int verify_parent_transid(struct extent_io_tree *io_tree,
 325                                 struct extent_buffer *eb, u64 parent_transid)
 326{
 327        struct extent_state *cached_state = NULL;
 328        int ret;
 329
 330        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 331                return 0;
 332
 333        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
 334                         0, &cached_state, GFP_NOFS);
 335        if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
 336            btrfs_header_generation(eb) == parent_transid) {
 337                ret = 0;
 338                goto out;
 339        }
 340        printk_ratelimited("parent transid verify failed on %llu wanted %llu "
 341                       "found %llu\n",
 342                       (unsigned long long)eb->start,
 343                       (unsigned long long)parent_transid,
 344                       (unsigned long long)btrfs_header_generation(eb));
 345        ret = 1;
 346        clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
 347out:
 348        unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
 349                             &cached_state, GFP_NOFS);
 350        return ret;
 351}
 352
 353/*
 354 * helper to read a given tree block, doing retries as required when
 355 * the checksums don't match and we have alternate mirrors to try.
 356 */
 357static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 358                                          struct extent_buffer *eb,
 359                                          u64 start, u64 parent_transid)
 360{
 361        struct extent_io_tree *io_tree;
 362        int ret;
 363        int num_copies = 0;
 364        int mirror_num = 0;
 365
 366        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 367        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 368        while (1) {
 369                ret = read_extent_buffer_pages(io_tree, eb, start,
 370                                               WAIT_COMPLETE,
 371                                               btree_get_extent, mirror_num);
 372                if (!ret &&
 373                    !verify_parent_transid(io_tree, eb, parent_transid))
 374                        return ret;
 375
 376                /*
 377                 * This buffer's crc is fine, but its contents are corrupted, so
 378                 * there is no reason to read the other copies, they won't be
 379                 * any less wrong.
 380                 */
 381                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
 382                        return ret;
 383
 384                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 385                                              eb->start, eb->len);
 386                if (num_copies == 1)
 387                        return ret;
 388
 389                mirror_num++;
 390                if (mirror_num > num_copies)
 391                        return ret;
 392        }
 393        return -EIO;
 394}
 395
 396/*
 397 * checksum a dirty tree block before IO.  This has extra checks to make sure
 398 * we only fill in the checksum field in the first page of a multi-page block
 399 */
 400
 401static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 402{
 403        struct extent_io_tree *tree;
 404        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 405        u64 found_start;
 406        unsigned long len;
 407        struct extent_buffer *eb;
 408        int ret;
 409
 410        tree = &BTRFS_I(page->mapping->host)->io_tree;
 411
 412        if (page->private == EXTENT_PAGE_PRIVATE) {
 413                WARN_ON(1);
 414                goto out;
 415        }
 416        if (!page->private) {
 417                WARN_ON(1);
 418                goto out;
 419        }
 420        len = page->private >> 2;
 421        WARN_ON(len == 0);
 422
 423        eb = alloc_extent_buffer(tree, start, len, page);
 424        if (eb == NULL) {
 425                WARN_ON(1);
 426                goto out;
 427        }
 428        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 429                                             btrfs_header_generation(eb));
 430        BUG_ON(ret);
 431        WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
 432
 433        found_start = btrfs_header_bytenr(eb);
 434        if (found_start != start) {
 435                WARN_ON(1);
 436                goto err;
 437        }
 438        if (eb->first_page != page) {
 439                WARN_ON(1);
 440                goto err;
 441        }
 442        if (!PageUptodate(page)) {
 443                WARN_ON(1);
 444                goto err;
 445        }
 446        csum_tree_block(root, eb, 0);
 447err:
 448        free_extent_buffer(eb);
 449out:
 450        return 0;
 451}
 452
 453static int check_tree_block_fsid(struct btrfs_root *root,
 454                                 struct extent_buffer *eb)
 455{
 456        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 457        u8 fsid[BTRFS_UUID_SIZE];
 458        int ret = 1;
 459
 460        read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
 461                           BTRFS_FSID_SIZE);
 462        while (fs_devices) {
 463                if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 464                        ret = 0;
 465                        break;
 466                }
 467                fs_devices = fs_devices->seed;
 468        }
 469        return ret;
 470}
 471
 472#define CORRUPT(reason, eb, root, slot)                         \
 473        printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
 474               "root=%llu, slot=%d\n", reason,                  \
 475               (unsigned long long)btrfs_header_bytenr(eb),     \
 476               (unsigned long long)root->objectid, slot)
 477
 478static noinline int check_leaf(struct btrfs_root *root,
 479                               struct extent_buffer *leaf)
 480{
 481        struct btrfs_key key;
 482        struct btrfs_key leaf_key;
 483        u32 nritems = btrfs_header_nritems(leaf);
 484        int slot;
 485
 486        if (nritems == 0)
 487                return 0;
 488
 489        /* Check the 0 item */
 490        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
 491            BTRFS_LEAF_DATA_SIZE(root)) {
 492                CORRUPT("invalid item offset size pair", leaf, root, 0);
 493                return -EIO;
 494        }
 495
 496        /*
 497         * Check to make sure each items keys are in the correct order and their
 498         * offsets make sense.  We only have to loop through nritems-1 because
 499         * we check the current slot against the next slot, which verifies the
 500         * next slot's offset+size makes sense and that the current's slot
 501         * offset is correct.
 502         */
 503        for (slot = 0; slot < nritems - 1; slot++) {
 504                btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
 505                btrfs_item_key_to_cpu(leaf, &key, slot + 1);
 506
 507                /* Make sure the keys are in the right order */
 508                if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
 509                        CORRUPT("bad key order", leaf, root, slot);
 510                        return -EIO;
 511                }
 512
 513                /*
 514                 * Make sure the offset and ends are right, remember that the
 515                 * item data starts at the end of the leaf and grows towards the
 516                 * front.
 517                 */
 518                if (btrfs_item_offset_nr(leaf, slot) !=
 519                        btrfs_item_end_nr(leaf, slot + 1)) {
 520                        CORRUPT("slot offset bad", leaf, root, slot);
 521                        return -EIO;
 522                }
 523
 524                /*
 525                 * Check to make sure that we don't point outside of the leaf,
 526                 * just incase all the items are consistent to eachother, but
 527                 * all point outside of the leaf.
 528                 */
 529                if (btrfs_item_end_nr(leaf, slot) >
 530                    BTRFS_LEAF_DATA_SIZE(root)) {
 531                        CORRUPT("slot end outside of leaf", leaf, root, slot);
 532                        return -EIO;
 533                }
 534        }
 535
 536        return 0;
 537}
 538
 539static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 540                               struct extent_state *state)
 541{
 542        struct extent_io_tree *tree;
 543        u64 found_start;
 544        int found_level;
 545        unsigned long len;
 546        struct extent_buffer *eb;
 547        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 548        int ret = 0;
 549
 550        tree = &BTRFS_I(page->mapping->host)->io_tree;
 551        if (page->private == EXTENT_PAGE_PRIVATE)
 552                goto out;
 553        if (!page->private)
 554                goto out;
 555
 556        len = page->private >> 2;
 557        WARN_ON(len == 0);
 558
 559        eb = alloc_extent_buffer(tree, start, len, page);
 560        if (eb == NULL) {
 561                ret = -EIO;
 562                goto out;
 563        }
 564
 565        found_start = btrfs_header_bytenr(eb);
 566        if (found_start != start) {
 567                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
 568                               "%llu %llu\n",
 569                               (unsigned long long)found_start,
 570                               (unsigned long long)eb->start);
 571                ret = -EIO;
 572                goto err;
 573        }
 574        if (eb->first_page != page) {
 575                printk(KERN_INFO "btrfs bad first page %lu %lu\n",
 576                       eb->first_page->index, page->index);
 577                WARN_ON(1);
 578                ret = -EIO;
 579                goto err;
 580        }
 581        if (check_tree_block_fsid(root, eb)) {
 582                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
 583                               (unsigned long long)eb->start);
 584                ret = -EIO;
 585                goto err;
 586        }
 587        found_level = btrfs_header_level(eb);
 588
 589        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 590                                       eb, found_level);
 591
 592        ret = csum_tree_block(root, eb, 1);
 593        if (ret) {
 594                ret = -EIO;
 595                goto err;
 596        }
 597
 598        /*
 599         * If this is a leaf block and it is corrupt, set the corrupt bit so
 600         * that we don't try and read the other copies of this block, just
 601         * return -EIO.
 602         */
 603        if (found_level == 0 && check_leaf(root, eb)) {
 604                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 605                ret = -EIO;
 606        }
 607
 608        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 609        end = eb->start + end - 1;
 610err:
 611        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
 612                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
 613                btree_readahead_hook(root, eb, eb->start, ret);
 614        }
 615
 616        free_extent_buffer(eb);
 617out:
 618        return ret;
 619}
 620
 621static int btree_io_failed_hook(struct bio *failed_bio,
 622                         struct page *page, u64 start, u64 end,
 623                         int mirror_num, struct extent_state *state)
 624{
 625        struct extent_io_tree *tree;
 626        unsigned long len;
 627        struct extent_buffer *eb;
 628        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 629
 630        tree = &BTRFS_I(page->mapping->host)->io_tree;
 631        if (page->private == EXTENT_PAGE_PRIVATE)
 632                goto out;
 633        if (!page->private)
 634                goto out;
 635
 636        len = page->private >> 2;
 637        WARN_ON(len == 0);
 638
 639        eb = alloc_extent_buffer(tree, start, len, page);
 640        if (eb == NULL)
 641                goto out;
 642
 643        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
 644                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
 645                btree_readahead_hook(root, eb, eb->start, -EIO);
 646        }
 647        free_extent_buffer(eb);
 648
 649out:
 650        return -EIO;    /* we fixed nothing */
 651}
 652
 653static void end_workqueue_bio(struct bio *bio, int err)
 654{
 655        struct end_io_wq *end_io_wq = bio->bi_private;
 656        struct btrfs_fs_info *fs_info;
 657
 658        fs_info = end_io_wq->info;
 659        end_io_wq->error = err;
 660        end_io_wq->work.func = end_workqueue_fn;
 661        end_io_wq->work.flags = 0;
 662
 663        if (bio->bi_rw & REQ_WRITE) {
 664                if (end_io_wq->metadata == 1)
 665                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 666                                           &end_io_wq->work);
 667                else if (end_io_wq->metadata == 2)
 668                        btrfs_queue_worker(&fs_info->endio_freespace_worker,
 669                                           &end_io_wq->work);
 670                else
 671                        btrfs_queue_worker(&fs_info->endio_write_workers,
 672                                           &end_io_wq->work);
 673        } else {
 674                if (end_io_wq->metadata)
 675                        btrfs_queue_worker(&fs_info->endio_meta_workers,
 676                                           &end_io_wq->work);
 677                else
 678                        btrfs_queue_worker(&fs_info->endio_workers,
 679                                           &end_io_wq->work);
 680        }
 681}
 682
 683/*
 684 * For the metadata arg you want
 685 *
 686 * 0 - if data
 687 * 1 - if normal metadta
 688 * 2 - if writing to the free space cache area
 689 */
 690int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 691                        int metadata)
 692{
 693        struct end_io_wq *end_io_wq;
 694        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
 695        if (!end_io_wq)
 696                return -ENOMEM;
 697
 698        end_io_wq->private = bio->bi_private;
 699        end_io_wq->end_io = bio->bi_end_io;
 700        end_io_wq->info = info;
 701        end_io_wq->error = 0;
 702        end_io_wq->bio = bio;
 703        end_io_wq->metadata = metadata;
 704
 705        bio->bi_private = end_io_wq;
 706        bio->bi_end_io = end_workqueue_bio;
 707        return 0;
 708}
 709
 710unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 711{
 712        unsigned long limit = min_t(unsigned long,
 713                                    info->workers.max_workers,
 714                                    info->fs_devices->open_devices);
 715        return 256 * limit;
 716}
 717
 718static void run_one_async_start(struct btrfs_work *work)
 719{
 720        struct async_submit_bio *async;
 721
 722        async = container_of(work, struct  async_submit_bio, work);
 723        async->submit_bio_start(async->inode, async->rw, async->bio,
 724                               async->mirror_num, async->bio_flags,
 725                               async->bio_offset);
 726}
 727
 728static void run_one_async_done(struct btrfs_work *work)
 729{
 730        struct btrfs_fs_info *fs_info;
 731        struct async_submit_bio *async;
 732        int limit;
 733
 734        async = container_of(work, struct  async_submit_bio, work);
 735        fs_info = BTRFS_I(async->inode)->root->fs_info;
 736
 737        limit = btrfs_async_submit_limit(fs_info);
 738        limit = limit * 2 / 3;
 739
 740        atomic_dec(&fs_info->nr_async_submits);
 741
 742        if (atomic_read(&fs_info->nr_async_submits) < limit &&
 743            waitqueue_active(&fs_info->async_submit_wait))
 744                wake_up(&fs_info->async_submit_wait);
 745
 746        async->submit_bio_done(async->inode, async->rw, async->bio,
 747                               async->mirror_num, async->bio_flags,
 748                               async->bio_offset);
 749}
 750
 751static void run_one_async_free(struct btrfs_work *work)
 752{
 753        struct async_submit_bio *async;
 754
 755        async = container_of(work, struct  async_submit_bio, work);
 756        kfree(async);
 757}
 758
 759int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 760                        int rw, struct bio *bio, int mirror_num,
 761                        unsigned long bio_flags,
 762                        u64 bio_offset,
 763                        extent_submit_bio_hook_t *submit_bio_start,
 764                        extent_submit_bio_hook_t *submit_bio_done)
 765{
 766        struct async_submit_bio *async;
 767
 768        async = kmalloc(sizeof(*async), GFP_NOFS);
 769        if (!async)
 770                return -ENOMEM;
 771
 772        async->inode = inode;
 773        async->rw = rw;
 774        async->bio = bio;
 775        async->mirror_num = mirror_num;
 776        async->submit_bio_start = submit_bio_start;
 777        async->submit_bio_done = submit_bio_done;
 778
 779        async->work.func = run_one_async_start;
 780        async->work.ordered_func = run_one_async_done;
 781        async->work.ordered_free = run_one_async_free;
 782
 783        async->work.flags = 0;
 784        async->bio_flags = bio_flags;
 785        async->bio_offset = bio_offset;
 786
 787        atomic_inc(&fs_info->nr_async_submits);
 788
 789        if (rw & REQ_SYNC)
 790                btrfs_set_work_high_prio(&async->work);
 791
 792        btrfs_queue_worker(&fs_info->workers, &async->work);
 793
 794        while (atomic_read(&fs_info->async_submit_draining) &&
 795              atomic_read(&fs_info->nr_async_submits)) {
 796                wait_event(fs_info->async_submit_wait,
 797                           (atomic_read(&fs_info->nr_async_submits) == 0));
 798        }
 799
 800        return 0;
 801}
 802
 803static int btree_csum_one_bio(struct bio *bio)
 804{
 805        struct bio_vec *bvec = bio->bi_io_vec;
 806        int bio_index = 0;
 807        struct btrfs_root *root;
 808
 809        WARN_ON(bio->bi_vcnt <= 0);
 810        while (bio_index < bio->bi_vcnt) {
 811                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 812                csum_dirty_buffer(root, bvec->bv_page);
 813                bio_index++;
 814                bvec++;
 815        }
 816        return 0;
 817}
 818
 819static int __btree_submit_bio_start(struct inode *inode, int rw,
 820                                    struct bio *bio, int mirror_num,
 821                                    unsigned long bio_flags,
 822                                    u64 bio_offset)
 823{
 824        /*
 825         * when we're called for a write, we're already in the async
 826         * submission context.  Just jump into btrfs_map_bio
 827         */
 828        btree_csum_one_bio(bio);
 829        return 0;
 830}
 831
 832static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 833                                 int mirror_num, unsigned long bio_flags,
 834                                 u64 bio_offset)
 835{
 836        /*
 837         * when we're called for a write, we're already in the async
 838         * submission context.  Just jump into btrfs_map_bio
 839         */
 840        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 841}
 842
 843static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 844                                 int mirror_num, unsigned long bio_flags,
 845                                 u64 bio_offset)
 846{
 847        int ret;
 848
 849        ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
 850                                          bio, 1);
 851        BUG_ON(ret);
 852
 853        if (!(rw & REQ_WRITE)) {
 854                /*
 855                 * called for a read, do the setup so that checksum validation
 856                 * can happen in the async kernel threads
 857                 */
 858                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 859                                     mirror_num, 0);
 860        }
 861
 862        /*
 863         * kthread helpers are used to submit writes so that checksumming
 864         * can happen in parallel across all CPUs
 865         */
 866        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 867                                   inode, rw, bio, mirror_num, 0,
 868                                   bio_offset,
 869                                   __btree_submit_bio_start,
 870                                   __btree_submit_bio_done);
 871}
 872
 873#ifdef CONFIG_MIGRATION
 874static int btree_migratepage(struct address_space *mapping,
 875                        struct page *newpage, struct page *page)
 876{
 877        /*
 878         * we can't safely write a btree page from here,
 879         * we haven't done the locking hook
 880         */
 881        if (PageDirty(page))
 882                return -EAGAIN;
 883        /*
 884         * Buffers may be managed in a filesystem specific way.
 885         * We must have no buffers or drop them.
 886         */
 887        if (page_has_private(page) &&
 888            !try_to_release_page(page, GFP_KERNEL))
 889                return -EAGAIN;
 890        return migrate_page(mapping, newpage, page);
 891}
 892#endif
 893
 894static int btree_writepage(struct page *page, struct writeback_control *wbc)
 895{
 896        struct extent_io_tree *tree;
 897        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 898        struct extent_buffer *eb;
 899        int was_dirty;
 900
 901        tree = &BTRFS_I(page->mapping->host)->io_tree;
 902        if (!(current->flags & PF_MEMALLOC)) {
 903                return extent_write_full_page(tree, page,
 904                                              btree_get_extent, wbc);
 905        }
 906
 907        redirty_page_for_writepage(wbc, page);
 908        eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
 909        WARN_ON(!eb);
 910
 911        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 912        if (!was_dirty) {
 913                spin_lock(&root->fs_info->delalloc_lock);
 914                root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
 915                spin_unlock(&root->fs_info->delalloc_lock);
 916        }
 917        free_extent_buffer(eb);
 918
 919        unlock_page(page);
 920        return 0;
 921}
 922
 923static int btree_writepages(struct address_space *mapping,
 924                            struct writeback_control *wbc)
 925{
 926        struct extent_io_tree *tree;
 927        tree = &BTRFS_I(mapping->host)->io_tree;
 928        if (wbc->sync_mode == WB_SYNC_NONE) {
 929                struct btrfs_root *root = BTRFS_I(mapping->host)->root;
 930                u64 num_dirty;
 931                unsigned long thresh = 32 * 1024 * 1024;
 932
 933                if (wbc->for_kupdate)
 934                        return 0;
 935
 936                /* this is a bit racy, but that's ok */
 937                num_dirty = root->fs_info->dirty_metadata_bytes;
 938                if (num_dirty < thresh)
 939                        return 0;
 940        }
 941        return extent_writepages(tree, mapping, btree_get_extent, wbc);
 942}
 943
 944static int btree_readpage(struct file *file, struct page *page)
 945{
 946        struct extent_io_tree *tree;
 947        tree = &BTRFS_I(page->mapping->host)->io_tree;
 948        return extent_read_full_page(tree, page, btree_get_extent, 0);
 949}
 950
 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 952{
 953        struct extent_io_tree *tree;
 954        struct extent_map_tree *map;
 955        int ret;
 956
 957        if (PageWriteback(page) || PageDirty(page))
 958                return 0;
 959
 960        tree = &BTRFS_I(page->mapping->host)->io_tree;
 961        map = &BTRFS_I(page->mapping->host)->extent_tree;
 962
 963        ret = try_release_extent_state(map, tree, page, gfp_flags);
 964        if (!ret)
 965                return 0;
 966
 967        ret = try_release_extent_buffer(tree, page);
 968        if (ret == 1) {
 969                ClearPagePrivate(page);
 970                set_page_private(page, 0);
 971                page_cache_release(page);
 972        }
 973
 974        return ret;
 975}
 976
 977static void btree_invalidatepage(struct page *page, unsigned long offset)
 978{
 979        struct extent_io_tree *tree;
 980        tree = &BTRFS_I(page->mapping->host)->io_tree;
 981        extent_invalidatepage(tree, page, offset);
 982        btree_releasepage(page, GFP_NOFS);
 983        if (PagePrivate(page)) {
 984                printk(KERN_WARNING "btrfs warning page private not zero "
 985                       "on page %llu\n", (unsigned long long)page_offset(page));
 986                ClearPagePrivate(page);
 987                set_page_private(page, 0);
 988                page_cache_release(page);
 989        }
 990}
 991
 992static const struct address_space_operations btree_aops = {
 993        .readpage       = btree_readpage,
 994        .writepage      = btree_writepage,
 995        .writepages     = btree_writepages,
 996        .releasepage    = btree_releasepage,
 997        .invalidatepage = btree_invalidatepage,
 998#ifdef CONFIG_MIGRATION
 999        .migratepage    = btree_migratepage,
1000#endif
1001};
1002
1003int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1004                         u64 parent_transid)
1005{
1006        struct extent_buffer *buf = NULL;
1007        struct inode *btree_inode = root->fs_info->btree_inode;
1008        int ret = 0;
1009
1010        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1011        if (!buf)
1012                return 0;
1013        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1014                                 buf, 0, WAIT_NONE, btree_get_extent, 0);
1015        free_extent_buffer(buf);
1016        return ret;
1017}
1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020                         int mirror_num, struct extent_buffer **eb)
1021{
1022        struct extent_buffer *buf = NULL;
1023        struct inode *btree_inode = root->fs_info->btree_inode;
1024        struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025        int ret;
1026
1027        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028        if (!buf)
1029                return 0;
1030
1031        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033        ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034                                       btree_get_extent, mirror_num);
1035        if (ret) {
1036                free_extent_buffer(buf);
1037                return ret;
1038        }
1039
1040        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041                free_extent_buffer(buf);
1042                return -EIO;
1043        } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044                *eb = buf;
1045        } else {
1046                free_extent_buffer(buf);
1047        }
1048        return 0;
1049}
1050
1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1052                                            u64 bytenr, u32 blocksize)
1053{
1054        struct inode *btree_inode = root->fs_info->btree_inode;
1055        struct extent_buffer *eb;
1056        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
1057                                bytenr, blocksize);
1058        return eb;
1059}
1060
1061struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1062                                                 u64 bytenr, u32 blocksize)
1063{
1064        struct inode *btree_inode = root->fs_info->btree_inode;
1065        struct extent_buffer *eb;
1066
1067        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
1068                                 bytenr, blocksize, NULL);
1069        return eb;
1070}
1071
1072
1073int btrfs_write_tree_block(struct extent_buffer *buf)
1074{
1075        return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
1076                                        buf->start + buf->len - 1);
1077}
1078
1079int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1080{
1081        return filemap_fdatawait_range(buf->first_page->mapping,
1082                                       buf->start, buf->start + buf->len - 1);
1083}
1084
1085struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1086                                      u32 blocksize, u64 parent_transid)
1087{
1088        struct extent_buffer *buf = NULL;
1089        int ret;
1090
1091        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1092        if (!buf)
1093                return NULL;
1094
1095        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
1096
1097        if (ret == 0)
1098                set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
1099        return buf;
1100
1101}
1102
1103int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1104                     struct extent_buffer *buf)
1105{
1106        struct inode *btree_inode = root->fs_info->btree_inode;
1107        if (btrfs_header_generation(buf) ==
1108            root->fs_info->running_transaction->transid) {
1109                btrfs_assert_tree_locked(buf);
1110
1111                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1112                        spin_lock(&root->fs_info->delalloc_lock);
1113                        if (root->fs_info->dirty_metadata_bytes >= buf->len)
1114                                root->fs_info->dirty_metadata_bytes -= buf->len;
1115                        else
1116                                WARN_ON(1);
1117                        spin_unlock(&root->fs_info->delalloc_lock);
1118                }
1119
1120                /* ugh, clear_extent_buffer_dirty needs to lock the page */
1121                btrfs_set_lock_blocking(buf);
1122                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
1123                                          buf);
1124        }
1125        return 0;
1126}
1127
1128static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1129                        u32 stripesize, struct btrfs_root *root,
1130                        struct btrfs_fs_info *fs_info,
1131                        u64 objectid)
1132{
1133        root->node = NULL;
1134        root->commit_root = NULL;
1135        root->sectorsize = sectorsize;
1136        root->nodesize = nodesize;
1137        root->leafsize = leafsize;
1138        root->stripesize = stripesize;
1139        root->ref_cows = 0;
1140        root->track_dirty = 0;
1141        root->in_radix = 0;
1142        root->orphan_item_inserted = 0;
1143        root->orphan_cleanup_state = 0;
1144
1145        root->fs_info = fs_info;
1146        root->objectid = objectid;
1147        root->last_trans = 0;
1148        root->highest_objectid = 0;
1149        root->name = NULL;
1150        root->inode_tree = RB_ROOT;
1151        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1152        root->block_rsv = NULL;
1153        root->orphan_block_rsv = NULL;
1154
1155        INIT_LIST_HEAD(&root->dirty_list);
1156        INIT_LIST_HEAD(&root->orphan_list);
1157        INIT_LIST_HEAD(&root->root_list);
1158        spin_lock_init(&root->orphan_lock);
1159        spin_lock_init(&root->inode_lock);
1160        spin_lock_init(&root->accounting_lock);
1161        mutex_init(&root->objectid_mutex);
1162        mutex_init(&root->log_mutex);
1163        init_waitqueue_head(&root->log_writer_wait);
1164        init_waitqueue_head(&root->log_commit_wait[0]);
1165        init_waitqueue_head(&root->log_commit_wait[1]);
1166        atomic_set(&root->log_commit[0], 0);
1167        atomic_set(&root->log_commit[1], 0);
1168        atomic_set(&root->log_writers, 0);
1169        root->log_batch = 0;
1170        root->log_transid = 0;
1171        root->last_log_commit = 0;
1172        extent_io_tree_init(&root->dirty_log_pages,
1173                             fs_info->btree_inode->i_mapping);
1174
1175        memset(&root->root_key, 0, sizeof(root->root_key));
1176        memset(&root->root_item, 0, sizeof(root->root_item));
1177        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1178        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
1179        root->defrag_trans_start = fs_info->generation;
1180        init_completion(&root->kobj_unregister);
1181        root->defrag_running = 0;
1182        root->root_key.objectid = objectid;
1183        root->anon_dev = 0;
1184        return 0;
1185}
1186
1187static int find_and_setup_root(struct btrfs_root *tree_root,
1188                               struct btrfs_fs_info *fs_info,
1189                               u64 objectid,
1190                               struct btrfs_root *root)
1191{
1192        int ret;
1193        u32 blocksize;
1194        u64 generation;
1195
1196        __setup_root(tree_root->nodesize, tree_root->leafsize,
1197                     tree_root->sectorsize, tree_root->stripesize,
1198                     root, fs_info, objectid);
1199        ret = btrfs_find_last_root(tree_root, objectid,
1200                                   &root->root_item, &root->root_key);
1201        if (ret > 0)
1202                return -ENOENT;
1203        BUG_ON(ret);
1204
1205        generation = btrfs_root_generation(&root->root_item);
1206        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207        root->commit_root = NULL;
1208        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1209                                     blocksize, generation);
1210        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1211                free_extent_buffer(root->node);
1212                root->node = NULL;
1213                return -EIO;
1214        }
1215        root->commit_root = btrfs_root_node(root);
1216        return 0;
1217}
1218
1219static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1220                                         struct btrfs_fs_info *fs_info)
1221{
1222        struct btrfs_root *root;
1223        struct btrfs_root *tree_root = fs_info->tree_root;
1224        struct extent_buffer *leaf;
1225
1226        root = kzalloc(sizeof(*root), GFP_NOFS);
1227        if (!root)
1228                return ERR_PTR(-ENOMEM);
1229
1230        __setup_root(tree_root->nodesize, tree_root->leafsize,
1231                     tree_root->sectorsize, tree_root->stripesize,
1232                     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1233
1234        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1235        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1236        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1237        /*
1238         * log trees do not get reference counted because they go away
1239         * before a real commit is actually done.  They do store pointers
1240         * to file data extents, and those reference counts still get
1241         * updated (along with back refs to the log tree).
1242         */
1243        root->ref_cows = 0;
1244
1245        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1246                                      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
1247        if (IS_ERR(leaf)) {
1248                kfree(root);
1249                return ERR_CAST(leaf);
1250        }
1251
1252        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1253        btrfs_set_header_bytenr(leaf, leaf->start);
1254        btrfs_set_header_generation(leaf, trans->transid);
1255        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1256        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1257        root->node = leaf;
1258
1259        write_extent_buffer(root->node, root->fs_info->fsid,
1260                            (unsigned long)btrfs_header_fsid(root->node),
1261                            BTRFS_FSID_SIZE);
1262        btrfs_mark_buffer_dirty(root->node);
1263        btrfs_tree_unlock(root->node);
1264        return root;
1265}
1266
1267int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1268                             struct btrfs_fs_info *fs_info)
1269{
1270        struct btrfs_root *log_root;
1271
1272        log_root = alloc_log_tree(trans, fs_info);
1273        if (IS_ERR(log_root))
1274                return PTR_ERR(log_root);
1275        WARN_ON(fs_info->log_root_tree);
1276        fs_info->log_root_tree = log_root;
1277        return 0;
1278}
1279
1280int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1281                       struct btrfs_root *root)
1282{
1283        struct btrfs_root *log_root;
1284        struct btrfs_inode_item *inode_item;
1285
1286        log_root = alloc_log_tree(trans, root->fs_info);
1287        if (IS_ERR(log_root))
1288                return PTR_ERR(log_root);
1289
1290        log_root->last_trans = trans->transid;
1291        log_root->root_key.offset = root->root_key.objectid;
1292
1293        inode_item = &log_root->root_item.inode;
1294        inode_item->generation = cpu_to_le64(1);
1295        inode_item->size = cpu_to_le64(3);
1296        inode_item->nlink = cpu_to_le32(1);
1297        inode_item->nbytes = cpu_to_le64(root->leafsize);
1298        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1299
1300        btrfs_set_root_node(&log_root->root_item, log_root->node);
1301
1302        WARN_ON(root->log_root);
1303        root->log_root = log_root;
1304        root->log_transid = 0;
1305        root->last_log_commit = 0;
1306        return 0;
1307}
1308
1309struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1310                                               struct btrfs_key *location)
1311{
1312        struct btrfs_root *root;
1313        struct btrfs_fs_info *fs_info = tree_root->fs_info;
1314        struct btrfs_path *path;
1315        struct extent_buffer *l;
1316        u64 generation;
1317        u32 blocksize;
1318        int ret = 0;
1319
1320        root = kzalloc(sizeof(*root), GFP_NOFS);
1321        if (!root)
1322                return ERR_PTR(-ENOMEM);
1323        if (location->offset == (u64)-1) {
1324                ret = find_and_setup_root(tree_root, fs_info,
1325                                          location->objectid, root);
1326                if (ret) {
1327                        kfree(root);
1328                        return ERR_PTR(ret);
1329                }
1330                goto out;
1331        }
1332
1333        __setup_root(tree_root->nodesize, tree_root->leafsize,
1334                     tree_root->sectorsize, tree_root->stripesize,
1335                     root, fs_info, location->objectid);
1336
1337        path = btrfs_alloc_path();
1338        if (!path) {
1339                kfree(root);
1340                return ERR_PTR(-ENOMEM);
1341        }
1342        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1343        if (ret == 0) {
1344                l = path->nodes[0];
1345                read_extent_buffer(l, &root->root_item,
1346                                btrfs_item_ptr_offset(l, path->slots[0]),
1347                                sizeof(root->root_item));
1348                memcpy(&root->root_key, location, sizeof(*location));
1349        }
1350        btrfs_free_path(path);
1351        if (ret) {
1352                kfree(root);
1353                if (ret > 0)
1354                        ret = -ENOENT;
1355                return ERR_PTR(ret);
1356        }
1357
1358        generation = btrfs_root_generation(&root->root_item);
1359        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1360        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1361                                     blocksize, generation);
1362        root->commit_root = btrfs_root_node(root);
1363        BUG_ON(!root->node);
1364out:
1365        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1366                root->ref_cows = 1;
1367                btrfs_check_and_init_root_item(&root->root_item);
1368        }
1369
1370        return root;
1371}
1372
1373struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1374                                              struct btrfs_key *location)
1375{
1376        struct btrfs_root *root;
1377        int ret;
1378
1379        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1380                return fs_info->tree_root;
1381        if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1382                return fs_info->extent_root;
1383        if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1384                return fs_info->chunk_root;
1385        if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1386                return fs_info->dev_root;
1387        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1388                return fs_info->csum_root;
1389again:
1390        spin_lock(&fs_info->fs_roots_radix_lock);
1391        root = radix_tree_lookup(&fs_info->fs_roots_radix,
1392                                 (unsigned long)location->objectid);
1393        spin_unlock(&fs_info->fs_roots_radix_lock);
1394        if (root)
1395                return root;
1396
1397        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1398        if (IS_ERR(root))
1399                return root;
1400
1401        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1402        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1403                                        GFP_NOFS);
1404        if (!root->free_ino_pinned || !root->free_ino_ctl) {
1405                ret = -ENOMEM;
1406                goto fail;
1407        }
1408
1409        btrfs_init_free_ino_ctl(root);
1410        mutex_init(&root->fs_commit_mutex);
1411        spin_lock_init(&root->cache_lock);
1412        init_waitqueue_head(&root->cache_wait);
1413
1414        ret = get_anon_bdev(&root->anon_dev);
1415        if (ret)
1416                goto fail;
1417
1418        if (btrfs_root_refs(&root->root_item) == 0) {
1419                ret = -ENOENT;
1420                goto fail;
1421        }
1422
1423        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1424        if (ret < 0)
1425                goto fail;
1426        if (ret == 0)
1427                root->orphan_item_inserted = 1;
1428
1429        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1430        if (ret)
1431                goto fail;
1432
1433        spin_lock(&fs_info->fs_roots_radix_lock);
1434        ret = radix_tree_insert(&fs_info->fs_roots_radix,
1435                                (unsigned long)root->root_key.objectid,
1436                                root);
1437        if (ret == 0)
1438                root->in_radix = 1;
1439
1440        spin_unlock(&fs_info->fs_roots_radix_lock);
1441        radix_tree_preload_end();
1442        if (ret) {
1443                if (ret == -EEXIST) {
1444                        free_fs_root(root);
1445                        goto again;
1446                }
1447                goto fail;
1448        }
1449
1450        ret = btrfs_find_dead_roots(fs_info->tree_root,
1451                                    root->root_key.objectid);
1452        WARN_ON(ret);
1453        return root;
1454fail:
1455        free_fs_root(root);
1456        return ERR_PTR(ret);
1457}
1458
1459static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1460{
1461        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1462        int ret = 0;
1463        struct btrfs_device *device;
1464        struct backing_dev_info *bdi;
1465
1466        rcu_read_lock();
1467        list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1468                if (!device->bdev)
1469                        continue;
1470                bdi = blk_get_backing_dev_info(device->bdev);
1471                if (bdi && bdi_congested(bdi, bdi_bits)) {
1472                        ret = 1;
1473                        break;
1474                }
1475        }
1476        rcu_read_unlock();
1477        return ret;
1478}
1479
1480/*
1481 * If this fails, caller must call bdi_destroy() to get rid of the
1482 * bdi again.
1483 */
1484static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1485{
1486        int err;
1487
1488        bdi->capabilities = BDI_CAP_MAP_COPY;
1489        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1490        if (err)
1491                return err;
1492
1493        bdi->ra_pages   = default_backing_dev_info.ra_pages;
1494        bdi->congested_fn       = btrfs_congested_fn;
1495        bdi->congested_data     = info;
1496        return 0;
1497}
1498
1499static int bio_ready_for_csum(struct bio *bio)
1500{
1501        u64 length = 0;
1502        u64 buf_len = 0;
1503        u64 start = 0;
1504        struct page *page;
1505        struct extent_io_tree *io_tree = NULL;
1506        struct bio_vec *bvec;
1507        int i;
1508        int ret;
1509
1510        bio_for_each_segment(bvec, bio, i) {
1511                page = bvec->bv_page;
1512                if (page->private == EXTENT_PAGE_PRIVATE) {
1513                        length += bvec->bv_len;
1514                        continue;
1515                }
1516                if (!page->private) {
1517                        length += bvec->bv_len;
1518                        continue;
1519                }
1520                length = bvec->bv_len;
1521                buf_len = page->private >> 2;
1522                start = page_offset(page) + bvec->bv_offset;
1523                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1524        }
1525        /* are we fully contained in this bio? */
1526        if (buf_len <= length)
1527                return 1;
1528
1529        ret = extent_range_uptodate(io_tree, start + length,
1530                                    start + buf_len - 1);
1531        return ret;
1532}
1533
1534/*
1535 * called by the kthread helper functions to finally call the bio end_io
1536 * functions.  This is where read checksum verification actually happens
1537 */
1538static void end_workqueue_fn(struct btrfs_work *work)
1539{
1540        struct bio *bio;
1541        struct end_io_wq *end_io_wq;
1542        struct btrfs_fs_info *fs_info;
1543        int error;
1544
1545        end_io_wq = container_of(work, struct end_io_wq, work);
1546        bio = end_io_wq->bio;
1547        fs_info = end_io_wq->info;
1548
1549        /* metadata bio reads are special because the whole tree block must
1550         * be checksummed at once.  This makes sure the entire block is in
1551         * ram and up to date before trying to verify things.  For
1552         * blocksize <= pagesize, it is basically a noop
1553         */
1554        if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
1555            !bio_ready_for_csum(bio)) {
1556                btrfs_queue_worker(&fs_info->endio_meta_workers,
1557                                   &end_io_wq->work);
1558                return;
1559        }
1560        error = end_io_wq->error;
1561        bio->bi_private = end_io_wq->private;
1562        bio->bi_end_io = end_io_wq->end_io;
1563        kfree(end_io_wq);
1564        bio_endio(bio, error);
1565}
1566
1567static int cleaner_kthread(void *arg)
1568{
1569        struct btrfs_root *root = arg;
1570
1571        do {
1572                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1573
1574                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1575                    mutex_trylock(&root->fs_info->cleaner_mutex)) {
1576                        btrfs_run_delayed_iputs(root);
1577                        btrfs_clean_old_snapshots(root);
1578                        mutex_unlock(&root->fs_info->cleaner_mutex);
1579                        btrfs_run_defrag_inodes(root->fs_info);
1580                }
1581
1582                if (freezing(current)) {
1583                        refrigerator();
1584                } else {
1585                        set_current_state(TASK_INTERRUPTIBLE);
1586                        if (!kthread_should_stop())
1587                                schedule();
1588                        __set_current_state(TASK_RUNNING);
1589                }
1590        } while (!kthread_should_stop());
1591        return 0;
1592}
1593
1594static int transaction_kthread(void *arg)
1595{
1596        struct btrfs_root *root = arg;
1597        struct btrfs_trans_handle *trans;
1598        struct btrfs_transaction *cur;
1599        u64 transid;
1600        unsigned long now;
1601        unsigned long delay;
1602        int ret;
1603
1604        do {
1605                delay = HZ * 30;
1606                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1607                mutex_lock(&root->fs_info->transaction_kthread_mutex);
1608
1609                spin_lock(&root->fs_info->trans_lock);
1610                cur = root->fs_info->running_transaction;
1611                if (!cur) {
1612                        spin_unlock(&root->fs_info->trans_lock);
1613                        goto sleep;
1614                }
1615
1616                now = get_seconds();
1617                if (!cur->blocked &&
1618                    (now < cur->start_time || now - cur->start_time < 30)) {
1619                        spin_unlock(&root->fs_info->trans_lock);
1620                        delay = HZ * 5;
1621                        goto sleep;
1622                }
1623                transid = cur->transid;
1624                spin_unlock(&root->fs_info->trans_lock);
1625
1626                trans = btrfs_join_transaction(root);
1627                BUG_ON(IS_ERR(trans));
1628                if (transid == trans->transid) {
1629                        ret = btrfs_commit_transaction(trans, root);
1630                        BUG_ON(ret);
1631                } else {
1632                        btrfs_end_transaction(trans, root);
1633                }
1634sleep:
1635                wake_up_process(root->fs_info->cleaner_kthread);
1636                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1637
1638                if (freezing(current)) {
1639                        refrigerator();
1640                } else {
1641                        set_current_state(TASK_INTERRUPTIBLE);
1642                        if (!kthread_should_stop() &&
1643                            !btrfs_transaction_blocked(root->fs_info))
1644                                schedule_timeout(delay);
1645                        __set_current_state(TASK_RUNNING);
1646                }
1647        } while (!kthread_should_stop());
1648        return 0;
1649}
1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups.  The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest  root in the array with the generation
1658 * in the super block.  If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662        u64 cur;
1663        int newest_index = -1;
1664        struct btrfs_root_backup *root_backup;
1665        int i;
1666
1667        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668                root_backup = info->super_copy->super_roots + i;
1669                cur = btrfs_backup_tree_root_gen(root_backup);
1670                if (cur == newest_gen)
1671                        newest_index = i;
1672        }
1673
1674        /* check to see if we actually wrapped around */
1675        if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676                root_backup = info->super_copy->super_roots;
1677                cur = btrfs_backup_tree_root_gen(root_backup);
1678                if (cur == newest_gen)
1679                        newest_index = 0;
1680        }
1681        return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array.  This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691                                     u64 newest_gen)
1692{
1693        int newest_index = -1;
1694
1695        newest_index = find_newest_super_backup(info, newest_gen);
1696        /* if there was garbage in there, just move along */
1697        if (newest_index == -1) {
1698                info->backup_root_index = 0;
1699        } else {
1700                info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701        }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711        int next_backup;
1712        struct btrfs_root_backup *root_backup;
1713        int last_backup;
1714
1715        next_backup = info->backup_root_index;
1716        last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717                BTRFS_NUM_BACKUP_ROOTS;
1718
1719        /*
1720         * just overwrite the last backup if we're at the same generation
1721         * this happens only at umount
1722         */
1723        root_backup = info->super_for_commit->super_roots + last_backup;
1724        if (btrfs_backup_tree_root_gen(root_backup) ==
1725            btrfs_header_generation(info->tree_root->node))
1726                next_backup = last_backup;
1727
1728        root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730        /*
1731         * make sure all of our padding and empty slots get zero filled
1732         * regardless of which ones we use today
1733         */
1734        memset(root_backup, 0, sizeof(*root_backup));
1735
1736        info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738        btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739        btrfs_set_backup_tree_root_gen(root_backup,
1740                               btrfs_header_generation(info->tree_root->node));
1741
1742        btrfs_set_backup_tree_root_level(root_backup,
1743                               btrfs_header_level(info->tree_root->node));
1744
1745        btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746        btrfs_set_backup_chunk_root_gen(root_backup,
1747                               btrfs_header_generation(info->chunk_root->node));
1748        btrfs_set_backup_chunk_root_level(root_backup,
1749                               btrfs_header_level(info->chunk_root->node));
1750
1751        btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752        btrfs_set_backup_extent_root_gen(root_backup,
1753                               btrfs_header_generation(info->extent_root->node));
1754        btrfs_set_backup_extent_root_level(root_backup,
1755                               btrfs_header_level(info->extent_root->node));
1756
1757        /*
1758         * we might commit during log recovery, which happens before we set
1759         * the fs_root.  Make sure it is valid before we fill it in.
1760         */
1761        if (info->fs_root && info->fs_root->node) {
1762                btrfs_set_backup_fs_root(root_backup,
1763                                         info->fs_root->node->start);
1764                btrfs_set_backup_fs_root_gen(root_backup,
1765                               btrfs_header_generation(info->fs_root->node));
1766                btrfs_set_backup_fs_root_level(root_backup,
1767                               btrfs_header_level(info->fs_root->node));
1768        }
1769
1770        btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771        btrfs_set_backup_dev_root_gen(root_backup,
1772                               btrfs_header_generation(info->dev_root->node));
1773        btrfs_set_backup_dev_root_level(root_backup,
1774                                       btrfs_header_level(info->dev_root->node));
1775
1776        btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777        btrfs_set_backup_csum_root_gen(root_backup,
1778                               btrfs_header_generation(info->csum_root->node));
1779        btrfs_set_backup_csum_root_level(root_backup,
1780                               btrfs_header_level(info->csum_root->node));
1781
1782        btrfs_set_backup_total_bytes(root_backup,
1783                             btrfs_super_total_bytes(info->super_copy));
1784        btrfs_set_backup_bytes_used(root_backup,
1785                             btrfs_super_bytes_used(info->super_copy));
1786        btrfs_set_backup_num_devices(root_backup,
1787                             btrfs_super_num_devices(info->super_copy));
1788
1789        /*
1790         * if we don't copy this out to the super_copy, it won't get remembered
1791         * for the next commit
1792         */
1793        memcpy(&info->super_copy->super_roots,
1794               &info->super_for_commit->super_roots,
1795               sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block.  It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807                                     struct btrfs_super_block *super,
1808                                     int *num_backups_tried, int *backup_index)
1809{
1810        struct btrfs_root_backup *root_backup;
1811        int newest = *backup_index;
1812
1813        if (*num_backups_tried == 0) {
1814                u64 gen = btrfs_super_generation(super);
1815
1816                newest = find_newest_super_backup(info, gen);
1817                if (newest == -1)
1818                        return -1;
1819
1820                *backup_index = newest;
1821                *num_backups_tried = 1;
1822        } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823                /* we've tried all the backups, all done */
1824                return -1;
1825        } else {
1826                /* jump to the next oldest backup */
1827                newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828                        BTRFS_NUM_BACKUP_ROOTS;
1829                *backup_index = newest;
1830                *num_backups_tried += 1;
1831        }
1832        root_backup = super->super_roots + newest;
1833
1834        btrfs_set_super_generation(super,
1835                                   btrfs_backup_tree_root_gen(root_backup));
1836        btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837        btrfs_set_super_root_level(super,
1838                                   btrfs_backup_tree_root_level(root_backup));
1839        btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841        /*
1842         * fixme: the total bytes and num_devices need to match or we should
1843         * need a fsck
1844         */
1845        btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846        btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847        return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853        free_extent_buffer(info->tree_root->node);
1854        free_extent_buffer(info->tree_root->commit_root);
1855        free_extent_buffer(info->dev_root->node);
1856        free_extent_buffer(info->dev_root->commit_root);
1857        free_extent_buffer(info->extent_root->node);
1858        free_extent_buffer(info->extent_root->commit_root);
1859        free_extent_buffer(info->csum_root->node);
1860        free_extent_buffer(info->csum_root->commit_root);
1861
1862        info->tree_root->node = NULL;
1863        info->tree_root->commit_root = NULL;
1864        info->dev_root->node = NULL;
1865        info->dev_root->commit_root = NULL;
1866        info->extent_root->node = NULL;
1867        info->extent_root->commit_root = NULL;
1868        info->csum_root->node = NULL;
1869        info->csum_root->commit_root = NULL;
1870
1871        if (chunk_root) {
1872                free_extent_buffer(info->chunk_root->node);
1873                free_extent_buffer(info->chunk_root->commit_root);
1874                info->chunk_root->node = NULL;
1875                info->chunk_root->commit_root = NULL;
1876        }
1877}
1878
1879
1880struct btrfs_root *open_ctree(struct super_block *sb,
1881                              struct btrfs_fs_devices *fs_devices,
1882                              char *options)
1883{
1884        u32 sectorsize;
1885        u32 nodesize;
1886        u32 leafsize;
1887        u32 blocksize;
1888        u32 stripesize;
1889        u64 generation;
1890        u64 features;
1891        struct btrfs_key location;
1892        struct buffer_head *bh;
1893        struct btrfs_super_block *disk_super;
1894        struct btrfs_root *tree_root = btrfs_sb(sb);
1895        struct btrfs_fs_info *fs_info = tree_root->fs_info;
1896        struct btrfs_root *extent_root;
1897        struct btrfs_root *csum_root;
1898        struct btrfs_root *chunk_root;
1899        struct btrfs_root *dev_root;
1900        struct btrfs_root *log_tree_root;
1901        int ret;
1902        int err = -EINVAL;
1903        int num_backups_tried = 0;
1904        int backup_index = 0;
1905
1906        extent_root = fs_info->extent_root =
1907                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908        csum_root = fs_info->csum_root =
1909                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910        chunk_root = fs_info->chunk_root =
1911                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912        dev_root = fs_info->dev_root =
1913                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915        if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1916                err = -ENOMEM;
1917                goto fail;
1918        }
1919
1920        ret = init_srcu_struct(&fs_info->subvol_srcu);
1921        if (ret) {
1922                err = ret;
1923                goto fail;
1924        }
1925
1926        ret = setup_bdi(fs_info, &fs_info->bdi);
1927        if (ret) {
1928                err = ret;
1929                goto fail_srcu;
1930        }
1931
1932        fs_info->btree_inode = new_inode(sb);
1933        if (!fs_info->btree_inode) {
1934                err = -ENOMEM;
1935                goto fail_bdi;
1936        }
1937
1938        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
1939
1940        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1941        INIT_LIST_HEAD(&fs_info->trans_list);
1942        INIT_LIST_HEAD(&fs_info->dead_roots);
1943        INIT_LIST_HEAD(&fs_info->delayed_iputs);
1944        INIT_LIST_HEAD(&fs_info->hashers);
1945        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1946        INIT_LIST_HEAD(&fs_info->ordered_operations);
1947        INIT_LIST_HEAD(&fs_info->caching_block_groups);
1948        spin_lock_init(&fs_info->delalloc_lock);
1949        spin_lock_init(&fs_info->trans_lock);
1950        spin_lock_init(&fs_info->ref_cache_lock);
1951        spin_lock_init(&fs_info->fs_roots_radix_lock);
1952        spin_lock_init(&fs_info->delayed_iput_lock);
1953        spin_lock_init(&fs_info->defrag_inodes_lock);
1954        spin_lock_init(&fs_info->free_chunk_lock);
1955        mutex_init(&fs_info->reloc_mutex);
1956
1957        init_completion(&fs_info->kobj_unregister);
1958        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1959        INIT_LIST_HEAD(&fs_info->space_info);
1960        btrfs_mapping_init(&fs_info->mapping_tree);
1961        btrfs_init_block_rsv(&fs_info->global_block_rsv);
1962        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1963        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1964        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1965        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1966        btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1967        atomic_set(&fs_info->nr_async_submits, 0);
1968        atomic_set(&fs_info->async_delalloc_pages, 0);
1969        atomic_set(&fs_info->async_submit_draining, 0);
1970        atomic_set(&fs_info->nr_async_bios, 0);
1971        atomic_set(&fs_info->defrag_running, 0);
1972        fs_info->sb = sb;
1973        fs_info->max_inline = 8192 * 1024;
1974        fs_info->metadata_ratio = 0;
1975        fs_info->defrag_inodes = RB_ROOT;
1976        fs_info->trans_no_join = 0;
1977        fs_info->free_chunk_space = 0;
1978
1979        /* readahead state */
1980        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981        spin_lock_init(&fs_info->reada_lock);
1982
1983        fs_info->thread_pool_size = min_t(unsigned long,
1984                                          num_online_cpus() + 2, 8);
1985
1986        INIT_LIST_HEAD(&fs_info->ordered_extents);
1987        spin_lock_init(&fs_info->ordered_extent_lock);
1988        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
1989                                        GFP_NOFS);
1990        if (!fs_info->delayed_root) {
1991                err = -ENOMEM;
1992                goto fail_iput;
1993        }
1994        btrfs_init_delayed_root(fs_info->delayed_root);
1995
1996        mutex_init(&fs_info->scrub_lock);
1997        atomic_set(&fs_info->scrubs_running, 0);
1998        atomic_set(&fs_info->scrub_pause_req, 0);
1999        atomic_set(&fs_info->scrubs_paused, 0);
2000        atomic_set(&fs_info->scrub_cancel_req, 0);
2001        init_waitqueue_head(&fs_info->scrub_pause_wait);
2002        init_rwsem(&fs_info->scrub_super_lock);
2003        fs_info->scrub_workers_refcnt = 0;
2004
2005        sb->s_blocksize = 4096;
2006        sb->s_blocksize_bits = blksize_bits(4096);
2007        sb->s_bdi = &fs_info->bdi;
2008
2009        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2010        set_nlink(fs_info->btree_inode, 1);
2011        /*
2012         * we set the i_size on the btree inode to the max possible int.
2013         * the real end of the address space is determined by all of
2014         * the devices in the system
2015         */
2016        fs_info->btree_inode->i_size = OFFSET_MAX;
2017        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2018        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
2019
2020        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2021        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
2022                             fs_info->btree_inode->i_mapping);
2023        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
2024
2025        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
2026
2027        BTRFS_I(fs_info->btree_inode)->root = tree_root;
2028        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2029               sizeof(struct btrfs_key));
2030        BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
2031        insert_inode_hash(fs_info->btree_inode);
2032
2033        spin_lock_init(&fs_info->block_group_cache_lock);
2034        fs_info->block_group_cache_tree = RB_ROOT;
2035
2036        extent_io_tree_init(&fs_info->freed_extents[0],
2037                             fs_info->btree_inode->i_mapping);
2038        extent_io_tree_init(&fs_info->freed_extents[1],
2039                             fs_info->btree_inode->i_mapping);
2040        fs_info->pinned_extents = &fs_info->freed_extents[0];
2041        fs_info->do_barriers = 1;
2042
2043
2044        mutex_init(&fs_info->ordered_operations_mutex);
2045        mutex_init(&fs_info->tree_log_mutex);
2046        mutex_init(&fs_info->chunk_mutex);
2047        mutex_init(&fs_info->transaction_kthread_mutex);
2048        mutex_init(&fs_info->cleaner_mutex);
2049        mutex_init(&fs_info->volume_mutex);
2050        init_rwsem(&fs_info->extent_commit_sem);
2051        init_rwsem(&fs_info->cleanup_work_sem);
2052        init_rwsem(&fs_info->subvol_sem);
2053
2054        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2055        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2056
2057        init_waitqueue_head(&fs_info->transaction_throttle);
2058        init_waitqueue_head(&fs_info->transaction_wait);
2059        init_waitqueue_head(&fs_info->transaction_blocked_wait);
2060        init_waitqueue_head(&fs_info->async_submit_wait);
2061
2062        __setup_root(4096, 4096, 4096, 4096, tree_root,
2063                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
2064
2065        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2066        if (!bh) {
2067                err = -EINVAL;
2068                goto fail_alloc;
2069        }
2070
2071        memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
2072        memcpy(fs_info->super_for_commit, fs_info->super_copy,
2073               sizeof(*fs_info->super_for_commit));
2074        brelse(bh);
2075
2076        memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
2077
2078        disk_super = fs_info->super_copy;
2079        if (!btrfs_super_root(disk_super))
2080                goto fail_alloc;
2081
2082        /* check FS state, whether FS is broken. */
2083        fs_info->fs_state |= btrfs_super_flags(disk_super);
2084
2085        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2086
2087        /*
2088         * run through our array of backup supers and setup
2089         * our ring pointer to the oldest one
2090         */
2091        generation = btrfs_super_generation(disk_super);
2092        find_oldest_super_backup(fs_info, generation);
2093
2094        /*
2095         * In the long term, we'll store the compression type in the super
2096         * block, and it'll be used for per file compression control.
2097         */
2098        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2099
2100        ret = btrfs_parse_options(tree_root, options);
2101        if (ret) {
2102                err = ret;
2103                goto fail_alloc;
2104        }
2105
2106        features = btrfs_super_incompat_flags(disk_super) &
2107                ~BTRFS_FEATURE_INCOMPAT_SUPP;
2108        if (features) {
2109                printk(KERN_ERR "BTRFS: couldn't mount because of "
2110                       "unsupported optional features (%Lx).\n",
2111                       (unsigned long long)features);
2112                err = -EINVAL;
2113                goto fail_alloc;
2114        }
2115
2116        features = btrfs_super_incompat_flags(disk_super);
2117        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2118        if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
2119                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2120        btrfs_set_super_incompat_flags(disk_super, features);
2121
2122        features = btrfs_super_compat_ro_flags(disk_super) &
2123                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
2124        if (!(sb->s_flags & MS_RDONLY) && features) {
2125                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
2126                       "unsupported option features (%Lx).\n",
2127                       (unsigned long long)features);
2128                err = -EINVAL;
2129                goto fail_alloc;
2130        }
2131
2132        btrfs_init_workers(&fs_info->generic_worker,
2133                           "genwork", 1, NULL);
2134
2135        btrfs_init_workers(&fs_info->workers, "worker",
2136                           fs_info->thread_pool_size,
2137                           &fs_info->generic_worker);
2138
2139        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
2140                           fs_info->thread_pool_size,
2141                           &fs_info->generic_worker);
2142
2143        btrfs_init_workers(&fs_info->submit_workers, "submit",
2144                           min_t(u64, fs_devices->num_devices,
2145                           fs_info->thread_pool_size),
2146                           &fs_info->generic_worker);
2147
2148        btrfs_init_workers(&fs_info->caching_workers, "cache",
2149                           2, &fs_info->generic_worker);
2150
2151        /* a higher idle thresh on the submit workers makes it much more
2152         * likely that bios will be send down in a sane order to the
2153         * devices
2154         */
2155        fs_info->submit_workers.idle_thresh = 64;
2156
2157        fs_info->workers.idle_thresh = 16;
2158        fs_info->workers.ordered = 1;
2159
2160        fs_info->delalloc_workers.idle_thresh = 2;
2161        fs_info->delalloc_workers.ordered = 1;
2162
2163        btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
2164                           &fs_info->generic_worker);
2165        btrfs_init_workers(&fs_info->endio_workers, "endio",
2166                           fs_info->thread_pool_size,
2167                           &fs_info->generic_worker);
2168        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
2169                           fs_info->thread_pool_size,
2170                           &fs_info->generic_worker);
2171        btrfs_init_workers(&fs_info->endio_meta_write_workers,
2172                           "endio-meta-write", fs_info->thread_pool_size,
2173                           &fs_info->generic_worker);
2174        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2175                           fs_info->thread_pool_size,
2176                           &fs_info->generic_worker);
2177        btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
2178                           1, &fs_info->generic_worker);
2179        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
2180                           fs_info->thread_pool_size,
2181                           &fs_info->generic_worker);
2182        btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183                           fs_info->thread_pool_size,
2184                           &fs_info->generic_worker);
2185
2186        /*
2187         * endios are largely parallel and should have a very
2188         * low idle thresh
2189         */
2190        fs_info->endio_workers.idle_thresh = 4;
2191        fs_info->endio_meta_workers.idle_thresh = 4;
2192
2193        fs_info->endio_write_workers.idle_thresh = 2;
2194        fs_info->endio_meta_write_workers.idle_thresh = 2;
2195        fs_info->readahead_workers.idle_thresh = 2;
2196
2197        /*
2198         * btrfs_start_workers can really only fail because of ENOMEM so just
2199         * return -ENOMEM if any of these fail.
2200         */
2201        ret = btrfs_start_workers(&fs_info->workers);
2202        ret |= btrfs_start_workers(&fs_info->generic_worker);
2203        ret |= btrfs_start_workers(&fs_info->submit_workers);
2204        ret |= btrfs_start_workers(&fs_info->delalloc_workers);
2205        ret |= btrfs_start_workers(&fs_info->fixup_workers);
2206        ret |= btrfs_start_workers(&fs_info->endio_workers);
2207        ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2208        ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2209        ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2210        ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
2211        ret |= btrfs_start_workers(&fs_info->delayed_workers);
2212        ret |= btrfs_start_workers(&fs_info->caching_workers);
2213        ret |= btrfs_start_workers(&fs_info->readahead_workers);
2214        if (ret) {
2215                ret = -ENOMEM;
2216                goto fail_sb_buffer;
2217        }
2218
2219        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2220        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
2221                                    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
2222
2223        nodesize = btrfs_super_nodesize(disk_super);
2224        leafsize = btrfs_super_leafsize(disk_super);
2225        sectorsize = btrfs_super_sectorsize(disk_super);
2226        stripesize = btrfs_super_stripesize(disk_super);
2227        tree_root->nodesize = nodesize;
2228        tree_root->leafsize = leafsize;
2229        tree_root->sectorsize = sectorsize;
2230        tree_root->stripesize = stripesize;
2231
2232        sb->s_blocksize = sectorsize;
2233        sb->s_blocksize_bits = blksize_bits(sectorsize);
2234
2235        if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
2236                    sizeof(disk_super->magic))) {
2237                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
2238                goto fail_sb_buffer;
2239        }
2240
2241        mutex_lock(&fs_info->chunk_mutex);
2242        ret = btrfs_read_sys_array(tree_root);
2243        mutex_unlock(&fs_info->chunk_mutex);
2244        if (ret) {
2245                printk(KERN_WARNING "btrfs: failed to read the system "
2246                       "array on %s\n", sb->s_id);
2247                goto fail_sb_buffer;
2248        }
2249
2250        blocksize = btrfs_level_size(tree_root,
2251                                     btrfs_super_chunk_root_level(disk_super));
2252        generation = btrfs_super_chunk_root_generation(disk_super);
2253
2254        __setup_root(nodesize, leafsize, sectorsize, stripesize,
2255                     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2256
2257        chunk_root->node = read_tree_block(chunk_root,
2258                                           btrfs_super_chunk_root(disk_super),
2259                                           blocksize, generation);
2260        BUG_ON(!chunk_root->node);
2261        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2262                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
2263                       sb->s_id);
2264                goto fail_tree_roots;
2265        }
2266        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
2267        chunk_root->commit_root = btrfs_root_node(chunk_root);
2268
2269        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
2270           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
2271           BTRFS_UUID_SIZE);
2272
2273        mutex_lock(&fs_info->chunk_mutex);
2274        ret = btrfs_read_chunk_tree(chunk_root);
2275        mutex_unlock(&fs_info->chunk_mutex);
2276        if (ret) {
2277                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2278                       sb->s_id);
2279                goto fail_tree_roots;
2280        }
2281
2282        btrfs_close_extra_devices(fs_devices);
2283
2284retry_root_backup:
2285        blocksize = btrfs_level_size(tree_root,
2286                                     btrfs_super_root_level(disk_super));
2287        generation = btrfs_super_generation(disk_super);
2288
2289        tree_root->node = read_tree_block(tree_root,
2290                                          btrfs_super_root(disk_super),
2291                                          blocksize, generation);
2292        if (!tree_root->node ||
2293            !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
2294                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
2295                       sb->s_id);
2296
2297                goto recovery_tree_root;
2298        }
2299
2300        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2301        tree_root->commit_root = btrfs_root_node(tree_root);
2302
2303        ret = find_and_setup_root(tree_root, fs_info,
2304                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
2305        if (ret)
2306                goto recovery_tree_root;
2307        extent_root->track_dirty = 1;
2308
2309        ret = find_and_setup_root(tree_root, fs_info,
2310                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
2311        if (ret)
2312                goto recovery_tree_root;
2313        dev_root->track_dirty = 1;
2314
2315        ret = find_and_setup_root(tree_root, fs_info,
2316                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
2317        if (ret)
2318                goto recovery_tree_root;
2319
2320        csum_root->track_dirty = 1;
2321
2322        fs_info->generation = generation;
2323        fs_info->last_trans_committed = generation;
2324        fs_info->data_alloc_profile = (u64)-1;
2325        fs_info->metadata_alloc_profile = (u64)-1;
2326        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
2327
2328        ret = btrfs_init_space_info(fs_info);
2329        if (ret) {
2330                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
2331                goto fail_block_groups;
2332        }
2333
2334        ret = btrfs_read_block_groups(extent_root);
2335        if (ret) {
2336                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2337                goto fail_block_groups;
2338        }
2339
2340        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2341                                               "btrfs-cleaner");
2342        if (IS_ERR(fs_info->cleaner_kthread))
2343                goto fail_block_groups;
2344
2345        fs_info->transaction_kthread = kthread_run(transaction_kthread,
2346                                                   tree_root,
2347                                                   "btrfs-transaction");
2348        if (IS_ERR(fs_info->transaction_kthread))
2349                goto fail_cleaner;
2350
2351        if (!btrfs_test_opt(tree_root, SSD) &&
2352            !btrfs_test_opt(tree_root, NOSSD) &&
2353            !fs_info->fs_devices->rotating) {
2354                printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
2355                       "mode\n");
2356                btrfs_set_opt(fs_info->mount_opt, SSD);
2357        }
2358
2359        /* do not make disk changes in broken FS */
2360        if (btrfs_super_log_root(disk_super) != 0 &&
2361            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
2362                u64 bytenr = btrfs_super_log_root(disk_super);
2363
2364                if (fs_devices->rw_devices == 0) {
2365                        printk(KERN_WARNING "Btrfs log replay required "
2366                               "on RO media\n");
2367                        err = -EIO;
2368                        goto fail_trans_kthread;
2369                }
2370                blocksize =
2371                     btrfs_level_size(tree_root,
2372                                      btrfs_super_log_root_level(disk_super));
2373
2374                log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
2375                if (!log_tree_root) {
2376                        err = -ENOMEM;
2377                        goto fail_trans_kthread;
2378                }
2379
2380                __setup_root(nodesize, leafsize, sectorsize, stripesize,
2381                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
2382
2383                log_tree_root->node = read_tree_block(tree_root, bytenr,
2384                                                      blocksize,
2385                                                      generation + 1);
2386                ret = btrfs_recover_log_trees(log_tree_root);
2387                BUG_ON(ret);
2388
2389                if (sb->s_flags & MS_RDONLY) {
2390                        ret =  btrfs_commit_super(tree_root);
2391                        BUG_ON(ret);
2392                }
2393        }
2394
2395        ret = btrfs_find_orphan_roots(tree_root);
2396        BUG_ON(ret);
2397
2398        if (!(sb->s_flags & MS_RDONLY)) {
2399                ret = btrfs_cleanup_fs_roots(fs_info);
2400                BUG_ON(ret);
2401
2402                ret = btrfs_recover_relocation(tree_root);
2403                if (ret < 0) {
2404                        printk(KERN_WARNING
2405                               "btrfs: failed to recover relocation\n");
2406                        err = -EINVAL;
2407                        goto fail_trans_kthread;
2408                }
2409        }
2410
2411        location.objectid = BTRFS_FS_TREE_OBJECTID;
2412        location.type = BTRFS_ROOT_ITEM_KEY;
2413        location.offset = (u64)-1;
2414
2415        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2416        if (!fs_info->fs_root)
2417                goto fail_trans_kthread;
2418        if (IS_ERR(fs_info->fs_root)) {
2419                err = PTR_ERR(fs_info->fs_root);
2420                goto fail_trans_kthread;
2421        }
2422
2423        if (!(sb->s_flags & MS_RDONLY)) {
2424                down_read(&fs_info->cleanup_work_sem);
2425                err = btrfs_orphan_cleanup(fs_info->fs_root);
2426                if (!err)
2427                        err = btrfs_orphan_cleanup(fs_info->tree_root);
2428                up_read(&fs_info->cleanup_work_sem);
2429                if (err) {
2430                        close_ctree(tree_root);
2431                        return ERR_PTR(err);
2432                }
2433        }
2434
2435        return tree_root;
2436
2437fail_trans_kthread:
2438        kthread_stop(fs_info->transaction_kthread);
2439fail_cleaner:
2440        kthread_stop(fs_info->cleaner_kthread);
2441
2442        /*
2443         * make sure we're done with the btree inode before we stop our
2444         * kthreads
2445         */
2446        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2447        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2448
2449fail_block_groups:
2450        btrfs_free_block_groups(fs_info);
2451
2452fail_tree_roots:
2453        free_root_pointers(fs_info, 1);
2454
2455fail_sb_buffer:
2456        btrfs_stop_workers(&fs_info->generic_worker);
2457        btrfs_stop_workers(&fs_info->readahead_workers);
2458        btrfs_stop_workers(&fs_info->fixup_workers);
2459        btrfs_stop_workers(&fs_info->delalloc_workers);
2460        btrfs_stop_workers(&fs_info->workers);
2461        btrfs_stop_workers(&fs_info->endio_workers);
2462        btrfs_stop_workers(&fs_info->endio_meta_workers);
2463        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2464        btrfs_stop_workers(&fs_info->endio_write_workers);
2465        btrfs_stop_workers(&fs_info->endio_freespace_worker);
2466        btrfs_stop_workers(&fs_info->submit_workers);
2467        btrfs_stop_workers(&fs_info->delayed_workers);
2468        btrfs_stop_workers(&fs_info->caching_workers);
2469fail_alloc:
2470fail_iput:
2471        btrfs_mapping_tree_free(&fs_info->mapping_tree);
2472
2473        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2474        iput(fs_info->btree_inode);
2475fail_bdi:
2476        bdi_destroy(&fs_info->bdi);
2477fail_srcu:
2478        cleanup_srcu_struct(&fs_info->subvol_srcu);
2479fail:
2480        btrfs_close_devices(fs_info->fs_devices);
2481        free_fs_info(fs_info);
2482        return ERR_PTR(err);
2483
2484recovery_tree_root:
2485        if (!btrfs_test_opt(tree_root, RECOVERY))
2486                goto fail_tree_roots;
2487
2488        free_root_pointers(fs_info, 0);
2489
2490        /* don't use the log in recovery mode, it won't be valid */
2491        btrfs_set_super_log_root(disk_super, 0);
2492
2493        /* we can't trust the free space cache either */
2494        btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2495
2496        ret = next_root_backup(fs_info, fs_info->super_copy,
2497                               &num_backups_tried, &backup_index);
2498        if (ret == -1)
2499                goto fail_block_groups;
2500        goto retry_root_backup;
2501}
2502
2503static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2504{
2505        char b[BDEVNAME_SIZE];
2506
2507        if (uptodate) {
2508                set_buffer_uptodate(bh);
2509        } else {
2510                printk_ratelimited(KERN_WARNING "lost page write due to "
2511                                        "I/O error on %s\n",
2512                                       bdevname(bh->b_bdev, b));
2513                /* note, we dont' set_buffer_write_io_error because we have
2514                 * our own ways of dealing with the IO errors
2515                 */
2516                clear_buffer_uptodate(bh);
2517        }
2518        unlock_buffer(bh);
2519        put_bh(bh);
2520}
2521
2522struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
2523{
2524        struct buffer_head *bh;
2525        struct buffer_head *latest = NULL;
2526        struct btrfs_super_block *super;
2527        int i;
2528        u64 transid = 0;
2529        u64 bytenr;
2530
2531        /* we would like to check all the supers, but that would make
2532         * a btrfs mount succeed after a mkfs from a different FS.
2533         * So, we need to add a special mount option to scan for
2534         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
2535         */
2536        for (i = 0; i < 1; i++) {
2537                bytenr = btrfs_sb_offset(i);
2538                if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
2539                        break;
2540                bh = __bread(bdev, bytenr / 4096, 4096);
2541                if (!bh)
2542                        continue;
2543
2544                super = (struct btrfs_super_block *)bh->b_data;
2545                if (btrfs_super_bytenr(super) != bytenr ||
2546                    strncmp((char *)(&super->magic), BTRFS_MAGIC,
2547                            sizeof(super->magic))) {
2548                        brelse(bh);
2549                        continue;
2550                }
2551
2552                if (!latest || btrfs_super_generation(super) > transid) {
2553                        brelse(latest);
2554                        latest = bh;
2555                        transid = btrfs_super_generation(super);
2556                } else {
2557                        brelse(bh);
2558                }
2559        }
2560        return latest;
2561}
2562
2563/*
2564 * this should be called twice, once with wait == 0 and
2565 * once with wait == 1.  When wait == 0 is done, all the buffer heads
2566 * we write are pinned.
2567 *
2568 * They are released when wait == 1 is done.
2569 * max_mirrors must be the same for both runs, and it indicates how
2570 * many supers on this one device should be written.
2571 *
2572 * max_mirrors == 0 means to write them all.
2573 */
2574static int write_dev_supers(struct btrfs_device *device,
2575                            struct btrfs_super_block *sb,
2576                            int do_barriers, int wait, int max_mirrors)
2577{
2578        struct buffer_head *bh;
2579        int i;
2580        int ret;
2581        int errors = 0;
2582        u32 crc;
2583        u64 bytenr;
2584
2585        if (max_mirrors == 0)
2586                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2587
2588        for (i = 0; i < max_mirrors; i++) {
2589                bytenr = btrfs_sb_offset(i);
2590                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
2591                        break;
2592
2593                if (wait) {
2594                        bh = __find_get_block(device->bdev, bytenr / 4096,
2595                                              BTRFS_SUPER_INFO_SIZE);
2596                        BUG_ON(!bh);
2597                        wait_on_buffer(bh);
2598                        if (!buffer_uptodate(bh))
2599                                errors++;
2600
2601                        /* drop our reference */
2602                        brelse(bh);
2603
2604                        /* drop the reference from the wait == 0 run */
2605                        brelse(bh);
2606                        continue;
2607                } else {
2608                        btrfs_set_super_bytenr(sb, bytenr);
2609
2610                        crc = ~(u32)0;
2611                        crc = btrfs_csum_data(NULL, (char *)sb +
2612                                              BTRFS_CSUM_SIZE, crc,
2613                                              BTRFS_SUPER_INFO_SIZE -
2614                                              BTRFS_CSUM_SIZE);
2615                        btrfs_csum_final(crc, sb->csum);
2616
2617                        /*
2618                         * one reference for us, and we leave it for the
2619                         * caller
2620                         */
2621                        bh = __getblk(device->bdev, bytenr / 4096,
2622                                      BTRFS_SUPER_INFO_SIZE);
2623                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
2624
2625                        /* one reference for submit_bh */
2626                        get_bh(bh);
2627
2628                        set_buffer_uptodate(bh);
2629                        lock_buffer(bh);
2630                        bh->b_end_io = btrfs_end_buffer_write_sync;
2631                }
2632
2633                /*
2634                 * we fua the first super.  The others we allow
2635                 * to go down lazy.
2636                 */
2637                ret = submit_bh(WRITE_FUA, bh);
2638                if (ret)
2639                        errors++;
2640        }
2641        return errors < i ? 0 : -1;
2642}
2643
2644/*
2645 * endio for the write_dev_flush, this will wake anyone waiting
2646 * for the barrier when it is done
2647 */
2648static void btrfs_end_empty_barrier(struct bio *bio, int err)
2649{
2650        if (err) {
2651                if (err == -EOPNOTSUPP)
2652                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2653                clear_bit(BIO_UPTODATE, &bio->bi_flags);
2654        }
2655        if (bio->bi_private)
2656                complete(bio->bi_private);
2657        bio_put(bio);
2658}
2659
2660/*
2661 * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
2662 * sent down.  With wait == 1, it waits for the previous flush.
2663 *
2664 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2665 * capable
2666 */
2667static int write_dev_flush(struct btrfs_device *device, int wait)
2668{
2669        struct bio *bio;
2670        int ret = 0;
2671
2672        if (device->nobarriers)
2673                return 0;
2674
2675        if (wait) {
2676                bio = device->flush_bio;
2677                if (!bio)
2678                        return 0;
2679
2680                wait_for_completion(&device->flush_wait);
2681
2682                if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2683                        printk("btrfs: disabling barriers on dev %s\n",
2684                               device->name);
2685                        device->nobarriers = 1;
2686                }
2687                if (!bio_flagged(bio, BIO_UPTODATE)) {
2688                        ret = -EIO;
2689                }
2690
2691                /* drop the reference from the wait == 0 run */
2692                bio_put(bio);
2693                device->flush_bio = NULL;
2694
2695                return ret;
2696        }
2697
2698        /*
2699         * one reference for us, and we leave it for the
2700         * caller
2701         */
2702        device->flush_bio = NULL;;
2703        bio = bio_alloc(GFP_NOFS, 0);
2704        if (!bio)
2705                return -ENOMEM;
2706
2707        bio->bi_end_io = btrfs_end_empty_barrier;
2708        bio->bi_bdev = device->bdev;
2709        init_completion(&device->flush_wait);
2710        bio->bi_private = &device->flush_wait;
2711        device->flush_bio = bio;
2712
2713        bio_get(bio);
2714        submit_bio(WRITE_FLUSH, bio);
2715
2716        return 0;
2717}
2718
2719/*
2720 * send an empty flush down to each device in parallel,
2721 * then wait for them
2722 */
2723static int barrier_all_devices(struct btrfs_fs_info *info)
2724{
2725        struct list_head *head;
2726        struct btrfs_device *dev;
2727        int errors = 0;
2728        int ret;
2729
2730        /* send down all the barriers */
2731        head = &info->fs_devices->devices;
2732        list_for_each_entry_rcu(dev, head, dev_list) {
2733                if (!dev->bdev) {
2734                        errors++;
2735                        continue;
2736                }
2737                if (!dev->in_fs_metadata || !dev->writeable)
2738                        continue;
2739
2740                ret = write_dev_flush(dev, 0);
2741                if (ret)
2742                        errors++;
2743        }
2744
2745        /* wait for all the barriers */
2746        list_for_each_entry_rcu(dev, head, dev_list) {
2747                if (!dev->bdev) {
2748                        errors++;
2749                        continue;
2750                }
2751                if (!dev->in_fs_metadata || !dev->writeable)
2752                        continue;
2753
2754                ret = write_dev_flush(dev, 1);
2755                if (ret)
2756                        errors++;
2757        }
2758        if (errors)
2759                return -EIO;
2760        return 0;
2761}
2762
2763int write_all_supers(struct btrfs_root *root, int max_mirrors)
2764{
2765        struct list_head *head;
2766        struct btrfs_device *dev;
2767        struct btrfs_super_block *sb;
2768        struct btrfs_dev_item *dev_item;
2769        int ret;
2770        int do_barriers;
2771        int max_errors;
2772        int total_errors = 0;
2773        u64 flags;
2774
2775        max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2776        do_barriers = !btrfs_test_opt(root, NOBARRIER);
2777        backup_super_roots(root->fs_info);
2778
2779        sb = root->fs_info->super_for_commit;
2780        dev_item = &sb->dev_item;
2781
2782        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2783        head = &root->fs_info->fs_devices->devices;
2784
2785        if (do_barriers)
2786                barrier_all_devices(root->fs_info);
2787
2788        list_for_each_entry_rcu(dev, head, dev_list) {
2789                if (!dev->bdev) {
2790                        total_errors++;
2791                        continue;
2792                }
2793                if (!dev->in_fs_metadata || !dev->writeable)
2794                        continue;
2795
2796                btrfs_set_stack_device_generation(dev_item, 0);
2797                btrfs_set_stack_device_type(dev_item, dev->type);
2798                btrfs_set_stack_device_id(dev_item, dev->devid);
2799                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2800                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2801                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2802                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2803                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2804                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2805                memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2806
2807                flags = btrfs_super_flags(sb);
2808                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2809
2810                ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2811                if (ret)
2812                        total_errors++;
2813        }
2814        if (total_errors > max_errors) {
2815                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2816                       total_errors);
2817                BUG();
2818        }
2819
2820        total_errors = 0;
2821        list_for_each_entry_rcu(dev, head, dev_list) {
2822                if (!dev->bdev)
2823                        continue;
2824                if (!dev->in_fs_metadata || !dev->writeable)
2825                        continue;
2826
2827                ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2828                if (ret)
2829                        total_errors++;
2830        }
2831        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2832        if (total_errors > max_errors) {
2833                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2834                       total_errors);
2835                BUG();
2836        }
2837        return 0;
2838}
2839
2840int write_ctree_super(struct btrfs_trans_handle *trans,
2841                      struct btrfs_root *root, int max_mirrors)
2842{
2843        int ret;
2844
2845        ret = write_all_supers(root, max_mirrors);
2846        return ret;
2847}
2848
2849int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2850{
2851        spin_lock(&fs_info->fs_roots_radix_lock);
2852        radix_tree_delete(&fs_info->fs_roots_radix,
2853                          (unsigned long)root->root_key.objectid);
2854        spin_unlock(&fs_info->fs_roots_radix_lock);
2855
2856        if (btrfs_root_refs(&root->root_item) == 0)
2857                synchronize_srcu(&fs_info->subvol_srcu);
2858
2859        __btrfs_remove_free_space_cache(root->free_ino_pinned);
2860        __btrfs_remove_free_space_cache(root->free_ino_ctl);
2861        free_fs_root(root);
2862        return 0;
2863}
2864
2865static void free_fs_root(struct btrfs_root *root)
2866{
2867        iput(root->cache_inode);
2868        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2869        if (root->anon_dev)
2870                free_anon_bdev(root->anon_dev);
2871        free_extent_buffer(root->node);
2872        free_extent_buffer(root->commit_root);
2873        kfree(root->free_ino_ctl);
2874        kfree(root->free_ino_pinned);
2875        kfree(root->name);
2876        kfree(root);
2877}
2878
2879static int del_fs_roots(struct btrfs_fs_info *fs_info)
2880{
2881        int ret;
2882        struct btrfs_root *gang[8];
2883        int i;
2884
2885        while (!list_empty(&fs_info->dead_roots)) {
2886                gang[0] = list_entry(fs_info->dead_roots.next,
2887                                     struct btrfs_root, root_list);
2888                list_del(&gang[0]->root_list);
2889
2890                if (gang[0]->in_radix) {
2891                        btrfs_free_fs_root(fs_info, gang[0]);
2892                } else {
2893                        free_extent_buffer(gang[0]->node);
2894                        free_extent_buffer(gang[0]->commit_root);
2895                        kfree(gang[0]);
2896                }
2897        }
2898
2899        while (1) {
2900                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2901                                             (void **)gang, 0,
2902                                             ARRAY_SIZE(gang));
2903                if (!ret)
2904                        break;
2905                for (i = 0; i < ret; i++)
2906                        btrfs_free_fs_root(fs_info, gang[i]);
2907        }
2908        return 0;
2909}
2910
2911int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2912{
2913        u64 root_objectid = 0;
2914        struct btrfs_root *gang[8];
2915        int i;
2916        int ret;
2917
2918        while (1) {
2919                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2920                                             (void **)gang, root_objectid,
2921                                             ARRAY_SIZE(gang));
2922                if (!ret)
2923                        break;
2924
2925                root_objectid = gang[ret - 1]->root_key.objectid + 1;
2926                for (i = 0; i < ret; i++) {
2927                        int err;
2928
2929                        root_objectid = gang[i]->root_key.objectid;
2930                        err = btrfs_orphan_cleanup(gang[i]);
2931                        if (err)
2932                                return err;
2933                }
2934                root_objectid++;
2935        }
2936        return 0;
2937}
2938
2939int btrfs_commit_super(struct btrfs_root *root)
2940{
2941        struct btrfs_trans_handle *trans;
2942        int ret;
2943
2944        mutex_lock(&root->fs_info->cleaner_mutex);
2945        btrfs_run_delayed_iputs(root);
2946        btrfs_clean_old_snapshots(root);
2947        mutex_unlock(&root->fs_info->cleaner_mutex);
2948
2949        /* wait until ongoing cleanup work done */
2950        down_write(&root->fs_info->cleanup_work_sem);
2951        up_write(&root->fs_info->cleanup_work_sem);
2952
2953        trans = btrfs_join_transaction(root);
2954        if (IS_ERR(trans))
2955                return PTR_ERR(trans);
2956        ret = btrfs_commit_transaction(trans, root);
2957        BUG_ON(ret);
2958        /* run commit again to drop the original snapshot */
2959        trans = btrfs_join_transaction(root);
2960        if (IS_ERR(trans))
2961                return PTR_ERR(trans);
2962        btrfs_commit_transaction(trans, root);
2963        ret = btrfs_write_and_wait_transaction(NULL, root);
2964        BUG_ON(ret);
2965
2966        ret = write_ctree_super(NULL, root, 0);
2967        return ret;
2968}
2969
2970int close_ctree(struct btrfs_root *root)
2971{
2972        struct btrfs_fs_info *fs_info = root->fs_info;
2973        int ret;
2974
2975        fs_info->closing = 1;
2976        smp_mb();
2977
2978        btrfs_scrub_cancel(root);
2979
2980        /* wait for any defraggers to finish */
2981        wait_event(fs_info->transaction_wait,
2982                   (atomic_read(&fs_info->defrag_running) == 0));
2983
2984        /* clear out the rbtree of defraggable inodes */
2985        btrfs_run_defrag_inodes(root->fs_info);
2986
2987        /*
2988         * Here come 2 situations when btrfs is broken to flip readonly:
2989         *
2990         * 1. when btrfs flips readonly somewhere else before
2991         * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2992         * and btrfs will skip to write sb directly to keep
2993         * ERROR state on disk.
2994         *
2995         * 2. when btrfs flips readonly just in btrfs_commit_super,
2996         * and in such case, btrfs cannot write sb via btrfs_commit_super,
2997         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2998         * btrfs will cleanup all FS resources first and write sb then.
2999         */
3000        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3001                ret = btrfs_commit_super(root);
3002                if (ret)
3003                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3004        }
3005
3006        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
3007                ret = btrfs_error_commit_super(root);
3008                if (ret)
3009                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
3010        }
3011
3012        btrfs_put_block_group_cache(fs_info);
3013
3014        kthread_stop(root->fs_info->transaction_kthread);
3015        kthread_stop(root->fs_info->cleaner_kthread);
3016
3017        fs_info->closing = 2;
3018        smp_mb();
3019
3020        if (fs_info->delalloc_bytes) {
3021                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
3022                       (unsigned long long)fs_info->delalloc_bytes);
3023        }
3024        if (fs_info->total_ref_cache_size) {
3025                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
3026                       (unsigned long long)fs_info->total_ref_cache_size);
3027        }
3028
3029        free_extent_buffer(fs_info->extent_root->node);
3030        free_extent_buffer(fs_info->extent_root->commit_root);
3031        free_extent_buffer(fs_info->tree_root->node);
3032        free_extent_buffer(fs_info->tree_root->commit_root);
3033        free_extent_buffer(root->fs_info->chunk_root->node);
3034        free_extent_buffer(root->fs_info->chunk_root->commit_root);
3035        free_extent_buffer(root->fs_info->dev_root->node);
3036        free_extent_buffer(root->fs_info->dev_root->commit_root);
3037        free_extent_buffer(root->fs_info->csum_root->node);
3038        free_extent_buffer(root->fs_info->csum_root->commit_root);
3039
3040        btrfs_free_block_groups(root->fs_info);
3041
3042        del_fs_roots(fs_info);
3043
3044        iput(fs_info->btree_inode);
3045
3046        btrfs_stop_workers(&fs_info->generic_worker);
3047        btrfs_stop_workers(&fs_info->fixup_workers);
3048        btrfs_stop_workers(&fs_info->delalloc_workers);
3049        btrfs_stop_workers(&fs_info->workers);
3050        btrfs_stop_workers(&fs_info->endio_workers);
3051        btrfs_stop_workers(&fs_info->endio_meta_workers);
3052        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3053        btrfs_stop_workers(&fs_info->endio_write_workers);
3054        btrfs_stop_workers(&fs_info->endio_freespace_worker);
3055        btrfs_stop_workers(&fs_info->submit_workers);
3056        btrfs_stop_workers(&fs_info->delayed_workers);
3057        btrfs_stop_workers(&fs_info->caching_workers);
3058        btrfs_stop_workers(&fs_info->readahead_workers);
3059
3060        btrfs_close_devices(fs_info->fs_devices);
3061        btrfs_mapping_tree_free(&fs_info->mapping_tree);
3062
3063        bdi_destroy(&fs_info->bdi);
3064        cleanup_srcu_struct(&fs_info->subvol_srcu);
3065
3066        free_fs_info(fs_info);
3067
3068        return 0;
3069}
3070
3071int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
3072{
3073        int ret;
3074        struct inode *btree_inode = buf->first_page->mapping->host;
3075
3076        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
3077                                     NULL);
3078        if (!ret)
3079                return ret;
3080
3081        ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
3082                                    parent_transid);
3083        return !ret;
3084}
3085
3086int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
3087{
3088        struct inode *btree_inode = buf->first_page->mapping->host;
3089        return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
3090                                          buf);
3091}
3092
3093void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3094{
3095        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
3096        u64 transid = btrfs_header_generation(buf);
3097        struct inode *btree_inode = root->fs_info->btree_inode;
3098        int was_dirty;
3099
3100        btrfs_assert_tree_locked(buf);
3101        if (transid != root->fs_info->generation) {
3102                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
3103                       "found %llu running %llu\n",
3104                        (unsigned long long)buf->start,
3105                        (unsigned long long)transid,
3106                        (unsigned long long)root->fs_info->generation);
3107                WARN_ON(1);
3108        }
3109        was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
3110                                            buf);
3111        if (!was_dirty) {
3112                spin_lock(&root->fs_info->delalloc_lock);
3113                root->fs_info->dirty_metadata_bytes += buf->len;
3114                spin_unlock(&root->fs_info->delalloc_lock);
3115        }
3116}
3117
3118void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3119{
3120        /*
3121         * looks as though older kernels can get into trouble with
3122         * this code, they end up stuck in balance_dirty_pages forever
3123         */
3124        u64 num_dirty;
3125        unsigned long thresh = 32 * 1024 * 1024;
3126
3127        if (current->flags & PF_MEMALLOC)
3128                return;
3129
3130        btrfs_balance_delayed_items(root);
3131
3132        num_dirty = root->fs_info->dirty_metadata_bytes;
3133
3134        if (num_dirty > thresh) {
3135                balance_dirty_pages_ratelimited_nr(
3136                                   root->fs_info->btree_inode->i_mapping, 1);
3137        }
3138        return;
3139}
3140
3141void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3142{
3143        /*
3144         * looks as though older kernels can get into trouble with
3145         * this code, they end up stuck in balance_dirty_pages forever
3146         */
3147        u64 num_dirty;
3148        unsigned long thresh = 32 * 1024 * 1024;
3149
3150        if (current->flags & PF_MEMALLOC)
3151                return;
3152
3153        num_dirty = root->fs_info->dirty_metadata_bytes;
3154
3155        if (num_dirty > thresh) {
3156                balance_dirty_pages_ratelimited_nr(
3157                                   root->fs_info->btree_inode->i_mapping, 1);
3158        }
3159        return;
3160}
3161
3162int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3163{
3164        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
3165        int ret;
3166        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3167        if (ret == 0)
3168                set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
3169        return ret;
3170}
3171
3172static int btree_lock_page_hook(struct page *page, void *data,
3173                                void (*flush_fn)(void *))
3174{
3175        struct inode *inode = page->mapping->host;
3176        struct btrfs_root *root = BTRFS_I(inode)->root;
3177        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3178        struct extent_buffer *eb;
3179        unsigned long len;
3180        u64 bytenr = page_offset(page);
3181
3182        if (page->private == EXTENT_PAGE_PRIVATE)
3183                goto out;
3184
3185        len = page->private >> 2;
3186        eb = find_extent_buffer(io_tree, bytenr, len);
3187        if (!eb)
3188                goto out;
3189
3190        if (!btrfs_try_tree_write_lock(eb)) {
3191                flush_fn(data);
3192                btrfs_tree_lock(eb);
3193        }
3194        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3195
3196        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3197                spin_lock(&root->fs_info->delalloc_lock);
3198                if (root->fs_info->dirty_metadata_bytes >= eb->len)
3199                        root->fs_info->dirty_metadata_bytes -= eb->len;
3200                else
3201                        WARN_ON(1);
3202                spin_unlock(&root->fs_info->delalloc_lock);
3203        }
3204
3205        btrfs_tree_unlock(eb);
3206        free_extent_buffer(eb);
3207out:
3208        if (!trylock_page(page)) {
3209                flush_fn(data);
3210                lock_page(page);
3211        }
3212        return 0;
3213}
3214
3215static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3216                              int read_only)
3217{
3218        if (read_only)
3219                return;
3220
3221        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
3222                printk(KERN_WARNING "warning: mount fs with errors, "
3223                       "running btrfsck is recommended\n");
3224}
3225
3226int btrfs_error_commit_super(struct btrfs_root *root)
3227{
3228        int ret;
3229
3230        mutex_lock(&root->fs_info->cleaner_mutex);
3231        btrfs_run_delayed_iputs(root);
3232        mutex_unlock(&root->fs_info->cleaner_mutex);
3233
3234        down_write(&root->fs_info->cleanup_work_sem);
3235        up_write(&root->fs_info->cleanup_work_sem);
3236
3237        /* cleanup FS via transaction */
3238        btrfs_cleanup_transaction(root);
3239
3240        ret = write_ctree_super(NULL, root, 0);
3241
3242        return ret;
3243}
3244
3245static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
3246{
3247        struct btrfs_inode *btrfs_inode;
3248        struct list_head splice;
3249
3250        INIT_LIST_HEAD(&splice);
3251
3252        mutex_lock(&root->fs_info->ordered_operations_mutex);
3253        spin_lock(&root->fs_info->ordered_extent_lock);
3254
3255        list_splice_init(&root->fs_info->ordered_operations, &splice);
3256        while (!list_empty(&splice)) {
3257                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3258                                         ordered_operations);
3259
3260                list_del_init(&btrfs_inode->ordered_operations);
3261
3262                btrfs_invalidate_inodes(btrfs_inode->root);
3263        }
3264
3265        spin_unlock(&root->fs_info->ordered_extent_lock);
3266        mutex_unlock(&root->fs_info->ordered_operations_mutex);
3267
3268        return 0;
3269}
3270
3271static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
3272{
3273        struct list_head splice;
3274        struct btrfs_ordered_extent *ordered;
3275        struct inode *inode;
3276
3277        INIT_LIST_HEAD(&splice);
3278
3279        spin_lock(&root->fs_info->ordered_extent_lock);
3280
3281        list_splice_init(&root->fs_info->ordered_extents, &splice);
3282        while (!list_empty(&splice)) {
3283                ordered = list_entry(splice.next, struct btrfs_ordered_extent,
3284                                     root_extent_list);
3285
3286                list_del_init(&ordered->root_extent_list);
3287                atomic_inc(&ordered->refs);
3288
3289                /* the inode may be getting freed (in sys_unlink path). */
3290                inode = igrab(ordered->inode);
3291
3292                spin_unlock(&root->fs_info->ordered_extent_lock);
3293                if (inode)
3294                        iput(inode);
3295
3296                atomic_set(&ordered->refs, 1);
3297                btrfs_put_ordered_extent(ordered);
3298
3299                spin_lock(&root->fs_info->ordered_extent_lock);
3300        }
3301
3302        spin_unlock(&root->fs_info->ordered_extent_lock);
3303
3304        return 0;
3305}
3306
3307static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3308                                      struct btrfs_root *root)
3309{
3310        struct rb_node *node;
3311        struct btrfs_delayed_ref_root *delayed_refs;
3312        struct btrfs_delayed_ref_node *ref;
3313        int ret = 0;
3314
3315        delayed_refs = &trans->delayed_refs;
3316
3317        spin_lock(&delayed_refs->lock);
3318        if (delayed_refs->num_entries == 0) {
3319                spin_unlock(&delayed_refs->lock);
3320                printk(KERN_INFO "delayed_refs has NO entry\n");
3321                return ret;
3322        }
3323
3324        node = rb_first(&delayed_refs->root);
3325        while (node) {
3326                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3327                node = rb_next(node);
3328
3329                ref->in_tree = 0;
3330                rb_erase(&ref->rb_node, &delayed_refs->root);
3331                delayed_refs->num_entries--;
3332
3333                atomic_set(&ref->refs, 1);
3334                if (btrfs_delayed_ref_is_head(ref)) {
3335                        struct btrfs_delayed_ref_head *head;
3336
3337                        head = btrfs_delayed_node_to_head(ref);
3338                        mutex_lock(&head->mutex);
3339                        kfree(head->extent_op);
3340                        delayed_refs->num_heads--;
3341                        if (list_empty(&head->cluster))
3342                                delayed_refs->num_heads_ready--;
3343                        list_del_init(&head->cluster);
3344                        mutex_unlock(&head->mutex);
3345                }
3346
3347                spin_unlock(&delayed_refs->lock);
3348                btrfs_put_delayed_ref(ref);
3349
3350                cond_resched();
3351                spin_lock(&delayed_refs->lock);
3352        }
3353
3354        spin_unlock(&delayed_refs->lock);
3355
3356        return ret;
3357}
3358
3359static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
3360{
3361        struct btrfs_pending_snapshot *snapshot;
3362        struct list_head splice;
3363
3364        INIT_LIST_HEAD(&splice);
3365
3366        list_splice_init(&t->pending_snapshots, &splice);
3367
3368        while (!list_empty(&splice)) {
3369                snapshot = list_entry(splice.next,
3370                                      struct btrfs_pending_snapshot,
3371                                      list);
3372
3373                list_del_init(&snapshot->list);
3374
3375                kfree(snapshot);
3376        }
3377
3378        return 0;
3379}
3380
3381static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3382{
3383        struct btrfs_inode *btrfs_inode;
3384        struct list_head splice;
3385
3386        INIT_LIST_HEAD(&splice);
3387
3388        spin_lock(&root->fs_info->delalloc_lock);
3389        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
3390
3391        while (!list_empty(&splice)) {
3392                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3393                                    delalloc_inodes);
3394
3395                list_del_init(&btrfs_inode->delalloc_inodes);
3396
3397                btrfs_invalidate_inodes(btrfs_inode->root);
3398        }
3399
3400        spin_unlock(&root->fs_info->delalloc_lock);
3401
3402        return 0;
3403}
3404
3405static int btrfs_destroy_marked_extents(struct btrfs_root *root,
3406                                        struct extent_io_tree *dirty_pages,
3407                                        int mark)
3408{
3409        int ret;
3410        struct page *page;
3411        struct inode *btree_inode = root->fs_info->btree_inode;
3412        struct extent_buffer *eb;
3413        u64 start = 0;
3414        u64 end;
3415        u64 offset;
3416        unsigned long index;
3417
3418        while (1) {
3419                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
3420                                            mark);
3421                if (ret)
3422                        break;
3423
3424                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
3425                while (start <= end) {
3426                        index = start >> PAGE_CACHE_SHIFT;
3427                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
3428                        page = find_get_page(btree_inode->i_mapping, index);
3429                        if (!page)
3430                                continue;
3431                        offset = page_offset(page);
3432
3433                        spin_lock(&dirty_pages->buffer_lock);
3434                        eb = radix_tree_lookup(
3435                             &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
3436                                               offset >> PAGE_CACHE_SHIFT);
3437                        spin_unlock(&dirty_pages->buffer_lock);
3438                        if (eb) {
3439                                ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
3440                                                         &eb->bflags);
3441                                atomic_set(&eb->refs, 1);
3442                        }
3443                        if (PageWriteback(page))
3444                                end_page_writeback(page);
3445
3446                        lock_page(page);
3447                        if (PageDirty(page)) {
3448                                clear_page_dirty_for_io(page);
3449                                spin_lock_irq(&page->mapping->tree_lock);
3450                                radix_tree_tag_clear(&page->mapping->page_tree,
3451                                                        page_index(page),
3452                                                        PAGECACHE_TAG_DIRTY);
3453                                spin_unlock_irq(&page->mapping->tree_lock);
3454                        }
3455
3456                        page->mapping->a_ops->invalidatepage(page, 0);
3457                        unlock_page(page);
3458                }
3459        }
3460
3461        return ret;
3462}
3463
3464static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
3465                                       struct extent_io_tree *pinned_extents)
3466{
3467        struct extent_io_tree *unpin;
3468        u64 start;
3469        u64 end;
3470        int ret;
3471
3472        unpin = pinned_extents;
3473        while (1) {
3474                ret = find_first_extent_bit(unpin, 0, &start, &end,
3475                                            EXTENT_DIRTY);
3476                if (ret)
3477                        break;
3478
3479                /* opt_discard */
3480                if (btrfs_test_opt(root, DISCARD))
3481                        ret = btrfs_error_discard_extent(root, start,
3482                                                         end + 1 - start,
3483                                                         NULL);
3484
3485                clear_extent_dirty(unpin, start, end, GFP_NOFS);
3486                btrfs_error_unpin_extent_range(root, start, end);
3487                cond_resched();
3488        }
3489
3490        return 0;
3491}
3492
3493static int btrfs_cleanup_transaction(struct btrfs_root *root)
3494{
3495        struct btrfs_transaction *t;
3496        LIST_HEAD(list);
3497
3498        WARN_ON(1);
3499
3500        mutex_lock(&root->fs_info->transaction_kthread_mutex);
3501
3502        spin_lock(&root->fs_info->trans_lock);
3503        list_splice_init(&root->fs_info->trans_list, &list);
3504        root->fs_info->trans_no_join = 1;
3505        spin_unlock(&root->fs_info->trans_lock);
3506
3507        while (!list_empty(&list)) {
3508                t = list_entry(list.next, struct btrfs_transaction, list);
3509                if (!t)
3510                        break;
3511
3512                btrfs_destroy_ordered_operations(root);
3513
3514                btrfs_destroy_ordered_extents(root);
3515
3516                btrfs_destroy_delayed_refs(t, root);
3517
3518                btrfs_block_rsv_release(root,
3519                                        &root->fs_info->trans_block_rsv,
3520                                        t->dirty_pages.dirty_bytes);
3521
3522                /* FIXME: cleanup wait for commit */
3523                t->in_commit = 1;
3524                t->blocked = 1;
3525                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3526                        wake_up(&root->fs_info->transaction_blocked_wait);
3527
3528                t->blocked = 0;
3529                if (waitqueue_active(&root->fs_info->transaction_wait))
3530                        wake_up(&root->fs_info->transaction_wait);
3531
3532                t->commit_done = 1;
3533                if (waitqueue_active(&t->commit_wait))
3534                        wake_up(&t->commit_wait);
3535
3536                btrfs_destroy_pending_snapshots(t);
3537
3538                btrfs_destroy_delalloc_inodes(root);
3539
3540                spin_lock(&root->fs_info->trans_lock);
3541                root->fs_info->running_transaction = NULL;
3542                spin_unlock(&root->fs_info->trans_lock);
3543
3544                btrfs_destroy_marked_extents(root, &t->dirty_pages,
3545                                             EXTENT_DIRTY);
3546
3547                btrfs_destroy_pinned_extent(root,
3548                                            root->fs_info->pinned_extents);
3549
3550                atomic_set(&t->use_count, 0);
3551                list_del_init(&t->list);
3552                memset(t, 0, sizeof(*t));
3553                kmem_cache_free(btrfs_transaction_cachep, t);
3554        }
3555
3556        spin_lock(&root->fs_info->trans_lock);
3557        root->fs_info->trans_no_join = 0;
3558        spin_unlock(&root->fs_info->trans_lock);
3559        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3560
3561        return 0;
3562}
3563
3564static struct extent_io_ops btree_extent_io_ops = {
3565        .write_cache_pages_lock_hook = btree_lock_page_hook,
3566        .readpage_end_io_hook = btree_readpage_end_io_hook,
3567        .readpage_io_failed_hook = btree_io_failed_hook,
3568        .submit_bio_hook = btree_submit_bio_hook,
3569        /* note we're sharing with inode.c for the merge bio hook */
3570        .merge_bio_hook = btrfs_merge_bio_hook,
3571};
3572
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.