linux/fs/btrfs/extent_io.c
<<
>>
Prefs
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
   7#include <linux/module.h>
   8#include <linux/spinlock.h>
   9#include <linux/blkdev.h>
  10#include <linux/swap.h>
  11#include <linux/writeback.h>
  12#include <linux/pagevec.h>
  13#include <linux/prefetch.h>
  14#include <linux/cleancache.h>
  15#include "extent_io.h"
  16#include "extent_map.h"
  17#include "compat.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22#include "locking.h"
  23#include "rcu-string.h"
  24
  25static struct kmem_cache *extent_state_cache;
  26static struct kmem_cache *extent_buffer_cache;
  27
  28static LIST_HEAD(buffers);
  29static LIST_HEAD(states);
  30
  31#define LEAK_DEBUG 0
  32#if LEAK_DEBUG
  33static DEFINE_SPINLOCK(leak_lock);
  34#endif
  35
  36#define BUFFER_LRU_MAX 64
  37
  38struct tree_entry {
  39        u64 start;
  40        u64 end;
  41        struct rb_node rb_node;
  42};
  43
  44struct extent_page_data {
  45        struct bio *bio;
  46        struct extent_io_tree *tree;
  47        get_extent_t *get_extent;
  48
  49        /* tells writepage not to lock the state bits for this range
  50         * it still does the unlocking
  51         */
  52        unsigned int extent_locked:1;
  53
  54        /* tells the submit_bio code to use a WRITE_SYNC */
  55        unsigned int sync_io:1;
  56};
  57
  58static noinline void flush_write_bio(void *data);
  59static inline struct btrfs_fs_info *
  60tree_fs_info(struct extent_io_tree *tree)
  61{
  62        return btrfs_sb(tree->mapping->host->i_sb);
  63}
  64
  65int __init extent_io_init(void)
  66{
  67        extent_state_cache = kmem_cache_create("extent_state",
  68                        sizeof(struct extent_state), 0,
  69                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  70        if (!extent_state_cache)
  71                return -ENOMEM;
  72
  73        extent_buffer_cache = kmem_cache_create("extent_buffers",
  74                        sizeof(struct extent_buffer), 0,
  75                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  76        if (!extent_buffer_cache)
  77                goto free_state_cache;
  78        return 0;
  79
  80free_state_cache:
  81        kmem_cache_destroy(extent_state_cache);
  82        return -ENOMEM;
  83}
  84
  85void extent_io_exit(void)
  86{
  87        struct extent_state *state;
  88        struct extent_buffer *eb;
  89
  90        while (!list_empty(&states)) {
  91                state = list_entry(states.next, struct extent_state, leak_list);
  92                printk(KERN_ERR "btrfs state leak: start %llu end %llu "
  93                       "state %lu in tree %p refs %d\n",
  94                       (unsigned long long)state->start,
  95                       (unsigned long long)state->end,
  96                       state->state, state->tree, atomic_read(&state->refs));
  97                list_del(&state->leak_list);
  98                kmem_cache_free(extent_state_cache, state);
  99
 100        }
 101
 102        while (!list_empty(&buffers)) {
 103                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 104                printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
 105                       "refs %d\n", (unsigned long long)eb->start,
 106                       eb->len, atomic_read(&eb->refs));
 107                list_del(&eb->leak_list);
 108                kmem_cache_free(extent_buffer_cache, eb);
 109        }
 110        if (extent_state_cache)
 111                kmem_cache_destroy(extent_state_cache);
 112        if (extent_buffer_cache)
 113                kmem_cache_destroy(extent_buffer_cache);
 114}
 115
 116void extent_io_tree_init(struct extent_io_tree *tree,
 117                         struct address_space *mapping)
 118{
 119        tree->state = RB_ROOT;
 120        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 121        tree->ops = NULL;
 122        tree->dirty_bytes = 0;
 123        spin_lock_init(&tree->lock);
 124        spin_lock_init(&tree->buffer_lock);
 125        tree->mapping = mapping;
 126}
 127
 128static struct extent_state *alloc_extent_state(gfp_t mask)
 129{
 130        struct extent_state *state;
 131#if LEAK_DEBUG
 132        unsigned long flags;
 133#endif
 134
 135        state = kmem_cache_alloc(extent_state_cache, mask);
 136        if (!state)
 137                return state;
 138        state->state = 0;
 139        state->private = 0;
 140        state->tree = NULL;
 141#if LEAK_DEBUG
 142        spin_lock_irqsave(&leak_lock, flags);
 143        list_add(&state->leak_list, &states);
 144        spin_unlock_irqrestore(&leak_lock, flags);
 145#endif
 146        atomic_set(&state->refs, 1);
 147        init_waitqueue_head(&state->wq);
 148        trace_alloc_extent_state(state, mask, _RET_IP_);
 149        return state;
 150}
 151
 152void free_extent_state(struct extent_state *state)
 153{
 154        if (!state)
 155                return;
 156        if (atomic_dec_and_test(&state->refs)) {
 157#if LEAK_DEBUG
 158                unsigned long flags;
 159#endif
 160                WARN_ON(state->tree);
 161#if LEAK_DEBUG
 162                spin_lock_irqsave(&leak_lock, flags);
 163                list_del(&state->leak_list);
 164                spin_unlock_irqrestore(&leak_lock, flags);
 165#endif
 166                trace_free_extent_state(state, _RET_IP_);
 167                kmem_cache_free(extent_state_cache, state);
 168        }
 169}
 170
 171static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 172                                   struct rb_node *node)
 173{
 174        struct rb_node **p = &root->rb_node;
 175        struct rb_node *parent = NULL;
 176        struct tree_entry *entry;
 177
 178        while (*p) {
 179                parent = *p;
 180                entry = rb_entry(parent, struct tree_entry, rb_node);
 181
 182                if (offset < entry->start)
 183                        p = &(*p)->rb_left;
 184                else if (offset > entry->end)
 185                        p = &(*p)->rb_right;
 186                else
 187                        return parent;
 188        }
 189
 190        rb_link_node(node, parent, p);
 191        rb_insert_color(node, root);
 192        return NULL;
 193}
 194
 195static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 196                                     struct rb_node **prev_ret,
 197                                     struct rb_node **next_ret)
 198{
 199        struct rb_root *root = &tree->state;
 200        struct rb_node *n = root->rb_node;
 201        struct rb_node *prev = NULL;
 202        struct rb_node *orig_prev = NULL;
 203        struct tree_entry *entry;
 204        struct tree_entry *prev_entry = NULL;
 205
 206        while (n) {
 207                entry = rb_entry(n, struct tree_entry, rb_node);
 208                prev = n;
 209                prev_entry = entry;
 210
 211                if (offset < entry->start)
 212                        n = n->rb_left;
 213                else if (offset > entry->end)
 214                        n = n->rb_right;
 215                else
 216                        return n;
 217        }
 218
 219        if (prev_ret) {
 220                orig_prev = prev;
 221                while (prev && offset > prev_entry->end) {
 222                        prev = rb_next(prev);
 223                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 224                }
 225                *prev_ret = prev;
 226                prev = orig_prev;
 227        }
 228
 229        if (next_ret) {
 230                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 231                while (prev && offset < prev_entry->start) {
 232                        prev = rb_prev(prev);
 233                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 234                }
 235                *next_ret = prev;
 236        }
 237        return NULL;
 238}
 239
 240static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 241                                          u64 offset)
 242{
 243        struct rb_node *prev = NULL;
 244        struct rb_node *ret;
 245
 246        ret = __etree_search(tree, offset, &prev, NULL);
 247        if (!ret)
 248                return prev;
 249        return ret;
 250}
 251
 252static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 253                     struct extent_state *other)
 254{
 255        if (tree->ops && tree->ops->merge_extent_hook)
 256                tree->ops->merge_extent_hook(tree->mapping->host, new,
 257                                             other);
 258}
 259
 260/*
 261 * utility function to look for merge candidates inside a given range.
 262 * Any extents with matching state are merged together into a single
 263 * extent in the tree.  Extents with EXTENT_IO in their state field
 264 * are not merged because the end_io handlers need to be able to do
 265 * operations on them without sleeping (or doing allocations/splits).
 266 *
 267 * This should be called with the tree lock held.
 268 */
 269static void merge_state(struct extent_io_tree *tree,
 270                        struct extent_state *state)
 271{
 272        struct extent_state *other;
 273        struct rb_node *other_node;
 274
 275        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 276                return;
 277
 278        other_node = rb_prev(&state->rb_node);
 279        if (other_node) {
 280                other = rb_entry(other_node, struct extent_state, rb_node);
 281                if (other->end == state->start - 1 &&
 282                    other->state == state->state) {
 283                        merge_cb(tree, state, other);
 284                        state->start = other->start;
 285                        other->tree = NULL;
 286                        rb_erase(&other->rb_node, &tree->state);
 287                        free_extent_state(other);
 288                }
 289        }
 290        other_node = rb_next(&state->rb_node);
 291        if (other_node) {
 292                other = rb_entry(other_node, struct extent_state, rb_node);
 293                if (other->start == state->end + 1 &&
 294                    other->state == state->state) {
 295                        merge_cb(tree, state, other);
 296                        state->end = other->end;
 297                        other->tree = NULL;
 298                        rb_erase(&other->rb_node, &tree->state);
 299                        free_extent_state(other);
 300                }
 301        }
 302}
 303
 304static void set_state_cb(struct extent_io_tree *tree,
 305                         struct extent_state *state, int *bits)
 306{
 307        if (tree->ops && tree->ops->set_bit_hook)
 308                tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 309}
 310
 311static void clear_state_cb(struct extent_io_tree *tree,
 312                           struct extent_state *state, int *bits)
 313{
 314        if (tree->ops && tree->ops->clear_bit_hook)
 315                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 316}
 317
 318static void set_state_bits(struct extent_io_tree *tree,
 319                           struct extent_state *state, int *bits);
 320
 321/*
 322 * insert an extent_state struct into the tree.  'bits' are set on the
 323 * struct before it is inserted.
 324 *
 325 * This may return -EEXIST if the extent is already there, in which case the
 326 * state struct is freed.
 327 *
 328 * The tree lock is not taken internally.  This is a utility function and
 329 * probably isn't what you want to call (see set/clear_extent_bit).
 330 */
 331static int insert_state(struct extent_io_tree *tree,
 332                        struct extent_state *state, u64 start, u64 end,
 333                        int *bits)
 334{
 335        struct rb_node *node;
 336
 337        if (end < start) {
 338                printk(KERN_ERR "btrfs end < start %llu %llu\n",
 339                       (unsigned long long)end,
 340                       (unsigned long long)start);
 341                WARN_ON(1);
 342        }
 343        state->start = start;
 344        state->end = end;
 345
 346        set_state_bits(tree, state, bits);
 347
 348        node = tree_insert(&tree->state, end, &state->rb_node);
 349        if (node) {
 350                struct extent_state *found;
 351                found = rb_entry(node, struct extent_state, rb_node);
 352                printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 353                       "%llu %llu\n", (unsigned long long)found->start,
 354                       (unsigned long long)found->end,
 355                       (unsigned long long)start, (unsigned long long)end);
 356                return -EEXIST;
 357        }
 358        state->tree = tree;
 359        merge_state(tree, state);
 360        return 0;
 361}
 362
 363static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 364                     u64 split)
 365{
 366        if (tree->ops && tree->ops->split_extent_hook)
 367                tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 368}
 369
 370/*
 371 * split a given extent state struct in two, inserting the preallocated
 372 * struct 'prealloc' as the newly created second half.  'split' indicates an
 373 * offset inside 'orig' where it should be split.
 374 *
 375 * Before calling,
 376 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 377 * are two extent state structs in the tree:
 378 * prealloc: [orig->start, split - 1]
 379 * orig: [ split, orig->end ]
 380 *
 381 * The tree locks are not taken by this function. They need to be held
 382 * by the caller.
 383 */
 384static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 385                       struct extent_state *prealloc, u64 split)
 386{
 387        struct rb_node *node;
 388
 389        split_cb(tree, orig, split);
 390
 391        prealloc->start = orig->start;
 392        prealloc->end = split - 1;
 393        prealloc->state = orig->state;
 394        orig->start = split;
 395
 396        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 397        if (node) {
 398                free_extent_state(prealloc);
 399                return -EEXIST;
 400        }
 401        prealloc->tree = tree;
 402        return 0;
 403}
 404
 405static struct extent_state *next_state(struct extent_state *state)
 406{
 407        struct rb_node *next = rb_next(&state->rb_node);
 408        if (next)
 409                return rb_entry(next, struct extent_state, rb_node);
 410        else
 411                return NULL;
 412}
 413
 414/*
 415 * utility function to clear some bits in an extent state struct.
 416 * it will optionally wake up any one waiting on this state (wake == 1).
 417 *
 418 * If no bits are set on the state struct after clearing things, the
 419 * struct is freed and removed from the tree
 420 */
 421static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 422                                            struct extent_state *state,
 423                                            int *bits, int wake)
 424{
 425        struct extent_state *next;
 426        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 427
 428        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 429                u64 range = state->end - state->start + 1;
 430                WARN_ON(range > tree->dirty_bytes);
 431                tree->dirty_bytes -= range;
 432        }
 433        clear_state_cb(tree, state, bits);
 434        state->state &= ~bits_to_clear;
 435        if (wake)
 436                wake_up(&state->wq);
 437        if (state->state == 0) {
 438                next = next_state(state);
 439                if (state->tree) {
 440                        rb_erase(&state->rb_node, &tree->state);
 441                        state->tree = NULL;
 442                        free_extent_state(state);
 443                } else {
 444                        WARN_ON(1);
 445                }
 446        } else {
 447                merge_state(tree, state);
 448                next = next_state(state);
 449        }
 450        return next;
 451}
 452
 453static struct extent_state *
 454alloc_extent_state_atomic(struct extent_state *prealloc)
 455{
 456        if (!prealloc)
 457                prealloc = alloc_extent_state(GFP_ATOMIC);
 458
 459        return prealloc;
 460}
 461
 462void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 463{
 464        btrfs_panic(tree_fs_info(tree), err, "Locking error: "
 465                    "Extent tree was modified by another "
 466                    "thread while locked.");
 467}
 468
 469/*
 470 * clear some bits on a range in the tree.  This may require splitting
 471 * or inserting elements in the tree, so the gfp mask is used to
 472 * indicate which allocations or sleeping are allowed.
 473 *
 474 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 475 * the given range from the tree regardless of state (ie for truncate).
 476 *
 477 * the range [start, end] is inclusive.
 478 *
 479 * This takes the tree lock, and returns 0 on success and < 0 on error.
 480 */
 481int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 482                     int bits, int wake, int delete,
 483                     struct extent_state **cached_state,
 484                     gfp_t mask)
 485{
 486        struct extent_state *state;
 487        struct extent_state *cached;
 488        struct extent_state *prealloc = NULL;
 489        struct rb_node *node;
 490        u64 last_end;
 491        int err;
 492        int clear = 0;
 493
 494        if (delete)
 495                bits |= ~EXTENT_CTLBITS;
 496        bits |= EXTENT_FIRST_DELALLOC;
 497
 498        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 499                clear = 1;
 500again:
 501        if (!prealloc && (mask & __GFP_WAIT)) {
 502                prealloc = alloc_extent_state(mask);
 503                if (!prealloc)
 504                        return -ENOMEM;
 505        }
 506
 507        spin_lock(&tree->lock);
 508        if (cached_state) {
 509                cached = *cached_state;
 510
 511                if (clear) {
 512                        *cached_state = NULL;
 513                        cached_state = NULL;
 514                }
 515
 516                if (cached && cached->tree && cached->start <= start &&
 517                    cached->end > start) {
 518                        if (clear)
 519                                atomic_dec(&cached->refs);
 520                        state = cached;
 521                        goto hit_next;
 522                }
 523                if (clear)
 524                        free_extent_state(cached);
 525        }
 526        /*
 527         * this search will find the extents that end after
 528         * our range starts
 529         */
 530        node = tree_search(tree, start);
 531        if (!node)
 532                goto out;
 533        state = rb_entry(node, struct extent_state, rb_node);
 534hit_next:
 535        if (state->start > end)
 536                goto out;
 537        WARN_ON(state->end < start);
 538        last_end = state->end;
 539
 540        /* the state doesn't have the wanted bits, go ahead */
 541        if (!(state->state & bits)) {
 542                state = next_state(state);
 543                goto next;
 544        }
 545
 546        /*
 547         *     | ---- desired range ---- |
 548         *  | state | or
 549         *  | ------------- state -------------- |
 550         *
 551         * We need to split the extent we found, and may flip
 552         * bits on second half.
 553         *
 554         * If the extent we found extends past our range, we
 555         * just split and search again.  It'll get split again
 556         * the next time though.
 557         *
 558         * If the extent we found is inside our range, we clear
 559         * the desired bit on it.
 560         */
 561
 562        if (state->start < start) {
 563                prealloc = alloc_extent_state_atomic(prealloc);
 564                BUG_ON(!prealloc);
 565                err = split_state(tree, state, prealloc, start);
 566                if (err)
 567                        extent_io_tree_panic(tree, err);
 568
 569                prealloc = NULL;
 570                if (err)
 571                        goto out;
 572                if (state->end <= end) {
 573                        state = clear_state_bit(tree, state, &bits, wake);
 574                        goto next;
 575                }
 576                goto search_again;
 577        }
 578        /*
 579         * | ---- desired range ---- |
 580         *                        | state |
 581         * We need to split the extent, and clear the bit
 582         * on the first half
 583         */
 584        if (state->start <= end && state->end > end) {
 585                prealloc = alloc_extent_state_atomic(prealloc);
 586                BUG_ON(!prealloc);
 587                err = split_state(tree, state, prealloc, end + 1);
 588                if (err)
 589                        extent_io_tree_panic(tree, err);
 590
 591                if (wake)
 592                        wake_up(&state->wq);
 593
 594                clear_state_bit(tree, prealloc, &bits, wake);
 595
 596                prealloc = NULL;
 597                goto out;
 598        }
 599
 600        state = clear_state_bit(tree, state, &bits, wake);
 601next:
 602        if (last_end == (u64)-1)
 603                goto out;
 604        start = last_end + 1;
 605        if (start <= end && state && !need_resched())
 606                goto hit_next;
 607        goto search_again;
 608
 609out:
 610        spin_unlock(&tree->lock);
 611        if (prealloc)
 612                free_extent_state(prealloc);
 613
 614        return 0;
 615
 616search_again:
 617        if (start > end)
 618                goto out;
 619        spin_unlock(&tree->lock);
 620        if (mask & __GFP_WAIT)
 621                cond_resched();
 622        goto again;
 623}
 624
 625static void wait_on_state(struct extent_io_tree *tree,
 626                          struct extent_state *state)
 627                __releases(tree->lock)
 628                __acquires(tree->lock)
 629{
 630        DEFINE_WAIT(wait);
 631        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 632        spin_unlock(&tree->lock);
 633        schedule();
 634        spin_lock(&tree->lock);
 635        finish_wait(&state->wq, &wait);
 636}
 637
 638/*
 639 * waits for one or more bits to clear on a range in the state tree.
 640 * The range [start, end] is inclusive.
 641 * The tree lock is taken by this function
 642 */
 643void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 644{
 645        struct extent_state *state;
 646        struct rb_node *node;
 647
 648        spin_lock(&tree->lock);
 649again:
 650        while (1) {
 651                /*
 652                 * this search will find all the extents that end after
 653                 * our range starts
 654                 */
 655                node = tree_search(tree, start);
 656                if (!node)
 657                        break;
 658
 659                state = rb_entry(node, struct extent_state, rb_node);
 660
 661                if (state->start > end)
 662                        goto out;
 663
 664                if (state->state & bits) {
 665                        start = state->start;
 666                        atomic_inc(&state->refs);
 667                        wait_on_state(tree, state);
 668                        free_extent_state(state);
 669                        goto again;
 670                }
 671                start = state->end + 1;
 672
 673                if (start > end)
 674                        break;
 675
 676                cond_resched_lock(&tree->lock);
 677        }
 678out:
 679        spin_unlock(&tree->lock);
 680}
 681
 682static void set_state_bits(struct extent_io_tree *tree,
 683                           struct extent_state *state,
 684                           int *bits)
 685{
 686        int bits_to_set = *bits & ~EXTENT_CTLBITS;
 687
 688        set_state_cb(tree, state, bits);
 689        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 690                u64 range = state->end - state->start + 1;
 691                tree->dirty_bytes += range;
 692        }
 693        state->state |= bits_to_set;
 694}
 695
 696static void cache_state(struct extent_state *state,
 697                        struct extent_state **cached_ptr)
 698{
 699        if (cached_ptr && !(*cached_ptr)) {
 700                if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 701                        *cached_ptr = state;
 702                        atomic_inc(&state->refs);
 703                }
 704        }
 705}
 706
 707static void uncache_state(struct extent_state **cached_ptr)
 708{
 709        if (cached_ptr && (*cached_ptr)) {
 710                struct extent_state *state = *cached_ptr;
 711                *cached_ptr = NULL;
 712                free_extent_state(state);
 713        }
 714}
 715
 716/*
 717 * set some bits on a range in the tree.  This may require allocations or
 718 * sleeping, so the gfp mask is used to indicate what is allowed.
 719 *
 720 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 721 * part of the range already has the desired bits set.  The start of the
 722 * existing range is returned in failed_start in this case.
 723 *
 724 * [start, end] is inclusive This takes the tree lock.
 725 */
 726
 727static int __must_check
 728__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 729                 int bits, int exclusive_bits, u64 *failed_start,
 730                 struct extent_state **cached_state, gfp_t mask)
 731{
 732        struct extent_state *state;
 733        struct extent_state *prealloc = NULL;
 734        struct rb_node *node;
 735        int err = 0;
 736        u64 last_start;
 737        u64 last_end;
 738
 739        bits |= EXTENT_FIRST_DELALLOC;
 740again:
 741        if (!prealloc && (mask & __GFP_WAIT)) {
 742                prealloc = alloc_extent_state(mask);
 743                BUG_ON(!prealloc);
 744        }
 745
 746        spin_lock(&tree->lock);
 747        if (cached_state && *cached_state) {
 748                state = *cached_state;
 749                if (state->start <= start && state->end > start &&
 750                    state->tree) {
 751                        node = &state->rb_node;
 752                        goto hit_next;
 753                }
 754        }
 755        /*
 756         * this search will find all the extents that end after
 757         * our range starts.
 758         */
 759        node = tree_search(tree, start);
 760        if (!node) {
 761                prealloc = alloc_extent_state_atomic(prealloc);
 762                BUG_ON(!prealloc);
 763                err = insert_state(tree, prealloc, start, end, &bits);
 764                if (err)
 765                        extent_io_tree_panic(tree, err);
 766
 767                prealloc = NULL;
 768                goto out;
 769        }
 770        state = rb_entry(node, struct extent_state, rb_node);
 771hit_next:
 772        last_start = state->start;
 773        last_end = state->end;
 774
 775        /*
 776         * | ---- desired range ---- |
 777         * | state |
 778         *
 779         * Just lock what we found and keep going
 780         */
 781        if (state->start == start && state->end <= end) {
 782                if (state->state & exclusive_bits) {
 783                        *failed_start = state->start;
 784                        err = -EEXIST;
 785                        goto out;
 786                }
 787
 788                set_state_bits(tree, state, &bits);
 789                cache_state(state, cached_state);
 790                merge_state(tree, state);
 791                if (last_end == (u64)-1)
 792                        goto out;
 793                start = last_end + 1;
 794                state = next_state(state);
 795                if (start < end && state && state->start == start &&
 796                    !need_resched())
 797                        goto hit_next;
 798                goto search_again;
 799        }
 800
 801        /*
 802         *     | ---- desired range ---- |
 803         * | state |
 804         *   or
 805         * | ------------- state -------------- |
 806         *
 807         * We need to split the extent we found, and may flip bits on
 808         * second half.
 809         *
 810         * If the extent we found extends past our
 811         * range, we just split and search again.  It'll get split
 812         * again the next time though.
 813         *
 814         * If the extent we found is inside our range, we set the
 815         * desired bit on it.
 816         */
 817        if (state->start < start) {
 818                if (state->state & exclusive_bits) {
 819                        *failed_start = start;
 820                        err = -EEXIST;
 821                        goto out;
 822                }
 823
 824                prealloc = alloc_extent_state_atomic(prealloc);
 825                BUG_ON(!prealloc);
 826                err = split_state(tree, state, prealloc, start);
 827                if (err)
 828                        extent_io_tree_panic(tree, err);
 829
 830                prealloc = NULL;
 831                if (err)
 832                        goto out;
 833                if (state->end <= end) {
 834                        set_state_bits(tree, state, &bits);
 835                        cache_state(state, cached_state);
 836                        merge_state(tree, state);
 837                        if (last_end == (u64)-1)
 838                                goto out;
 839                        start = last_end + 1;
 840                        state = next_state(state);
 841                        if (start < end && state && state->start == start &&
 842                            !need_resched())
 843                                goto hit_next;
 844                }
 845                goto search_again;
 846        }
 847        /*
 848         * | ---- desired range ---- |
 849         *     | state | or               | state |
 850         *
 851         * There's a hole, we need to insert something in it and
 852         * ignore the extent we found.
 853         */
 854        if (state->start > start) {
 855                u64 this_end;
 856                if (end < last_start)
 857                        this_end = end;
 858                else
 859                        this_end = last_start - 1;
 860
 861                prealloc = alloc_extent_state_atomic(prealloc);
 862                BUG_ON(!prealloc);
 863
 864                /*
 865                 * Avoid to free 'prealloc' if it can be merged with
 866                 * the later extent.
 867                 */
 868                err = insert_state(tree, prealloc, start, this_end,
 869                                   &bits);
 870                if (err)
 871                        extent_io_tree_panic(tree, err);
 872
 873                cache_state(prealloc, cached_state);
 874                prealloc = NULL;
 875                start = this_end + 1;
 876                goto search_again;
 877        }
 878        /*
 879         * | ---- desired range ---- |
 880         *                        | state |
 881         * We need to split the extent, and set the bit
 882         * on the first half
 883         */
 884        if (state->start <= end && state->end > end) {
 885                if (state->state & exclusive_bits) {
 886                        *failed_start = start;
 887                        err = -EEXIST;
 888                        goto out;
 889                }
 890
 891                prealloc = alloc_extent_state_atomic(prealloc);
 892                BUG_ON(!prealloc);
 893                err = split_state(tree, state, prealloc, end + 1);
 894                if (err)
 895                        extent_io_tree_panic(tree, err);
 896
 897                set_state_bits(tree, prealloc, &bits);
 898                cache_state(prealloc, cached_state);
 899                merge_state(tree, prealloc);
 900                prealloc = NULL;
 901                goto out;
 902        }
 903
 904        goto search_again;
 905
 906out:
 907        spin_unlock(&tree->lock);
 908        if (prealloc)
 909                free_extent_state(prealloc);
 910
 911        return err;
 912
 913search_again:
 914        if (start > end)
 915                goto out;
 916        spin_unlock(&tree->lock);
 917        if (mask & __GFP_WAIT)
 918                cond_resched();
 919        goto again;
 920}
 921
 922int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 923                   u64 *failed_start, struct extent_state **cached_state,
 924                   gfp_t mask)
 925{
 926        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
 927                                cached_state, mask);
 928}
 929
 930
 931/**
 932 * convert_extent_bit - convert all bits in a given range from one bit to
 933 *                      another
 934 * @tree:       the io tree to search
 935 * @start:      the start offset in bytes
 936 * @end:        the end offset in bytes (inclusive)
 937 * @bits:       the bits to set in this range
 938 * @clear_bits: the bits to clear in this range
 939 * @mask:       the allocation mask
 940 *
 941 * This will go through and set bits for the given range.  If any states exist
 942 * already in this range they are set with the given bit and cleared of the
 943 * clear_bits.  This is only meant to be used by things that are mergeable, ie
 944 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 945 * boundary bits like LOCK.
 946 */
 947int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 948                       int bits, int clear_bits, gfp_t mask)
 949{
 950        struct extent_state *state;
 951        struct extent_state *prealloc = NULL;
 952        struct rb_node *node;
 953        int err = 0;
 954        u64 last_start;
 955        u64 last_end;
 956
 957again:
 958        if (!prealloc && (mask & __GFP_WAIT)) {
 959                prealloc = alloc_extent_state(mask);
 960                if (!prealloc)
 961                        return -ENOMEM;
 962        }
 963
 964        spin_lock(&tree->lock);
 965        /*
 966         * this search will find all the extents that end after
 967         * our range starts.
 968         */
 969        node = tree_search(tree, start);
 970        if (!node) {
 971                prealloc = alloc_extent_state_atomic(prealloc);
 972                if (!prealloc) {
 973                        err = -ENOMEM;
 974                        goto out;
 975                }
 976                err = insert_state(tree, prealloc, start, end, &bits);
 977                prealloc = NULL;
 978                if (err)
 979                        extent_io_tree_panic(tree, err);
 980                goto out;
 981        }
 982        state = rb_entry(node, struct extent_state, rb_node);
 983hit_next:
 984        last_start = state->start;
 985        last_end = state->end;
 986
 987        /*
 988         * | ---- desired range ---- |
 989         * | state |
 990         *
 991         * Just lock what we found and keep going
 992         */
 993        if (state->start == start && state->end <= end) {
 994                set_state_bits(tree, state, &bits);
 995                state = clear_state_bit(tree, state, &clear_bits, 0);
 996                if (last_end == (u64)-1)
 997                        goto out;
 998                start = last_end + 1;
 999                if (start < end && state && state->start == start &&
1000                    !need_resched())
1001                        goto hit_next;
1002                goto search_again;
1003        }
1004
1005        /*
1006         *     | ---- desired range ---- |
1007         * | state |
1008         *   or
1009         * | ------------- state -------------- |
1010         *
1011         * We need to split the extent we found, and may flip bits on
1012         * second half.
1013         *
1014         * If the extent we found extends past our
1015         * range, we just split and search again.  It'll get split
1016         * again the next time though.
1017         *
1018         * If the extent we found is inside our range, we set the
1019         * desired bit on it.
1020         */
1021        if (state->start < start) {
1022                prealloc = alloc_extent_state_atomic(prealloc);
1023                if (!prealloc) {
1024                        err = -ENOMEM;
1025                        goto out;
1026                }
1027                err = split_state(tree, state, prealloc, start);
1028                if (err)
1029                        extent_io_tree_panic(tree, err);
1030                prealloc = NULL;
1031                if (err)
1032                        goto out;
1033                if (state->end <= end) {
1034                        set_state_bits(tree, state, &bits);
1035                        state = clear_state_bit(tree, state, &clear_bits, 0);
1036                        if (last_end == (u64)-1)
1037                                goto out;
1038                        start = last_end + 1;
1039                        if (start < end && state && state->start == start &&
1040                            !need_resched())
1041                                goto hit_next;
1042                }
1043                goto search_again;
1044        }
1045        /*
1046         * | ---- desired range ---- |
1047         *     | state | or               | state |
1048         *
1049         * There's a hole, we need to insert something in it and
1050         * ignore the extent we found.
1051         */
1052        if (state->start > start) {
1053                u64 this_end;
1054                if (end < last_start)
1055                        this_end = end;
1056                else
1057                        this_end = last_start - 1;
1058
1059                prealloc = alloc_extent_state_atomic(prealloc);
1060                if (!prealloc) {
1061                        err = -ENOMEM;
1062                        goto out;
1063                }
1064
1065                /*
1066                 * Avoid to free 'prealloc' if it can be merged with
1067                 * the later extent.
1068                 */
1069                err = insert_state(tree, prealloc, start, this_end,
1070                                   &bits);
1071                if (err)
1072                        extent_io_tree_panic(tree, err);
1073                prealloc = NULL;
1074                start = this_end + 1;
1075                goto search_again;
1076        }
1077        /*
1078         * | ---- desired range ---- |
1079         *                        | state |
1080         * We need to split the extent, and set the bit
1081         * on the first half
1082         */
1083        if (state->start <= end && state->end > end) {
1084                prealloc = alloc_extent_state_atomic(prealloc);
1085                if (!prealloc) {
1086                        err = -ENOMEM;
1087                        goto out;
1088                }
1089
1090                err = split_state(tree, state, prealloc, end + 1);
1091                if (err)
1092                        extent_io_tree_panic(tree, err);
1093
1094                set_state_bits(tree, prealloc, &bits);
1095                clear_state_bit(tree, prealloc, &clear_bits, 0);
1096                prealloc = NULL;
1097                goto out;
1098        }
1099
1100        goto search_again;
1101
1102out:
1103        spin_unlock(&tree->lock);
1104        if (prealloc)
1105                free_extent_state(prealloc);
1106
1107        return err;
1108
1109search_again:
1110        if (start > end)
1111                goto out;
1112        spin_unlock(&tree->lock);
1113        if (mask & __GFP_WAIT)
1114                cond_resched();
1115        goto again;
1116}
1117
1118/* wrappers around set/clear extent bit */
1119int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1120                     gfp_t mask)
1121{
1122        return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1123                              NULL, mask);
1124}
1125
1126int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1127                    int bits, gfp_t mask)
1128{
1129        return set_extent_bit(tree, start, end, bits, NULL,
1130                              NULL, mask);
1131}
1132
1133int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1134                      int bits, gfp_t mask)
1135{
1136        return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1137}
1138
1139int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1140                        struct extent_state **cached_state, gfp_t mask)
1141{
1142        return set_extent_bit(tree, start, end,
1143                              EXTENT_DELALLOC | EXTENT_UPTODATE,
1144                              NULL, cached_state, mask);
1145}
1146
1147int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1148                       gfp_t mask)
1149{
1150        return clear_extent_bit(tree, start, end,
1151                                EXTENT_DIRTY | EXTENT_DELALLOC |
1152                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1153}
1154
1155int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1156                     gfp_t mask)
1157{
1158        return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1159                              NULL, mask);
1160}
1161
1162int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1163                        struct extent_state **cached_state, gfp_t mask)
1164{
1165        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1166                              cached_state, mask);
1167}
1168
1169int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1170                          struct extent_state **cached_state, gfp_t mask)
1171{
1172        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1173                                cached_state, mask);
1174}
1175
1176/*
1177 * either insert or lock state struct between start and end use mask to tell
1178 * us if waiting is desired.
1179 */
1180int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1181                     int bits, struct extent_state **cached_state)
1182{
1183        int err;
1184        u64 failed_start;
1185        while (1) {
1186                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1187                                       EXTENT_LOCKED, &failed_start,
1188                                       cached_state, GFP_NOFS);
1189                if (err == -EEXIST) {
1190                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1191                        start = failed_start;
1192                } else
1193                        break;
1194                WARN_ON(start > end);
1195        }
1196        return err;
1197}
1198
1199int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1200{
1201        return lock_extent_bits(tree, start, end, 0, NULL);
1202}
1203
1204int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1205{
1206        int err;
1207        u64 failed_start;
1208
1209        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1210                               &failed_start, NULL, GFP_NOFS);
1211        if (err == -EEXIST) {
1212                if (failed_start > start)
1213                        clear_extent_bit(tree, start, failed_start - 1,
1214                                         EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1215                return 0;
1216        }
1217        return 1;
1218}
1219
1220int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1221                         struct extent_state **cached, gfp_t mask)
1222{
1223        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1224                                mask);
1225}
1226
1227int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1228{
1229        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1230                                GFP_NOFS);
1231}
1232
1233/*
1234 * helper function to set both pages and extents in the tree writeback
1235 */
1236static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1237{
1238        unsigned long index = start >> PAGE_CACHE_SHIFT;
1239        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1240        struct page *page;
1241
1242        while (index <= end_index) {
1243                page = find_get_page(tree->mapping, index);
1244                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1245                set_page_writeback(page);
1246                page_cache_release(page);
1247                index++;
1248        }
1249        return 0;
1250}
1251
1252/* find the first state struct with 'bits' set after 'start', and
1253 * return it.  tree->lock must be held.  NULL will returned if
1254 * nothing was found after 'start'
1255 */
1256struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1257                                                 u64 start, int bits)
1258{
1259        struct rb_node *node;
1260        struct extent_state *state;
1261
1262        /*
1263         * this search will find all the extents that end after
1264         * our range starts.
1265         */
1266        node = tree_search(tree, start);
1267        if (!node)
1268                goto out;
1269
1270        while (1) {
1271                state = rb_entry(node, struct extent_state, rb_node);
1272                if (state->end >= start && (state->state & bits))
1273                        return state;
1274
1275                node = rb_next(node);
1276                if (!node)
1277                        break;
1278        }
1279out:
1280        return NULL;
1281}
1282
1283/*
1284 * find the first offset in the io tree with 'bits' set. zero is
1285 * returned if we find something, and *start_ret and *end_ret are
1286 * set to reflect the state struct that was found.
1287 *
1288 * If nothing was found, 1 is returned. If found something, return 0.
1289 */
1290int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1291                          u64 *start_ret, u64 *end_ret, int bits)
1292{
1293        struct extent_state *state;
1294        int ret = 1;
1295
1296        spin_lock(&tree->lock);
1297        state = find_first_extent_bit_state(tree, start, bits);
1298        if (state) {
1299                *start_ret = state->start;
1300                *end_ret = state->end;
1301                ret = 0;
1302        }
1303        spin_unlock(&tree->lock);
1304        return ret;
1305}
1306
1307/*
1308 * find a contiguous range of bytes in the file marked as delalloc, not
1309 * more than 'max_bytes'.  start and end are used to return the range,
1310 *
1311 * 1 is returned if we find something, 0 if nothing was in the tree
1312 */
1313static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1314                                        u64 *start, u64 *end, u64 max_bytes,
1315                                        struct extent_state **cached_state)
1316{
1317        struct rb_node *node;
1318        struct extent_state *state;
1319        u64 cur_start = *start;
1320        u64 found = 0;
1321        u64 total_bytes = 0;
1322
1323        spin_lock(&tree->lock);
1324
1325        /*
1326         * this search will find all the extents that end after
1327         * our range starts.
1328         */
1329        node = tree_search(tree, cur_start);
1330        if (!node) {
1331                if (!found)
1332                        *end = (u64)-1;
1333                goto out;
1334        }
1335
1336        while (1) {
1337                state = rb_entry(node, struct extent_state, rb_node);
1338                if (found && (state->start != cur_start ||
1339                              (state->state & EXTENT_BOUNDARY))) {
1340                        goto out;
1341                }
1342                if (!(state->state & EXTENT_DELALLOC)) {
1343                        if (!found)
1344                                *end = state->end;
1345                        goto out;
1346                }
1347                if (!found) {
1348                        *start = state->start;
1349                        *cached_state = state;
1350                        atomic_inc(&state->refs);
1351                }
1352                found++;
1353                *end = state->end;
1354                cur_start = state->end + 1;
1355                node = rb_next(node);
1356                if (!node)
1357                        break;
1358                total_bytes += state->end - state->start + 1;
1359                if (total_bytes >= max_bytes)
1360                        break;
1361        }
1362out:
1363        spin_unlock(&tree->lock);
1364        return found;
1365}
1366
1367static noinline void __unlock_for_delalloc(struct inode *inode,
1368                                           struct page *locked_page,
1369                                           u64 start, u64 end)
1370{
1371        int ret;
1372        struct page *pages[16];
1373        unsigned long index = start >> PAGE_CACHE_SHIFT;
1374        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1375        unsigned long nr_pages = end_index - index + 1;
1376        int i;
1377
1378        if (index == locked_page->index && end_index == index)
1379                return;
1380
1381        while (nr_pages > 0) {
1382                ret = find_get_pages_contig(inode->i_mapping, index,
1383                                     min_t(unsigned long, nr_pages,
1384                                     ARRAY_SIZE(pages)), pages);
1385                for (i = 0; i < ret; i++) {
1386                        if (pages[i] != locked_page)
1387                                unlock_page(pages[i]);
1388                        page_cache_release(pages[i]);
1389                }
1390                nr_pages -= ret;
1391                index += ret;
1392                cond_resched();
1393        }
1394}
1395
1396static noinline int lock_delalloc_pages(struct inode *inode,
1397                                        struct page *locked_page,
1398                                        u64 delalloc_start,
1399                                        u64 delalloc_end)
1400{
1401        unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1402        unsigned long start_index = index;
1403        unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1404        unsigned long pages_locked = 0;
1405        struct page *pages[16];
1406        unsigned long nrpages;
1407        int ret;
1408        int i;
1409
1410        /* the caller is responsible for locking the start index */
1411        if (index == locked_page->index && index == end_index)
1412                return 0;
1413
1414        /* skip the page at the start index */
1415        nrpages = end_index - index + 1;
1416        while (nrpages > 0) {
1417                ret = find_get_pages_contig(inode->i_mapping, index,
1418                                     min_t(unsigned long,
1419                                     nrpages, ARRAY_SIZE(pages)), pages);
1420                if (ret == 0) {
1421                        ret = -EAGAIN;
1422                        goto done;
1423                }
1424                /* now we have an array of pages, lock them all */
1425                for (i = 0; i < ret; i++) {
1426                        /*
1427                         * the caller is taking responsibility for
1428                         * locked_page
1429                         */
1430                        if (pages[i] != locked_page) {
1431                                lock_page(pages[i]);
1432                                if (!PageDirty(pages[i]) ||
1433                                    pages[i]->mapping != inode->i_mapping) {
1434                                        ret = -EAGAIN;
1435                                        unlock_page(pages[i]);
1436                                        page_cache_release(pages[i]);
1437                                        goto done;
1438                                }
1439                        }
1440                        page_cache_release(pages[i]);
1441                        pages_locked++;
1442                }
1443                nrpages -= ret;
1444                index += ret;
1445                cond_resched();
1446        }
1447        ret = 0;
1448done:
1449        if (ret && pages_locked) {
1450                __unlock_for_delalloc(inode, locked_page,
1451                              delalloc_start,
1452                              ((u64)(start_index + pages_locked - 1)) <<
1453                              PAGE_CACHE_SHIFT);
1454        }
1455        return ret;
1456}
1457
1458/*
1459 * find a contiguous range of bytes in the file marked as delalloc, not
1460 * more than 'max_bytes'.  start and end are used to return the range,
1461 *
1462 * 1 is returned if we find something, 0 if nothing was in the tree
1463 */
1464static noinline u64 find_lock_delalloc_range(struct inode *inode,
1465                                             struct extent_io_tree *tree,
1466                                             struct page *locked_page,
1467                                             u64 *start, u64 *end,
1468                                             u64 max_bytes)
1469{
1470        u64 delalloc_start;
1471        u64 delalloc_end;
1472        u64 found;
1473        struct extent_state *cached_state = NULL;
1474        int ret;
1475        int loops = 0;
1476
1477again:
1478        /* step one, find a bunch of delalloc bytes starting at start */
1479        delalloc_start = *start;
1480        delalloc_end = 0;
1481        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1482                                    max_bytes, &cached_state);
1483        if (!found || delalloc_end <= *start) {
1484                *start = delalloc_start;
1485                *end = delalloc_end;
1486                free_extent_state(cached_state);
1487                return found;
1488        }
1489
1490        /*
1491         * start comes from the offset of locked_page.  We have to lock
1492         * pages in order, so we can't process delalloc bytes before
1493         * locked_page
1494         */
1495        if (delalloc_start < *start)
1496                delalloc_start = *start;
1497
1498        /*
1499         * make sure to limit the number of pages we try to lock down
1500         * if we're looping.
1501         */
1502        if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1503                delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1504
1505        /* step two, lock all the pages after the page that has start */
1506        ret = lock_delalloc_pages(inode, locked_page,
1507                                  delalloc_start, delalloc_end);
1508        if (ret == -EAGAIN) {
1509                /* some of the pages are gone, lets avoid looping by
1510                 * shortening the size of the delalloc range we're searching
1511                 */
1512                free_extent_state(cached_state);
1513                if (!loops) {
1514                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1515                        max_bytes = PAGE_CACHE_SIZE - offset;
1516                        loops = 1;
1517                        goto again;
1518                } else {
1519                        found = 0;
1520                        goto out_failed;
1521                }
1522        }
1523        BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1524
1525        /* step three, lock the state bits for the whole range */
1526        lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1527
1528        /* then test to make sure it is all still delalloc */
1529        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1530                             EXTENT_DELALLOC, 1, cached_state);
1531        if (!ret) {
1532                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1533                                     &cached_state, GFP_NOFS);
1534                __unlock_for_delalloc(inode, locked_page,
1535                              delalloc_start, delalloc_end);
1536                cond_resched();
1537                goto again;
1538        }
1539        free_extent_state(cached_state);
1540        *start = delalloc_start;
1541        *end = delalloc_end;
1542out_failed:
1543        return found;
1544}
1545
1546int extent_clear_unlock_delalloc(struct inode *inode,
1547                                struct extent_io_tree *tree,
1548                                u64 start, u64 end, struct page *locked_page,
1549                                unsigned long op)
1550{
1551        int ret;
1552        struct page *pages[16];
1553        unsigned long index = start >> PAGE_CACHE_SHIFT;
1554        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1555        unsigned long nr_pages = end_index - index + 1;
1556        int i;
1557        int clear_bits = 0;
1558
1559        if (op & EXTENT_CLEAR_UNLOCK)
1560                clear_bits |= EXTENT_LOCKED;
1561        if (op & EXTENT_CLEAR_DIRTY)
1562                clear_bits |= EXTENT_DIRTY;
1563
1564        if (op & EXTENT_CLEAR_DELALLOC)
1565                clear_bits |= EXTENT_DELALLOC;
1566
1567        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1568        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1569                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1570                    EXTENT_SET_PRIVATE2)))
1571                return 0;
1572
1573        while (nr_pages > 0) {
1574                ret = find_get_pages_contig(inode->i_mapping, index,
1575                                     min_t(unsigned long,
1576                                     nr_pages, ARRAY_SIZE(pages)), pages);
1577                for (i = 0; i < ret; i++) {
1578
1579                        if (op & EXTENT_SET_PRIVATE2)
1580                                SetPagePrivate2(pages[i]);
1581
1582                        if (pages[i] == locked_page) {
1583                                page_cache_release(pages[i]);
1584                                continue;
1585                        }
1586                        if (op & EXTENT_CLEAR_DIRTY)
1587                                clear_page_dirty_for_io(pages[i]);
1588                        if (op & EXTENT_SET_WRITEBACK)
1589                                set_page_writeback(pages[i]);
1590                        if (op & EXTENT_END_WRITEBACK)
1591                                end_page_writeback(pages[i]);
1592                        if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1593                                unlock_page(pages[i]);
1594                        page_cache_release(pages[i]);
1595                }
1596                nr_pages -= ret;
1597                index += ret;
1598                cond_resched();
1599        }
1600        return 0;
1601}
1602
1603/*
1604 * count the number of bytes in the tree that have a given bit(s)
1605 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1606 * cached.  The total number found is returned.
1607 */
1608u64 count_range_bits(struct extent_io_tree *tree,
1609                     u64 *start, u64 search_end, u64 max_bytes,
1610                     unsigned long bits, int contig)
1611{
1612        struct rb_node *node;
1613        struct extent_state *state;
1614        u64 cur_start = *start;
1615        u64 total_bytes = 0;
1616        u64 last = 0;
1617        int found = 0;
1618
1619        if (search_end <= cur_start) {
1620                WARN_ON(1);
1621                return 0;
1622        }
1623
1624        spin_lock(&tree->lock);
1625        if (cur_start == 0 && bits == EXTENT_DIRTY) {
1626                total_bytes = tree->dirty_bytes;
1627                goto out;
1628        }
1629        /*
1630         * this search will find all the extents that end after
1631         * our range starts.
1632         */
1633        node = tree_search(tree, cur_start);
1634        if (!node)
1635                goto out;
1636
1637        while (1) {
1638                state = rb_entry(node, struct extent_state, rb_node);
1639                if (state->start > search_end)
1640                        break;
1641                if (contig && found && state->start > last + 1)
1642                        break;
1643                if (state->end >= cur_start && (state->state & bits) == bits) {
1644                        total_bytes += min(search_end, state->end) + 1 -
1645                                       max(cur_start, state->start);
1646                        if (total_bytes >= max_bytes)
1647                                break;
1648                        if (!found) {
1649                                *start = max(cur_start, state->start);
1650                                found = 1;
1651                        }
1652                        last = state->end;
1653                } else if (contig && found) {
1654                        break;
1655                }
1656                node = rb_next(node);
1657                if (!node)
1658                        break;
1659        }
1660out:
1661        spin_unlock(&tree->lock);
1662        return total_bytes;
1663}
1664
1665/*
1666 * set the private field for a given byte offset in the tree.  If there isn't
1667 * an extent_state there already, this does nothing.
1668 */
1669int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1670{
1671        struct rb_node *node;
1672        struct extent_state *state;
1673        int ret = 0;
1674
1675        spin_lock(&tree->lock);
1676        /*
1677         * this search will find all the extents that end after
1678         * our range starts.
1679         */
1680        node = tree_search(tree, start);
1681        if (!node) {
1682                ret = -ENOENT;
1683                goto out;
1684        }
1685        state = rb_entry(node, struct extent_state, rb_node);
1686        if (state->start != start) {
1687                ret = -ENOENT;
1688                goto out;
1689        }
1690        state->private = private;
1691out:
1692        spin_unlock(&tree->lock);
1693        return ret;
1694}
1695
1696int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1697{
1698        struct rb_node *node;
1699        struct extent_state *state;
1700        int ret = 0;
1701
1702        spin_lock(&tree->lock);
1703        /*
1704         * this search will find all the extents that end after
1705         * our range starts.
1706         */
1707        node = tree_search(tree, start);
1708        if (!node) {
1709                ret = -ENOENT;
1710                goto out;
1711        }
1712        state = rb_entry(node, struct extent_state, rb_node);
1713        if (state->start != start) {
1714                ret = -ENOENT;
1715                goto out;
1716        }
1717        *private = state->private;
1718out:
1719        spin_unlock(&tree->lock);
1720        return ret;
1721}
1722
1723/*
1724 * searches a range in the state tree for a given mask.
1725 * If 'filled' == 1, this returns 1 only if every extent in the tree
1726 * has the bits set.  Otherwise, 1 is returned if any bit in the
1727 * range is found set.
1728 */
1729int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1730                   int bits, int filled, struct extent_state *cached)
1731{
1732        struct extent_state *state = NULL;
1733        struct rb_node *node;
1734        int bitset = 0;
1735
1736        spin_lock(&tree->lock);
1737        if (cached && cached->tree && cached->start <= start &&
1738            cached->end > start)
1739                node = &cached->rb_node;
1740        else
1741                node = tree_search(tree, start);
1742        while (node && start <= end) {
1743                state = rb_entry(node, struct extent_state, rb_node);
1744
1745                if (filled && state->start > start) {
1746                        bitset = 0;
1747                        break;
1748                }
1749
1750                if (state->start > end)
1751                        break;
1752
1753                if (state->state & bits) {
1754                        bitset = 1;
1755                        if (!filled)
1756                                break;
1757                } else if (filled) {
1758                        bitset = 0;
1759                        break;
1760                }
1761
1762                if (state->end == (u64)-1)
1763                        break;
1764
1765                start = state->end + 1;
1766                if (start > end)
1767                        break;
1768                node = rb_next(node);
1769                if (!node) {
1770                        if (filled)
1771                                bitset = 0;
1772                        break;
1773                }
1774        }
1775        spin_unlock(&tree->lock);
1776        return bitset;
1777}
1778
1779/*
1780 * helper function to set a given page up to date if all the
1781 * extents in the tree for that page are up to date
1782 */
1783static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1784{
1785        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1786        u64 end = start + PAGE_CACHE_SIZE - 1;
1787        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1788                SetPageUptodate(page);
1789}
1790
1791/*
1792 * helper function to unlock a page if all the extents in the tree
1793 * for that page are unlocked
1794 */
1795static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1796{
1797        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1798        u64 end = start + PAGE_CACHE_SIZE - 1;
1799        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1800                unlock_page(page);
1801}
1802
1803/*
1804 * helper function to end page writeback if all the extents
1805 * in the tree for that page are done with writeback
1806 */
1807static void check_page_writeback(struct extent_io_tree *tree,
1808                                 struct page *page)
1809{
1810        end_page_writeback(page);
1811}
1812
1813/*
1814 * When IO fails, either with EIO or csum verification fails, we
1815 * try other mirrors that might have a good copy of the data.  This
1816 * io_failure_record is used to record state as we go through all the
1817 * mirrors.  If another mirror has good data, the page is set up to date
1818 * and things continue.  If a good mirror can't be found, the original
1819 * bio end_io callback is called to indicate things have failed.
1820 */
1821struct io_failure_record {
1822        struct page *page;
1823        u64 start;
1824        u64 len;
1825        u64 logical;
1826        unsigned long bio_flags;
1827        int this_mirror;
1828        int failed_mirror;
1829        int in_validation;
1830};
1831
1832static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1833                                int did_repair)
1834{
1835        int ret;
1836        int err = 0;
1837        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1838
1839        set_state_private(failure_tree, rec->start, 0);
1840        ret = clear_extent_bits(failure_tree, rec->start,
1841                                rec->start + rec->len - 1,
1842                                EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1843        if (ret)
1844                err = ret;
1845
1846        if (did_repair) {
1847                ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1848                                        rec->start + rec->len - 1,
1849                                        EXTENT_DAMAGED, GFP_NOFS);
1850                if (ret && !err)
1851                        err = ret;
1852        }
1853
1854        kfree(rec);
1855        return err;
1856}
1857
1858static void repair_io_failure_callback(struct bio *bio, int err)
1859{
1860        complete(bio->bi_private);
1861}
1862
1863/*
1864 * this bypasses the standard btrfs submit functions deliberately, as
1865 * the standard behavior is to write all copies in a raid setup. here we only
1866 * want to write the one bad copy. so we do the mapping for ourselves and issue
1867 * submit_bio directly.
1868 * to avoid any synchonization issues, wait for the data after writing, which
1869 * actually prevents the read that triggered the error from finishing.
1870 * currently, there can be no more than two copies of every data bit. thus,
1871 * exactly one rewrite is required.
1872 */
1873int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1874                        u64 length, u64 logical, struct page *page,
1875                        int mirror_num)
1876{
1877        struct bio *bio;
1878        struct btrfs_device *dev;
1879        DECLARE_COMPLETION_ONSTACK(compl);
1880        u64 map_length = 0;
1881        u64 sector;
1882        struct btrfs_bio *bbio = NULL;
1883        int ret;
1884
1885        BUG_ON(!mirror_num);
1886
1887        bio = bio_alloc(GFP_NOFS, 1);
1888        if (!bio)
1889                return -EIO;
1890        bio->bi_private = &compl;
1891        bio->bi_end_io = repair_io_failure_callback;
1892        bio->bi_size = 0;
1893        map_length = length;
1894
1895        ret = btrfs_map_block(map_tree, WRITE, logical,
1896                              &map_length, &bbio, mirror_num);
1897        if (ret) {
1898                bio_put(bio);
1899                return -EIO;
1900        }
1901        BUG_ON(mirror_num != bbio->mirror_num);
1902        sector = bbio->stripes[mirror_num-1].physical >> 9;
1903        bio->bi_sector = sector;
1904        dev = bbio->stripes[mirror_num-1].dev;
1905        kfree(bbio);
1906        if (!dev || !dev->bdev || !dev->writeable) {
1907                bio_put(bio);
1908                return -EIO;
1909        }
1910        bio->bi_bdev = dev->bdev;
1911        bio_add_page(bio, page, length, start-page_offset(page));
1912        btrfsic_submit_bio(WRITE_SYNC, bio);
1913        wait_for_completion(&compl);
1914
1915        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1916                /* try to remap that extent elsewhere? */
1917                bio_put(bio);
1918                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1919                return -EIO;
1920        }
1921
1922        printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1923                      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1924                      start, rcu_str_deref(dev->name), sector);
1925
1926        bio_put(bio);
1927        return 0;
1928}
1929
1930int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1931                         int mirror_num)
1932{
1933        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1934        u64 start = eb->start;
1935        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1936        int ret = 0;
1937
1938        for (i = 0; i < num_pages; i++) {
1939                struct page *p = extent_buffer_page(eb, i);
1940                ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
1941                                        start, p, mirror_num);
1942                if (ret)
1943                        break;
1944                start += PAGE_CACHE_SIZE;
1945        }
1946
1947        return ret;
1948}
1949
1950/*
1951 * each time an IO finishes, we do a fast check in the IO failure tree
1952 * to see if we need to process or clean up an io_failure_record
1953 */
1954static int clean_io_failure(u64 start, struct page *page)
1955{
1956        u64 private;
1957        u64 private_failure;
1958        struct io_failure_record *failrec;
1959        struct btrfs_mapping_tree *map_tree;
1960        struct extent_state *state;
1961        int num_copies;
1962        int did_repair = 0;
1963        int ret;
1964        struct inode *inode = page->mapping->host;
1965
1966        private = 0;
1967        ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1968                                (u64)-1, 1, EXTENT_DIRTY, 0);
1969        if (!ret)
1970                return 0;
1971
1972        ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1973                                &private_failure);
1974        if (ret)
1975                return 0;
1976
1977        failrec = (struct io_failure_record *)(unsigned long) private_failure;
1978        BUG_ON(!failrec->this_mirror);
1979
1980        if (failrec->in_validation) {
1981                /* there was no real error, just free the record */
1982                pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1983                         failrec->start);
1984                did_repair = 1;
1985                goto out;
1986        }
1987
1988        spin_lock(&BTRFS_I(inode)->io_tree.lock);
1989        state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1990                                            failrec->start,
1991                                            EXTENT_LOCKED);
1992        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1993
1994        if (state && state->start == failrec->start) {
1995                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1996                num_copies = btrfs_num_copies(map_tree, failrec->logical,
1997                                                failrec->len);
1998                if (num_copies > 1)  {
1999                        ret = repair_io_failure(map_tree, start, failrec->len,
2000                                                failrec->logical, page,
2001                                                failrec->failed_mirror);
2002                        did_repair = !ret;
2003                }
2004        }
2005
2006out:
2007        if (!ret)
2008                ret = free_io_failure(inode, failrec, did_repair);
2009
2010        return ret;
2011}
2012
2013/*
2014 * this is a generic handler for readpage errors (default
2015 * readpage_io_failed_hook). if other copies exist, read those and write back
2016 * good data to the failed position. does not investigate in remapping the
2017 * failed extent elsewhere, hoping the device will be smart enough to do this as
2018 * needed
2019 */
2020
2021static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2022                                u64 start, u64 end, int failed_mirror,
2023                                struct extent_state *state)
2024{
2025        struct io_failure_record *failrec = NULL;
2026        u64 private;
2027        struct extent_map *em;
2028        struct inode *inode = page->mapping->host;
2029        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2030        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2031        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2032        struct bio *bio;
2033        int num_copies;
2034        int ret;
2035        int read_mode;
2036        u64 logical;
2037
2038        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2039
2040        ret = get_state_private(failure_tree, start, &private);
2041        if (ret) {
2042                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2043                if (!failrec)
2044                        return -ENOMEM;
2045                failrec->start = start;
2046                failrec->len = end - start + 1;
2047                failrec->this_mirror = 0;
2048                failrec->bio_flags = 0;
2049                failrec->in_validation = 0;
2050
2051                read_lock(&em_tree->lock);
2052                em = lookup_extent_mapping(em_tree, start, failrec->len);
2053                if (!em) {
2054                        read_unlock(&em_tree->lock);
2055                        kfree(failrec);
2056                        return -EIO;
2057                }
2058
2059                if (em->start > start || em->start + em->len < start) {
2060                        free_extent_map(em);
2061                        em = NULL;
2062                }
2063                read_unlock(&em_tree->lock);
2064
2065                if (!em || IS_ERR(em)) {
2066                        kfree(failrec);
2067                        return -EIO;
2068                }
2069                logical = start - em->start;
2070                logical = em->block_start + logical;
2071                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2072                        logical = em->block_start;
2073                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2074                        extent_set_compress_type(&failrec->bio_flags,
2075                                                 em->compress_type);
2076                }
2077                pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2078                         "len=%llu\n", logical, start, failrec->len);
2079                failrec->logical = logical;
2080                free_extent_map(em);
2081
2082                /* set the bits in the private failure tree */
2083                ret = set_extent_bits(failure_tree, start, end,
2084                                        EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2085                if (ret >= 0)
2086                        ret = set_state_private(failure_tree, start,
2087                                                (u64)(unsigned long)failrec);
2088                /* set the bits in the inode's tree */
2089                if (ret >= 0)
2090                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2091                                                GFP_NOFS);
2092                if (ret < 0) {
2093                        kfree(failrec);
2094                        return ret;
2095                }
2096        } else {
2097                failrec = (struct io_failure_record *)(unsigned long)private;
2098                pr_debug("bio_readpage_error: (found) logical=%llu, "
2099                         "start=%llu, len=%llu, validation=%d\n",
2100                         failrec->logical, failrec->start, failrec->len,
2101                         failrec->in_validation);
2102                /*
2103                 * when data can be on disk more than twice, add to failrec here
2104                 * (e.g. with a list for failed_mirror) to make
2105                 * clean_io_failure() clean all those errors at once.
2106                 */
2107        }
2108        num_copies = btrfs_num_copies(
2109                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
2110                              failrec->logical, failrec->len);
2111        if (num_copies == 1) {
2112                /*
2113                 * we only have a single copy of the data, so don't bother with
2114                 * all the retry and error correction code that follows. no
2115                 * matter what the error is, it is very likely to persist.
2116                 */
2117                pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2118                         "state=%p, num_copies=%d, next_mirror %d, "
2119                         "failed_mirror %d\n", state, num_copies,
2120                         failrec->this_mirror, failed_mirror);
2121                free_io_failure(inode, failrec, 0);
2122                return -EIO;
2123        }
2124
2125        if (!state) {
2126                spin_lock(&tree->lock);
2127                state = find_first_extent_bit_state(tree, failrec->start,
2128                                                    EXTENT_LOCKED);
2129                if (state && state->start != failrec->start)
2130                        state = NULL;
2131                spin_unlock(&tree->lock);
2132        }
2133
2134        /*
2135         * there are two premises:
2136         *      a) deliver good data to the caller
2137         *      b) correct the bad sectors on disk
2138         */
2139        if (failed_bio->bi_vcnt > 1) {
2140                /*
2141                 * to fulfill b), we need to know the exact failing sectors, as
2142                 * we don't want to rewrite any more than the failed ones. thus,
2143                 * we need separate read requests for the failed bio
2144                 *
2145                 * if the following BUG_ON triggers, our validation request got
2146                 * merged. we need separate requests for our algorithm to work.
2147                 */
2148                BUG_ON(failrec->in_validation);
2149                failrec->in_validation = 1;
2150                failrec->this_mirror = failed_mirror;
2151                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2152        } else {
2153                /*
2154                 * we're ready to fulfill a) and b) alongside. get a good copy
2155                 * of the failed sector and if we succeed, we have setup
2156                 * everything for repair_io_failure to do the rest for us.
2157                 */
2158                if (failrec->in_validation) {
2159                        BUG_ON(failrec->this_mirror != failed_mirror);
2160                        failrec->in_validation = 0;
2161                        failrec->this_mirror = 0;
2162                }
2163                failrec->failed_mirror = failed_mirror;
2164                failrec->this_mirror++;
2165                if (failrec->this_mirror == failed_mirror)
2166                        failrec->this_mirror++;
2167                read_mode = READ_SYNC;
2168        }
2169
2170        if (!state || failrec->this_mirror > num_copies) {
2171                pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2172                         "next_mirror %d, failed_mirror %d\n", state,
2173                         num_copies, failrec->this_mirror, failed_mirror);
2174                free_io_failure(inode, failrec, 0);
2175                return -EIO;
2176        }
2177
2178        bio = bio_alloc(GFP_NOFS, 1);
2179        if (!bio) {
2180                free_io_failure(inode, failrec, 0);
2181                return -EIO;
2182        }
2183        bio->bi_private = state;
2184        bio->bi_end_io = failed_bio->bi_end_io;
2185        bio->bi_sector = failrec->logical >> 9;
2186        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2187        bio->bi_size = 0;
2188
2189        bio_add_page(bio, page, failrec->len, start - page_offset(page));
2190
2191        pr_debug("bio_readpage_error: submitting new read[%#x] to "
2192                 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2193                 failrec->this_mirror, num_copies, failrec->in_validation);
2194
2195        ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2196                                         failrec->this_mirror,
2197                                         failrec->bio_flags, 0);
2198        return ret;
2199}
2200
2201/* lots and lots of room for performance fixes in the end_bio funcs */
2202
2203int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2204{
2205        int uptodate = (err == 0);
2206        struct extent_io_tree *tree;
2207        int ret;
2208
2209        tree = &BTRFS_I(page->mapping->host)->io_tree;
2210
2211        if (tree->ops && tree->ops->writepage_end_io_hook) {
2212                ret = tree->ops->writepage_end_io_hook(page, start,
2213                                               end, NULL, uptodate);
2214                if (ret)
2215                        uptodate = 0;
2216        }
2217
2218        if (!uptodate) {
2219                ClearPageUptodate(page);
2220                SetPageError(page);
2221        }
2222        return 0;
2223}
2224
2225/*
2226 * after a writepage IO is done, we need to:
2227 * clear the uptodate bits on error
2228 * clear the writeback bits in the extent tree for this IO
2229 * end_page_writeback if the page has no more pending IO
2230 *
2231 * Scheduling is not allowed, so the extent state tree is expected
2232 * to have one and only one object corresponding to this IO.
2233 */
2234static void end_bio_extent_writepage(struct bio *bio, int err)
2235{
2236        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2237        struct extent_io_tree *tree;
2238        u64 start;
2239        u64 end;
2240        int whole_page;
2241
2242        do {
2243                struct page *page = bvec->bv_page;
2244                tree = &BTRFS_I(page->mapping->host)->io_tree;
2245
2246                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2247                         bvec->bv_offset;
2248                end = start + bvec->bv_len - 1;
2249
2250                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2251                        whole_page = 1;
2252                else
2253                        whole_page = 0;
2254
2255                if (--bvec >= bio->bi_io_vec)
2256                        prefetchw(&bvec->bv_page->flags);
2257
2258                if (end_extent_writepage(page, err, start, end))
2259                        continue;
2260
2261                if (whole_page)
2262                        end_page_writeback(page);
2263                else
2264                        check_page_writeback(tree, page);
2265        } while (bvec >= bio->bi_io_vec);
2266
2267        bio_put(bio);
2268}
2269
2270/*
2271 * after a readpage IO is done, we need to:
2272 * clear the uptodate bits on error
2273 * set the uptodate bits if things worked
2274 * set the page up to date if all extents in the tree are uptodate
2275 * clear the lock bit in the extent tree
2276 * unlock the page if there are no other extents locked for it
2277 *
2278 * Scheduling is not allowed, so the extent state tree is expected
2279 * to have one and only one object corresponding to this IO.
2280 */
2281static void end_bio_extent_readpage(struct bio *bio, int err)
2282{
2283        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2284        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2285        struct bio_vec *bvec = bio->bi_io_vec;
2286        struct extent_io_tree *tree;
2287        u64 start;
2288        u64 end;
2289        int whole_page;
2290        int mirror;
2291        int ret;
2292
2293        if (err)
2294                uptodate = 0;
2295
2296        do {
2297                struct page *page = bvec->bv_page;
2298                struct extent_state *cached = NULL;
2299                struct extent_state *state;
2300
2301                pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2302                         "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2303                         (long int)bio->bi_bdev);
2304                tree = &BTRFS_I(page->mapping->host)->io_tree;
2305
2306                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2307                        bvec->bv_offset;
2308                end = start + bvec->bv_len - 1;
2309
2310                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2311                        whole_page = 1;
2312                else
2313                        whole_page = 0;
2314
2315                if (++bvec <= bvec_end)
2316                        prefetchw(&bvec->bv_page->flags);
2317
2318                spin_lock(&tree->lock);
2319                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2320                if (state && state->start == start) {
2321                        /*
2322                         * take a reference on the state, unlock will drop
2323                         * the ref
2324                         */
2325                        cache_state(state, &cached);
2326                }
2327                spin_unlock(&tree->lock);
2328
2329                mirror = (int)(unsigned long)bio->bi_bdev;
2330                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2331                        ret = tree->ops->readpage_end_io_hook(page, start, end,
2332                                                              state, mirror);
2333                        if (ret)
2334                                uptodate = 0;
2335                        else
2336                                clean_io_failure(start, page);
2337                }
2338
2339                if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2340                        ret = tree->ops->readpage_io_failed_hook(page, mirror);
2341                        if (!ret && !err &&
2342                            test_bit(BIO_UPTODATE, &bio->bi_flags))
2343                                uptodate = 1;
2344                } else if (!uptodate) {
2345                        /*
2346                         * The generic bio_readpage_error handles errors the
2347                         * following way: If possible, new read requests are
2348                         * created and submitted and will end up in
2349                         * end_bio_extent_readpage as well (if we're lucky, not
2350                         * in the !uptodate case). In that case it returns 0 and
2351                         * we just go on with the next page in our bio. If it
2352                         * can't handle the error it will return -EIO and we
2353                         * remain responsible for that page.
2354                         */
2355                        ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
2356                        if (ret == 0) {
2357                                uptodate =
2358                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
2359                                if (err)
2360                                        uptodate = 0;
2361                                uncache_state(&cached);
2362                                continue;
2363                        }
2364                }
2365
2366                if (uptodate && tree->track_uptodate) {
2367                        set_extent_uptodate(tree, start, end, &cached,
2368                                            GFP_ATOMIC);
2369                }
2370                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2371
2372                if (whole_page) {
2373                        if (uptodate) {
2374                                SetPageUptodate(page);
2375                        } else {
2376                                ClearPageUptodate(page);
2377                                SetPageError(page);
2378                        }
2379                        unlock_page(page);
2380                } else {
2381                        if (uptodate) {
2382                                check_page_uptodate(tree, page);
2383                        } else {
2384                                ClearPageUptodate(page);
2385                                SetPageError(page);
2386                        }
2387                        check_page_locked(tree, page);
2388                }
2389        } while (bvec <= bvec_end);
2390
2391        bio_put(bio);
2392}
2393
2394struct bio *
2395btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2396                gfp_t gfp_flags)
2397{
2398        struct bio *bio;
2399
2400        bio = bio_alloc(gfp_flags, nr_vecs);
2401
2402        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2403                while (!bio && (nr_vecs /= 2))
2404                        bio = bio_alloc(gfp_flags, nr_vecs);
2405        }
2406
2407        if (bio) {
2408                bio->bi_size = 0;
2409                bio->bi_bdev = bdev;
2410                bio->bi_sector = first_sector;
2411        }
2412        return bio;
2413}
2414
2415/*
2416 * Since writes are async, they will only return -ENOMEM.
2417 * Reads can return the full range of I/O error conditions.
2418 */
2419static int __must_check submit_one_bio(int rw, struct bio *bio,
2420                                       int mirror_num, unsigned long bio_flags)
2421{
2422        int ret = 0;
2423        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2424        struct page *page = bvec->bv_page;
2425        struct extent_io_tree *tree = bio->bi_private;
2426        u64 start;
2427
2428        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2429
2430        bio->bi_private = NULL;
2431
2432        bio_get(bio);
2433
2434        if (tree->ops && tree->ops->submit_bio_hook)
2435                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2436                                           mirror_num, bio_flags, start);
2437        else
2438                btrfsic_submit_bio(rw, bio);
2439
2440        if (bio_flagged(bio, BIO_EOPNOTSUPP))
2441                ret = -EOPNOTSUPP;
2442        bio_put(bio);
2443        return ret;
2444}
2445
2446static int merge_bio(struct extent_io_tree *tree, struct page *page,
2447                     unsigned long offset, size_t size, struct bio *bio,
2448                     unsigned long bio_flags)
2449{
2450        int ret = 0;
2451        if (tree->ops && tree->ops->merge_bio_hook)
2452                ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2453                                                bio_flags);
2454        BUG_ON(ret < 0);
2455        return ret;
2456
2457}
2458
2459static int submit_extent_page(int rw, struct extent_io_tree *tree,
2460                              struct page *page, sector_t sector,
2461                              size_t size, unsigned long offset,
2462                              struct block_device *bdev,
2463                              struct bio **bio_ret,
2464                              unsigned long max_pages,
2465                              bio_end_io_t end_io_func,
2466                              int mirror_num,
2467                              unsigned long prev_bio_flags,
2468                              unsigned long bio_flags)
2469{
2470        int ret = 0;
2471        struct bio *bio;
2472        int nr;
2473        int contig = 0;
2474        int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2475        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2476        size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2477
2478        if (bio_ret && *bio_ret) {
2479                bio = *bio_ret;
2480                if (old_compressed)
2481                        contig = bio->bi_sector == sector;
2482                else
2483                        contig = bio->bi_sector + (bio->bi_size >> 9) ==
2484                                sector;
2485
2486                if (prev_bio_flags != bio_flags || !contig ||
2487                    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
2488                    bio_add_page(bio, page, page_size, offset) < page_size) {
2489                        ret = submit_one_bio(rw, bio, mirror_num,
2490                                             prev_bio_flags);
2491                        if (ret < 0)
2492                                return ret;
2493                        bio = NULL;
2494                } else {
2495                        return 0;
2496                }
2497        }
2498        if (this_compressed)
2499                nr = BIO_MAX_PAGES;
2500        else
2501                nr = bio_get_nr_vecs(bdev);
2502
2503        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2504        if (!bio)
2505                return -ENOMEM;
2506
2507        bio_add_page(bio, page, page_size, offset);
2508        bio->bi_end_io = end_io_func;
2509        bio->bi_private = tree;
2510
2511        if (bio_ret)
2512                *bio_ret = bio;
2513        else
2514                ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2515
2516        return ret;
2517}
2518
2519void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
2520{
2521        if (!PagePrivate(page)) {
2522                SetPagePrivate(page);
2523                page_cache_get(page);
2524                set_page_private(page, (unsigned long)eb);
2525        } else {
2526                WARN_ON(page->private != (unsigned long)eb);
2527        }
2528}
2529
2530void set_page_extent_mapped(struct page *page)
2531{
2532        if (!PagePrivate(page)) {
2533                SetPagePrivate(page);
2534                page_cache_get(page);
2535                set_page_private(page, EXTENT_PAGE_PRIVATE);
2536        }
2537}
2538
2539/*
2540 * basic readpage implementation.  Locked extent state structs are inserted
2541 * into the tree that are removed when the IO is done (by the end_io
2542 * handlers)
2543 * XXX JDM: This needs looking at to ensure proper page locking
2544 */
2545static int __extent_read_full_page(struct extent_io_tree *tree,
2546                                   struct page *page,
2547                                   get_extent_t *get_extent,
2548                                   struct bio **bio, int mirror_num,
2549                                   unsigned long *bio_flags)
2550{
2551        struct inode *inode = page->mapping->host;
2552        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2553        u64 page_end = start + PAGE_CACHE_SIZE - 1;
2554        u64 end;
2555        u64 cur = start;
2556        u64 extent_offset;
2557        u64 last_byte = i_size_read(inode);
2558        u64 block_start;
2559        u64 cur_end;
2560        sector_t sector;
2561        struct extent_map *em;
2562        struct block_device *bdev;
2563        struct btrfs_ordered_extent *ordered;
2564        int ret;
2565        int nr = 0;
2566        size_t pg_offset = 0;
2567        size_t iosize;
2568        size_t disk_io_size;
2569        size_t blocksize = inode->i_sb->s_blocksize;
2570        unsigned long this_bio_flag = 0;
2571
2572        set_page_extent_mapped(page);
2573
2574        if (!PageUptodate(page)) {
2575                if (cleancache_get_page(page) == 0) {
2576                        BUG_ON(blocksize != PAGE_SIZE);
2577                        goto out;
2578                }
2579        }
2580
2581        end = page_end;
2582        while (1) {
2583                lock_extent(tree, start, end);
2584                ordered = btrfs_lookup_ordered_extent(inode, start);
2585                if (!ordered)
2586                        break;
2587                unlock_extent(tree, start, end);
2588                btrfs_start_ordered_extent(inode, ordered, 1);
2589                btrfs_put_ordered_extent(ordered);
2590        }
2591
2592        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2593                char *userpage;
2594                size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2595
2596                if (zero_offset) {
2597                        iosize = PAGE_CACHE_SIZE - zero_offset;
2598                        userpage = kmap_atomic(page);
2599                        memset(userpage + zero_offset, 0, iosize);
2600                        flush_dcache_page(page);
2601                        kunmap_atomic(userpage);
2602                }
2603        }
2604        while (cur <= end) {
2605                if (cur >= last_byte) {
2606                        char *userpage;
2607                        struct extent_state *cached = NULL;
2608
2609                        iosize = PAGE_CACHE_SIZE - pg_offset;
2610                        userpage = kmap_atomic(page);
2611                        memset(userpage + pg_offset, 0, iosize);
2612                        flush_dcache_page(page);
2613                        kunmap_atomic(userpage);
2614                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2615                                            &cached, GFP_NOFS);
2616                        unlock_extent_cached(tree, cur, cur + iosize - 1,
2617                                             &cached, GFP_NOFS);
2618                        break;
2619                }
2620                em = get_extent(inode, page, pg_offset, cur,
2621                                end - cur + 1, 0);
2622                if (IS_ERR_OR_NULL(em)) {
2623                        SetPageError(page);
2624                        unlock_extent(tree, cur, end);
2625                        break;
2626                }
2627                extent_offset = cur - em->start;
2628                BUG_ON(extent_map_end(em) <= cur);
2629                BUG_ON(end < cur);
2630
2631                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2632                        this_bio_flag = EXTENT_BIO_COMPRESSED;
2633                        extent_set_compress_type(&this_bio_flag,
2634                                                 em->compress_type);
2635                }
2636
2637                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2638                cur_end = min(extent_map_end(em) - 1, end);
2639                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2640                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2641                        disk_io_size = em->block_len;
2642                        sector = em->block_start >> 9;
2643                } else {
2644                        sector = (em->block_start + extent_offset) >> 9;
2645                        disk_io_size = iosize;
2646                }
2647                bdev = em->bdev;
2648                block_start = em->block_start;
2649                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2650                        block_start = EXTENT_MAP_HOLE;
2651                free_extent_map(em);
2652                em = NULL;
2653
2654                /* we've found a hole, just zero and go on */
2655                if (block_start == EXTENT_MAP_HOLE) {
2656                        char *userpage;
2657                        struct extent_state *cached = NULL;
2658
2659                        userpage = kmap_atomic(page);
2660                        memset(userpage + pg_offset, 0, iosize);
2661                        flush_dcache_page(page);
2662                        kunmap_atomic(userpage);
2663
2664                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2665                                            &cached, GFP_NOFS);
2666                        unlock_extent_cached(tree, cur, cur + iosize - 1,
2667                                             &cached, GFP_NOFS);
2668                        cur = cur + iosize;
2669                        pg_offset += iosize;
2670                        continue;
2671                }
2672                /* the get_extent function already copied into the page */
2673                if (test_range_bit(tree, cur, cur_end,
2674                                   EXTENT_UPTODATE, 1, NULL)) {
2675                        check_page_uptodate(tree, page);
2676                        unlock_extent(tree, cur, cur + iosize - 1);
2677                        cur = cur + iosize;
2678                        pg_offset += iosize;
2679                        continue;
2680                }
2681                /* we have an inline extent but it didn't get marked up
2682                 * to date.  Error out
2683                 */
2684                if (block_start == EXTENT_MAP_INLINE) {
2685                        SetPageError(page);
2686                        unlock_extent(tree, cur, cur + iosize - 1);
2687                        cur = cur + iosize;
2688                        pg_offset += iosize;
2689                        continue;
2690                }
2691
2692                ret = 0;
2693                if (tree->ops && tree->ops->readpage_io_hook) {
2694                        ret = tree->ops->readpage_io_hook(page, cur,
2695                                                          cur + iosize - 1);
2696                }
2697                if (!ret) {
2698                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2699                        pnr -= page->index;
2700                        ret = submit_extent_page(READ, tree, page,
2701                                         sector, disk_io_size, pg_offset,
2702                                         bdev, bio, pnr,
2703                                         end_bio_extent_readpage, mirror_num,
2704                                         *bio_flags,
2705                                         this_bio_flag);
2706                        BUG_ON(ret == -ENOMEM);
2707                        nr++;
2708                        *bio_flags = this_bio_flag;
2709                }
2710                if (ret)
2711                        SetPageError(page);
2712                cur = cur + iosize;
2713                pg_offset += iosize;
2714        }
2715out:
2716        if (!nr) {
2717                if (!PageError(page))
2718                        SetPageUptodate(page);
2719                unlock_page(page);
2720        }
2721        return 0;
2722}
2723
2724int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2725                            get_extent_t *get_extent, int mirror_num)
2726{
2727        struct bio *bio = NULL;
2728        unsigned long bio_flags = 0;
2729        int ret;
2730
2731        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2732                                      &bio_flags);
2733        if (bio)
2734                ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2735        return ret;
2736}
2737
2738static noinline void update_nr_written(struct page *page,
2739                                      struct writeback_control *wbc,
2740                                      unsigned long nr_written)
2741{
2742        wbc->nr_to_write -= nr_written;
2743        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2744            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2745                page->mapping->writeback_index = page->index + nr_written;
2746}
2747
2748/*
2749 * the writepage semantics are similar to regular writepage.  extent
2750 * records are inserted to lock ranges in the tree, and as dirty areas
2751 * are found, they are marked writeback.  Then the lock bits are removed
2752 * and the end_io handler clears the writeback ranges
2753 */
2754static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2755                              void *data)
2756{
2757        struct inode *inode = page->mapping->host;
2758        struct extent_page_data *epd = data;
2759        struct extent_io_tree *tree = epd->tree;
2760        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2761        u64 delalloc_start;
2762        u64 page_end = start + PAGE_CACHE_SIZE - 1;
2763        u64 end;
2764        u64 cur = start;
2765        u64 extent_offset;
2766        u64 last_byte = i_size_read(inode);
2767        u64 block_start;
2768        u64 iosize;
2769        sector_t sector;
2770        struct extent_state *cached_state = NULL;
2771        struct extent_map *em;
2772        struct block_device *bdev;
2773        int ret;
2774        int nr = 0;
2775        size_t pg_offset = 0;
2776        size_t blocksize;
2777        loff_t i_size = i_size_read(inode);
2778        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2779        u64 nr_delalloc;
2780        u64 delalloc_end;
2781        int page_started;
2782        int compressed;
2783        int write_flags;
2784        unsigned long nr_written = 0;
2785        bool fill_delalloc = true;
2786
2787        if (wbc->sync_mode == WB_SYNC_ALL)
2788                write_flags = WRITE_SYNC;
2789        else
2790                write_flags = WRITE;
2791
2792        trace___extent_writepage(page, inode, wbc);
2793
2794        WARN_ON(!PageLocked(page));
2795
2796        ClearPageError(page);
2797
2798        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2799        if (page->index > end_index ||
2800           (page->index == end_index && !pg_offset)) {
2801                page->mapping->a_ops->invalidatepage(page, 0);
2802                unlock_page(page);
2803                return 0;
2804        }
2805
2806        if (page->index == end_index) {
2807                char *userpage;
2808
2809                userpage = kmap_atomic(page);
2810                memset(userpage + pg_offset, 0,
2811                       PAGE_CACHE_SIZE - pg_offset);
2812                kunmap_atomic(userpage);
2813                flush_dcache_page(page);
2814        }
2815        pg_offset = 0;
2816
2817        set_page_extent_mapped(page);
2818
2819        if (!tree->ops || !tree->ops->fill_delalloc)
2820                fill_delalloc = false;
2821
2822        delalloc_start = start;
2823        delalloc_end = 0;
2824        page_started = 0;
2825        if (!epd->extent_locked && fill_delalloc) {
2826                u64 delalloc_to_write = 0;
2827                /*
2828                 * make sure the wbc mapping index is at least updated
2829                 * to this page.
2830                 */
2831                update_nr_written(page, wbc, 0);
2832
2833                while (delalloc_end < page_end) {
2834                        nr_delalloc = find_lock_delalloc_range(inode, tree,
2835                                                       page,
2836                                                       &delalloc_start,
2837                                                       &delalloc_end,
2838                                                       128 * 1024 * 1024);
2839                        if (nr_delalloc == 0) {
2840                                delalloc_start = delalloc_end + 1;
2841                                continue;
2842                        }
2843                        ret = tree->ops->fill_delalloc(inode, page,
2844                                                       delalloc_start,
2845                                                       delalloc_end,
2846                                                       &page_started,
2847                                                       &nr_written);
2848                        /* File system has been set read-only */
2849                        if (ret) {
2850                                SetPageError(page);
2851                                goto done;
2852                        }
2853                        /*
2854                         * delalloc_end is already one less than the total
2855                         * length, so we don't subtract one from
2856                         * PAGE_CACHE_SIZE
2857                         */
2858                        delalloc_to_write += (delalloc_end - delalloc_start +
2859                                              PAGE_CACHE_SIZE) >>
2860                                              PAGE_CACHE_SHIFT;
2861                        delalloc_start = delalloc_end + 1;
2862                }
2863                if (wbc->nr_to_write < delalloc_to_write) {
2864                        int thresh = 8192;
2865
2866                        if (delalloc_to_write < thresh * 2)
2867                                thresh = delalloc_to_write;
2868                        wbc->nr_to_write = min_t(u64, delalloc_to_write,
2869                                                 thresh);
2870                }
2871
2872                /* did the fill delalloc function already unlock and start
2873                 * the IO?
2874                 */
2875                if (page_started) {
2876                        ret = 0;
2877                        /*
2878                         * we've unlocked the page, so we can't update
2879                         * the mapping's writeback index, just update
2880                         * nr_to_write.
2881                         */
2882                        wbc->nr_to_write -= nr_written;
2883                        goto done_unlocked;
2884                }
2885        }
2886        if (tree->ops && tree->ops->writepage_start_hook) {
2887                ret = tree->ops->writepage_start_hook(page, start,
2888                                                      page_end);
2889                if (ret) {
2890                        /* Fixup worker will requeue */
2891                        if (ret == -EBUSY)
2892                                wbc->pages_skipped++;
2893                        else
2894                                redirty_page_for_writepage(wbc, page);
2895                        update_nr_written(page, wbc, nr_written);
2896                        unlock_page(page);
2897                        ret = 0;
2898                        goto done_unlocked;
2899                }
2900        }
2901
2902        /*
2903         * we don't want to touch the inode after unlocking the page,
2904         * so we update the mapping writeback index now
2905         */
2906        update_nr_written(page, wbc, nr_written + 1);
2907
2908        end = page_end;
2909        if (last_byte <= start) {
2910                if (tree->ops && tree->ops->writepage_end_io_hook)
2911                        tree->ops->writepage_end_io_hook(page, start,
2912                                                         page_end, NULL, 1);
2913                goto done;
2914        }
2915
2916        blocksize = inode->i_sb->s_blocksize;
2917
2918        while (cur <= end) {
2919                if (cur >= last_byte) {
2920                        if (tree->ops && tree->ops->writepage_end_io_hook)
2921                                tree->ops->writepage_end_io_hook(page, cur,
2922                                                         page_end, NULL, 1);
2923                        break;
2924                }
2925                em = epd->get_extent(inode, page, pg_offset, cur,
2926                                     end - cur + 1, 1);
2927                if (IS_ERR_OR_NULL(em)) {
2928                        SetPageError(page);
2929                        break;
2930                }
2931
2932                extent_offset = cur - em->start;
2933                BUG_ON(extent_map_end(em) <= cur);
2934                BUG_ON(end < cur);
2935                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2936                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2937                sector = (em->block_start + extent_offset) >> 9;
2938                bdev = em->bdev;
2939                block_start = em->block_start;
2940                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2941                free_extent_map(em);
2942                em = NULL;
2943
2944                /*
2945                 * compressed and inline extents are written through other
2946                 * paths in the FS
2947                 */
2948                if (compressed || block_start == EXTENT_MAP_HOLE ||
2949                    block_start == EXTENT_MAP_INLINE) {
2950                        /*
2951                         * end_io notification does not happen here for
2952                         * compressed extents
2953                         */
2954                        if (!compressed && tree->ops &&
2955                            tree->ops->writepage_end_io_hook)
2956                                tree->ops->writepage_end_io_hook(page, cur,
2957                                                         cur + iosize - 1,
2958                                                         NULL, 1);
2959                        else if (compressed) {
2960                                /* we don't want to end_page_writeback on
2961                                 * a compressed extent.  this happens
2962                                 * elsewhere
2963                                 */
2964                                nr++;
2965                        }
2966
2967                        cur += iosize;
2968                        pg_offset += iosize;
2969                        continue;
2970                }
2971                /* leave this out until we have a page_mkwrite call */
2972                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2973                                   EXTENT_DIRTY, 0, NULL)) {
2974                        cur = cur + iosize;
2975                        pg_offset += iosize;
2976                        continue;
2977                }
2978
2979                if (tree->ops && tree->ops->writepage_io_hook) {
2980                        ret = tree->ops->writepage_io_hook(page, cur,
2981                                                cur + iosize - 1);
2982                } else {
2983                        ret = 0;
2984                }
2985                if (ret) {
2986                        SetPageError(page);
2987                } else {
2988                        unsigned long max_nr = end_index + 1;
2989
2990                        set_range_writeback(tree, cur, cur + iosize - 1);
2991                        if (!PageWriteback(page)) {
2992                                printk(KERN_ERR "btrfs warning page %lu not "
2993                                       "writeback, cur %llu end %llu\n",
2994                                       page->index, (unsigned long long)cur,
2995                                       (unsigned long long)end);
2996                        }
2997
2998                        ret = submit_extent_page(write_flags, tree, page,
2999                                                 sector, iosize, pg_offset,
3000                                                 bdev, &epd->bio, max_nr,
3001                                                 end_bio_extent_writepage,
3002                                                 0, 0, 0);
3003                        if (ret)
3004                                SetPageError(page);
3005                }
3006                cur = cur + iosize;
3007                pg_offset += iosize;
3008                nr++;
3009        }
3010done:
3011        if (nr == 0) {
3012                /* make sure the mapping tag for page dirty gets cleared */
3013                set_page_writeback(page);
3014                end_page_writeback(page);
3015        }
3016        unlock_page(page);
3017
3018done_unlocked:
3019
3020        /* drop our reference on any cached states */
3021        free_extent_state(cached_state);
3022        return 0;
3023}
3024
3025static int eb_wait(void *word)
3026{
3027        io_schedule();
3028        return 0;
3029}
3030
3031static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3032{
3033        wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3034                    TASK_UNINTERRUPTIBLE);
3035}
3036
3037static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3038                                     struct btrfs_fs_info *fs_info,
3039                                     struct extent_page_data *epd)
3040{
3041        unsigned long i, num_pages;
3042        int flush = 0;
3043        int ret = 0;
3044
3045        if (!btrfs_try_tree_write_lock(eb)) {
3046                flush = 1;
3047                flush_write_bio(epd);
3048                btrfs_tree_lock(eb);
3049        }
3050
3051        if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3052                btrfs_tree_unlock(eb);
3053                if (!epd->sync_io)
3054                        return 0;
3055                if (!flush) {
3056                        flush_write_bio(epd);
3057                        flush = 1;
3058                }
3059                while (1) {
3060                        wait_on_extent_buffer_writeback(eb);
3061                        btrfs_tree_lock(eb);
3062                        if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3063                                break;
3064                        btrfs_tree_unlock(eb);
3065                }
3066        }
3067
3068        /*
3069         * We need to do this to prevent races in people who check if the eb is
3070         * under IO since we can end up having no IO bits set for a short period
3071         * of time.
3072         */
3073        spin_lock(&eb->refs_lock);
3074        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3075                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3076                spin_unlock(&eb->refs_lock);
3077                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3078                spin_lock(&fs_info->delalloc_lock);
3079                if (fs_info->dirty_metadata_bytes >= eb->len)
3080                        fs_info->dirty_metadata_bytes -= eb->len;
3081                else
3082                        WARN_ON(1);
3083                spin_unlock(&fs_info->delalloc_lock);
3084                ret = 1;
3085        } else {
3086                spin_unlock(&eb->refs_lock);
3087        }
3088
3089        btrfs_tree_unlock(eb);
3090
3091        if (!ret)
3092                return ret;
3093
3094        num_pages = num_extent_pages(eb->start, eb->len);
3095        for (i = 0; i < num_pages; i++) {
3096                struct page *p = extent_buffer_page(eb, i);
3097
3098                if (!trylock_page(p)) {
3099                        if (!flush) {
3100                                flush_write_bio(epd);
3101                                flush = 1;
3102                        }
3103                        lock_page(p);
3104                }
3105        }
3106
3107        return ret;
3108}
3109
3110static void end_extent_buffer_writeback(struct extent_buffer *eb)
3111{
3112        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3113        smp_mb__after_clear_bit();
3114        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3115}
3116
3117static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3118{
3119        int uptodate = err == 0;
3120        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3121        struct extent_buffer *eb;
3122        int done;
3123
3124        do {
3125                struct page *page = bvec->bv_page;
3126
3127                bvec--;
3128                eb = (struct extent_buffer *)page->private;
3129                BUG_ON(!eb);
3130                done = atomic_dec_and_test(&eb->io_pages);
3131
3132                if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3133                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3134                        ClearPageUptodate(page);
3135                        SetPageError(page);
3136                }
3137
3138                end_page_writeback(page);
3139
3140                if (!done)
3141                        continue;
3142
3143                end_extent_buffer_writeback(eb);
3144        } while (bvec >= bio->bi_io_vec);
3145
3146        bio_put(bio);
3147
3148}
3149
3150static int write_one_eb(struct extent_buffer *eb,
3151                        struct btrfs_fs_info *fs_info,
3152                        struct writeback_control *wbc,
3153                        struct extent_page_data *epd)
3154{
3155        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3156        u64 offset = eb->start;
3157        unsigned long i, num_pages;
3158        int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3159        int ret = 0;
3160
3161        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3162        num_pages = num_extent_pages(eb->start, eb->len);
3163        atomic_set(&eb->io_pages, num_pages);
3164        for (i = 0; i < num_pages; i++) {
3165                struct page *p = extent_buffer_page(eb, i);
3166
3167                clear_page_dirty_for_io(p);
3168                set_page_writeback(p);
3169                ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3170                                         PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3171                                         -1, end_bio_extent_buffer_writepage,
3172                                         0, 0, 0);
3173                if (ret) {
3174                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3175                        SetPageError(p);
3176                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3177                                end_extent_buffer_writeback(eb);
3178                        ret = -EIO;
3179                        break;
3180                }
3181                offset += PAGE_CACHE_SIZE;
3182                update_nr_written(p, wbc, 1);
3183                unlock_page(p);
3184        }
3185
3186        if (unlikely(ret)) {
3187                for (; i < num_pages; i++) {
3188                        struct page *p = extent_buffer_page(eb, i);
3189                        unlock_page(p);
3190                }
3191        }
3192
3193        return ret;
3194}
3195
3196int btree_write_cache_pages(struct address_space *mapping,
3197                                   struct writeback_control *wbc)
3198{
3199        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3200        struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3201        struct extent_buffer *eb, *prev_eb = NULL;
3202        struct extent_page_data epd = {
3203                .bio = NULL,
3204                .tree = tree,
3205                .extent_locked = 0,
3206                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3207        };
3208        int ret = 0;
3209        int done = 0;
3210        int nr_to_write_done = 0;
3211        struct pagevec pvec;
3212        int nr_pages;
3213        pgoff_t index;
3214        pgoff_t end;            /* Inclusive */
3215        int scanned = 0;
3216        int tag;
3217
3218        pagevec_init(&pvec, 0);
3219        if (wbc->range_cyclic) {
3220                index = mapping->writeback_index; /* Start from prev offset */
3221                end = -1;
3222        } else {
3223                index = wbc->range_start >> PAGE_CACHE_SHIFT;
3224                end = wbc->range_end >> PAGE_CACHE_SHIFT;
3225                scanned = 1;
3226        }
3227        if (wbc->sync_mode == WB_SYNC_ALL)
3228                tag = PAGECACHE_TAG_TOWRITE;
3229        else
3230                tag = PAGECACHE_TAG_DIRTY;
3231retry:
3232        if (wbc->sync_mode == WB_SYNC_ALL)
3233                tag_pages_for_writeback(mapping, index, end);
3234        while (!done && !nr_to_write_done && (index <= end) &&
3235               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3236                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3237                unsigned i;
3238
3239                scanned = 1;
3240                for (i = 0; i < nr_pages; i++) {
3241                        struct page *page = pvec.pages[i];
3242
3243                        if (!PagePrivate(page))
3244                                continue;
3245
3246                        if (!wbc->range_cyclic && page->index > end) {
3247                                done = 1;
3248                                break;
3249                        }
3250
3251                        eb = (struct extent_buffer *)page->private;
3252                        if (!eb) {
3253                                WARN_ON(1);
3254                                continue;
3255                        }
3256
3257                        if (eb == prev_eb)
3258                                continue;
3259
3260                        if (!atomic_inc_not_zero(&eb->refs)) {
3261                                WARN_ON(1);
3262                                continue;
3263                        }
3264
3265                        prev_eb = eb;
3266                        ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3267                        if (!ret) {
3268                                free_extent_buffer(eb);
3269                                continue;
3270                        }
3271
3272                        ret = write_one_eb(eb, fs_info, wbc, &epd);
3273                        if (ret) {
3274                                done = 1;
3275                                free_extent_buffer(eb);
3276                                break;
3277                        }
3278                        free_extent_buffer(eb);
3279
3280                        /*
3281                         * the filesystem may choose to bump up nr_to_write.
3282                         * We have to make sure to honor the new nr_to_write
3283                         * at any time
3284                         */
3285                        nr_to_write_done = wbc->nr_to_write <= 0;
3286                }
3287                pagevec_release(&pvec);
3288                cond_resched();
3289        }
3290        if (!scanned && !done) {
3291                /*
3292                 * We hit the last page and there is more work to be done: wrap
3293                 * back to the start of the file
3294                 */
3295                scanned = 1;
3296                index = 0;
3297                goto retry;
3298        }
3299        flush_write_bio(&epd);
3300        return ret;
3301}
3302
3303/**
3304 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3305 * @mapping: address space structure to write
3306 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3307 * @writepage: function called for each page
3308 * @data: data passed to writepage function
3309 *
3310 * If a page is already under I/O, write_cache_pages() skips it, even
3311 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3312 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3313 * and msync() need to guarantee that all the data which was dirty at the time
3314 * the call was made get new I/O started against them.  If wbc->sync_mode is
3315 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3316 * existing IO to complete.
3317 */
3318static int extent_write_cache_pages(struct extent_io_tree *tree,
3319                             struct address_space *mapping,
3320                             struct writeback_control *wbc,
3321                             writepage_t writepage, void *data,
3322                             void (*flush_fn)(void *))
3323{
3324        struct inode *inode = mapping->host;
3325        int ret = 0;
3326        int done = 0;
3327        int nr_to_write_done = 0;
3328        struct pagevec pvec;
3329        int nr_pages;
3330        pgoff_t index;
3331        pgoff_t end;            /* Inclusive */
3332        int scanned = 0;
3333        int tag;
3334
3335        /*
3336         * We have to hold onto the inode so that ordered extents can do their
3337         * work when the IO finishes.  The alternative to this is failing to add
3338         * an ordered extent if the igrab() fails there and that is a huge pain
3339         * to deal with, so instead just hold onto the inode throughout the
3340         * writepages operation.  If it fails here we are freeing up the inode
3341         * anyway and we'd rather not waste our time writing out stuff that is
3342         * going to be truncated anyway.
3343         */
3344        if (!igrab(inode))
3345                return 0;
3346
3347        pagevec_init(&pvec, 0);
3348        if (wbc->range_cyclic) {
3349                index = mapping->writeback_index; /* Start from prev offset */
3350                end = -1;
3351        } else {
3352                index = wbc->range_start >> PAGE_CACHE_SHIFT;
3353                end = wbc->range_end >> PAGE_CACHE_SHIFT;
3354                scanned = 1;
3355        }
3356        if (wbc->sync_mode == WB_SYNC_ALL)
3357                tag = PAGECACHE_TAG_TOWRITE;
3358        else
3359                tag = PAGECACHE_TAG_DIRTY;
3360retry:
3361        if (wbc->sync_mode == WB_SYNC_ALL)
3362                tag_pages_for_writeback(mapping, index, end);
3363        while (!done && !nr_to_write_done && (index <= end) &&
3364               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3365                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3366                unsigned i;
3367
3368                scanned = 1;
3369                for (i = 0; i < nr_pages; i++) {
3370                        struct page *page = pvec.pages[i];
3371
3372                        /*
3373                         * At this point we hold neither mapping->tree_lock nor
3374                         * lock on the page itself: the page may be truncated or
3375                         * invalidated (changing page->mapping to NULL), or even
3376                         * swizzled back from swapper_space to tmpfs file
3377                         * mapping
3378                         */
3379                        if (tree->ops &&
3380                            tree->ops->write_cache_pages_lock_hook) {
3381                                tree->ops->write_cache_pages_lock_hook(page,
3382                                                               data, flush_fn);
3383                        } else {
3384                                if (!trylock_page(page)) {
3385                                        flush_fn(data);
3386                                        lock_page(page);
3387                                }
3388                        }
3389
3390                        if (unlikely(page->mapping != mapping)) {
3391                                unlock_page(page);
3392                                continue;
3393                        }
3394
3395                        if (!wbc->range_cyclic && page->index > end) {
3396                                done = 1;
3397                                unlock_page(page);
3398                                continue;
3399                        }
3400
3401                        if (wbc->sync_mode != WB_SYNC_NONE) {
3402                                if (PageWriteback(page))
3403                                        flush_fn(data);
3404                                wait_on_page_writeback(page);
3405                        }
3406
3407                        if (PageWriteback(page) ||
3408                            !clear_page_dirty_for_io(page)) {
3409                                unlock_page(page);
3410                                continue;
3411                        }
3412
3413                        ret = (*writepage)(page, wbc, data);
3414
3415                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3416                                unlock_page(page);
3417                                ret = 0;
3418                        }
3419                        if (ret)
3420                                done = 1;
3421
3422                        /*
3423                         * the filesystem may choose to bump up nr_to_write.
3424                         * We have to make sure to honor the new nr_to_write
3425                         * at any time
3426                         */
3427                        nr_to_write_done = wbc->nr_to_write <= 0;
3428                }
3429                pagevec_release(&pvec);
3430                cond_resched();
3431        }
3432        if (!scanned && !done) {
3433                /*
3434                 * We hit the last page and there is more work to be done: wrap
3435                 * back to the start of the file
3436                 */
3437                scanned = 1;
3438                index = 0;
3439                goto retry;
3440        }
3441        btrfs_add_delayed_iput(inode);
3442        return ret;
3443}
3444
3445static void flush_epd_write_bio(struct extent_page_data *epd)
3446{
3447        if (epd->bio) {
3448                int rw = WRITE;
3449                int ret;
3450
3451                if (epd->sync_io)
3452                        rw = WRITE_SYNC;
3453
3454                ret = submit_one_bio(rw, epd->bio, 0, 0);
3455                BUG_ON(ret < 0); /* -ENOMEM */
3456                epd->bio = NULL;
3457        }
3458}
3459
3460static noinline void flush_write_bio(void *data)
3461{
3462        struct extent_page_data *epd = data;
3463        flush_epd_write_bio(epd);
3464}
3465
3466int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3467                          get_extent_t *get_extent,
3468                          struct writeback_control *wbc)
3469{
3470        int ret;
3471        struct extent_page_data epd = {
3472                .bio = NULL,
3473                .tree = tree,
3474                .get_extent = get_extent,
3475                .extent_locked = 0,
3476                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3477        };
3478
3479        ret = __extent_writepage(page, wbc, &epd);
3480
3481        flush_epd_write_bio(&epd);
3482        return ret;
3483}
3484
3485int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3486                              u64 start, u64 end, get_extent_t *get_extent,
3487                              int mode)
3488{
3489        int ret = 0;
3490        struct address_space *mapping = inode->i_mapping;
3491        struct page *page;
3492        unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3493                PAGE_CACHE_SHIFT;
3494
3495        struct extent_page_data epd = {
3496                .bio = NULL,
3497                .tree = tree,
3498                .get_extent = get_extent,
3499                .extent_locked = 1,
3500                .sync_io = mode == WB_SYNC_ALL,
3501        };
3502        struct writeback_control wbc_writepages = {
3503                .sync_mode      = mode,
3504                .nr_to_write    = nr_pages * 2,
3505                .range_start    = start,
3506                .range_end      = end + 1,
3507        };
3508
3509        while (start <= end) {
3510                page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3511                if (clear_page_dirty_for_io(page))
3512                        ret = __extent_writepage(page, &wbc_writepages, &epd);
3513                else {
3514                        if (tree->ops && tree->ops->writepage_end_io_hook)
3515                                tree->ops->writepage_end_io_hook(page, start,
3516                                                 start + PAGE_CACHE_SIZE - 1,
3517                                                 NULL, 1);
3518                        unlock_page(page);
3519                }
3520                page_cache_release(page);
3521                start += PAGE_CACHE_SIZE;
3522        }
3523
3524        flush_epd_write_bio(&epd);
3525        return ret;
3526}
3527
3528int extent_writepages(struct extent_io_tree *tree,
3529                      struct address_space *mapping,
3530                      get_extent_t *get_extent,
3531                      struct writeback_control *wbc)
3532{
3533        int ret = 0;
3534        struct extent_page_data epd = {
3535                .bio = NULL,
3536                .tree = tree,
3537                .get_extent = get_extent,
3538                .extent_locked = 0,
3539                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3540        };
3541
3542        ret = extent_write_cache_pages(tree, mapping, wbc,
3543                                       __extent_writepage, &epd,
3544                                       flush_write_bio);
3545        flush_epd_write_bio(&epd);
3546        return ret;
3547}
3548
3549int extent_readpages(struct extent_io_tree *tree,
3550                     struct address_space *mapping,
3551                     struct list_head *pages, unsigned nr_pages,
3552                     get_extent_t get_extent)
3553{
3554        struct bio *bio = NULL;
3555        unsigned page_idx;
3556        unsigned long bio_flags = 0;
3557        struct page *pagepool[16];
3558        struct page *page;
3559        int i = 0;
3560        int nr = 0;
3561
3562        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3563                page = list_entry(pages->prev, struct page, lru);
3564
3565                prefetchw(&page->flags);
3566                list_del(&page->lru);
3567                if (add_to_page_cache_lru(page, mapping,
3568                                        page->index, GFP_NOFS)) {
3569                        page_cache_release(page);
3570                        continue;
3571                }
3572
3573                pagepool[nr++] = page;
3574                if (nr < ARRAY_SIZE(pagepool))
3575                        continue;
3576                for (i = 0; i < nr; i++) {
3577                        __extent_read_full_page(tree, pagepool[i], get_extent,
3578                                        &bio, 0, &bio_flags);
3579                        page_cache_release(pagepool[i]);
3580                }
3581                nr = 0;
3582        }
3583        for (i = 0; i < nr; i++) {
3584                __extent_read_full_page(tree, pagepool[i], get_extent,
3585                                        &bio, 0, &bio_flags);
3586                page_cache_release(pagepool[i]);
3587        }
3588
3589        BUG_ON(!list_empty(pages));
3590        if (bio)
3591                return submit_one_bio(READ, bio, 0, bio_flags);
3592        return 0;
3593}
3594
3595/*
3596 * basic invalidatepage code, this waits on any locked or writeback
3597 * ranges corresponding to the page, and then deletes any extent state
3598 * records from the tree
3599 */
3600int extent_invalidatepage(struct extent_io_tree *tree,
3601                          struct page *page, unsigned long offset)
3602{
3603        struct extent_state *cached_state = NULL;
3604        u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3605        u64 end = start + PAGE_CACHE_SIZE - 1;
3606        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3607
3608        start += (offset + blocksize - 1) & ~(blocksize - 1);
3609        if (start > end)
3610                return 0;
3611
3612        lock_extent_bits(tree, start, end, 0, &cached_state);
3613        wait_on_page_writeback(page);
3614        clear_extent_bit(tree, start, end,
3615                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3616                         EXTENT_DO_ACCOUNTING,
3617                         1, 1, &cached_state, GFP_NOFS);
3618        return 0;
3619}
3620
3621/*
3622 * a helper for releasepage, this tests for areas of the page that
3623 * are locked or under IO and drops the related state bits if it is safe
3624 * to drop the page.
3625 */
3626int try_release_extent_state(struct extent_map_tree *map,
3627                             struct extent_io_tree *tree, struct page *page,
3628                             gfp_t mask)
3629{
3630        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3631        u64 end = start + PAGE_CACHE_SIZE - 1;
3632        int ret = 1;
3633
3634        if (test_range_bit(tree, start, end,
3635                           EXTENT_IOBITS, 0, NULL))
3636                ret = 0;
3637        else {
3638                if ((mask & GFP_NOFS) == GFP_NOFS)
3639                        mask = GFP_NOFS;
3640                /*
3641                 * at this point we can safely clear everything except the
3642                 * locked bit and the nodatasum bit
3643                 */
3644                ret = clear_extent_bit(tree, start, end,
3645                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3646                                 0, 0, NULL, mask);
3647
3648                /* if clear_extent_bit failed for enomem reasons,
3649                 * we can't allow the release to continue.
3650                 */
3651                if (ret < 0)
3652                        ret = 0;
3653                else
3654                        ret = 1;
3655        }
3656        return ret;
3657}
3658
3659/*
3660 * a helper for releasepage.  As long as there are no locked extents
3661 * in the range corresponding to the page, both state records and extent
3662 * map records are removed
3663 */
3664int try_release_extent_mapping(struct extent_map_tree *map,
3665                               struct extent_io_tree *tree, struct page *page,
3666                               gfp_t mask)
3667{
3668        struct extent_map *em;
3669        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3670        u64 end = start + PAGE_CACHE_SIZE - 1;
3671
3672        if ((mask & __GFP_WAIT) &&
3673            page->mapping->host->i_size > 16 * 1024 * 1024) {
3674                u64 len;
3675                while (start <= end) {
3676                        len = end - start + 1;
3677                        write_lock(&map->lock);
3678                        em = lookup_extent_mapping(map, start, len);
3679                        if (!em) {
3680                                write_unlock(&map->lock);
3681                                break;
3682                        }
3683                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3684                            em->start != start) {
3685                                write_unlock(&map->lock);
3686                                free_extent_map(em);
3687                                break;
3688                        }
3689                        if (!test_range_bit(tree, em->start,
3690                                            extent_map_end(em) - 1,
3691                                            EXTENT_LOCKED | EXTENT_WRITEBACK,
3692                                            0, NULL)) {
3693                                remove_extent_mapping(map, em);
3694                                /* once for the rb tree */
3695                                free_extent_map(em);
3696                        }
3697                        start = extent_map_end(em);
3698                        write_unlock(&map->lock);
3699
3700                        /* once for us */
3701                        free_extent_map(em);
3702                }
3703        }
3704        return try_release_extent_state(map, tree, page, mask);
3705}
3706
3707/*
3708 * helper function for fiemap, which doesn't want to see any holes.
3709 * This maps until we find something past 'last'
3710 */
3711static struct extent_map *get_extent_skip_holes(struct inode *inode,
3712                                                u64 offset,
3713                                                u64 last,
3714                                                get_extent_t *get_extent)
3715{
3716        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3717        struct extent_map *em;
3718        u64 len;
3719
3720        if (offset >= last)
3721                return NULL;
3722
3723        while(1) {
3724                len = last - offset;
3725                if (len == 0)
3726                        break;
3727                len = (len + sectorsize - 1) & ~(sectorsize - 1);
3728                em = get_extent(inode, NULL, 0, offset, len, 0);
3729                if (IS_ERR_OR_NULL(em))
3730                        return em;
3731
3732                /* if this isn't a hole return it */
3733                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3734                    em->block_start != EXTENT_MAP_HOLE) {
3735                        return em;
3736                }
3737
3738                /* this is a hole, advance to the next extent */
3739                offset = extent_map_end(em);
3740                free_extent_map(em);
3741                if (offset >= last)
3742                        break;
3743        }
3744        return NULL;
3745}
3746
3747int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3748                __u64 start, __u64 len, get_extent_t *get_extent)
3749{
3750        int ret = 0;
3751        u64 off = start;
3752        u64 max = start + len;
3753        u32 flags = 0;
3754        u32 found_type;
3755        u64 last;
3756        u64 last_for_get_extent = 0;
3757        u64 disko = 0;
3758        u64 isize = i_size_read(inode);
3759        struct btrfs_key found_key;
3760        struct extent_map *em = NULL;
3761        struct extent_state *cached_state = NULL;
3762        struct btrfs_path *path;
3763        struct btrfs_file_extent_item *item;
3764        int end = 0;
3765        u64 em_start = 0;
3766        u64 em_len = 0;
3767        u64 em_end = 0;
3768        unsigned long emflags;
3769
3770        if (len == 0)
3771                return -EINVAL;
3772
3773        path = btrfs_alloc_path();
3774        if (!path)
3775                return -ENOMEM;
3776        path->leave_spinning = 1;
3777
3778        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3779        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3780
3781        /*
3782         * lookup the last file extent.  We're not using i_size here
3783         * because there might be preallocation past i_size
3784         */
3785        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3786                                       path, btrfs_ino(inode), -1, 0);
3787        if (ret < 0) {
3788                btrfs_free_path(path);
3789                return ret;
3790        }
3791        WARN_ON(!ret);
3792        path->slots[0]--;
3793        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3794                              struct btrfs_file_extent_item);
3795        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3796        found_type = btrfs_key_type(&found_key);
3797
3798        /* No extents, but there might be delalloc bits */
3799        if (found_key.objectid != btrfs_ino(inode) ||
3800            found_type != BTRFS_EXTENT_DATA_KEY) {
3801                /* have to trust i_size as the end */
3802                last = (u64)-1;
3803                last_for_get_extent = isize;
3804        } else {
3805                /*
3806                 * remember the start of the last extent.  There are a
3807                 * bunch of different factors that go into the length of the
3808                 * extent, so its much less complex to remember where it started
3809                 */
3810                last = found_key.offset;
3811                last_for_get_extent = last + 1;
3812        }
3813        btrfs_free_path(path);
3814
3815        /*
3816         * we might have some extents allocated but more delalloc past those
3817         * extents.  so, we trust isize unless the start of the last extent is
3818         * beyond isize
3819         */
3820        if (last < isize) {
3821                last = (u64)-1;
3822                last_for_get_extent = isize;
3823        }
3824
3825        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3826                         &cached_state);
3827
3828        em = get_extent_skip_holes(inode, start, last_for_get_extent,
3829                                   get_extent);
3830        if (!em)
3831                goto out;
3832        if (IS_ERR(em)) {
3833                ret = PTR_ERR(em);
3834                goto out;
3835        }
3836
3837        while (!end) {
3838                u64 offset_in_extent;
3839
3840                /* break if the extent we found is outside the range */
3841                if (em->start >= max || extent_map_end(em) < off)
3842                        break;
3843
3844                /*
3845                 * get_extent may return an extent that starts before our
3846                 * requested range.  We have to make sure the ranges
3847                 * we return to fiemap always move forward and don't
3848                 * overlap, so adjust the offsets here
3849                 */
3850                em_start = max(em->start, off);
3851
3852                /*
3853                 * record the offset from the start of the extent
3854                 * for adjusting the disk offset below
3855                 */
3856                offset_in_extent = em_start - em->start;
3857                em_end = extent_map_end(em);
3858                em_len = em_end - em_start;
3859                emflags = em->flags;
3860                disko = 0;
3861                flags = 0;
3862
3863                /*
3864                 * bump off for our next call to get_extent
3865                 */
3866                off = extent_map_end(em);
3867                if (off >= max)
3868                        end = 1;
3869
3870                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3871                        end = 1;
3872                        flags |= FIEMAP_EXTENT_LAST;
3873                } else if (em->block_start == EXTENT_MAP_INLINE) {
3874                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
3875                                  FIEMAP_EXTENT_NOT_ALIGNED);
3876                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
3877                        flags |= (FIEMAP_EXTENT_DELALLOC |
3878                                  FIEMAP_EXTENT_UNKNOWN);
3879                } else {
3880                        disko = em->block_start + offset_in_extent;
3881                }
3882                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3883                        flags |= FIEMAP_EXTENT_ENCODED;
3884
3885                free_extent_map(em);
3886                em = NULL;
3887                if ((em_start >= last) || em_len == (u64)-1 ||
3888                   (last == (u64)-1 && isize <= em_end)) {
3889                        flags |= FIEMAP_EXTENT_LAST;
3890                        end = 1;
3891                }
3892
3893                /* now scan forward to see if this is really the last extent. */
3894                em = get_extent_skip_holes(inode, off, last_for_get_extent,
3895                                           get_extent);
3896                if (IS_ERR(em)) {
3897                        ret = PTR_ERR(em);
3898                        goto out;
3899                }
3900                if (!em) {
3901                        flags |= FIEMAP_EXTENT_LAST;
3902                        end = 1;
3903                }
3904                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3905                                              em_len, flags);
3906                if (ret)
3907                        goto out_free;
3908        }
3909out_free:
3910        free_extent_map(em);
3911out:
3912        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3913                             &cached_state, GFP_NOFS);
3914        return ret;
3915}
3916
3917inline struct page *extent_buffer_page(struct extent_buffer *eb,
3918                                              unsigned long i)
3919{
3920        return eb->pages[i];
3921}
3922
3923inline unsigned long num_extent_pages(u64 start, u64 len)
3924{
3925        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3926                (start >> PAGE_CACHE_SHIFT);
3927}
3928
3929static void __free_extent_buffer(struct extent_buffer *eb)
3930{
3931#if LEAK_DEBUG
3932        unsigned long flags;
3933        spin_lock_irqsave(&leak_lock, flags);
3934        list_del(&eb->leak_list);
3935        spin_unlock_irqrestore(&leak_lock, flags);
3936#endif
3937        if (eb->pages && eb->pages != eb->inline_pages)
3938                kfree(eb->pages);
3939        kmem_cache_free(extent_buffer_cache, eb);
3940}
3941
3942static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3943                                                   u64 start,
3944                                                   unsigned long len,
3945                                                   gfp_t mask)
3946{
3947        struct extent_buffer *eb = NULL;
3948#if LEAK_DEBUG
3949        unsigned long flags;
3950#endif
3951
3952        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3953        if (eb == NULL)
3954                return NULL;
3955        eb->start = start;
3956        eb->len = len;
3957        eb->tree = tree;
3958        eb->bflags = 0;
3959        rwlock_init(&eb->lock);
3960        atomic_set(&eb->write_locks, 0);
3961        atomic_set(&eb->read_locks, 0);
3962        atomic_set(&eb->blocking_readers, 0);
3963        atomic_set(&eb->blocking_writers, 0);
3964        atomic_set(&eb->spinning_readers, 0);
3965        atomic_set(&eb->spinning_writers, 0);
3966        eb->lock_nested = 0;
3967        init_waitqueue_head(&eb->write_lock_wq);
3968        init_waitqueue_head(&eb->read_lock_wq);
3969
3970#if LEAK_DEBUG
3971        spin_lock_irqsave(&leak_lock, flags);
3972        list_add(&eb->leak_list, &buffers);
3973        spin_unlock_irqrestore(&leak_lock, flags);
3974#endif
3975        spin_lock_init(&eb->refs_lock);
3976        atomic_set(&eb->refs, 1);
3977        atomic_set(&eb->io_pages, 0);
3978
3979        if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
3980                struct page **pages;
3981                int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
3982                        PAGE_CACHE_SHIFT;
3983                pages = kzalloc(num_pages, mask);
3984                if (!pages) {
3985                        __free_extent_buffer(eb);
3986                        return NULL;
3987                }
3988                eb->pages = pages;
3989        } else {
3990                eb->pages = eb->inline_pages;
3991        }
3992
3993        return eb;
3994}
3995
3996struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3997{
3998        unsigned long i;
3999        struct page *p;
4000        struct extent_buffer *new;
4001        unsigned long num_pages = num_extent_pages(src->start, src->len);
4002
4003        new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
4004        if (new == NULL)
4005                return NULL;
4006
4007        for (i = 0; i < num_pages; i++) {
4008                p = alloc_page(GFP_ATOMIC);
4009                BUG_ON(!p);
4010                attach_extent_buffer_page(new, p);
4011                WARN_ON(PageDirty(p));
4012                SetPageUptodate(p);
4013                new->pages[i] = p;
4014        }
4015
4016        copy_extent_buffer(new, src, 0, 0, src->len);
4017        set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4018        set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4019
4020        return new;
4021}
4022
4023struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4024{
4025        struct extent_buffer *eb;
4026        unsigned long num_pages = num_extent_pages(0, len);
4027        unsigned long i;
4028
4029        eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
4030        if (!eb)
4031                return NULL;
4032
4033        for (i = 0; i < num_pages; i++) {
4034                eb->pages[i] = alloc_page(GFP_ATOMIC);
4035                if (!eb->pages[i])
4036                        goto err;
4037        }
4038        set_extent_buffer_uptodate(eb);
4039        btrfs_set_header_nritems(eb, 0);
4040        set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4041
4042        return eb;
4043err:
4044        for (i--; i > 0; i--)
4045                __free_page(eb->pages[i]);
4046        __free_extent_buffer(eb);
4047        return NULL;
4048}
4049
4050static int extent_buffer_under_io(struct extent_buffer *eb)
4051{
4052        return (atomic_read(&eb->io_pages) ||
4053                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4054                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4055}
4056
4057/*
4058 * Helper for releasing extent buffer page.
4059 */
4060static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4061                                                unsigned long start_idx)
4062{
4063        unsigned long index;
4064        unsigned long num_pages;
4065        struct page *page;
4066        int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4067
4068        BUG_ON(extent_buffer_under_io(eb));
4069
4070        num_pages = num_extent_pages(eb->start, eb->len);
4071        index = start_idx + num_pages;
4072        if (start_idx >= index)
4073                return;
4074
4075        do {
4076                index--;
4077                page = extent_buffer_page(eb, index);
4078                if (page && mapped) {
4079                        spin_lock(&page->mapping->private_lock);
4080                        /*
4081                         * We do this since we'll remove the pages after we've
4082                         * removed the eb from the radix tree, so we could race
4083                         * and have this page now attached to the new eb.  So
4084                         * only clear page_private if it's still connected to
4085                         * this eb.
4086                         */
4087                        if (PagePrivate(page) &&
4088                            page->private == (unsigned long)eb) {
4089                                BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4090                                BUG_ON(PageDirty(page));
4091                                BUG_ON(PageWriteback(page));
4092                                /*
4093                                 * We need to make sure we haven't be attached
4094                                 * to a new eb.
4095                                 */
4096                                ClearPagePrivate(page);
4097                                set_page_private(page, 0);
4098                                /* One for the page private */
4099                                page_cache_release(page);
4100                        }
4101                        spin_unlock(&page->mapping->private_lock);
4102
4103                }
4104                if (page) {
4105                        /* One for when we alloced the page */
4106                        page_cache_release(page);
4107                }
4108        } while (index != start_idx);
4109}
4110
4111/*
4112 * Helper for releasing the extent buffer.
4113 */
4114static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4115{
4116        btrfs_release_extent_buffer_page(eb, 0);
4117        __free_extent_buffer(eb);
4118}
4119
4120static void check_buffer_tree_ref(struct extent_buffer *eb)
4121{
4122        /* the ref bit is tricky.  We have to make sure it is set
4123         * if we have the buffer dirty.   Otherwise the
4124         * code to free a buffer can end up dropping a dirty
4125         * page
4126         *
4127         * Once the ref bit is set, it won't go away while the
4128         * buffer is dirty or in writeback, and it also won't
4129         * go away while we have the reference count on the
4130         * eb bumped.
4131         *
4132         * We can't just set the ref bit without bumping the
4133         * ref on the eb because free_extent_buffer might
4134         * see the ref bit and try to clear it.  If this happens
4135         * free_extent_buffer might end up dropping our original
4136         * ref by mistake and freeing the page before we are able
4137         * to add one more ref.
4138         *
4139         * So bump the ref count first, then set the bit.  If someone
4140         * beat us to it, drop the ref we added.
4141         */
4142        spin_lock(&eb->refs_lock);
4143        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4144                atomic_inc(&eb->refs);
4145        spin_unlock(&eb->refs_lock);
4146}
4147
4148static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4149{
4150        unsigned long num_pages, i;
4151
4152        check_buffer_tree_ref(eb);
4153
4154        num_pages = num_extent_pages(eb->start, eb->len);
4155        for (i = 0; i < num_pages; i++) {
4156                struct page *p = extent_buffer_page(eb, i);
4157                mark_page_accessed(p);
4158        }
4159}
4160
4161struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4162                                          u64 start, unsigned long len)
4163{
4164        unsigned long num_pages = num_extent_pages(start, len);
4165        unsigned long i;
4166        unsigned long index = start >> PAGE_CACHE_SHIFT;
4167        struct extent_buffer *eb;
4168        struct extent_buffer *exists = NULL;
4169        struct page *p;
4170        struct address_space *mapping = tree->mapping;
4171        int uptodate = 1;
4172        int ret;
4173
4174        rcu_read_lock();
4175        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4176        if (eb && atomic_inc_not_zero(&eb->refs)) {
4177                rcu_read_unlock();
4178                mark_extent_buffer_accessed(eb);
4179                return eb;
4180        }
4181        rcu_read_unlock();
4182
4183        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4184        if (!eb)
4185                return NULL;
4186
4187        for (i = 0; i < num_pages; i++, index++) {
4188                p = find_or_create_page(mapping, index, GFP_NOFS);
4189                if (!p) {
4190                        WARN_ON(1);
4191                        goto free_eb;
4192                }
4193
4194                spin_lock(&mapping->private_lock);
4195                if (PagePrivate(p)) {
4196                        /*
4197                         * We could have already allocated an eb for this page
4198                         * and attached one so lets see if we can get a ref on
4199                         * the existing eb, and if we can we know it's good and
4200                         * we can just return that one, else we know we can just
4201                         * overwrite page->private.
4202                         */
4203                        exists = (struct extent_buffer *)p->private;
4204                        if (atomic_inc_not_zero(&exists->refs)) {
4205                                spin_unlock(&mapping->private_lock);
4206                                unlock_page(p);
4207                                page_cache_release(p);
4208                                mark_extent_buffer_accessed(exists);
4209                                goto free_eb;
4210                        }
4211
4212                        /*
4213                         * Do this so attach doesn't complain and we need to
4214                         * drop the ref the old guy had.
4215                         */
4216                        ClearPagePrivate(p);
4217                        WARN_ON(PageDirty(p));
4218                        page_cache_release(p);
4219                }
4220                attach_extent_buffer_page(eb, p);
4221                spin_unlock(&mapping->private_lock);
4222                WARN_ON(PageDirty(p));
4223                mark_page_accessed(p);
4224                eb->pages[i] = p;
4225                if (!PageUptodate(p))
4226                        uptodate = 0;
4227
4228                /*
4229                 * see below about how we avoid a nasty race with release page
4230                 * and why we unlock later
4231                 */
4232        }
4233        if (uptodate)
4234                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4235again:
4236        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4237        if (ret)
4238                goto free_eb;
4239
4240        spin_lock(&tree->buffer_lock);
4241        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
4242        if (ret == -EEXIST) {
4243                exists = radix_tree_lookup(&tree->buffer,
4244                                                start >> PAGE_CACHE_SHIFT);
4245                if (!atomic_inc_not_zero(&exists->refs)) {
4246                        spin_unlock(&tree->buffer_lock);
4247                        radix_tree_preload_end();
4248                        exists = NULL;
4249                        goto again;
4250                }
4251                spin_unlock(&tree->buffer_lock);
4252                radix_tree_preload_end();
4253                mark_extent_buffer_accessed(exists);
4254                goto free_eb;
4255        }
4256        /* add one reference for the tree */
4257        check_buffer_tree_ref(eb);
4258        spin_unlock(&tree->buffer_lock);
4259        radix_tree_preload_end();
4260
4261        /*
4262         * there is a race where release page may have
4263         * tried to find this extent buffer in the radix
4264         * but failed.  It will tell the VM it is safe to
4265         * reclaim the, and it will clear the page private bit.
4266         * We must make sure to set the page private bit properly
4267         * after the extent buffer is in the radix tree so
4268         * it doesn't get lost
4269         */
4270        SetPageChecked(eb->pages[0]);
4271        for (i = 1; i < num_pages; i++) {
4272                p = extent_buffer_page(eb, i);
4273                ClearPageChecked(p);
4274                unlock_page(p);
4275        }
4276        unlock_page(eb->pages[0]);
4277        return eb;
4278
4279free_eb:
4280        for (i = 0; i < num_pages; i++) {
4281                if (eb->pages[i])
4282                        unlock_page(eb->pages[i]);
4283        }
4284
4285        WARN_ON(!atomic_dec_and_test(&eb->refs));
4286        btrfs_release_extent_buffer(eb);
4287        return exists;
4288}
4289
4290struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4291                                         u64 start, unsigned long len)
4292{
4293        struct extent_buffer *eb;
4294
4295        rcu_read_lock();
4296        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4297        if (eb && atomic_inc_not_zero(&eb->refs)) {
4298                rcu_read_unlock();
4299                mark_extent_buffer_accessed(eb);
4300                return eb;
4301        }
4302        rcu_read_unlock();
4303
4304        return NULL;
4305}
4306
4307static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4308{
4309        struct extent_buffer *eb =
4310                        container_of(head, struct extent_buffer, rcu_head);
4311
4312        __free_extent_buffer(eb);
4313}
4314
4315/* Expects to have eb->eb_lock already held */
4316static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4317{
4318        WARN_ON(atomic_read(&eb->refs) == 0);
4319        if (atomic_dec_and_test(&eb->refs)) {
4320                if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4321                        spin_unlock(&eb->refs_lock);
4322                } else {
4323                        struct extent_io_tree *tree = eb->tree;
4324
4325                        spin_unlock(&eb->refs_lock);
4326
4327                        spin_lock(&tree->buffer_lock);
4328                        radix_tree_delete(&tree->buffer,
4329                                          eb->start >> PAGE_CACHE_SHIFT);
4330                        spin_unlock(&tree->buffer_lock);
4331                }
4332
4333                /* Should be safe to release our pages at this point */
4334                btrfs_release_extent_buffer_page(eb, 0);
4335
4336                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4337                return 1;
4338        }
4339        spin_unlock(&eb->refs_lock);
4340
4341        return 0;
4342}
4343
4344void free_extent_buffer(struct extent_buffer *eb)
4345{
4346        if (!eb)
4347                return;
4348
4349        spin_lock(&eb->refs_lock);
4350        if (atomic_read(&eb->refs) == 2 &&
4351            test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4352                atomic_dec(&eb->refs);
4353
4354        if (atomic_read(&eb->refs) == 2 &&
4355            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4356            !extent_buffer_under_io(eb) &&
4357            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4358                atomic_dec(&eb->refs);
4359
4360        /*
4361         * I know this is terrible, but it's temporary until we stop tracking
4362         * the uptodate bits and such for the extent buffers.
4363         */
4364        release_extent_buffer(eb, GFP_ATOMIC);
4365}
4366
4367void free_extent_buffer_stale(struct extent_buffer *eb)
4368{
4369        if (!eb)
4370                return;
4371
4372        spin_lock(&eb->refs_lock);
4373        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4374
4375        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4376            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4377                atomic_dec(&eb->refs);
4378        release_extent_buffer(eb, GFP_NOFS);
4379}
4380
4381void clear_extent_buffer_dirty(struct extent_buffer *eb)
4382{
4383        unsigned long i;
4384        unsigned long num_pages;
4385        struct page *page;
4386
4387        num_pages = num_extent_pages(eb->start, eb->len);
4388
4389        for (i = 0; i < num_pages; i++) {
4390                page = extent_buffer_page(eb, i);
4391                if (!PageDirty(page))
4392                        continue;
4393
4394                lock_page(page);
4395                WARN_ON(!PagePrivate(page));
4396
4397                clear_page_dirty_for_io(page);
4398                spin_lock_irq(&page->mapping->tree_lock);
4399                if (!PageDirty(page)) {
4400                        radix_tree_tag_clear(&page->mapping->page_tree,
4401                                                page_index(page),
4402                                                PAGECACHE_TAG_DIRTY);
4403                }
4404                spin_unlock_irq(&page->mapping->tree_lock);
4405                ClearPageError(page);
4406                unlock_page(page);
4407        }
4408        WARN_ON(atomic_read(&eb->refs) == 0);
4409}
4410
4411int set_extent_buffer_dirty(struct extent_buffer *eb)
4412{
4413        unsigned long i;
4414        unsigned long num_pages;
4415        int was_dirty = 0;
4416
4417        check_buffer_tree_ref(eb);
4418
4419        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4420
4421        num_pages = num_extent_pages(eb->start, eb->len);
4422        WARN_ON(atomic_read(&eb->refs) == 0);
4423        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4424
4425        for (i = 0; i < num_pages; i++)
4426                set_page_dirty(extent_buffer_page(eb, i));
4427        return was_dirty;
4428}
4429
4430static int range_straddles_pages(u64 start, u64 len)
4431{
4432        if (len < PAGE_CACHE_SIZE)
4433                return 1;
4434        if (start & (PAGE_CACHE_SIZE - 1))
4435                return 1;
4436        if ((start + len) & (PAGE_CACHE_SIZE - 1))
4437                return 1;
4438        return 0;
4439}
4440
4441int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4442{
4443        unsigned long i;
4444        struct page *page;
4445        unsigned long num_pages;
4446
4447        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4448        num_pages = num_extent_pages(eb->start, eb->len);
4449        for (i = 0; i < num_pages; i++) {
4450                page = extent_buffer_page(eb, i);
4451                if (page)
4452                        ClearPageUptodate(page);
4453        }
4454        return 0;
4455}
4456
4457int set_extent_buffer_uptodate(struct extent_buffer *eb)
4458{
4459        unsigned long i;
4460        struct page *page;
4461        unsigned long num_pages;
4462
4463        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4464        num_pages = num_extent_pages(eb->start, eb->len);
4465        for (i = 0; i < num_pages; i++) {
4466                page = extent_buffer_page(eb, i);
4467                SetPageUptodate(page);
4468        }
4469        return 0;
4470}
4471
4472int extent_range_uptodate(struct extent_io_tree *tree,
4473                          u64 start, u64 end)
4474{
4475        struct page *page;
4476        int ret;
4477        int pg_uptodate = 1;
4478        int uptodate;
4479        unsigned long index;
4480
4481        if (range_straddles_pages(start, end - start + 1)) {
4482                ret = test_range_bit(tree, start, end,
4483                                     EXTENT_UPTODATE, 1, NULL);
4484                if (ret)
4485                        return 1;
4486        }
4487        while (start <= end) {
4488                index = start >> PAGE_CACHE_SHIFT;
4489                page = find_get_page(tree->mapping, index);
4490                if (!page)
4491                        return 1;
4492                uptodate = PageUptodate(page);
4493                page_cache_release(page);
4494                if (!uptodate) {
4495                        pg_uptodate = 0;
4496                        break;
4497                }
4498                start += PAGE_CACHE_SIZE;
4499        }
4500        return pg_uptodate;
4501}
4502
4503int extent_buffer_uptodate(struct extent_buffer *eb)
4504{
4505        return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4506}
4507
4508int read_extent_buffer_pages(struct extent_io_tree *tree,
4509                             struct extent_buffer *eb, u64 start, int wait,
4510                             get_extent_t *get_extent, int mirror_num)
4511{
4512        unsigned long i;
4513        unsigned long start_i;
4514        struct page *page;
4515        int err;
4516        int ret = 0;
4517        int locked_pages = 0;
4518        int all_uptodate = 1;
4519        unsigned long num_pages;
4520        unsigned long num_reads = 0;
4521        struct bio *bio = NULL;
4522        unsigned long bio_flags = 0;
4523
4524        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4525                return 0;
4526
4527        if (start) {
4528                WARN_ON(start < eb->start);
4529                start_i = (start >> PAGE_CACHE_SHIFT) -
4530                        (eb->start >> PAGE_CACHE_SHIFT);
4531        } else {
4532                start_i = 0;
4533        }
4534
4535        num_pages = num_extent_pages(eb->start, eb->len);
4536        for (i = start_i; i < num_pages; i++) {
4537                page = extent_buffer_page(eb, i);
4538                if (wait == WAIT_NONE) {
4539                        if (!trylock_page(page))
4540                                goto unlock_exit;
4541                } else {
4542                        lock_page(page);
4543                }
4544                locked_pages++;
4545                if (!PageUptodate(page)) {
4546                        num_reads++;
4547                        all_uptodate = 0;
4548                }
4549        }
4550        if (all_uptodate) {
4551                if (start_i == 0)
4552                        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4553                goto unlock_exit;
4554        }
4555
4556        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4557        eb->read_mirror = 0;
4558        atomic_set(&eb->io_pages, num_reads);
4559        for (i = start_i; i < num_pages; i++) {
4560                page = extent_buffer_page(eb, i);
4561                if (!PageUptodate(page)) {
4562                        ClearPageError(page);
4563                        err = __extent_read_full_page(tree, page,
4564                                                      get_extent, &bio,
4565                                                      mirror_num, &bio_flags);
4566                        if (err)
4567                                ret = err;
4568                } else {
4569                        unlock_page(page);
4570                }
4571        }
4572
4573        if (bio) {
4574                err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4575                if (err)
4576                        return err;
4577        }
4578
4579        if (ret || wait != WAIT_COMPLETE)
4580                return ret;
4581
4582        for (i = start_i; i < num_pages; i++) {
4583                page = extent_buffer_page(eb, i);
4584                wait_on_page_locked(page);
4585                if (!PageUptodate(page))
4586                        ret = -EIO;
4587        }
4588
4589        return ret;
4590
4591unlock_exit:
4592        i = start_i;
4593        while (locked_pages > 0) {
4594                page = extent_buffer_page(eb, i);
4595                i++;
4596                unlock_page(page);
4597                locked_pages--;
4598        }
4599        return ret;
4600}
4601
4602void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4603                        unsigned long start,
4604                        unsigned long len)
4605{
4606        size_t cur;
4607        size_t offset;
4608        struct page *page;
4609        char *kaddr;
4610        char *dst = (char *)dstv;
4611        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4612        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4613
4614        WARN_ON(start > eb->len);
4615        WARN_ON(start + len > eb->start + eb->len);
4616
4617        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4618
4619        while (len > 0) {
4620                page = extent_buffer_page(eb, i);
4621
4622                cur = min(len, (PAGE_CACHE_SIZE - offset));
4623                kaddr = page_address(page);
4624                memcpy(dst, kaddr + offset, cur);
4625
4626                dst += cur;
4627                len -= cur;
4628                offset = 0;
4629                i++;
4630        }
4631}
4632
4633int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4634                               unsigned long min_len, char **map,
4635                               unsigned long *map_start,
4636                               unsigned long *map_len)
4637{
4638        size_t offset = start & (PAGE_CACHE_SIZE - 1);
4639        char *kaddr;
4640        struct page *p;
4641        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4642        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4643        unsigned long end_i = (start_offset + start + min_len - 1) >>
4644                PAGE_CACHE_SHIFT;
4645
4646        if (i != end_i)
4647                return -EINVAL;
4648
4649        if (i == 0) {
4650                offset = start_offset;
4651                *map_start = 0;
4652        } else {
4653                offset = 0;
4654                *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4655        }
4656
4657        if (start + min_len > eb->len) {
4658                printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4659                       "wanted %lu %lu\n", (unsigned long long)eb->start,
4660                       eb->len, start, min_len);
4661                WARN_ON(1);
4662                return -EINVAL;
4663        }
4664
4665        p = extent_buffer_page(eb, i);
4666        kaddr = page_address(p);
4667        *map = kaddr + offset;
4668        *map_len = PAGE_CACHE_SIZE - offset;
4669        return 0;
4670}
4671
4672int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4673                          unsigned long start,
4674                          unsigned long len)
4675{
4676        size_t cur;
4677        size_t offset;
4678        struct page *page;
4679        char *kaddr;
4680        char *ptr = (char *)ptrv;
4681        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4682        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4683        int ret = 0;
4684
4685        WARN_ON(start > eb->len);
4686        WARN_ON(start + len > eb->start + eb->len);
4687
4688        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4689
4690        while (len > 0) {
4691                page = extent_buffer_page(eb, i);
4692
4693                cur = min(len, (PAGE_CACHE_SIZE - offset));
4694
4695                kaddr = page_address(page);
4696                ret = memcmp(ptr, kaddr + offset, cur);
4697                if (ret)
4698                        break;
4699
4700                ptr += cur;
4701                len -= cur;
4702                offset = 0;
4703                i++;
4704        }
4705        return ret;
4706}
4707
4708void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4709                         unsigned long start, unsigned long len)
4710{
4711        size_t cur;
4712        size_t offset;
4713        struct page *page;
4714        char *kaddr;
4715        char *src = (char *)srcv;
4716        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4717        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4718
4719        WARN_ON(start > eb->len);
4720        WARN_ON(start + len > eb->start + eb->len);
4721
4722        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4723
4724        while (len > 0) {
4725                page = extent_buffer_page(eb, i);
4726                WARN_ON(!PageUptodate(page));
4727
4728                cur = min(len, PAGE_CACHE_SIZE - offset);
4729                kaddr = page_address(page);
4730                memcpy(kaddr + offset, src, cur);
4731
4732                src += cur;
4733                len -= cur;
4734                offset = 0;
4735                i++;
4736        }
4737}
4738
4739void memset_extent_buffer(struct extent_buffer *eb, char c,
4740                          unsigned long start, unsigned long len)
4741{
4742        size_t cur;
4743        size_t offset;
4744        struct page *page;
4745        char *kaddr;
4746        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4747        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4748
4749        WARN_ON(start > eb->len);
4750        WARN_ON(start + len > eb->start + eb->len);
4751
4752        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4753
4754        while (len > 0) {
4755                page = extent_buffer_page(eb, i);
4756                WARN_ON(!PageUptodate(page));
4757
4758                cur = min(len, PAGE_CACHE_SIZE - offset);
4759                kaddr = page_address(page);
4760                memset(kaddr + offset, c, cur);
4761
4762                len -= cur;
4763                offset = 0;
4764                i++;
4765        }
4766}
4767
4768void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4769                        unsigned long dst_offset, unsigned long src_offset,
4770                        unsigned long len)
4771{
4772        u64 dst_len = dst->len;
4773        size_t cur;
4774        size_t offset;
4775        struct page *page;
4776        char *kaddr;
4777        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4778        unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4779
4780        WARN_ON(src->len != dst_len);
4781
4782        offset = (start_offset + dst_offset) &
4783                ((unsigned long)PAGE_CACHE_SIZE - 1);
4784
4785        while (len > 0) {
4786                page = extent_buffer_page(dst, i);
4787                WARN_ON(!PageUptodate(page));
4788
4789                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4790
4791                kaddr = page_address(page);
4792                read_extent_buffer(src, kaddr + offset, src_offset, cur);
4793
4794                src_offset += cur;
4795                len -= cur;
4796                offset = 0;
4797                i++;
4798        }
4799}
4800
4801static void move_pages(struct page *dst_page, struct page *src_page,
4802                       unsigned long dst_off, unsigned long src_off,
4803                       unsigned long len)
4804{
4805        char *dst_kaddr = page_address(dst_page);
4806        if (dst_page == src_page) {
4807                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4808        } else {
4809                char *src_kaddr = page_address(src_page);
4810                char *p = dst_kaddr + dst_off + len;
4811                char *s = src_kaddr + src_off + len;
4812
4813                while (len--)
4814                        *--p = *--s;
4815        }
4816}
4817
4818static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4819{
4820        unsigned long distance = (src > dst) ? src - dst : dst - src;
4821        return distance < len;
4822}
4823
4824static void copy_pages(struct page *dst_page, struct page *src_page,
4825                       unsigned long dst_off, unsigned long src_off,
4826                       unsigned long len)
4827{
4828        char *dst_kaddr = page_address(dst_page);
4829        char *src_kaddr;
4830        int must_memmove = 0;
4831
4832        if (dst_page != src_page) {
4833                src_kaddr = page_address(src_page);
4834        } else {
4835                src_kaddr = dst_kaddr;
4836                if (areas_overlap(src_off, dst_off, len))
4837                        must_memmove = 1;
4838        }
4839
4840        if (must_memmove)
4841                memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4842        else
4843                memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4844}
4845
4846void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4847                           unsigned long src_offset, unsigned long len)
4848{
4849        size_t cur;
4850        size_t dst_off_in_page;
4851        size_t src_off_in_page;
4852        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4853        unsigned long dst_i;
4854        unsigned long src_i;
4855
4856        if (src_offset + len > dst->len) {
4857                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4858                       "len %lu dst len %lu\n", src_offset, len, dst->len);
4859                BUG_ON(1);
4860        }
4861        if (dst_offset + len > dst->len) {
4862                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4863                       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4864                BUG_ON(1);
4865        }
4866
4867        while (len > 0) {
4868                dst_off_in_page = (start_offset + dst_offset) &
4869                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4870                src_off_in_page = (start_offset + src_offset) &
4871                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4872
4873                dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4874                src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4875
4876                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4877                                               src_off_in_page));
4878                cur = min_t(unsigned long, cur,
4879                        (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4880
4881                copy_pages(extent_buffer_page(dst, dst_i),
4882                           extent_buffer_page(dst, src_i),
4883                           dst_off_in_page, src_off_in_page, cur);
4884
4885                src_offset += cur;
4886                dst_offset += cur;
4887                len -= cur;
4888        }
4889}
4890
4891void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4892                           unsigned long src_offset, unsigned long len)
4893{
4894        size_t cur;
4895        size_t dst_off_in_page;
4896        size_t src_off_in_page;
4897        unsigned long dst_end = dst_offset + len - 1;
4898        unsigned long src_end = src_offset + len - 1;
4899        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4900        unsigned long dst_i;
4901        unsigned long src_i;
4902
4903        if (src_offset + len > dst->len) {
4904                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4905                       "len %lu len %lu\n", src_offset, len, dst->len);
4906                BUG_ON(1);
4907        }
4908        if (dst_offset + len > dst->len) {
4909                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4910                       "len %lu len %lu\n", dst_offset, len, dst->len);
4911                BUG_ON(1);
4912        }
4913        if (dst_offset < src_offset) {
4914                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4915                return;
4916        }
4917        while (len > 0) {
4918                dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4919                src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4920
4921                dst_off_in_page = (start_offset + dst_end) &
4922                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4923                src_off_in_page = (start_offset + src_end) &
4924                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4925
4926                cur = min_t(unsigned long, len, src_off_in_page + 1);
4927                cur = min(cur, dst_off_in_page + 1);
4928                move_pages(extent_buffer_page(dst, dst_i),
4929                           extent_buffer_page(dst, src_i),
4930                           dst_off_in_page - cur + 1,
4931                           src_off_in_page - cur + 1, cur);
4932
4933                dst_end -= cur;
4934                src_end -= cur;
4935                len -= cur;
4936        }
4937}
4938
4939int try_release_extent_buffer(struct page *page, gfp_t mask)
4940{
4941        struct extent_buffer *eb;
4942
4943        /*
4944         * We need to make sure noboody is attaching this page to an eb right
4945         * now.
4946         */
4947        spin_lock(&page->mapping->private_lock);
4948        if (!PagePrivate(page)) {
4949                spin_unlock(&page->mapping->private_lock);
4950                return 1;
4951        }
4952
4953        eb = (struct extent_buffer *)page->private;
4954        BUG_ON(!eb);
4955
4956        /*
4957         * This is a little awful but should be ok, we need to make sure that
4958         * the eb doesn't disappear out from under us while we're looking at
4959         * this page.
4960         */
4961        spin_lock(&eb->refs_lock);
4962        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4963                spin_unlock(&eb->refs_lock);
4964                spin_unlock(&page->mapping->private_lock);
4965                return 0;
4966        }
4967        spin_unlock(&page->mapping->private_lock);
4968
4969        if ((mask & GFP_NOFS) == GFP_NOFS)
4970                mask = GFP_NOFS;
4971
4972        /*
4973         * If tree ref isn't set then we know the ref on this eb is a real ref,
4974         * so just return, this page will likely be freed soon anyway.
4975         */
4976        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4977                spin_unlock(&eb->refs_lock);
4978                return 0;
4979        }
4980
4981        return release_extent_buffer(eb, mask);
4982}
4983
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.