linux/fs/btrfs/extent_io.c
<<
>>
Prefs
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
   7#include <linux/module.h>
   8#include <linux/spinlock.h>
   9#include <linux/blkdev.h>
  10#include <linux/swap.h>
  11#include <linux/writeback.h>
  12#include <linux/pagevec.h>
  13#include <linux/prefetch.h>
  14#include <linux/cleancache.h>
  15#include "extent_io.h"
  16#include "extent_map.h"
  17#include "compat.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22
  23static struct kmem_cache *extent_state_cache;
  24static struct kmem_cache *extent_buffer_cache;
  25
  26static LIST_HEAD(buffers);
  27static LIST_HEAD(states);
  28
  29#define LEAK_DEBUG 0
  30#if LEAK_DEBUG
  31static DEFINE_SPINLOCK(leak_lock);
  32#endif
  33
  34#define BUFFER_LRU_MAX 64
  35
  36struct tree_entry {
  37        u64 start;
  38        u64 end;
  39        struct rb_node rb_node;
  40};
  41
  42struct extent_page_data {
  43        struct bio *bio;
  44        struct extent_io_tree *tree;
  45        get_extent_t *get_extent;
  46
  47        /* tells writepage not to lock the state bits for this range
  48         * it still does the unlocking
  49         */
  50        unsigned int extent_locked:1;
  51
  52        /* tells the submit_bio code to use a WRITE_SYNC */
  53        unsigned int sync_io:1;
  54};
  55
  56int __init extent_io_init(void)
  57{
  58        extent_state_cache = kmem_cache_create("extent_state",
  59                        sizeof(struct extent_state), 0,
  60                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  61        if (!extent_state_cache)
  62                return -ENOMEM;
  63
  64        extent_buffer_cache = kmem_cache_create("extent_buffers",
  65                        sizeof(struct extent_buffer), 0,
  66                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  67        if (!extent_buffer_cache)
  68                goto free_state_cache;
  69        return 0;
  70
  71free_state_cache:
  72        kmem_cache_destroy(extent_state_cache);
  73        return -ENOMEM;
  74}
  75
  76void extent_io_exit(void)
  77{
  78        struct extent_state *state;
  79        struct extent_buffer *eb;
  80
  81        while (!list_empty(&states)) {
  82                state = list_entry(states.next, struct extent_state, leak_list);
  83                printk(KERN_ERR "btrfs state leak: start %llu end %llu "
  84                       "state %lu in tree %p refs %d\n",
  85                       (unsigned long long)state->start,
  86                       (unsigned long long)state->end,
  87                       state->state, state->tree, atomic_read(&state->refs));
  88                list_del(&state->leak_list);
  89                kmem_cache_free(extent_state_cache, state);
  90
  91        }
  92
  93        while (!list_empty(&buffers)) {
  94                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
  95                printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
  96                       "refs %d\n", (unsigned long long)eb->start,
  97                       eb->len, atomic_read(&eb->refs));
  98                list_del(&eb->leak_list);
  99                kmem_cache_free(extent_buffer_cache, eb);
 100        }
 101        if (extent_state_cache)
 102                kmem_cache_destroy(extent_state_cache);
 103        if (extent_buffer_cache)
 104                kmem_cache_destroy(extent_buffer_cache);
 105}
 106
 107void extent_io_tree_init(struct extent_io_tree *tree,
 108                         struct address_space *mapping)
 109{
 110        tree->state = RB_ROOT;
 111        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 112        tree->ops = NULL;
 113        tree->dirty_bytes = 0;
 114        spin_lock_init(&tree->lock);
 115        spin_lock_init(&tree->buffer_lock);
 116        tree->mapping = mapping;
 117}
 118
 119static struct extent_state *alloc_extent_state(gfp_t mask)
 120{
 121        struct extent_state *state;
 122#if LEAK_DEBUG
 123        unsigned long flags;
 124#endif
 125
 126        state = kmem_cache_alloc(extent_state_cache, mask);
 127        if (!state)
 128                return state;
 129        state->state = 0;
 130        state->private = 0;
 131        state->tree = NULL;
 132#if LEAK_DEBUG
 133        spin_lock_irqsave(&leak_lock, flags);
 134        list_add(&state->leak_list, &states);
 135        spin_unlock_irqrestore(&leak_lock, flags);
 136#endif
 137        atomic_set(&state->refs, 1);
 138        init_waitqueue_head(&state->wq);
 139        return state;
 140}
 141
 142void free_extent_state(struct extent_state *state)
 143{
 144        if (!state)
 145                return;
 146        if (atomic_dec_and_test(&state->refs)) {
 147#if LEAK_DEBUG
 148                unsigned long flags;
 149#endif
 150                WARN_ON(state->tree);
 151#if LEAK_DEBUG
 152                spin_lock_irqsave(&leak_lock, flags);
 153                list_del(&state->leak_list);
 154                spin_unlock_irqrestore(&leak_lock, flags);
 155#endif
 156                kmem_cache_free(extent_state_cache, state);
 157        }
 158}
 159
 160static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 161                                   struct rb_node *node)
 162{
 163        struct rb_node **p = &root->rb_node;
 164        struct rb_node *parent = NULL;
 165        struct tree_entry *entry;
 166
 167        while (*p) {
 168                parent = *p;
 169                entry = rb_entry(parent, struct tree_entry, rb_node);
 170
 171                if (offset < entry->start)
 172                        p = &(*p)->rb_left;
 173                else if (offset > entry->end)
 174                        p = &(*p)->rb_right;
 175                else
 176                        return parent;
 177        }
 178
 179        entry = rb_entry(node, struct tree_entry, rb_node);
 180        rb_link_node(node, parent, p);
 181        rb_insert_color(node, root);
 182        return NULL;
 183}
 184
 185static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 186                                     struct rb_node **prev_ret,
 187                                     struct rb_node **next_ret)
 188{
 189        struct rb_root *root = &tree->state;
 190        struct rb_node *n = root->rb_node;
 191        struct rb_node *prev = NULL;
 192        struct rb_node *orig_prev = NULL;
 193        struct tree_entry *entry;
 194        struct tree_entry *prev_entry = NULL;
 195
 196        while (n) {
 197                entry = rb_entry(n, struct tree_entry, rb_node);
 198                prev = n;
 199                prev_entry = entry;
 200
 201                if (offset < entry->start)
 202                        n = n->rb_left;
 203                else if (offset > entry->end)
 204                        n = n->rb_right;
 205                else
 206                        return n;
 207        }
 208
 209        if (prev_ret) {
 210                orig_prev = prev;
 211                while (prev && offset > prev_entry->end) {
 212                        prev = rb_next(prev);
 213                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 214                }
 215                *prev_ret = prev;
 216                prev = orig_prev;
 217        }
 218
 219        if (next_ret) {
 220                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 221                while (prev && offset < prev_entry->start) {
 222                        prev = rb_prev(prev);
 223                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 224                }
 225                *next_ret = prev;
 226        }
 227        return NULL;
 228}
 229
 230static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 231                                          u64 offset)
 232{
 233        struct rb_node *prev = NULL;
 234        struct rb_node *ret;
 235
 236        ret = __etree_search(tree, offset, &prev, NULL);
 237        if (!ret)
 238                return prev;
 239        return ret;
 240}
 241
 242static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 243                     struct extent_state *other)
 244{
 245        if (tree->ops && tree->ops->merge_extent_hook)
 246                tree->ops->merge_extent_hook(tree->mapping->host, new,
 247                                             other);
 248}
 249
 250/*
 251 * utility function to look for merge candidates inside a given range.
 252 * Any extents with matching state are merged together into a single
 253 * extent in the tree.  Extents with EXTENT_IO in their state field
 254 * are not merged because the end_io handlers need to be able to do
 255 * operations on them without sleeping (or doing allocations/splits).
 256 *
 257 * This should be called with the tree lock held.
 258 */
 259static void merge_state(struct extent_io_tree *tree,
 260                        struct extent_state *state)
 261{
 262        struct extent_state *other;
 263        struct rb_node *other_node;
 264
 265        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 266                return;
 267
 268        other_node = rb_prev(&state->rb_node);
 269        if (other_node) {
 270                other = rb_entry(other_node, struct extent_state, rb_node);
 271                if (other->end == state->start - 1 &&
 272                    other->state == state->state) {
 273                        merge_cb(tree, state, other);
 274                        state->start = other->start;
 275                        other->tree = NULL;
 276                        rb_erase(&other->rb_node, &tree->state);
 277                        free_extent_state(other);
 278                }
 279        }
 280        other_node = rb_next(&state->rb_node);
 281        if (other_node) {
 282                other = rb_entry(other_node, struct extent_state, rb_node);
 283                if (other->start == state->end + 1 &&
 284                    other->state == state->state) {
 285                        merge_cb(tree, state, other);
 286                        state->end = other->end;
 287                        other->tree = NULL;
 288                        rb_erase(&other->rb_node, &tree->state);
 289                        free_extent_state(other);
 290                }
 291        }
 292}
 293
 294static void set_state_cb(struct extent_io_tree *tree,
 295                         struct extent_state *state, int *bits)
 296{
 297        if (tree->ops && tree->ops->set_bit_hook)
 298                tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 299}
 300
 301static void clear_state_cb(struct extent_io_tree *tree,
 302                           struct extent_state *state, int *bits)
 303{
 304        if (tree->ops && tree->ops->clear_bit_hook)
 305                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 306}
 307
 308static void set_state_bits(struct extent_io_tree *tree,
 309                           struct extent_state *state, int *bits);
 310
 311/*
 312 * insert an extent_state struct into the tree.  'bits' are set on the
 313 * struct before it is inserted.
 314 *
 315 * This may return -EEXIST if the extent is already there, in which case the
 316 * state struct is freed.
 317 *
 318 * The tree lock is not taken internally.  This is a utility function and
 319 * probably isn't what you want to call (see set/clear_extent_bit).
 320 */
 321static int insert_state(struct extent_io_tree *tree,
 322                        struct extent_state *state, u64 start, u64 end,
 323                        int *bits)
 324{
 325        struct rb_node *node;
 326
 327        if (end < start) {
 328                printk(KERN_ERR "btrfs end < start %llu %llu\n",
 329                       (unsigned long long)end,
 330                       (unsigned long long)start);
 331                WARN_ON(1);
 332        }
 333        state->start = start;
 334        state->end = end;
 335
 336        set_state_bits(tree, state, bits);
 337
 338        node = tree_insert(&tree->state, end, &state->rb_node);
 339        if (node) {
 340                struct extent_state *found;
 341                found = rb_entry(node, struct extent_state, rb_node);
 342                printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 343                       "%llu %llu\n", (unsigned long long)found->start,
 344                       (unsigned long long)found->end,
 345                       (unsigned long long)start, (unsigned long long)end);
 346                return -EEXIST;
 347        }
 348        state->tree = tree;
 349        merge_state(tree, state);
 350        return 0;
 351}
 352
 353static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 354                     u64 split)
 355{
 356        if (tree->ops && tree->ops->split_extent_hook)
 357                tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 358}
 359
 360/*
 361 * split a given extent state struct in two, inserting the preallocated
 362 * struct 'prealloc' as the newly created second half.  'split' indicates an
 363 * offset inside 'orig' where it should be split.
 364 *
 365 * Before calling,
 366 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 367 * are two extent state structs in the tree:
 368 * prealloc: [orig->start, split - 1]
 369 * orig: [ split, orig->end ]
 370 *
 371 * The tree locks are not taken by this function. They need to be held
 372 * by the caller.
 373 */
 374static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 375                       struct extent_state *prealloc, u64 split)
 376{
 377        struct rb_node *node;
 378
 379        split_cb(tree, orig, split);
 380
 381        prealloc->start = orig->start;
 382        prealloc->end = split - 1;
 383        prealloc->state = orig->state;
 384        orig->start = split;
 385
 386        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 387        if (node) {
 388                free_extent_state(prealloc);
 389                return -EEXIST;
 390        }
 391        prealloc->tree = tree;
 392        return 0;
 393}
 394
 395/*
 396 * utility function to clear some bits in an extent state struct.
 397 * it will optionally wake up any one waiting on this state (wake == 1), or
 398 * forcibly remove the state from the tree (delete == 1).
 399 *
 400 * If no bits are set on the state struct after clearing things, the
 401 * struct is freed and removed from the tree
 402 */
 403static int clear_state_bit(struct extent_io_tree *tree,
 404                            struct extent_state *state,
 405                            int *bits, int wake)
 406{
 407        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 408        int ret = state->state & bits_to_clear;
 409
 410        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 411                u64 range = state->end - state->start + 1;
 412                WARN_ON(range > tree->dirty_bytes);
 413                tree->dirty_bytes -= range;
 414        }
 415        clear_state_cb(tree, state, bits);
 416        state->state &= ~bits_to_clear;
 417        if (wake)
 418                wake_up(&state->wq);
 419        if (state->state == 0) {
 420                if (state->tree) {
 421                        rb_erase(&state->rb_node, &tree->state);
 422                        state->tree = NULL;
 423                        free_extent_state(state);
 424                } else {
 425                        WARN_ON(1);
 426                }
 427        } else {
 428                merge_state(tree, state);
 429        }
 430        return ret;
 431}
 432
 433static struct extent_state *
 434alloc_extent_state_atomic(struct extent_state *prealloc)
 435{
 436        if (!prealloc)
 437                prealloc = alloc_extent_state(GFP_ATOMIC);
 438
 439        return prealloc;
 440}
 441
 442/*
 443 * clear some bits on a range in the tree.  This may require splitting
 444 * or inserting elements in the tree, so the gfp mask is used to
 445 * indicate which allocations or sleeping are allowed.
 446 *
 447 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 448 * the given range from the tree regardless of state (ie for truncate).
 449 *
 450 * the range [start, end] is inclusive.
 451 *
 452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
 453 * bits were already set, or zero if none of the bits were already set.
 454 */
 455int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 456                     int bits, int wake, int delete,
 457                     struct extent_state **cached_state,
 458                     gfp_t mask)
 459{
 460        struct extent_state *state;
 461        struct extent_state *cached;
 462        struct extent_state *prealloc = NULL;
 463        struct rb_node *next_node;
 464        struct rb_node *node;
 465        u64 last_end;
 466        int err;
 467        int set = 0;
 468        int clear = 0;
 469
 470        if (delete)
 471                bits |= ~EXTENT_CTLBITS;
 472        bits |= EXTENT_FIRST_DELALLOC;
 473
 474        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 475                clear = 1;
 476again:
 477        if (!prealloc && (mask & __GFP_WAIT)) {
 478                prealloc = alloc_extent_state(mask);
 479                if (!prealloc)
 480                        return -ENOMEM;
 481        }
 482
 483        spin_lock(&tree->lock);
 484        if (cached_state) {
 485                cached = *cached_state;
 486
 487                if (clear) {
 488                        *cached_state = NULL;
 489                        cached_state = NULL;
 490                }
 491
 492                if (cached && cached->tree && cached->start <= start &&
 493                    cached->end > start) {
 494                        if (clear)
 495                                atomic_dec(&cached->refs);
 496                        state = cached;
 497                        goto hit_next;
 498                }
 499                if (clear)
 500                        free_extent_state(cached);
 501        }
 502        /*
 503         * this search will find the extents that end after
 504         * our range starts
 505         */
 506        node = tree_search(tree, start);
 507        if (!node)
 508                goto out;
 509        state = rb_entry(node, struct extent_state, rb_node);
 510hit_next:
 511        if (state->start > end)
 512                goto out;
 513        WARN_ON(state->end < start);
 514        last_end = state->end;
 515
 516        if (state->end < end && !need_resched())
 517                next_node = rb_next(&state->rb_node);
 518        else
 519                next_node = NULL;
 520
 521        /* the state doesn't have the wanted bits, go ahead */
 522        if (!(state->state & bits))
 523                goto next;
 524
 525        /*
 526         *     | ---- desired range ---- |
 527         *  | state | or
 528         *  | ------------- state -------------- |
 529         *
 530         * We need to split the extent we found, and may flip
 531         * bits on second half.
 532         *
 533         * If the extent we found extends past our range, we
 534         * just split and search again.  It'll get split again
 535         * the next time though.
 536         *
 537         * If the extent we found is inside our range, we clear
 538         * the desired bit on it.
 539         */
 540
 541        if (state->start < start) {
 542                prealloc = alloc_extent_state_atomic(prealloc);
 543                BUG_ON(!prealloc);
 544                err = split_state(tree, state, prealloc, start);
 545                BUG_ON(err == -EEXIST);
 546                prealloc = NULL;
 547                if (err)
 548                        goto out;
 549                if (state->end <= end) {
 550                        set |= clear_state_bit(tree, state, &bits, wake);
 551                        if (last_end == (u64)-1)
 552                                goto out;
 553                        start = last_end + 1;
 554                }
 555                goto search_again;
 556        }
 557        /*
 558         * | ---- desired range ---- |
 559         *                        | state |
 560         * We need to split the extent, and clear the bit
 561         * on the first half
 562         */
 563        if (state->start <= end && state->end > end) {
 564                prealloc = alloc_extent_state_atomic(prealloc);
 565                BUG_ON(!prealloc);
 566                err = split_state(tree, state, prealloc, end + 1);
 567                BUG_ON(err == -EEXIST);
 568                if (wake)
 569                        wake_up(&state->wq);
 570
 571                set |= clear_state_bit(tree, prealloc, &bits, wake);
 572
 573                prealloc = NULL;
 574                goto out;
 575        }
 576
 577        set |= clear_state_bit(tree, state, &bits, wake);
 578next:
 579        if (last_end == (u64)-1)
 580                goto out;
 581        start = last_end + 1;
 582        if (start <= end && next_node) {
 583                state = rb_entry(next_node, struct extent_state,
 584                                 rb_node);
 585                goto hit_next;
 586        }
 587        goto search_again;
 588
 589out:
 590        spin_unlock(&tree->lock);
 591        if (prealloc)
 592                free_extent_state(prealloc);
 593
 594        return set;
 595
 596search_again:
 597        if (start > end)
 598                goto out;
 599        spin_unlock(&tree->lock);
 600        if (mask & __GFP_WAIT)
 601                cond_resched();
 602        goto again;
 603}
 604
 605static int wait_on_state(struct extent_io_tree *tree,
 606                         struct extent_state *state)
 607                __releases(tree->lock)
 608                __acquires(tree->lock)
 609{
 610        DEFINE_WAIT(wait);
 611        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 612        spin_unlock(&tree->lock);
 613        schedule();
 614        spin_lock(&tree->lock);
 615        finish_wait(&state->wq, &wait);
 616        return 0;
 617}
 618
 619/*
 620 * waits for one or more bits to clear on a range in the state tree.
 621 * The range [start, end] is inclusive.
 622 * The tree lock is taken by this function
 623 */
 624int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 625{
 626        struct extent_state *state;
 627        struct rb_node *node;
 628
 629        spin_lock(&tree->lock);
 630again:
 631        while (1) {
 632                /*
 633                 * this search will find all the extents that end after
 634                 * our range starts
 635                 */
 636                node = tree_search(tree, start);
 637                if (!node)
 638                        break;
 639
 640                state = rb_entry(node, struct extent_state, rb_node);
 641
 642                if (state->start > end)
 643                        goto out;
 644
 645                if (state->state & bits) {
 646                        start = state->start;
 647                        atomic_inc(&state->refs);
 648                        wait_on_state(tree, state);
 649                        free_extent_state(state);
 650                        goto again;
 651                }
 652                start = state->end + 1;
 653
 654                if (start > end)
 655                        break;
 656
 657                cond_resched_lock(&tree->lock);
 658        }
 659out:
 660        spin_unlock(&tree->lock);
 661        return 0;
 662}
 663
 664static void set_state_bits(struct extent_io_tree *tree,
 665                           struct extent_state *state,
 666                           int *bits)
 667{
 668        int bits_to_set = *bits & ~EXTENT_CTLBITS;
 669
 670        set_state_cb(tree, state, bits);
 671        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 672                u64 range = state->end - state->start + 1;
 673                tree->dirty_bytes += range;
 674        }
 675        state->state |= bits_to_set;
 676}
 677
 678static void cache_state(struct extent_state *state,
 679                        struct extent_state **cached_ptr)
 680{
 681        if (cached_ptr && !(*cached_ptr)) {
 682                if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 683                        *cached_ptr = state;
 684                        atomic_inc(&state->refs);
 685                }
 686        }
 687}
 688
 689static void uncache_state(struct extent_state **cached_ptr)
 690{
 691        if (cached_ptr && (*cached_ptr)) {
 692                struct extent_state *state = *cached_ptr;
 693                *cached_ptr = NULL;
 694                free_extent_state(state);
 695        }
 696}
 697
 698/*
 699 * set some bits on a range in the tree.  This may require allocations or
 700 * sleeping, so the gfp mask is used to indicate what is allowed.
 701 *
 702 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 703 * part of the range already has the desired bits set.  The start of the
 704 * existing range is returned in failed_start in this case.
 705 *
 706 * [start, end] is inclusive This takes the tree lock.
 707 */
 708
 709int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 710                   int bits, int exclusive_bits, u64 *failed_start,
 711                   struct extent_state **cached_state, gfp_t mask)
 712{
 713        struct extent_state *state;
 714        struct extent_state *prealloc = NULL;
 715        struct rb_node *node;
 716        int err = 0;
 717        u64 last_start;
 718        u64 last_end;
 719
 720        bits |= EXTENT_FIRST_DELALLOC;
 721again:
 722        if (!prealloc && (mask & __GFP_WAIT)) {
 723                prealloc = alloc_extent_state(mask);
 724                BUG_ON(!prealloc);
 725        }
 726
 727        spin_lock(&tree->lock);
 728        if (cached_state && *cached_state) {
 729                state = *cached_state;
 730                if (state->start <= start && state->end > start &&
 731                    state->tree) {
 732                        node = &state->rb_node;
 733                        goto hit_next;
 734                }
 735        }
 736        /*
 737         * this search will find all the extents that end after
 738         * our range starts.
 739         */
 740        node = tree_search(tree, start);
 741        if (!node) {
 742                prealloc = alloc_extent_state_atomic(prealloc);
 743                BUG_ON(!prealloc);
 744                err = insert_state(tree, prealloc, start, end, &bits);
 745                prealloc = NULL;
 746                BUG_ON(err == -EEXIST);
 747                goto out;
 748        }
 749        state = rb_entry(node, struct extent_state, rb_node);
 750hit_next:
 751        last_start = state->start;
 752        last_end = state->end;
 753
 754        /*
 755         * | ---- desired range ---- |
 756         * | state |
 757         *
 758         * Just lock what we found and keep going
 759         */
 760        if (state->start == start && state->end <= end) {
 761                struct rb_node *next_node;
 762                if (state->state & exclusive_bits) {
 763                        *failed_start = state->start;
 764                        err = -EEXIST;
 765                        goto out;
 766                }
 767
 768                set_state_bits(tree, state, &bits);
 769
 770                cache_state(state, cached_state);
 771                merge_state(tree, state);
 772                if (last_end == (u64)-1)
 773                        goto out;
 774
 775                start = last_end + 1;
 776                next_node = rb_next(&state->rb_node);
 777                if (next_node && start < end && prealloc && !need_resched()) {
 778                        state = rb_entry(next_node, struct extent_state,
 779                                         rb_node);
 780                        if (state->start == start)
 781                                goto hit_next;
 782                }
 783                goto search_again;
 784        }
 785
 786        /*
 787         *     | ---- desired range ---- |
 788         * | state |
 789         *   or
 790         * | ------------- state -------------- |
 791         *
 792         * We need to split the extent we found, and may flip bits on
 793         * second half.
 794         *
 795         * If the extent we found extends past our
 796         * range, we just split and search again.  It'll get split
 797         * again the next time though.
 798         *
 799         * If the extent we found is inside our range, we set the
 800         * desired bit on it.
 801         */
 802        if (state->start < start) {
 803                if (state->state & exclusive_bits) {
 804                        *failed_start = start;
 805                        err = -EEXIST;
 806                        goto out;
 807                }
 808
 809                prealloc = alloc_extent_state_atomic(prealloc);
 810                BUG_ON(!prealloc);
 811                err = split_state(tree, state, prealloc, start);
 812                BUG_ON(err == -EEXIST);
 813                prealloc = NULL;
 814                if (err)
 815                        goto out;
 816                if (state->end <= end) {
 817                        set_state_bits(tree, state, &bits);
 818                        cache_state(state, cached_state);
 819                        merge_state(tree, state);
 820                        if (last_end == (u64)-1)
 821                                goto out;
 822                        start = last_end + 1;
 823                }
 824                goto search_again;
 825        }
 826        /*
 827         * | ---- desired range ---- |
 828         *     | state | or               | state |
 829         *
 830         * There's a hole, we need to insert something in it and
 831         * ignore the extent we found.
 832         */
 833        if (state->start > start) {
 834                u64 this_end;
 835                if (end < last_start)
 836                        this_end = end;
 837                else
 838                        this_end = last_start - 1;
 839
 840                prealloc = alloc_extent_state_atomic(prealloc);
 841                BUG_ON(!prealloc);
 842
 843                /*
 844                 * Avoid to free 'prealloc' if it can be merged with
 845                 * the later extent.
 846                 */
 847                err = insert_state(tree, prealloc, start, this_end,
 848                                   &bits);
 849                BUG_ON(err == -EEXIST);
 850                if (err) {
 851                        free_extent_state(prealloc);
 852                        prealloc = NULL;
 853                        goto out;
 854                }
 855                cache_state(prealloc, cached_state);
 856                prealloc = NULL;
 857                start = this_end + 1;
 858                goto search_again;
 859        }
 860        /*
 861         * | ---- desired range ---- |
 862         *                        | state |
 863         * We need to split the extent, and set the bit
 864         * on the first half
 865         */
 866        if (state->start <= end && state->end > end) {
 867                if (state->state & exclusive_bits) {
 868                        *failed_start = start;
 869                        err = -EEXIST;
 870                        goto out;
 871                }
 872
 873                prealloc = alloc_extent_state_atomic(prealloc);
 874                BUG_ON(!prealloc);
 875                err = split_state(tree, state, prealloc, end + 1);
 876                BUG_ON(err == -EEXIST);
 877
 878                set_state_bits(tree, prealloc, &bits);
 879                cache_state(prealloc, cached_state);
 880                merge_state(tree, prealloc);
 881                prealloc = NULL;
 882                goto out;
 883        }
 884
 885        goto search_again;
 886
 887out:
 888        spin_unlock(&tree->lock);
 889        if (prealloc)
 890                free_extent_state(prealloc);
 891
 892        return err;
 893
 894search_again:
 895        if (start > end)
 896                goto out;
 897        spin_unlock(&tree->lock);
 898        if (mask & __GFP_WAIT)
 899                cond_resched();
 900        goto again;
 901}
 902
 903/**
 904 * convert_extent - convert all bits in a given range from one bit to another
 905 * @tree:       the io tree to search
 906 * @start:      the start offset in bytes
 907 * @end:        the end offset in bytes (inclusive)
 908 * @bits:       the bits to set in this range
 909 * @clear_bits: the bits to clear in this range
 910 * @mask:       the allocation mask
 911 *
 912 * This will go through and set bits for the given range.  If any states exist
 913 * already in this range they are set with the given bit and cleared of the
 914 * clear_bits.  This is only meant to be used by things that are mergeable, ie
 915 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 916 * boundary bits like LOCK.
 917 */
 918int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 919                       int bits, int clear_bits, gfp_t mask)
 920{
 921        struct extent_state *state;
 922        struct extent_state *prealloc = NULL;
 923        struct rb_node *node;
 924        int err = 0;
 925        u64 last_start;
 926        u64 last_end;
 927
 928again:
 929        if (!prealloc && (mask & __GFP_WAIT)) {
 930                prealloc = alloc_extent_state(mask);
 931                if (!prealloc)
 932                        return -ENOMEM;
 933        }
 934
 935        spin_lock(&tree->lock);
 936        /*
 937         * this search will find all the extents that end after
 938         * our range starts.
 939         */
 940        node = tree_search(tree, start);
 941        if (!node) {
 942                prealloc = alloc_extent_state_atomic(prealloc);
 943                if (!prealloc) {
 944                        err = -ENOMEM;
 945                        goto out;
 946                }
 947                err = insert_state(tree, prealloc, start, end, &bits);
 948                prealloc = NULL;
 949                BUG_ON(err == -EEXIST);
 950                goto out;
 951        }
 952        state = rb_entry(node, struct extent_state, rb_node);
 953hit_next:
 954        last_start = state->start;
 955        last_end = state->end;
 956
 957        /*
 958         * | ---- desired range ---- |
 959         * | state |
 960         *
 961         * Just lock what we found and keep going
 962         */
 963        if (state->start == start && state->end <= end) {
 964                struct rb_node *next_node;
 965
 966                set_state_bits(tree, state, &bits);
 967                clear_state_bit(tree, state, &clear_bits, 0);
 968                if (last_end == (u64)-1)
 969                        goto out;
 970
 971                start = last_end + 1;
 972                next_node = rb_next(&state->rb_node);
 973                if (next_node && start < end && prealloc && !need_resched()) {
 974                        state = rb_entry(next_node, struct extent_state,
 975                                         rb_node);
 976                        if (state->start == start)
 977                                goto hit_next;
 978                }
 979                goto search_again;
 980        }
 981
 982        /*
 983         *     | ---- desired range ---- |
 984         * | state |
 985         *   or
 986         * | ------------- state -------------- |
 987         *
 988         * We need to split the extent we found, and may flip bits on
 989         * second half.
 990         *
 991         * If the extent we found extends past our
 992         * range, we just split and search again.  It'll get split
 993         * again the next time though.
 994         *
 995         * If the extent we found is inside our range, we set the
 996         * desired bit on it.
 997         */
 998        if (state->start < start) {
 999                prealloc = alloc_extent_state_atomic(prealloc);
1000                if (!prealloc) {
1001                        err = -ENOMEM;
1002                        goto out;
1003                }
1004                err = split_state(tree, state, prealloc, start);
1005                BUG_ON(err == -EEXIST);
1006                prealloc = NULL;
1007                if (err)
1008                        goto out;
1009                if (state->end <= end) {
1010                        set_state_bits(tree, state, &bits);
1011                        clear_state_bit(tree, state, &clear_bits, 0);
1012                        if (last_end == (u64)-1)
1013                                goto out;
1014                        start = last_end + 1;
1015                }
1016                goto search_again;
1017        }
1018        /*
1019         * | ---- desired range ---- |
1020         *     | state | or               | state |
1021         *
1022         * There's a hole, we need to insert something in it and
1023         * ignore the extent we found.
1024         */
1025        if (state->start > start) {
1026                u64 this_end;
1027                if (end < last_start)
1028                        this_end = end;
1029                else
1030                        this_end = last_start - 1;
1031
1032                prealloc = alloc_extent_state_atomic(prealloc);
1033                if (!prealloc) {
1034                        err = -ENOMEM;
1035                        goto out;
1036                }
1037
1038                /*
1039                 * Avoid to free 'prealloc' if it can be merged with
1040                 * the later extent.
1041                 */
1042                err = insert_state(tree, prealloc, start, this_end,
1043                                   &bits);
1044                BUG_ON(err == -EEXIST);
1045                if (err) {
1046                        free_extent_state(prealloc);
1047                        prealloc = NULL;
1048                        goto out;
1049                }
1050                prealloc = NULL;
1051                start = this_end + 1;
1052                goto search_again;
1053        }
1054        /*
1055         * | ---- desired range ---- |
1056         *                        | state |
1057         * We need to split the extent, and set the bit
1058         * on the first half
1059         */
1060        if (state->start <= end && state->end > end) {
1061                prealloc = alloc_extent_state_atomic(prealloc);
1062                if (!prealloc) {
1063                        err = -ENOMEM;
1064                        goto out;
1065                }
1066
1067                err = split_state(tree, state, prealloc, end + 1);
1068                BUG_ON(err == -EEXIST);
1069
1070                set_state_bits(tree, prealloc, &bits);
1071                clear_state_bit(tree, prealloc, &clear_bits, 0);
1072                prealloc = NULL;
1073                goto out;
1074        }
1075
1076        goto search_again;
1077
1078out:
1079        spin_unlock(&tree->lock);
1080        if (prealloc)
1081                free_extent_state(prealloc);
1082
1083        return err;
1084
1085search_again:
1086        if (start > end)
1087                goto out;
1088        spin_unlock(&tree->lock);
1089        if (mask & __GFP_WAIT)
1090                cond_resched();
1091        goto again;
1092}
1093
1094/* wrappers around set/clear extent bit */
1095int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1096                     gfp_t mask)
1097{
1098        return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
1099                              NULL, mask);
1100}
1101
1102int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1103                    int bits, gfp_t mask)
1104{
1105        return set_extent_bit(tree, start, end, bits, 0, NULL,
1106                              NULL, mask);
1107}
1108
1109int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1110                      int bits, gfp_t mask)
1111{
1112        return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1113}
1114
1115int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1116                        struct extent_state **cached_state, gfp_t mask)
1117{
1118        return set_extent_bit(tree, start, end,
1119                              EXTENT_DELALLOC | EXTENT_UPTODATE,
1120                              0, NULL, cached_state, mask);
1121}
1122
1123int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1124                       gfp_t mask)
1125{
1126        return clear_extent_bit(tree, start, end,
1127                                EXTENT_DIRTY | EXTENT_DELALLOC |
1128                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1129}
1130
1131int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1132                     gfp_t mask)
1133{
1134        return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
1135                              NULL, mask);
1136}
1137
1138int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1139                        struct extent_state **cached_state, gfp_t mask)
1140{
1141        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1142                              NULL, cached_state, mask);
1143}
1144
1145static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
1146                                 u64 end, struct extent_state **cached_state,
1147                                 gfp_t mask)
1148{
1149        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1150                                cached_state, mask);
1151}
1152
1153/*
1154 * either insert or lock state struct between start and end use mask to tell
1155 * us if waiting is desired.
1156 */
1157int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1158                     int bits, struct extent_state **cached_state, gfp_t mask)
1159{
1160        int err;
1161        u64 failed_start;
1162        while (1) {
1163                err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1164                                     EXTENT_LOCKED, &failed_start,
1165                                     cached_state, mask);
1166                if (err == -EEXIST && (mask & __GFP_WAIT)) {
1167                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1168                        start = failed_start;
1169                } else {
1170                        break;
1171                }
1172                WARN_ON(start > end);
1173        }
1174        return err;
1175}
1176
1177int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1178{
1179        return lock_extent_bits(tree, start, end, 0, NULL, mask);
1180}
1181
1182int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1183                    gfp_t mask)
1184{
1185        int err;
1186        u64 failed_start;
1187
1188        err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1189                             &failed_start, NULL, mask);
1190        if (err == -EEXIST) {
1191                if (failed_start > start)
1192                        clear_extent_bit(tree, start, failed_start - 1,
1193                                         EXTENT_LOCKED, 1, 0, NULL, mask);
1194                return 0;
1195        }
1196        return 1;
1197}
1198
1199int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1200                         struct extent_state **cached, gfp_t mask)
1201{
1202        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1203                                mask);
1204}
1205
1206int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1207{
1208        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1209                                mask);
1210}
1211
1212/*
1213 * helper function to set both pages and extents in the tree writeback
1214 */
1215static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1216{
1217        unsigned long index = start >> PAGE_CACHE_SHIFT;
1218        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1219        struct page *page;
1220
1221        while (index <= end_index) {
1222                page = find_get_page(tree->mapping, index);
1223                BUG_ON(!page);
1224                set_page_writeback(page);
1225                page_cache_release(page);
1226                index++;
1227        }
1228        return 0;
1229}
1230
1231/* find the first state struct with 'bits' set after 'start', and
1232 * return it.  tree->lock must be held.  NULL will returned if
1233 * nothing was found after 'start'
1234 */
1235struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1236                                                 u64 start, int bits)
1237{
1238        struct rb_node *node;
1239        struct extent_state *state;
1240
1241        /*
1242         * this search will find all the extents that end after
1243         * our range starts.
1244         */
1245        node = tree_search(tree, start);
1246        if (!node)
1247                goto out;
1248
1249        while (1) {
1250                state = rb_entry(node, struct extent_state, rb_node);
1251                if (state->end >= start && (state->state & bits))
1252                        return state;
1253
1254                node = rb_next(node);
1255                if (!node)
1256                        break;
1257        }
1258out:
1259        return NULL;
1260}
1261
1262/*
1263 * find the first offset in the io tree with 'bits' set. zero is
1264 * returned if we find something, and *start_ret and *end_ret are
1265 * set to reflect the state struct that was found.
1266 *
1267 * If nothing was found, 1 is returned, < 0 on error
1268 */
1269int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1270                          u64 *start_ret, u64 *end_ret, int bits)
1271{
1272        struct extent_state *state;
1273        int ret = 1;
1274
1275        spin_lock(&tree->lock);
1276        state = find_first_extent_bit_state(tree, start, bits);
1277        if (state) {
1278                *start_ret = state->start;
1279                *end_ret = state->end;
1280                ret = 0;
1281        }
1282        spin_unlock(&tree->lock);
1283        return ret;
1284}
1285
1286/*
1287 * find a contiguous range of bytes in the file marked as delalloc, not
1288 * more than 'max_bytes'.  start and end are used to return the range,
1289 *
1290 * 1 is returned if we find something, 0 if nothing was in the tree
1291 */
1292static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1293                                        u64 *start, u64 *end, u64 max_bytes,
1294                                        struct extent_state **cached_state)
1295{
1296        struct rb_node *node;
1297        struct extent_state *state;
1298        u64 cur_start = *start;
1299        u64 found = 0;
1300        u64 total_bytes = 0;
1301
1302        spin_lock(&tree->lock);
1303
1304        /*
1305         * this search will find all the extents that end after
1306         * our range starts.
1307         */
1308        node = tree_search(tree, cur_start);
1309        if (!node) {
1310                if (!found)
1311                        *end = (u64)-1;
1312                goto out;
1313        }
1314
1315        while (1) {
1316                state = rb_entry(node, struct extent_state, rb_node);
1317                if (found && (state->start != cur_start ||
1318                              (state->state & EXTENT_BOUNDARY))) {
1319                        goto out;
1320                }
1321                if (!(state->state & EXTENT_DELALLOC)) {
1322                        if (!found)
1323                                *end = state->end;
1324                        goto out;
1325                }
1326                if (!found) {
1327                        *start = state->start;
1328                        *cached_state = state;
1329                        atomic_inc(&state->refs);
1330                }
1331                found++;
1332                *end = state->end;
1333                cur_start = state->end + 1;
1334                node = rb_next(node);
1335                if (!node)
1336                        break;
1337                total_bytes += state->end - state->start + 1;
1338                if (total_bytes >= max_bytes)
1339                        break;
1340        }
1341out:
1342        spin_unlock(&tree->lock);
1343        return found;
1344}
1345
1346static noinline int __unlock_for_delalloc(struct inode *inode,
1347                                          struct page *locked_page,
1348                                          u64 start, u64 end)
1349{
1350        int ret;
1351        struct page *pages[16];
1352        unsigned long index = start >> PAGE_CACHE_SHIFT;
1353        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1354        unsigned long nr_pages = end_index - index + 1;
1355        int i;
1356
1357        if (index == locked_page->index && end_index == index)
1358                return 0;
1359
1360        while (nr_pages > 0) {
1361                ret = find_get_pages_contig(inode->i_mapping, index,
1362                                     min_t(unsigned long, nr_pages,
1363                                     ARRAY_SIZE(pages)), pages);
1364                for (i = 0; i < ret; i++) {
1365                        if (pages[i] != locked_page)
1366                                unlock_page(pages[i]);
1367                        page_cache_release(pages[i]);
1368                }
1369                nr_pages -= ret;
1370                index += ret;
1371                cond_resched();
1372        }
1373        return 0;
1374}
1375
1376static noinline int lock_delalloc_pages(struct inode *inode,
1377                                        struct page *locked_page,
1378                                        u64 delalloc_start,
1379                                        u64 delalloc_end)
1380{
1381        unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1382        unsigned long start_index = index;
1383        unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1384        unsigned long pages_locked = 0;
1385        struct page *pages[16];
1386        unsigned long nrpages;
1387        int ret;
1388        int i;
1389
1390        /* the caller is responsible for locking the start index */
1391        if (index == locked_page->index && index == end_index)
1392                return 0;
1393
1394        /* skip the page at the start index */
1395        nrpages = end_index - index + 1;
1396        while (nrpages > 0) {
1397                ret = find_get_pages_contig(inode->i_mapping, index,
1398                                     min_t(unsigned long,
1399                                     nrpages, ARRAY_SIZE(pages)), pages);
1400                if (ret == 0) {
1401                        ret = -EAGAIN;
1402                        goto done;
1403                }
1404                /* now we have an array of pages, lock them all */
1405                for (i = 0; i < ret; i++) {
1406                        /*
1407                         * the caller is taking responsibility for
1408                         * locked_page
1409                         */
1410                        if (pages[i] != locked_page) {
1411                                lock_page(pages[i]);
1412                                if (!PageDirty(pages[i]) ||
1413                                    pages[i]->mapping != inode->i_mapping) {
1414                                        ret = -EAGAIN;
1415                                        unlock_page(pages[i]);
1416                                        page_cache_release(pages[i]);
1417                                        goto done;
1418                                }
1419                        }
1420                        page_cache_release(pages[i]);
1421                        pages_locked++;
1422                }
1423                nrpages -= ret;
1424                index += ret;
1425                cond_resched();
1426        }
1427        ret = 0;
1428done:
1429        if (ret && pages_locked) {
1430                __unlock_for_delalloc(inode, locked_page,
1431                              delalloc_start,
1432                              ((u64)(start_index + pages_locked - 1)) <<
1433                              PAGE_CACHE_SHIFT);
1434        }
1435        return ret;
1436}
1437
1438/*
1439 * find a contiguous range of bytes in the file marked as delalloc, not
1440 * more than 'max_bytes'.  start and end are used to return the range,
1441 *
1442 * 1 is returned if we find something, 0 if nothing was in the tree
1443 */
1444static noinline u64 find_lock_delalloc_range(struct inode *inode,
1445                                             struct extent_io_tree *tree,
1446                                             struct page *locked_page,
1447                                             u64 *start, u64 *end,
1448                                             u64 max_bytes)
1449{
1450        u64 delalloc_start;
1451        u64 delalloc_end;
1452        u64 found;
1453        struct extent_state *cached_state = NULL;
1454        int ret;
1455        int loops = 0;
1456
1457again:
1458        /* step one, find a bunch of delalloc bytes starting at start */
1459        delalloc_start = *start;
1460        delalloc_end = 0;
1461        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1462                                    max_bytes, &cached_state);
1463        if (!found || delalloc_end <= *start) {
1464                *start = delalloc_start;
1465                *end = delalloc_end;
1466                free_extent_state(cached_state);
1467                return found;
1468        }
1469
1470        /*
1471         * start comes from the offset of locked_page.  We have to lock
1472         * pages in order, so we can't process delalloc bytes before
1473         * locked_page
1474         */
1475        if (delalloc_start < *start)
1476                delalloc_start = *start;
1477
1478        /*
1479         * make sure to limit the number of pages we try to lock down
1480         * if we're looping.
1481         */
1482        if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1483                delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1484
1485        /* step two, lock all the pages after the page that has start */
1486        ret = lock_delalloc_pages(inode, locked_page,
1487                                  delalloc_start, delalloc_end);
1488        if (ret == -EAGAIN) {
1489                /* some of the pages are gone, lets avoid looping by
1490                 * shortening the size of the delalloc range we're searching
1491                 */
1492                free_extent_state(cached_state);
1493                if (!loops) {
1494                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1495                        max_bytes = PAGE_CACHE_SIZE - offset;
1496                        loops = 1;
1497                        goto again;
1498                } else {
1499                        found = 0;
1500                        goto out_failed;
1501                }
1502        }
1503        BUG_ON(ret);
1504
1505        /* step three, lock the state bits for the whole range */
1506        lock_extent_bits(tree, delalloc_start, delalloc_end,
1507                         0, &cached_state, GFP_NOFS);
1508
1509        /* then test to make sure it is all still delalloc */
1510        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1511                             EXTENT_DELALLOC, 1, cached_state);
1512        if (!ret) {
1513                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1514                                     &cached_state, GFP_NOFS);
1515                __unlock_for_delalloc(inode, locked_page,
1516                              delalloc_start, delalloc_end);
1517                cond_resched();
1518                goto again;
1519        }
1520        free_extent_state(cached_state);
1521        *start = delalloc_start;
1522        *end = delalloc_end;
1523out_failed:
1524        return found;
1525}
1526
1527int extent_clear_unlock_delalloc(struct inode *inode,
1528                                struct extent_io_tree *tree,
1529                                u64 start, u64 end, struct page *locked_page,
1530                                unsigned long op)
1531{
1532        int ret;
1533        struct page *pages[16];
1534        unsigned long index = start >> PAGE_CACHE_SHIFT;
1535        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1536        unsigned long nr_pages = end_index - index + 1;
1537        int i;
1538        int clear_bits = 0;
1539
1540        if (op & EXTENT_CLEAR_UNLOCK)
1541                clear_bits |= EXTENT_LOCKED;
1542        if (op & EXTENT_CLEAR_DIRTY)
1543                clear_bits |= EXTENT_DIRTY;
1544
1545        if (op & EXTENT_CLEAR_DELALLOC)
1546                clear_bits |= EXTENT_DELALLOC;
1547
1548        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1549        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1550                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1551                    EXTENT_SET_PRIVATE2)))
1552                return 0;
1553
1554        while (nr_pages > 0) {
1555                ret = find_get_pages_contig(inode->i_mapping, index,
1556                                     min_t(unsigned long,
1557                                     nr_pages, ARRAY_SIZE(pages)), pages);
1558                for (i = 0; i < ret; i++) {
1559
1560                        if (op & EXTENT_SET_PRIVATE2)
1561                                SetPagePrivate2(pages[i]);
1562
1563                        if (pages[i] == locked_page) {
1564                                page_cache_release(pages[i]);
1565                                continue;
1566                        }
1567                        if (op & EXTENT_CLEAR_DIRTY)
1568                                clear_page_dirty_for_io(pages[i]);
1569                        if (op & EXTENT_SET_WRITEBACK)
1570                                set_page_writeback(pages[i]);
1571                        if (op & EXTENT_END_WRITEBACK)
1572                                end_page_writeback(pages[i]);
1573                        if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1574                                unlock_page(pages[i]);
1575                        page_cache_release(pages[i]);
1576                }
1577                nr_pages -= ret;
1578                index += ret;
1579                cond_resched();
1580        }
1581        return 0;
1582}
1583
1584/*
1585 * count the number of bytes in the tree that have a given bit(s)
1586 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1587 * cached.  The total number found is returned.
1588 */
1589u64 count_range_bits(struct extent_io_tree *tree,
1590                     u64 *start, u64 search_end, u64 max_bytes,
1591                     unsigned long bits, int contig)
1592{
1593        struct rb_node *node;
1594        struct extent_state *state;
1595        u64 cur_start = *start;
1596        u64 total_bytes = 0;
1597        u64 last = 0;
1598        int found = 0;
1599
1600        if (search_end <= cur_start) {
1601                WARN_ON(1);
1602                return 0;
1603        }
1604
1605        spin_lock(&tree->lock);
1606        if (cur_start == 0 && bits == EXTENT_DIRTY) {
1607                total_bytes = tree->dirty_bytes;
1608                goto out;
1609        }
1610        /*
1611         * this search will find all the extents that end after
1612         * our range starts.
1613         */
1614        node = tree_search(tree, cur_start);
1615        if (!node)
1616                goto out;
1617
1618        while (1) {
1619                state = rb_entry(node, struct extent_state, rb_node);
1620                if (state->start > search_end)
1621                        break;
1622                if (contig && found && state->start > last + 1)
1623                        break;
1624                if (state->end >= cur_start && (state->state & bits) == bits) {
1625                        total_bytes += min(search_end, state->end) + 1 -
1626                                       max(cur_start, state->start);
1627                        if (total_bytes >= max_bytes)
1628                                break;
1629                        if (!found) {
1630                                *start = max(cur_start, state->start);
1631                                found = 1;
1632                        }
1633                        last = state->end;
1634                } else if (contig && found) {
1635                        break;
1636                }
1637                node = rb_next(node);
1638                if (!node)
1639                        break;
1640        }
1641out:
1642        spin_unlock(&tree->lock);
1643        return total_bytes;
1644}
1645
1646/*
1647 * set the private field for a given byte offset in the tree.  If there isn't
1648 * an extent_state there already, this does nothing.
1649 */
1650int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1651{
1652        struct rb_node *node;
1653        struct extent_state *state;
1654        int ret = 0;
1655
1656        spin_lock(&tree->lock);
1657        /*
1658         * this search will find all the extents that end after
1659         * our range starts.
1660         */
1661        node = tree_search(tree, start);
1662        if (!node) {
1663                ret = -ENOENT;
1664                goto out;
1665        }
1666        state = rb_entry(node, struct extent_state, rb_node);
1667        if (state->start != start) {
1668                ret = -ENOENT;
1669                goto out;
1670        }
1671        state->private = private;
1672out:
1673        spin_unlock(&tree->lock);
1674        return ret;
1675}
1676
1677int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1678{
1679        struct rb_node *node;
1680        struct extent_state *state;
1681        int ret = 0;
1682
1683        spin_lock(&tree->lock);
1684        /*
1685         * this search will find all the extents that end after
1686         * our range starts.
1687         */
1688        node = tree_search(tree, start);
1689        if (!node) {
1690                ret = -ENOENT;
1691                goto out;
1692        }
1693        state = rb_entry(node, struct extent_state, rb_node);
1694        if (state->start != start) {
1695                ret = -ENOENT;
1696                goto out;
1697        }
1698        *private = state->private;
1699out:
1700        spin_unlock(&tree->lock);
1701        return ret;
1702}
1703
1704/*
1705 * searches a range in the state tree for a given mask.
1706 * If 'filled' == 1, this returns 1 only if every extent in the tree
1707 * has the bits set.  Otherwise, 1 is returned if any bit in the
1708 * range is found set.
1709 */
1710int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1711                   int bits, int filled, struct extent_state *cached)
1712{
1713        struct extent_state *state = NULL;
1714        struct rb_node *node;
1715        int bitset = 0;
1716
1717        spin_lock(&tree->lock);
1718        if (cached && cached->tree && cached->start <= start &&
1719            cached->end > start)
1720                node = &cached->rb_node;
1721        else
1722                node = tree_search(tree, start);
1723        while (node && start <= end) {
1724                state = rb_entry(node, struct extent_state, rb_node);
1725
1726                if (filled && state->start > start) {
1727                        bitset = 0;
1728                        break;
1729                }
1730
1731                if (state->start > end)
1732                        break;
1733
1734                if (state->state & bits) {
1735                        bitset = 1;
1736                        if (!filled)
1737                                break;
1738                } else if (filled) {
1739                        bitset = 0;
1740                        break;
1741                }
1742
1743                if (state->end == (u64)-1)
1744                        break;
1745
1746                start = state->end + 1;
1747                if (start > end)
1748                        break;
1749                node = rb_next(node);
1750                if (!node) {
1751                        if (filled)
1752                                bitset = 0;
1753                        break;
1754                }
1755        }
1756        spin_unlock(&tree->lock);
1757        return bitset;
1758}
1759
1760/*
1761 * helper function to set a given page up to date if all the
1762 * extents in the tree for that page are up to date
1763 */
1764static int check_page_uptodate(struct extent_io_tree *tree,
1765                               struct page *page)
1766{
1767        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1768        u64 end = start + PAGE_CACHE_SIZE - 1;
1769        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1770                SetPageUptodate(page);
1771        return 0;
1772}
1773
1774/*
1775 * helper function to unlock a page if all the extents in the tree
1776 * for that page are unlocked
1777 */
1778static int check_page_locked(struct extent_io_tree *tree,
1779                             struct page *page)
1780{
1781        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1782        u64 end = start + PAGE_CACHE_SIZE - 1;
1783        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1784                unlock_page(page);
1785        return 0;
1786}
1787
1788/*
1789 * helper function to end page writeback if all the extents
1790 * in the tree for that page are done with writeback
1791 */
1792static int check_page_writeback(struct extent_io_tree *tree,
1793                             struct page *page)
1794{
1795        end_page_writeback(page);
1796        return 0;
1797}
1798
1799/*
1800 * When IO fails, either with EIO or csum verification fails, we
1801 * try other mirrors that might have a good copy of the data.  This
1802 * io_failure_record is used to record state as we go through all the
1803 * mirrors.  If another mirror has good data, the page is set up to date
1804 * and things continue.  If a good mirror can't be found, the original
1805 * bio end_io callback is called to indicate things have failed.
1806 */
1807struct io_failure_record {
1808        struct page *page;
1809        u64 start;
1810        u64 len;
1811        u64 logical;
1812        unsigned long bio_flags;
1813        int this_mirror;
1814        int failed_mirror;
1815        int in_validation;
1816};
1817
1818static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1819                                int did_repair)
1820{
1821        int ret;
1822        int err = 0;
1823        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1824
1825        set_state_private(failure_tree, rec->start, 0);
1826        ret = clear_extent_bits(failure_tree, rec->start,
1827                                rec->start + rec->len - 1,
1828                                EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1829        if (ret)
1830                err = ret;
1831
1832        if (did_repair) {
1833                ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1834                                        rec->start + rec->len - 1,
1835                                        EXTENT_DAMAGED, GFP_NOFS);
1836                if (ret && !err)
1837                        err = ret;
1838        }
1839
1840        kfree(rec);
1841        return err;
1842}
1843
1844static void repair_io_failure_callback(struct bio *bio, int err)
1845{
1846        complete(bio->bi_private);
1847}
1848
1849/*
1850 * this bypasses the standard btrfs submit functions deliberately, as
1851 * the standard behavior is to write all copies in a raid setup. here we only
1852 * want to write the one bad copy. so we do the mapping for ourselves and issue
1853 * submit_bio directly.
1854 * to avoid any synchonization issues, wait for the data after writing, which
1855 * actually prevents the read that triggered the error from finishing.
1856 * currently, there can be no more than two copies of every data bit. thus,
1857 * exactly one rewrite is required.
1858 */
1859int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1860                        u64 length, u64 logical, struct page *page,
1861                        int mirror_num)
1862{
1863        struct bio *bio;
1864        struct btrfs_device *dev;
1865        DECLARE_COMPLETION_ONSTACK(compl);
1866        u64 map_length = 0;
1867        u64 sector;
1868        struct btrfs_bio *bbio = NULL;
1869        int ret;
1870
1871        BUG_ON(!mirror_num);
1872
1873        bio = bio_alloc(GFP_NOFS, 1);
1874        if (!bio)
1875                return -EIO;
1876        bio->bi_private = &compl;
1877        bio->bi_end_io = repair_io_failure_callback;
1878        bio->bi_size = 0;
1879        map_length = length;
1880
1881        ret = btrfs_map_block(map_tree, WRITE, logical,
1882                              &map_length, &bbio, mirror_num);
1883        if (ret) {
1884                bio_put(bio);
1885                return -EIO;
1886        }
1887        BUG_ON(mirror_num != bbio->mirror_num);
1888        sector = bbio->stripes[mirror_num-1].physical >> 9;
1889        bio->bi_sector = sector;
1890        dev = bbio->stripes[mirror_num-1].dev;
1891        kfree(bbio);
1892        if (!dev || !dev->bdev || !dev->writeable) {
1893                bio_put(bio);
1894                return -EIO;
1895        }
1896        bio->bi_bdev = dev->bdev;
1897        bio_add_page(bio, page, length, start-page_offset(page));
1898        btrfsic_submit_bio(WRITE_SYNC, bio);
1899        wait_for_completion(&compl);
1900
1901        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1902                /* try to remap that extent elsewhere? */
1903                bio_put(bio);
1904                return -EIO;
1905        }
1906
1907        printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1908                        "sector %llu)\n", page->mapping->host->i_ino, start,
1909                        dev->name, sector);
1910
1911        bio_put(bio);
1912        return 0;
1913}
1914
1915/*
1916 * each time an IO finishes, we do a fast check in the IO failure tree
1917 * to see if we need to process or clean up an io_failure_record
1918 */
1919static int clean_io_failure(u64 start, struct page *page)
1920{
1921        u64 private;
1922        u64 private_failure;
1923        struct io_failure_record *failrec;
1924        struct btrfs_mapping_tree *map_tree;
1925        struct extent_state *state;
1926        int num_copies;
1927        int did_repair = 0;
1928        int ret;
1929        struct inode *inode = page->mapping->host;
1930
1931        private = 0;
1932        ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1933                                (u64)-1, 1, EXTENT_DIRTY, 0);
1934        if (!ret)
1935                return 0;
1936
1937        ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1938                                &private_failure);
1939        if (ret)
1940                return 0;
1941
1942        failrec = (struct io_failure_record *)(unsigned long) private_failure;
1943        BUG_ON(!failrec->this_mirror);
1944
1945        if (failrec->in_validation) {
1946                /* there was no real error, just free the record */
1947                pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1948                         failrec->start);
1949                did_repair = 1;
1950                goto out;
1951        }
1952
1953        spin_lock(&BTRFS_I(inode)->io_tree.lock);
1954        state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1955                                            failrec->start,
1956                                            EXTENT_LOCKED);
1957        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1958
1959        if (state && state->start == failrec->start) {
1960                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1961                num_copies = btrfs_num_copies(map_tree, failrec->logical,
1962                                                failrec->len);
1963                if (num_copies > 1)  {
1964                        ret = repair_io_failure(map_tree, start, failrec->len,
1965                                                failrec->logical, page,
1966                                                failrec->failed_mirror);
1967                        did_repair = !ret;
1968                }
1969        }
1970
1971out:
1972        if (!ret)
1973                ret = free_io_failure(inode, failrec, did_repair);
1974
1975        return ret;
1976}
1977
1978/*
1979 * this is a generic handler for readpage errors (default
1980 * readpage_io_failed_hook). if other copies exist, read those and write back
1981 * good data to the failed position. does not investigate in remapping the
1982 * failed extent elsewhere, hoping the device will be smart enough to do this as
1983 * needed
1984 */
1985
1986static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1987                                u64 start, u64 end, int failed_mirror,
1988                                struct extent_state *state)
1989{
1990        struct io_failure_record *failrec = NULL;
1991        u64 private;
1992        struct extent_map *em;
1993        struct inode *inode = page->mapping->host;
1994        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1995        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1996        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1997        struct bio *bio;
1998        int num_copies;
1999        int ret;
2000        int read_mode;
2001        u64 logical;
2002
2003        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2004
2005        ret = get_state_private(failure_tree, start, &private);
2006        if (ret) {
2007                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2008                if (!failrec)
2009                        return -ENOMEM;
2010                failrec->start = start;
2011                failrec->len = end - start + 1;
2012                failrec->this_mirror = 0;
2013                failrec->bio_flags = 0;
2014                failrec->in_validation = 0;
2015
2016                read_lock(&em_tree->lock);
2017                em = lookup_extent_mapping(em_tree, start, failrec->len);
2018                if (!em) {
2019                        read_unlock(&em_tree->lock);
2020                        kfree(failrec);
2021                        return -EIO;
2022                }
2023
2024                if (em->start > start || em->start + em->len < start) {
2025                        free_extent_map(em);
2026                        em = NULL;
2027                }
2028                read_unlock(&em_tree->lock);
2029
2030                if (!em || IS_ERR(em)) {
2031                        kfree(failrec);
2032                        return -EIO;
2033                }
2034                logical = start - em->start;
2035                logical = em->block_start + logical;
2036                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2037                        logical = em->block_start;
2038                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2039                        extent_set_compress_type(&failrec->bio_flags,
2040                                                 em->compress_type);
2041                }
2042                pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2043                         "len=%llu\n", logical, start, failrec->len);
2044                failrec->logical = logical;
2045                free_extent_map(em);
2046
2047                /* set the bits in the private failure tree */
2048                ret = set_extent_bits(failure_tree, start, end,
2049                                        EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2050                if (ret >= 0)
2051                        ret = set_state_private(failure_tree, start,
2052                                                (u64)(unsigned long)failrec);
2053                /* set the bits in the inode's tree */
2054                if (ret >= 0)
2055                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2056                                                GFP_NOFS);
2057                if (ret < 0) {
2058                        kfree(failrec);
2059                        return ret;
2060                }
2061        } else {
2062                failrec = (struct io_failure_record *)(unsigned long)private;
2063                pr_debug("bio_readpage_error: (found) logical=%llu, "
2064                         "start=%llu, len=%llu, validation=%d\n",
2065                         failrec->logical, failrec->start, failrec->len,
2066                         failrec->in_validation);
2067                /*
2068                 * when data can be on disk more than twice, add to failrec here
2069                 * (e.g. with a list for failed_mirror) to make
2070                 * clean_io_failure() clean all those errors at once.
2071                 */
2072        }
2073        num_copies = btrfs_num_copies(
2074                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
2075                              failrec->logical, failrec->len);
2076        if (num_copies == 1) {
2077                /*
2078                 * we only have a single copy of the data, so don't bother with
2079                 * all the retry and error correction code that follows. no
2080                 * matter what the error is, it is very likely to persist.
2081                 */
2082                pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2083                         "state=%p, num_copies=%d, next_mirror %d, "
2084                         "failed_mirror %d\n", state, num_copies,
2085                         failrec->this_mirror, failed_mirror);
2086                free_io_failure(inode, failrec, 0);
2087                return -EIO;
2088        }
2089
2090        if (!state) {
2091                spin_lock(&tree->lock);
2092                state = find_first_extent_bit_state(tree, failrec->start,
2093                                                    EXTENT_LOCKED);
2094                if (state && state->start != failrec->start)
2095                        state = NULL;
2096                spin_unlock(&tree->lock);
2097        }
2098
2099        /*
2100         * there are two premises:
2101         *      a) deliver good data to the caller
2102         *      b) correct the bad sectors on disk
2103         */
2104        if (failed_bio->bi_vcnt > 1) {
2105                /*
2106                 * to fulfill b), we need to know the exact failing sectors, as
2107                 * we don't want to rewrite any more than the failed ones. thus,
2108                 * we need separate read requests for the failed bio
2109                 *
2110                 * if the following BUG_ON triggers, our validation request got
2111                 * merged. we need separate requests for our algorithm to work.
2112                 */
2113                BUG_ON(failrec->in_validation);
2114                failrec->in_validation = 1;
2115                failrec->this_mirror = failed_mirror;
2116                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2117        } else {
2118                /*
2119                 * we're ready to fulfill a) and b) alongside. get a good copy
2120                 * of the failed sector and if we succeed, we have setup
2121                 * everything for repair_io_failure to do the rest for us.
2122                 */
2123                if (failrec->in_validation) {
2124                        BUG_ON(failrec->this_mirror != failed_mirror);
2125                        failrec->in_validation = 0;
2126                        failrec->this_mirror = 0;
2127                }
2128                failrec->failed_mirror = failed_mirror;
2129                failrec->this_mirror++;
2130                if (failrec->this_mirror == failed_mirror)
2131                        failrec->this_mirror++;
2132                read_mode = READ_SYNC;
2133        }
2134
2135        if (!state || failrec->this_mirror > num_copies) {
2136                pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2137                         "next_mirror %d, failed_mirror %d\n", state,
2138                         num_copies, failrec->this_mirror, failed_mirror);
2139                free_io_failure(inode, failrec, 0);
2140                return -EIO;
2141        }
2142
2143        bio = bio_alloc(GFP_NOFS, 1);
2144        bio->bi_private = state;
2145        bio->bi_end_io = failed_bio->bi_end_io;
2146        bio->bi_sector = failrec->logical >> 9;
2147        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2148        bio->bi_size = 0;
2149
2150        bio_add_page(bio, page, failrec->len, start - page_offset(page));
2151
2152        pr_debug("bio_readpage_error: submitting new read[%#x] to "
2153                 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2154                 failrec->this_mirror, num_copies, failrec->in_validation);
2155
2156        ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2157                                         failrec->this_mirror,
2158                                         failrec->bio_flags, 0);
2159        return ret;
2160}
2161
2162/* lots and lots of room for performance fixes in the end_bio funcs */
2163
2164int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2165{
2166        int uptodate = (err == 0);
2167        struct extent_io_tree *tree;
2168        int ret;
2169
2170        tree = &BTRFS_I(page->mapping->host)->io_tree;
2171
2172        if (tree->ops && tree->ops->writepage_end_io_hook) {
2173                ret = tree->ops->writepage_end_io_hook(page, start,
2174                                               end, NULL, uptodate);
2175                if (ret)
2176                        uptodate = 0;
2177        }
2178
2179        if (!uptodate && tree->ops &&
2180            tree->ops->writepage_io_failed_hook) {
2181                ret = tree->ops->writepage_io_failed_hook(NULL, page,
2182                                                 start, end, NULL);
2183                /* Writeback already completed */
2184                if (ret == 0)
2185                        return 1;
2186        }
2187
2188        if (!uptodate) {
2189                clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2190                ClearPageUptodate(page);
2191                SetPageError(page);
2192        }
2193        return 0;
2194}
2195
2196/*
2197 * after a writepage IO is done, we need to:
2198 * clear the uptodate bits on error
2199 * clear the writeback bits in the extent tree for this IO
2200 * end_page_writeback if the page has no more pending IO
2201 *
2202 * Scheduling is not allowed, so the extent state tree is expected
2203 * to have one and only one object corresponding to this IO.
2204 */
2205static void end_bio_extent_writepage(struct bio *bio, int err)
2206{
2207        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2208        struct extent_io_tree *tree;
2209        u64 start;
2210        u64 end;
2211        int whole_page;
2212
2213        do {
2214                struct page *page = bvec->bv_page;
2215                tree = &BTRFS_I(page->mapping->host)->io_tree;
2216
2217                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2218                         bvec->bv_offset;
2219                end = start + bvec->bv_len - 1;
2220
2221                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2222                        whole_page = 1;
2223                else
2224                        whole_page = 0;
2225
2226                if (--bvec >= bio->bi_io_vec)
2227                        prefetchw(&bvec->bv_page->flags);
2228
2229                if (end_extent_writepage(page, err, start, end))
2230                        continue;
2231
2232                if (whole_page)
2233                        end_page_writeback(page);
2234                else
2235                        check_page_writeback(tree, page);
2236        } while (bvec >= bio->bi_io_vec);
2237
2238        bio_put(bio);
2239}
2240
2241/*
2242 * after a readpage IO is done, we need to:
2243 * clear the uptodate bits on error
2244 * set the uptodate bits if things worked
2245 * set the page up to date if all extents in the tree are uptodate
2246 * clear the lock bit in the extent tree
2247 * unlock the page if there are no other extents locked for it
2248 *
2249 * Scheduling is not allowed, so the extent state tree is expected
2250 * to have one and only one object corresponding to this IO.
2251 */
2252static void end_bio_extent_readpage(struct bio *bio, int err)
2253{
2254        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2255        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2256        struct bio_vec *bvec = bio->bi_io_vec;
2257        struct extent_io_tree *tree;
2258        u64 start;
2259        u64 end;
2260        int whole_page;
2261        int ret;
2262
2263        if (err)
2264                uptodate = 0;
2265
2266        do {
2267                struct page *page = bvec->bv_page;
2268                struct extent_state *cached = NULL;
2269                struct extent_state *state;
2270
2271                pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2272                         "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2273                         (long int)bio->bi_bdev);
2274                tree = &BTRFS_I(page->mapping->host)->io_tree;
2275
2276                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2277                        bvec->bv_offset;
2278                end = start + bvec->bv_len - 1;
2279
2280                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2281                        whole_page = 1;
2282                else
2283                        whole_page = 0;
2284
2285                if (++bvec <= bvec_end)
2286                        prefetchw(&bvec->bv_page->flags);
2287
2288                spin_lock(&tree->lock);
2289                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2290                if (state && state->start == start) {
2291                        /*
2292                         * take a reference on the state, unlock will drop
2293                         * the ref
2294                         */
2295                        cache_state(state, &cached);
2296                }
2297                spin_unlock(&tree->lock);
2298
2299                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2300                        ret = tree->ops->readpage_end_io_hook(page, start, end,
2301                                                              state);
2302                        if (ret)
2303                                uptodate = 0;
2304                        else
2305                                clean_io_failure(start, page);
2306                }
2307                if (!uptodate) {
2308                        int failed_mirror;
2309                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
2310                        /*
2311                         * The generic bio_readpage_error handles errors the
2312                         * following way: If possible, new read requests are
2313                         * created and submitted and will end up in
2314                         * end_bio_extent_readpage as well (if we're lucky, not
2315                         * in the !uptodate case). In that case it returns 0 and
2316                         * we just go on with the next page in our bio. If it
2317                         * can't handle the error it will return -EIO and we
2318                         * remain responsible for that page.
2319                         */
2320                        ret = bio_readpage_error(bio, page, start, end,
2321                                                        failed_mirror, NULL);
2322                        if (ret == 0) {
2323error_handled:
2324                                uptodate =
2325                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
2326                                if (err)
2327                                        uptodate = 0;
2328                                uncache_state(&cached);
2329                                continue;
2330                        }
2331                        if (tree->ops && tree->ops->readpage_io_failed_hook) {
2332                                ret = tree->ops->readpage_io_failed_hook(
2333                                                        bio, page, start, end,
2334                                                        failed_mirror, state);
2335                                if (ret == 0)
2336                                        goto error_handled;
2337                        }
2338                }
2339
2340                if (uptodate) {
2341                        set_extent_uptodate(tree, start, end, &cached,
2342                                            GFP_ATOMIC);
2343                }
2344                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2345
2346                if (whole_page) {
2347                        if (uptodate) {
2348                                SetPageUptodate(page);
2349                        } else {
2350                                ClearPageUptodate(page);
2351                                SetPageError(page);
2352                        }
2353                        unlock_page(page);
2354                } else {
2355                        if (uptodate) {
2356                                check_page_uptodate(tree, page);
2357                        } else {
2358                                ClearPageUptodate(page);
2359                                SetPageError(page);
2360                        }
2361                        check_page_locked(tree, page);
2362                }
2363        } while (bvec <= bvec_end);
2364
2365        bio_put(bio);
2366}
2367
2368struct bio *
2369btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2370                gfp_t gfp_flags)
2371{
2372        struct bio *bio;
2373
2374        bio = bio_alloc(gfp_flags, nr_vecs);
2375
2376        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2377                while (!bio && (nr_vecs /= 2))
2378                        bio = bio_alloc(gfp_flags, nr_vecs);
2379        }
2380
2381        if (bio) {
2382                bio->bi_size = 0;
2383                bio->bi_bdev = bdev;
2384                bio->bi_sector = first_sector;
2385        }
2386        return bio;
2387}
2388
2389static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2390                          unsigned long bio_flags)
2391{
2392        int ret = 0;
2393        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2394        struct page *page = bvec->bv_page;
2395        struct extent_io_tree *tree = bio->bi_private;
2396        u64 start;
2397
2398        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2399
2400        bio->bi_private = NULL;
2401
2402        bio_get(bio);
2403
2404        if (tree->ops && tree->ops->submit_bio_hook)
2405                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2406                                           mirror_num, bio_flags, start);
2407        else
2408                btrfsic_submit_bio(rw, bio);
2409
2410        if (bio_flagged(bio, BIO_EOPNOTSUPP))
2411                ret = -EOPNOTSUPP;
2412        bio_put(bio);
2413        return ret;
2414}
2415
2416static int submit_extent_page(int rw, struct extent_io_tree *tree,
2417                              struct page *page, sector_t sector,
2418                              size_t size, unsigned long offset,
2419                              struct block_device *bdev,
2420                              struct bio **bio_ret,
2421                              unsigned long max_pages,
2422                              bio_end_io_t end_io_func,
2423                              int mirror_num,
2424                              unsigned long prev_bio_flags,
2425                              unsigned long bio_flags)
2426{
2427        int ret = 0;
2428        struct bio *bio;
2429        int nr;
2430        int contig = 0;
2431        int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2432        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2433        size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2434
2435        if (bio_ret && *bio_ret) {
2436                bio = *bio_ret;
2437                if (old_compressed)
2438                        contig = bio->bi_sector == sector;
2439                else
2440                        contig = bio->bi_sector + (bio->bi_size >> 9) ==
2441                                sector;
2442
2443                if (prev_bio_flags != bio_flags || !contig ||
2444                    (tree->ops && tree->ops->merge_bio_hook &&
2445                     tree->ops->merge_bio_hook(page, offset, page_size, bio,
2446                                               bio_flags)) ||
2447                    bio_add_page(bio, page, page_size, offset) < page_size) {
2448                        ret = submit_one_bio(rw, bio, mirror_num,
2449                                             prev_bio_flags);
2450                        bio = NULL;
2451                } else {
2452                        return 0;
2453                }
2454        }
2455        if (this_compressed)
2456                nr = BIO_MAX_PAGES;
2457        else
2458                nr = bio_get_nr_vecs(bdev);
2459
2460        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2461        if (!bio)
2462                return -ENOMEM;
2463
2464        bio_add_page(bio, page, page_size, offset);
2465        bio->bi_end_io = end_io_func;
2466        bio->bi_private = tree;
2467
2468        if (bio_ret)
2469                *bio_ret = bio;
2470        else
2471                ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2472
2473        return ret;
2474}
2475
2476void set_page_extent_mapped(struct page *page)
2477{
2478        if (!PagePrivate(page)) {
2479                SetPagePrivate(page);
2480                page_cache_get(page);
2481                set_page_private(page, EXTENT_PAGE_PRIVATE);
2482        }
2483}
2484
2485static void set_page_extent_head(struct page *page, unsigned long len)
2486{
2487        WARN_ON(!PagePrivate(page));
2488        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
2489}
2490
2491/*
2492 * basic readpage implementation.  Locked extent state structs are inserted
2493 * into the tree that are removed when the IO is done (by the end_io
2494 * handlers)
2495 */
2496static int __extent_read_full_page(struct extent_io_tree *tree,
2497                                   struct page *page,
2498                                   get_extent_t *get_extent,
2499                                   struct bio **bio, int mirror_num,
2500                                   unsigned long *bio_flags)
2501{
2502        struct inode *inode = page->mapping->host;
2503        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2504        u64 page_end = start + PAGE_CACHE_SIZE - 1;
2505        u64 end;
2506        u64 cur = start;
2507        u64 extent_offset;
2508        u64 last_byte = i_size_read(inode);
2509        u64 block_start;
2510        u64 cur_end;
2511        sector_t sector;
2512        struct extent_map *em;
2513        struct block_device *bdev;
2514        struct btrfs_ordered_extent *ordered;
2515        int ret;
2516        int nr = 0;
2517        size_t pg_offset = 0;
2518        size_t iosize;
2519        size_t disk_io_size;
2520        size_t blocksize = inode->i_sb->s_blocksize;
2521        unsigned long this_bio_flag = 0;
2522
2523        set_page_extent_mapped(page);
2524
2525        if (!PageUptodate(page)) {
2526                if (cleancache_get_page(page) == 0) {
2527                        BUG_ON(blocksize != PAGE_SIZE);
2528                        goto out;
2529                }
2530        }
2531
2532        end = page_end;
2533        while (1) {
2534                lock_extent(tree, start, end, GFP_NOFS);
2535                ordered = btrfs_lookup_ordered_extent(inode, start);
2536                if (!ordered)
2537                        break;
2538                unlock_extent(tree, start, end, GFP_NOFS);
2539                btrfs_start_ordered_extent(inode, ordered, 1);
2540                btrfs_put_ordered_extent(ordered);
2541        }
2542
2543        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2544                char *userpage;
2545                size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2546
2547                if (zero_offset) {
2548                        iosize = PAGE_CACHE_SIZE - zero_offset;
2549                        userpage = kmap_atomic(page, KM_USER0);
2550                        memset(userpage + zero_offset, 0, iosize);
2551                        flush_dcache_page(page);
2552                        kunmap_atomic(userpage, KM_USER0);
2553                }
2554        }
2555        while (cur <= end) {
2556                if (cur >= last_byte) {
2557                        char *userpage;
2558                        struct extent_state *cached = NULL;
2559
2560                        iosize = PAGE_CACHE_SIZE - pg_offset;
2561                        userpage = kmap_atomic(page, KM_USER0);
2562                        memset(userpage + pg_offset, 0, iosize);
2563                        flush_dcache_page(page);
2564                        kunmap_atomic(userpage, KM_USER0);
2565                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2566                                            &cached, GFP_NOFS);
2567                        unlock_extent_cached(tree, cur, cur + iosize - 1,
2568                                             &cached, GFP_NOFS);
2569                        break;
2570                }
2571                em = get_extent(inode, page, pg_offset, cur,
2572                                end - cur + 1, 0);
2573                if (IS_ERR_OR_NULL(em)) {
2574                        SetPageError(page);
2575                        unlock_extent(tree, cur, end, GFP_NOFS);
2576                        break;
2577                }
2578                extent_offset = cur - em->start;
2579                BUG_ON(extent_map_end(em) <= cur);
2580                BUG_ON(end < cur);
2581
2582                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2583                        this_bio_flag = EXTENT_BIO_COMPRESSED;
2584                        extent_set_compress_type(&this_bio_flag,
2585                                                 em->compress_type);
2586                }
2587
2588                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2589                cur_end = min(extent_map_end(em) - 1, end);
2590                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2591                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2592                        disk_io_size = em->block_len;
2593                        sector = em->block_start >> 9;
2594                } else {
2595                        sector = (em->block_start + extent_offset) >> 9;
2596                        disk_io_size = iosize;
2597                }
2598                bdev = em->bdev;
2599                block_start = em->block_start;
2600                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2601                        block_start = EXTENT_MAP_HOLE;
2602                free_extent_map(em);
2603                em = NULL;
2604
2605                /* we've found a hole, just zero and go on */
2606                if (block_start == EXTENT_MAP_HOLE) {
2607                        char *userpage;
2608                        struct extent_state *cached = NULL;
2609
2610                        userpage = kmap_atomic(page, KM_USER0);
2611                        memset(userpage + pg_offset, 0, iosize);
2612                        flush_dcache_page(page);
2613                        kunmap_atomic(userpage, KM_USER0);
2614
2615                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2616                                            &cached, GFP_NOFS);
2617                        unlock_extent_cached(tree, cur, cur + iosize - 1,
2618                                             &cached, GFP_NOFS);
2619                        cur = cur + iosize;
2620                        pg_offset += iosize;
2621                        continue;
2622                }
2623                /* the get_extent function already copied into the page */
2624                if (test_range_bit(tree, cur, cur_end,
2625                                   EXTENT_UPTODATE, 1, NULL)) {
2626                        check_page_uptodate(tree, page);
2627                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2628                        cur = cur + iosize;
2629                        pg_offset += iosize;
2630                        continue;
2631                }
2632                /* we have an inline extent but it didn't get marked up
2633                 * to date.  Error out
2634                 */
2635                if (block_start == EXTENT_MAP_INLINE) {
2636                        SetPageError(page);
2637                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2638                        cur = cur + iosize;
2639                        pg_offset += iosize;
2640                        continue;
2641                }
2642
2643                ret = 0;
2644                if (tree->ops && tree->ops->readpage_io_hook) {
2645                        ret = tree->ops->readpage_io_hook(page, cur,
2646                                                          cur + iosize - 1);
2647                }
2648                if (!ret) {
2649                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2650                        pnr -= page->index;
2651                        ret = submit_extent_page(READ, tree, page,
2652                                         sector, disk_io_size, pg_offset,
2653                                         bdev, bio, pnr,
2654                                         end_bio_extent_readpage, mirror_num,
2655                                         *bio_flags,
2656                                         this_bio_flag);
2657                        nr++;
2658                        *bio_flags = this_bio_flag;
2659                }
2660                if (ret)
2661                        SetPageError(page);
2662                cur = cur + iosize;
2663                pg_offset += iosize;
2664        }
2665out:
2666        if (!nr) {
2667                if (!PageError(page))
2668                        SetPageUptodate(page);
2669                unlock_page(page);
2670        }
2671        return 0;
2672}
2673
2674int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2675                            get_extent_t *get_extent, int mirror_num)
2676{
2677        struct bio *bio = NULL;
2678        unsigned long bio_flags = 0;
2679        int ret;
2680
2681        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2682                                      &bio_flags);
2683        if (bio)
2684                ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2685        return ret;
2686}
2687
2688static noinline void update_nr_written(struct page *page,
2689                                      struct writeback_control *wbc,
2690                                      unsigned long nr_written)
2691{
2692        wbc->nr_to_write -= nr_written;
2693        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2694            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2695                page->mapping->writeback_index = page->index + nr_written;
2696}
2697
2698/*
2699 * the writepage semantics are similar to regular writepage.  extent
2700 * records are inserted to lock ranges in the tree, and as dirty areas
2701 * are found, they are marked writeback.  Then the lock bits are removed
2702 * and the end_io handler clears the writeback ranges
2703 */
2704static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2705                              void *data)
2706{
2707        struct inode *inode = page->mapping->host;
2708        struct extent_page_data *epd = data;
2709        struct extent_io_tree *tree = epd->tree;
2710        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2711        u64 delalloc_start;
2712        u64 page_end = start + PAGE_CACHE_SIZE - 1;
2713        u64 end;
2714        u64 cur = start;
2715        u64 extent_offset;
2716        u64 last_byte = i_size_read(inode);
2717        u64 block_start;
2718        u64 iosize;
2719        sector_t sector;
2720        struct extent_state *cached_state = NULL;
2721        struct extent_map *em;
2722        struct block_device *bdev;
2723        int ret;
2724        int nr = 0;
2725        size_t pg_offset = 0;
2726        size_t blocksize;
2727        loff_t i_size = i_size_read(inode);
2728        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2729        u64 nr_delalloc;
2730        u64 delalloc_end;
2731        int page_started;
2732        int compressed;
2733        int write_flags;
2734        unsigned long nr_written = 0;
2735        bool fill_delalloc = true;
2736
2737        if (wbc->sync_mode == WB_SYNC_ALL)
2738                write_flags = WRITE_SYNC;
2739        else
2740                write_flags = WRITE;
2741
2742        trace___extent_writepage(page, inode, wbc);
2743
2744        WARN_ON(!PageLocked(page));
2745
2746        ClearPageError(page);
2747
2748        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2749        if (page->index > end_index ||
2750           (page->index == end_index && !pg_offset)) {
2751                page->mapping->a_ops->invalidatepage(page, 0);
2752                unlock_page(page);
2753                return 0;
2754        }
2755
2756        if (page->index == end_index) {
2757                char *userpage;
2758
2759                userpage = kmap_atomic(page, KM_USER0);
2760                memset(userpage + pg_offset, 0,
2761                       PAGE_CACHE_SIZE - pg_offset);
2762                kunmap_atomic(userpage, KM_USER0);
2763                flush_dcache_page(page);
2764        }
2765        pg_offset = 0;
2766
2767        set_page_extent_mapped(page);
2768
2769        if (!tree->ops || !tree->ops->fill_delalloc)
2770                fill_delalloc = false;
2771
2772        delalloc_start = start;
2773        delalloc_end = 0;
2774        page_started = 0;
2775        if (!epd->extent_locked && fill_delalloc) {
2776                u64 delalloc_to_write = 0;
2777                /*
2778                 * make sure the wbc mapping index is at least updated
2779                 * to this page.
2780                 */
2781                update_nr_written(page, wbc, 0);
2782
2783                while (delalloc_end < page_end) {
2784                        nr_delalloc = find_lock_delalloc_range(inode, tree,
2785                                                       page,
2786                                                       &delalloc_start,
2787                                                       &delalloc_end,
2788                                                       128 * 1024 * 1024);
2789                        if (nr_delalloc == 0) {
2790                                delalloc_start = delalloc_end + 1;
2791                                continue;
2792                        }
2793                        ret = tree->ops->fill_delalloc(inode, page,
2794                                                       delalloc_start,
2795                                                       delalloc_end,
2796                                                       &page_started,
2797                                                       &nr_written);
2798                        BUG_ON(ret);
2799                        /*
2800                         * delalloc_end is already one less than the total
2801                         * length, so we don't subtract one from
2802                         * PAGE_CACHE_SIZE
2803                         */
2804                        delalloc_to_write += (delalloc_end - delalloc_start +
2805                                              PAGE_CACHE_SIZE) >>
2806                                              PAGE_CACHE_SHIFT;
2807                        delalloc_start = delalloc_end + 1;
2808                }
2809                if (wbc->nr_to_write < delalloc_to_write) {
2810                        int thresh = 8192;
2811
2812                        if (delalloc_to_write < thresh * 2)
2813                                thresh = delalloc_to_write;
2814                        wbc->nr_to_write = min_t(u64, delalloc_to_write,
2815                                                 thresh);
2816                }
2817
2818                /* did the fill delalloc function already unlock and start
2819                 * the IO?
2820                 */
2821                if (page_started) {
2822                        ret = 0;
2823                        /*
2824                         * we've unlocked the page, so we can't update
2825                         * the mapping's writeback index, just update
2826                         * nr_to_write.
2827                         */
2828                        wbc->nr_to_write -= nr_written;
2829                        goto done_unlocked;
2830                }
2831        }
2832        if (tree->ops && tree->ops->writepage_start_hook) {
2833                ret = tree->ops->writepage_start_hook(page, start,
2834                                                      page_end);
2835                if (ret) {
2836                        /* Fixup worker will requeue */
2837                        if (ret == -EBUSY)
2838                                wbc->pages_skipped++;
2839                        else
2840                                redirty_page_for_writepage(wbc, page);
2841                        update_nr_written(page, wbc, nr_written);
2842                        unlock_page(page);
2843                        ret = 0;
2844                        goto done_unlocked;
2845                }
2846        }
2847
2848        /*
2849         * we don't want to touch the inode after unlocking the page,
2850         * so we update the mapping writeback index now
2851         */
2852        update_nr_written(page, wbc, nr_written + 1);
2853
2854        end = page_end;
2855        if (last_byte <= start) {
2856                if (tree->ops && tree->ops->writepage_end_io_hook)
2857                        tree->ops->writepage_end_io_hook(page, start,
2858                                                         page_end, NULL, 1);
2859                goto done;
2860        }
2861
2862        blocksize = inode->i_sb->s_blocksize;
2863
2864        while (cur <= end) {
2865                if (cur >= last_byte) {
2866                        if (tree->ops && tree->ops->writepage_end_io_hook)
2867                                tree->ops->writepage_end_io_hook(page, cur,
2868                                                         page_end, NULL, 1);
2869                        break;
2870                }
2871                em = epd->get_extent(inode, page, pg_offset, cur,
2872                                     end - cur + 1, 1);
2873                if (IS_ERR_OR_NULL(em)) {
2874                        SetPageError(page);
2875                        break;
2876                }
2877
2878                extent_offset = cur - em->start;
2879                BUG_ON(extent_map_end(em) <= cur);
2880                BUG_ON(end < cur);
2881                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2882                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2883                sector = (em->block_start + extent_offset) >> 9;
2884                bdev = em->bdev;
2885                block_start = em->block_start;
2886                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2887                free_extent_map(em);
2888                em = NULL;
2889
2890                /*
2891                 * compressed and inline extents are written through other
2892                 * paths in the FS
2893                 */
2894                if (compressed || block_start == EXTENT_MAP_HOLE ||
2895                    block_start == EXTENT_MAP_INLINE) {
2896                        /*
2897                         * end_io notification does not happen here for
2898                         * compressed extents
2899                         */
2900                        if (!compressed && tree->ops &&
2901                            tree->ops->writepage_end_io_hook)
2902                                tree->ops->writepage_end_io_hook(page, cur,
2903                                                         cur + iosize - 1,
2904                                                         NULL, 1);
2905                        else if (compressed) {
2906                                /* we don't want to end_page_writeback on
2907                                 * a compressed extent.  this happens
2908                                 * elsewhere
2909                                 */
2910                                nr++;
2911                        }
2912
2913                        cur += iosize;
2914                        pg_offset += iosize;
2915                        continue;
2916                }
2917                /* leave this out until we have a page_mkwrite call */
2918                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2919                                   EXTENT_DIRTY, 0, NULL)) {
2920                        cur = cur + iosize;
2921                        pg_offset += iosize;
2922                        continue;
2923                }
2924
2925                if (tree->ops && tree->ops->writepage_io_hook) {
2926                        ret = tree->ops->writepage_io_hook(page, cur,
2927                                                cur + iosize - 1);
2928                } else {
2929                        ret = 0;
2930                }
2931                if (ret) {
2932                        SetPageError(page);
2933                } else {
2934                        unsigned long max_nr = end_index + 1;
2935
2936                        set_range_writeback(tree, cur, cur + iosize - 1);
2937                        if (!PageWriteback(page)) {
2938                                printk(KERN_ERR "btrfs warning page %lu not "
2939                                       "writeback, cur %llu end %llu\n",
2940                                       page->index, (unsigned long long)cur,
2941                                       (unsigned long long)end);
2942                        }
2943
2944                        ret = submit_extent_page(write_flags, tree, page,
2945                                                 sector, iosize, pg_offset,
2946                                                 bdev, &epd->bio, max_nr,
2947                                                 end_bio_extent_writepage,
2948                                                 0, 0, 0);
2949                        if (ret)
2950                                SetPageError(page);
2951                }
2952                cur = cur + iosize;
2953                pg_offset += iosize;
2954                nr++;
2955        }
2956done:
2957        if (nr == 0) {
2958                /* make sure the mapping tag for page dirty gets cleared */
2959                set_page_writeback(page);
2960                end_page_writeback(page);
2961        }
2962        unlock_page(page);
2963
2964done_unlocked:
2965
2966        /* drop our reference on any cached states */
2967        free_extent_state(cached_state);
2968        return 0;
2969}
2970
2971/**
2972 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2973 * @mapping: address space structure to write
2974 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2975 * @writepage: function called for each page
2976 * @data: data passed to writepage function
2977 *
2978 * If a page is already under I/O, write_cache_pages() skips it, even
2979 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2980 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2981 * and msync() need to guarantee that all the data which was dirty at the time
2982 * the call was made get new I/O started against them.  If wbc->sync_mode is
2983 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2984 * existing IO to complete.
2985 */
2986static int extent_write_cache_pages(struct extent_io_tree *tree,
2987                             struct address_space *mapping,
2988                             struct writeback_control *wbc,
2989                             writepage_t writepage, void *data,
2990                             void (*flush_fn)(void *))
2991{
2992        int ret = 0;
2993        int done = 0;
2994        int nr_to_write_done = 0;
2995        struct pagevec pvec;
2996        int nr_pages;
2997        pgoff_t index;
2998        pgoff_t end;            /* Inclusive */
2999        int scanned = 0;
3000        int tag;
3001
3002        pagevec_init(&pvec, 0);
3003        if (wbc->range_cyclic) {
3004                index = mapping->writeback_index; /* Start from prev offset */
3005                end = -1;
3006        } else {
3007                index = wbc->range_start >> PAGE_CACHE_SHIFT;
3008                end = wbc->range_end >> PAGE_CACHE_SHIFT;
3009                scanned = 1;
3010        }
3011        if (wbc->sync_mode == WB_SYNC_ALL)
3012                tag = PAGECACHE_TAG_TOWRITE;
3013        else
3014                tag = PAGECACHE_TAG_DIRTY;
3015retry:
3016        if (wbc->sync_mode == WB_SYNC_ALL)
3017                tag_pages_for_writeback(mapping, index, end);
3018        while (!done && !nr_to_write_done && (index <= end) &&
3019               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3020                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3021                unsigned i;
3022
3023                scanned = 1;
3024                for (i = 0; i < nr_pages; i++) {
3025                        struct page *page = pvec.pages[i];
3026
3027                        /*
3028                         * At this point we hold neither mapping->tree_lock nor
3029                         * lock on the page itself: the page may be truncated or
3030                         * invalidated (changing page->mapping to NULL), or even
3031                         * swizzled back from swapper_space to tmpfs file
3032                         * mapping
3033                         */
3034                        if (tree->ops &&
3035                            tree->ops->write_cache_pages_lock_hook) {
3036                                tree->ops->write_cache_pages_lock_hook(page,
3037                                                               data, flush_fn);
3038                        } else {
3039                                if (!trylock_page(page)) {
3040                                        flush_fn(data);
3041                                        lock_page(page);
3042                                }
3043                        }
3044
3045                        if (unlikely(page->mapping != mapping)) {
3046                                unlock_page(page);
3047                                continue;
3048                        }
3049
3050                        if (!wbc->range_cyclic && page->index > end) {
3051                                done = 1;
3052                                unlock_page(page);
3053                                continue;
3054                        }
3055
3056                        if (wbc->sync_mode != WB_SYNC_NONE) {
3057                                if (PageWriteback(page))
3058                                        flush_fn(data);
3059                                wait_on_page_writeback(page);
3060                        }
3061
3062                        if (PageWriteback(page) ||
3063                            !clear_page_dirty_for_io(page)) {
3064                                unlock_page(page);
3065                                continue;
3066                        }
3067
3068                        ret = (*writepage)(page, wbc, data);
3069
3070                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3071                                unlock_page(page);
3072                                ret = 0;
3073                        }
3074                        if (ret)
3075                                done = 1;
3076
3077                        /*
3078                         * the filesystem may choose to bump up nr_to_write.
3079                         * We have to make sure to honor the new nr_to_write
3080                         * at any time
3081                         */
3082                        nr_to_write_done = wbc->nr_to_write <= 0;
3083                }
3084                pagevec_release(&pvec);
3085                cond_resched();
3086        }
3087        if (!scanned && !done) {
3088                /*
3089                 * We hit the last page and there is more work to be done: wrap
3090                 * back to the start of the file
3091                 */
3092                scanned = 1;
3093                index = 0;
3094                goto retry;
3095        }
3096        return ret;
3097}
3098
3099static void flush_epd_write_bio(struct extent_page_data *epd)
3100{
3101        if (epd->bio) {
3102                if (epd->sync_io)
3103                        submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
3104                else
3105                        submit_one_bio(WRITE, epd->bio, 0, 0);
3106                epd->bio = NULL;
3107        }
3108}
3109
3110static noinline void flush_write_bio(void *data)
3111{
3112        struct extent_page_data *epd = data;
3113        flush_epd_write_bio(epd);
3114}
3115
3116int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3117                          get_extent_t *get_extent,
3118                          struct writeback_control *wbc)
3119{
3120        int ret;
3121        struct extent_page_data epd = {
3122                .bio = NULL,
3123                .tree = tree,
3124                .get_extent = get_extent,
3125                .extent_locked = 0,
3126                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3127        };
3128
3129        ret = __extent_writepage(page, wbc, &epd);
3130
3131        flush_epd_write_bio(&epd);
3132        return ret;
3133}
3134
3135int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3136                              u64 start, u64 end, get_extent_t *get_extent,
3137                              int mode)
3138{
3139        int ret = 0;
3140        struct address_space *mapping = inode->i_mapping;
3141        struct page *page;
3142        unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3143                PAGE_CACHE_SHIFT;
3144
3145        struct extent_page_data epd = {
3146                .bio = NULL,
3147                .tree = tree,
3148                .get_extent = get_extent,
3149                .extent_locked = 1,
3150                .sync_io = mode == WB_SYNC_ALL,
3151        };
3152        struct writeback_control wbc_writepages = {
3153                .sync_mode      = mode,
3154                .nr_to_write    = nr_pages * 2,
3155                .range_start    = start,
3156                .range_end      = end + 1,
3157        };
3158
3159        while (start <= end) {
3160                page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3161                if (clear_page_dirty_for_io(page))
3162                        ret = __extent_writepage(page, &wbc_writepages, &epd);
3163                else {
3164                        if (tree->ops && tree->ops->writepage_end_io_hook)
3165                                tree->ops->writepage_end_io_hook(page, start,
3166                                                 start + PAGE_CACHE_SIZE - 1,
3167                                                 NULL, 1);
3168                        unlock_page(page);
3169                }
3170                page_cache_release(page);
3171                start += PAGE_CACHE_SIZE;
3172        }
3173
3174        flush_epd_write_bio(&epd);
3175        return ret;
3176}
3177
3178int extent_writepages(struct extent_io_tree *tree,
3179                      struct address_space *mapping,
3180                      get_extent_t *get_extent,
3181                      struct writeback_control *wbc)
3182{
3183        int ret = 0;
3184        struct extent_page_data epd = {
3185                .bio = NULL,
3186                .tree = tree,
3187                .get_extent = get_extent,
3188                .extent_locked = 0,
3189                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3190        };
3191
3192        ret = extent_write_cache_pages(tree, mapping, wbc,
3193                                       __extent_writepage, &epd,
3194                                       flush_write_bio);
3195        flush_epd_write_bio(&epd);
3196        return ret;
3197}
3198
3199int extent_readpages(struct extent_io_tree *tree,
3200                     struct address_space *mapping,
3201                     struct list_head *pages, unsigned nr_pages,
3202                     get_extent_t get_extent)
3203{
3204        struct bio *bio = NULL;
3205        unsigned page_idx;
3206        unsigned long bio_flags = 0;
3207
3208        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3209                struct page *page = list_entry(pages->prev, struct page, lru);
3210
3211                prefetchw(&page->flags);
3212                list_del(&page->lru);
3213                if (!add_to_page_cache_lru(page, mapping,
3214                                        page->index, GFP_NOFS)) {
3215                        __extent_read_full_page(tree, page, get_extent,
3216                                                &bio, 0, &bio_flags);
3217                }
3218                page_cache_release(page);
3219        }
3220        BUG_ON(!list_empty(pages));
3221        if (bio)
3222                submit_one_bio(READ, bio, 0, bio_flags);
3223        return 0;
3224}
3225
3226/*
3227 * basic invalidatepage code, this waits on any locked or writeback
3228 * ranges corresponding to the page, and then deletes any extent state
3229 * records from the tree
3230 */
3231int extent_invalidatepage(struct extent_io_tree *tree,
3232                          struct page *page, unsigned long offset)
3233{
3234        struct extent_state *cached_state = NULL;
3235        u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3236        u64 end = start + PAGE_CACHE_SIZE - 1;
3237        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3238
3239        start += (offset + blocksize - 1) & ~(blocksize - 1);
3240        if (start > end)
3241                return 0;
3242
3243        lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
3244        wait_on_page_writeback(page);
3245        clear_extent_bit(tree, start, end,
3246                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3247                         EXTENT_DO_ACCOUNTING,
3248                         1, 1, &cached_state, GFP_NOFS);
3249        return 0;
3250}
3251
3252/*
3253 * a helper for releasepage, this tests for areas of the page that
3254 * are locked or under IO and drops the related state bits if it is safe
3255 * to drop the page.
3256 */
3257int try_release_extent_state(struct extent_map_tree *map,
3258                             struct extent_io_tree *tree, struct page *page,
3259                             gfp_t mask)
3260{
3261        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3262        u64 end = start + PAGE_CACHE_SIZE - 1;
3263        int ret = 1;
3264
3265        if (test_range_bit(tree, start, end,
3266                           EXTENT_IOBITS, 0, NULL))
3267                ret = 0;
3268        else {
3269                if ((mask & GFP_NOFS) == GFP_NOFS)
3270                        mask = GFP_NOFS;
3271                /*
3272                 * at this point we can safely clear everything except the
3273                 * locked bit and the nodatasum bit
3274                 */
3275                ret = clear_extent_bit(tree, start, end,
3276                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3277                                 0, 0, NULL, mask);
3278
3279                /* if clear_extent_bit failed for enomem reasons,
3280                 * we can't allow the release to continue.
3281                 */
3282                if (ret < 0)
3283                        ret = 0;
3284                else
3285                        ret = 1;
3286        }
3287        return ret;
3288}
3289
3290/*
3291 * a helper for releasepage.  As long as there are no locked extents
3292 * in the range corresponding to the page, both state records and extent
3293 * map records are removed
3294 */
3295int try_release_extent_mapping(struct extent_map_tree *map,
3296                               struct extent_io_tree *tree, struct page *page,
3297                               gfp_t mask)
3298{
3299        struct extent_map *em;
3300        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3301        u64 end = start + PAGE_CACHE_SIZE - 1;
3302
3303        if ((mask & __GFP_WAIT) &&
3304            page->mapping->host->i_size > 16 * 1024 * 1024) {
3305                u64 len;
3306                while (start <= end) {
3307                        len = end - start + 1;
3308                        write_lock(&map->lock);
3309                        em = lookup_extent_mapping(map, start, len);
3310                        if (!em) {
3311                                write_unlock(&map->lock);
3312                                break;
3313                        }
3314                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3315                            em->start != start) {
3316                                write_unlock(&map->lock);
3317                                free_extent_map(em);
3318                                break;
3319                        }
3320                        if (!test_range_bit(tree, em->start,
3321                                            extent_map_end(em) - 1,
3322                                            EXTENT_LOCKED | EXTENT_WRITEBACK,
3323                                            0, NULL)) {
3324                                remove_extent_mapping(map, em);
3325                                /* once for the rb tree */
3326                                free_extent_map(em);
3327                        }
3328                        start = extent_map_end(em);
3329                        write_unlock(&map->lock);
3330
3331                        /* once for us */
3332                        free_extent_map(em);
3333                }
3334        }
3335        return try_release_extent_state(map, tree, page, mask);
3336}
3337
3338/*
3339 * helper function for fiemap, which doesn't want to see any holes.
3340 * This maps until we find something past 'last'
3341 */
3342static struct extent_map *get_extent_skip_holes(struct inode *inode,
3343                                                u64 offset,
3344                                                u64 last,
3345                                                get_extent_t *get_extent)
3346{
3347        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3348        struct extent_map *em;
3349        u64 len;
3350
3351        if (offset >= last)
3352                return NULL;
3353
3354        while(1) {
3355                len = last - offset;
3356                if (len == 0)
3357                        break;
3358                len = (len + sectorsize - 1) & ~(sectorsize - 1);
3359                em = get_extent(inode, NULL, 0, offset, len, 0);
3360                if (IS_ERR_OR_NULL(em))
3361                        return em;
3362
3363                /* if this isn't a hole return it */
3364                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3365                    em->block_start != EXTENT_MAP_HOLE) {
3366                        return em;
3367                }
3368
3369                /* this is a hole, advance to the next extent */
3370                offset = extent_map_end(em);
3371                free_extent_map(em);
3372                if (offset >= last)
3373                        break;
3374        }
3375        return NULL;
3376}
3377
3378int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3379                __u64 start, __u64 len, get_extent_t *get_extent)
3380{
3381        int ret = 0;
3382        u64 off = start;
3383        u64 max = start + len;
3384        u32 flags = 0;
3385        u32 found_type;
3386        u64 last;
3387        u64 last_for_get_extent = 0;
3388        u64 disko = 0;
3389        u64 isize = i_size_read(inode);
3390        struct btrfs_key found_key;
3391        struct extent_map *em = NULL;
3392        struct extent_state *cached_state = NULL;
3393        struct btrfs_path *path;
3394        struct btrfs_file_extent_item *item;
3395        int end = 0;
3396        u64 em_start = 0;
3397        u64 em_len = 0;
3398        u64 em_end = 0;
3399        unsigned long emflags;
3400
3401        if (len == 0)
3402                return -EINVAL;
3403
3404        path = btrfs_alloc_path();
3405        if (!path)
3406                return -ENOMEM;
3407        path->leave_spinning = 1;
3408
3409        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3410        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3411
3412        /*
3413         * lookup the last file extent.  We're not using i_size here
3414         * because there might be preallocation past i_size
3415         */
3416        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3417                                       path, btrfs_ino(inode), -1, 0);
3418        if (ret < 0) {
3419                btrfs_free_path(path);
3420                return ret;
3421        }
3422        WARN_ON(!ret);
3423        path->slots[0]--;
3424        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3425                              struct btrfs_file_extent_item);
3426        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3427        found_type = btrfs_key_type(&found_key);
3428
3429        /* No extents, but there might be delalloc bits */
3430        if (found_key.objectid != btrfs_ino(inode) ||
3431            found_type != BTRFS_EXTENT_DATA_KEY) {
3432                /* have to trust i_size as the end */
3433                last = (u64)-1;
3434                last_for_get_extent = isize;
3435        } else {
3436                /*
3437                 * remember the start of the last extent.  There are a
3438                 * bunch of different factors that go into the length of the
3439                 * extent, so its much less complex to remember where it started
3440                 */
3441                last = found_key.offset;
3442                last_for_get_extent = last + 1;
3443        }
3444        btrfs_free_path(path);
3445
3446        /*
3447         * we might have some extents allocated but more delalloc past those
3448         * extents.  so, we trust isize unless the start of the last extent is
3449         * beyond isize
3450         */
3451        if (last < isize) {
3452                last = (u64)-1;
3453                last_for_get_extent = isize;
3454        }
3455
3456        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3457                         &cached_state, GFP_NOFS);
3458
3459        em = get_extent_skip_holes(inode, start, last_for_get_extent,
3460                                   get_extent);
3461        if (!em)
3462                goto out;
3463        if (IS_ERR(em)) {
3464                ret = PTR_ERR(em);
3465                goto out;
3466        }
3467
3468        while (!end) {
3469                u64 offset_in_extent;
3470
3471                /* break if the extent we found is outside the range */
3472                if (em->start >= max || extent_map_end(em) < off)
3473                        break;
3474
3475                /*
3476                 * get_extent may return an extent that starts before our
3477                 * requested range.  We have to make sure the ranges
3478                 * we return to fiemap always move forward and don't
3479                 * overlap, so adjust the offsets here
3480                 */
3481                em_start = max(em->start, off);
3482
3483                /*
3484                 * record the offset from the start of the extent
3485                 * for adjusting the disk offset below
3486                 */
3487                offset_in_extent = em_start - em->start;
3488                em_end = extent_map_end(em);
3489                em_len = em_end - em_start;
3490                emflags = em->flags;
3491                disko = 0;
3492                flags = 0;
3493
3494                /*
3495                 * bump off for our next call to get_extent
3496                 */
3497                off = extent_map_end(em);
3498                if (off >= max)
3499                        end = 1;
3500
3501                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3502                        end = 1;
3503                        flags |= FIEMAP_EXTENT_LAST;
3504                } else if (em->block_start == EXTENT_MAP_INLINE) {
3505                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
3506                                  FIEMAP_EXTENT_NOT_ALIGNED);
3507                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
3508                        flags |= (FIEMAP_EXTENT_DELALLOC |
3509                                  FIEMAP_EXTENT_UNKNOWN);
3510                } else {
3511                        disko = em->block_start + offset_in_extent;
3512                }
3513                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3514                        flags |= FIEMAP_EXTENT_ENCODED;
3515
3516                free_extent_map(em);
3517                em = NULL;
3518                if ((em_start >= last) || em_len == (u64)-1 ||
3519                   (last == (u64)-1 && isize <= em_end)) {
3520                        flags |= FIEMAP_EXTENT_LAST;
3521                        end = 1;
3522                }
3523
3524                /* now scan forward to see if this is really the last extent. */
3525                em = get_extent_skip_holes(inode, off, last_for_get_extent,
3526                                           get_extent);
3527                if (IS_ERR(em)) {
3528                        ret = PTR_ERR(em);
3529                        goto out;
3530                }
3531                if (!em) {
3532                        flags |= FIEMAP_EXTENT_LAST;
3533                        end = 1;
3534                }
3535                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3536                                              em_len, flags);
3537                if (ret)
3538                        goto out_free;
3539        }
3540out_free:
3541        free_extent_map(em);
3542out:
3543        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3544                             &cached_state, GFP_NOFS);
3545        return ret;
3546}
3547
3548inline struct page *extent_buffer_page(struct extent_buffer *eb,
3549                                              unsigned long i)
3550{
3551        struct page *p;
3552        struct address_space *mapping;
3553
3554        if (i == 0)
3555                return eb->first_page;
3556        i += eb->start >> PAGE_CACHE_SHIFT;
3557        mapping = eb->first_page->mapping;
3558        if (!mapping)
3559                return NULL;
3560
3561        /*
3562         * extent_buffer_page is only called after pinning the page
3563         * by increasing the reference count.  So we know the page must
3564         * be in the radix tree.
3565         */
3566        rcu_read_lock();
3567        p = radix_tree_lookup(&mapping->page_tree, i);
3568        rcu_read_unlock();
3569
3570        return p;
3571}
3572
3573inline unsigned long num_extent_pages(u64 start, u64 len)
3574{
3575        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3576                (start >> PAGE_CACHE_SHIFT);
3577}
3578
3579static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3580                                                   u64 start,
3581                                                   unsigned long len,
3582                                                   gfp_t mask)
3583{
3584        struct extent_buffer *eb = NULL;
3585#if LEAK_DEBUG
3586        unsigned long flags;
3587#endif
3588
3589        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3590        if (eb == NULL)
3591                return NULL;
3592        eb->start = start;
3593        eb->len = len;
3594        rwlock_init(&eb->lock);
3595        atomic_set(&eb->write_locks, 0);
3596        atomic_set(&eb->read_locks, 0);
3597        atomic_set(&eb->blocking_readers, 0);
3598        atomic_set(&eb->blocking_writers, 0);
3599        atomic_set(&eb->spinning_readers, 0);
3600        atomic_set(&eb->spinning_writers, 0);
3601        eb->lock_nested = 0;
3602        init_waitqueue_head(&eb->write_lock_wq);
3603        init_waitqueue_head(&eb->read_lock_wq);
3604
3605#if LEAK_DEBUG
3606        spin_lock_irqsave(&leak_lock, flags);
3607        list_add(&eb->leak_list, &buffers);
3608        spin_unlock_irqrestore(&leak_lock, flags);
3609#endif
3610        atomic_set(&eb->refs, 1);
3611
3612        return eb;
3613}
3614
3615static void __free_extent_buffer(struct extent_buffer *eb)
3616{
3617#if LEAK_DEBUG
3618        unsigned long flags;
3619        spin_lock_irqsave(&leak_lock, flags);
3620        list_del(&eb->leak_list);
3621        spin_unlock_irqrestore(&leak_lock, flags);
3622#endif
3623        kmem_cache_free(extent_buffer_cache, eb);
3624}
3625
3626/*
3627 * Helper for releasing extent buffer page.
3628 */
3629static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3630                                                unsigned long start_idx)
3631{
3632        unsigned long index;
3633        struct page *page;
3634
3635        if (!eb->first_page)
3636                return;
3637
3638        index = num_extent_pages(eb->start, eb->len);
3639        if (start_idx >= index)
3640                return;
3641
3642        do {
3643                index--;
3644                page = extent_buffer_page(eb, index);
3645                if (page)
3646                        page_cache_release(page);
3647        } while (index != start_idx);
3648}
3649
3650/*
3651 * Helper for releasing the extent buffer.
3652 */
3653static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3654{
3655        btrfs_release_extent_buffer_page(eb, 0);
3656        __free_extent_buffer(eb);
3657}
3658
3659struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3660                                          u64 start, unsigned long len,
3661                                          struct page *page0)
3662{
3663        unsigned long num_pages = num_extent_pages(start, len);
3664        unsigned long i;
3665        unsigned long index = start >> PAGE_CACHE_SHIFT;
3666        struct extent_buffer *eb;
3667        struct extent_buffer *exists = NULL;
3668        struct page *p;
3669        struct address_space *mapping = tree->mapping;
3670        int uptodate = 1;
3671        int ret;
3672
3673        rcu_read_lock();
3674        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3675        if (eb && atomic_inc_not_zero(&eb->refs)) {
3676                rcu_read_unlock();
3677                mark_page_accessed(eb->first_page);
3678                return eb;
3679        }
3680        rcu_read_unlock();
3681
3682        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3683        if (!eb)
3684                return NULL;
3685
3686        if (page0) {
3687                eb->first_page = page0;
3688                i = 1;
3689                index++;
3690                page_cache_get(page0);
3691                mark_page_accessed(page0);
3692                set_page_extent_mapped(page0);
3693                set_page_extent_head(page0, len);
3694                uptodate = PageUptodate(page0);
3695        } else {
3696                i = 0;
3697        }
3698        for (; i < num_pages; i++, index++) {
3699                p = find_or_create_page(mapping, index, GFP_NOFS);
3700                if (!p) {
3701                        WARN_ON(1);
3702                        goto free_eb;
3703                }
3704                set_page_extent_mapped(p);
3705                mark_page_accessed(p);
3706                if (i == 0) {
3707                        eb->first_page = p;
3708                        set_page_extent_head(p, len);
3709                } else {
3710                        set_page_private(p, EXTENT_PAGE_PRIVATE);
3711                }
3712                if (!PageUptodate(p))
3713                        uptodate = 0;
3714
3715                /*
3716                 * see below about how we avoid a nasty race with release page
3717                 * and why we unlock later
3718                 */
3719                if (i != 0)
3720                        unlock_page(p);
3721        }
3722        if (uptodate)
3723                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3724
3725        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3726        if (ret)
3727                goto free_eb;
3728
3729        spin_lock(&tree->buffer_lock);
3730        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3731        if (ret == -EEXIST) {
3732                exists = radix_tree_lookup(&tree->buffer,
3733                                                start >> PAGE_CACHE_SHIFT);
3734                /* add one reference for the caller */
3735                atomic_inc(&exists->refs);
3736                spin_unlock(&tree->buffer_lock);
3737                radix_tree_preload_end();
3738                goto free_eb;
3739        }
3740        /* add one reference for the tree */
3741        atomic_inc(&eb->refs);
3742        spin_unlock(&tree->buffer_lock);
3743        radix_tree_preload_end();
3744
3745        /*
3746         * there is a race where release page may have
3747         * tried to find this extent buffer in the radix
3748         * but failed.  It will tell the VM it is safe to
3749         * reclaim the, and it will clear the page private bit.
3750         * We must make sure to set the page private bit properly
3751         * after the extent buffer is in the radix tree so
3752         * it doesn't get lost
3753         */
3754        set_page_extent_mapped(eb->first_page);
3755        set_page_extent_head(eb->first_page, eb->len);
3756        if (!page0)
3757                unlock_page(eb->first_page);
3758        return eb;
3759
3760free_eb:
3761        if (eb->first_page && !page0)
3762                unlock_page(eb->first_page);
3763
3764        if (!atomic_dec_and_test(&eb->refs))
3765                return exists;
3766        btrfs_release_extent_buffer(eb);
3767        return exists;
3768}
3769
3770struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3771                                         u64 start, unsigned long len)
3772{
3773        struct extent_buffer *eb;
3774
3775        rcu_read_lock();
3776        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3777        if (eb && atomic_inc_not_zero(&eb->refs)) {
3778                rcu_read_unlock();
3779                mark_page_accessed(eb->first_page);
3780                return eb;
3781        }
3782        rcu_read_unlock();
3783
3784        return NULL;
3785}
3786
3787void free_extent_buffer(struct extent_buffer *eb)
3788{
3789        if (!eb)
3790                return;
3791
3792        if (!atomic_dec_and_test(&eb->refs))
3793                return;
3794
3795        WARN_ON(1);
3796}
3797
3798int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3799                              struct extent_buffer *eb)
3800{
3801        unsigned long i;
3802        unsigned long num_pages;
3803        struct page *page;
3804
3805        num_pages = num_extent_pages(eb->start, eb->len);
3806
3807        for (i = 0; i < num_pages; i++) {
3808                page = extent_buffer_page(eb, i);
3809                if (!PageDirty(page))
3810                        continue;
3811
3812                lock_page(page);
3813                WARN_ON(!PagePrivate(page));
3814
3815                set_page_extent_mapped(page);
3816                if (i == 0)
3817                        set_page_extent_head(page, eb->len);
3818
3819                clear_page_dirty_for_io(page);
3820                spin_lock_irq(&page->mapping->tree_lock);
3821                if (!PageDirty(page)) {
3822                        radix_tree_tag_clear(&page->mapping->page_tree,
3823                                                page_index(page),
3824                                                PAGECACHE_TAG_DIRTY);
3825                }
3826                spin_unlock_irq(&page->mapping->tree_lock);
3827                ClearPageError(page);
3828                unlock_page(page);
3829        }
3830        return 0;
3831}
3832
3833int set_extent_buffer_dirty(struct extent_io_tree *tree,
3834                             struct extent_buffer *eb)
3835{
3836        unsigned long i;
3837        unsigned long num_pages;
3838        int was_dirty = 0;
3839
3840        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3841        num_pages = num_extent_pages(eb->start, eb->len);
3842        for (i = 0; i < num_pages; i++)
3843                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3844        return was_dirty;
3845}
3846
3847static int __eb_straddles_pages(u64 start, u64 len)
3848{
3849        if (len < PAGE_CACHE_SIZE)
3850                return 1;
3851        if (start & (PAGE_CACHE_SIZE - 1))
3852                return 1;
3853        if ((start + len) & (PAGE_CACHE_SIZE - 1))
3854                return 1;
3855        return 0;
3856}
3857
3858static int eb_straddles_pages(struct extent_buffer *eb)
3859{
3860        return __eb_straddles_pages(eb->start, eb->len);
3861}
3862
3863int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3864                                struct extent_buffer *eb,
3865                                struct extent_state **cached_state)
3866{
3867        unsigned long i;
3868        struct page *page;
3869        unsigned long num_pages;
3870
3871        num_pages = num_extent_pages(eb->start, eb->len);
3872        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3873
3874        clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3875                              cached_state, GFP_NOFS);
3876
3877        for (i = 0; i < num_pages; i++) {
3878                page = extent_buffer_page(eb, i);
3879                if (page)
3880                        ClearPageUptodate(page);
3881        }
3882        return 0;
3883}
3884
3885int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3886                                struct extent_buffer *eb)
3887{
3888        unsigned long i;
3889        struct page *page;
3890        unsigned long num_pages;
3891
3892        num_pages = num_extent_pages(eb->start, eb->len);
3893
3894        if (eb_straddles_pages(eb)) {
3895                set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3896                                    NULL, GFP_NOFS);
3897        }
3898        for (i = 0; i < num_pages; i++) {
3899                page = extent_buffer_page(eb, i);
3900                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3901                    ((i == num_pages - 1) &&
3902                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3903                        check_page_uptodate(tree, page);
3904                        continue;
3905                }
3906                SetPageUptodate(page);
3907        }
3908        return 0;
3909}
3910
3911int extent_range_uptodate(struct extent_io_tree *tree,
3912                          u64 start, u64 end)
3913{
3914        struct page *page;
3915        int ret;
3916        int pg_uptodate = 1;
3917        int uptodate;
3918        unsigned long index;
3919
3920        if (__eb_straddles_pages(start, end - start + 1)) {
3921                ret = test_range_bit(tree, start, end,
3922                                     EXTENT_UPTODATE, 1, NULL);
3923                if (ret)
3924                        return 1;
3925        }
3926        while (start <= end) {
3927                index = start >> PAGE_CACHE_SHIFT;
3928                page = find_get_page(tree->mapping, index);
3929                if (!page)
3930                        return 1;
3931                uptodate = PageUptodate(page);
3932                page_cache_release(page);
3933                if (!uptodate) {
3934                        pg_uptodate = 0;
3935                        break;
3936                }
3937                start += PAGE_CACHE_SIZE;
3938        }
3939        return pg_uptodate;
3940}
3941
3942int extent_buffer_uptodate(struct extent_io_tree *tree,
3943                           struct extent_buffer *eb,
3944                           struct extent_state *cached_state)
3945{
3946        int ret = 0;
3947        unsigned long num_pages;
3948        unsigned long i;
3949        struct page *page;
3950        int pg_uptodate = 1;
3951
3952        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3953                return 1;
3954
3955        if (eb_straddles_pages(eb)) {
3956                ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3957                                   EXTENT_UPTODATE, 1, cached_state);
3958                if (ret)
3959                        return ret;
3960        }
3961
3962        num_pages = num_extent_pages(eb->start, eb->len);
3963        for (i = 0; i < num_pages; i++) {
3964                page = extent_buffer_page(eb, i);
3965                if (!PageUptodate(page)) {
3966                        pg_uptodate = 0;
3967                        break;
3968                }
3969        }
3970        return pg_uptodate;
3971}
3972
3973int read_extent_buffer_pages(struct extent_io_tree *tree,
3974                             struct extent_buffer *eb, u64 start, int wait,
3975                             get_extent_t *get_extent, int mirror_num)
3976{
3977        unsigned long i;
3978        unsigned long start_i;
3979        struct page *page;
3980        int err;
3981        int ret = 0;
3982        int locked_pages = 0;
3983        int all_uptodate = 1;
3984        int inc_all_pages = 0;
3985        unsigned long num_pages;
3986        struct bio *bio = NULL;
3987        unsigned long bio_flags = 0;
3988
3989        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3990                return 0;
3991
3992        if (eb_straddles_pages(eb)) {
3993                if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3994                                   EXTENT_UPTODATE, 1, NULL)) {
3995                        return 0;
3996                }
3997        }
3998
3999        if (start) {
4000                WARN_ON(start < eb->start);
4001                start_i = (start >> PAGE_CACHE_SHIFT) -
4002                        (eb->start >> PAGE_CACHE_SHIFT);
4003        } else {
4004                start_i = 0;
4005        }
4006
4007        num_pages = num_extent_pages(eb->start, eb->len);
4008        for (i = start_i; i < num_pages; i++) {
4009                page = extent_buffer_page(eb, i);
4010                if (wait == WAIT_NONE) {
4011                        if (!trylock_page(page))
4012                                goto unlock_exit;
4013                } else {
4014                        lock_page(page);
4015                }
4016                locked_pages++;
4017                if (!PageUptodate(page))
4018                        all_uptodate = 0;
4019        }
4020        if (all_uptodate) {
4021                if (start_i == 0)
4022                        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4023                goto unlock_exit;
4024        }
4025
4026        for (i = start_i; i < num_pages; i++) {
4027                page = extent_buffer_page(eb, i);
4028
4029                WARN_ON(!PagePrivate(page));
4030
4031                set_page_extent_mapped(page);
4032                if (i == 0)
4033                        set_page_extent_head(page, eb->len);
4034
4035                if (inc_all_pages)
4036                        page_cache_get(page);
4037                if (!PageUptodate(page)) {
4038                        if (start_i == 0)
4039                                inc_all_pages = 1;
4040                        ClearPageError(page);
4041                        err = __extent_read_full_page(tree, page,
4042                                                      get_extent, &bio,
4043                                                      mirror_num, &bio_flags);
4044                        if (err)
4045                                ret = err;
4046                } else {
4047                        unlock_page(page);
4048                }
4049        }
4050
4051        if (bio)
4052                submit_one_bio(READ, bio, mirror_num, bio_flags);
4053
4054        if (ret || wait != WAIT_COMPLETE)
4055                return ret;
4056
4057        for (i = start_i; i < num_pages; i++) {
4058                page = extent_buffer_page(eb, i);
4059                wait_on_page_locked(page);
4060                if (!PageUptodate(page))
4061                        ret = -EIO;
4062        }
4063
4064        if (!ret)
4065                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4066        return ret;
4067
4068unlock_exit:
4069        i = start_i;
4070        while (locked_pages > 0) {
4071                page = extent_buffer_page(eb, i);
4072                i++;
4073                unlock_page(page);
4074                locked_pages--;
4075        }
4076        return ret;
4077}
4078
4079void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4080                        unsigned long start,
4081                        unsigned long len)
4082{
4083        size_t cur;
4084        size_t offset;
4085        struct page *page;
4086        char *kaddr;
4087        char *dst = (char *)dstv;
4088        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4089        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4090
4091        WARN_ON(start > eb->len);
4092        WARN_ON(start + len > eb->start + eb->len);
4093
4094        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4095
4096        while (len > 0) {
4097                page = extent_buffer_page(eb, i);
4098
4099                cur = min(len, (PAGE_CACHE_SIZE - offset));
4100                kaddr = page_address(page);
4101                memcpy(dst, kaddr + offset, cur);
4102
4103                dst += cur;
4104                len -= cur;
4105                offset = 0;
4106                i++;
4107        }
4108}
4109
4110int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4111                               unsigned long min_len, char **map,
4112                               unsigned long *map_start,
4113                               unsigned long *map_len)
4114{
4115        size_t offset = start & (PAGE_CACHE_SIZE - 1);
4116        char *kaddr;
4117        struct page *p;
4118        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4119        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4120        unsigned long end_i = (start_offset + start + min_len - 1) >>
4121                PAGE_CACHE_SHIFT;
4122
4123        if (i != end_i)
4124                return -EINVAL;
4125
4126        if (i == 0) {
4127                offset = start_offset;
4128                *map_start = 0;
4129        } else {
4130                offset = 0;
4131                *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4132        }
4133
4134        if (start + min_len > eb->len) {
4135                printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4136                       "wanted %lu %lu\n", (unsigned long long)eb->start,
4137                       eb->len, start, min_len);
4138                WARN_ON(1);
4139                return -EINVAL;
4140        }
4141
4142        p = extent_buffer_page(eb, i);
4143        kaddr = page_address(p);
4144        *map = kaddr + offset;
4145        *map_len = PAGE_CACHE_SIZE - offset;
4146        return 0;
4147}
4148
4149int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4150                          unsigned long start,
4151                          unsigned long len)
4152{
4153        size_t cur;
4154        size_t offset;
4155        struct page *page;
4156        char *kaddr;
4157        char *ptr = (char *)ptrv;
4158        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4159        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4160        int ret = 0;
4161
4162        WARN_ON(start > eb->len);
4163        WARN_ON(start + len > eb->start + eb->len);
4164
4165        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4166
4167        while (len > 0) {
4168                page = extent_buffer_page(eb, i);
4169
4170                cur = min(len, (PAGE_CACHE_SIZE - offset));
4171
4172                kaddr = page_address(page);
4173                ret = memcmp(ptr, kaddr + offset, cur);
4174                if (ret)
4175                        break;
4176
4177                ptr += cur;
4178                len -= cur;
4179                offset = 0;
4180                i++;
4181        }
4182        return ret;
4183}
4184
4185void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4186                         unsigned long start, unsigned long len)
4187{
4188        size_t cur;
4189        size_t offset;
4190        struct page *page;
4191        char *kaddr;
4192        char *src = (char *)srcv;
4193        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4194        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4195
4196        WARN_ON(start > eb->len);
4197        WARN_ON(start + len > eb->start + eb->len);
4198
4199        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4200
4201        while (len > 0) {
4202                page = extent_buffer_page(eb, i);
4203                WARN_ON(!PageUptodate(page));
4204
4205                cur = min(len, PAGE_CACHE_SIZE - offset);
4206                kaddr = page_address(page);
4207                memcpy(kaddr + offset, src, cur);
4208
4209                src += cur;
4210                len -= cur;
4211                offset = 0;
4212                i++;
4213        }
4214}
4215
4216void memset_extent_buffer(struct extent_buffer *eb, char c,
4217                          unsigned long start, unsigned long len)
4218{
4219        size_t cur;
4220        size_t offset;
4221        struct page *page;
4222        char *kaddr;
4223        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4224        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4225
4226        WARN_ON(start > eb->len);
4227        WARN_ON(start + len > eb->start + eb->len);
4228
4229        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4230
4231        while (len > 0) {
4232                page = extent_buffer_page(eb, i);
4233                WARN_ON(!PageUptodate(page));
4234
4235                cur = min(len, PAGE_CACHE_SIZE - offset);
4236                kaddr = page_address(page);
4237                memset(kaddr + offset, c, cur);
4238
4239                len -= cur;
4240                offset = 0;
4241                i++;
4242        }
4243}
4244
4245void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4246                        unsigned long dst_offset, unsigned long src_offset,
4247                        unsigned long len)
4248{
4249        u64 dst_len = dst->len;
4250        size_t cur;
4251        size_t offset;
4252        struct page *page;
4253        char *kaddr;
4254        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4255        unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4256
4257        WARN_ON(src->len != dst_len);
4258
4259        offset = (start_offset + dst_offset) &
4260                ((unsigned long)PAGE_CACHE_SIZE - 1);
4261
4262        while (len > 0) {
4263                page = extent_buffer_page(dst, i);
4264                WARN_ON(!PageUptodate(page));
4265
4266                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4267
4268                kaddr = page_address(page);
4269                read_extent_buffer(src, kaddr + offset, src_offset, cur);
4270
4271                src_offset += cur;
4272                len -= cur;
4273                offset = 0;
4274                i++;
4275        }
4276}
4277
4278static void move_pages(struct page *dst_page, struct page *src_page,
4279                       unsigned long dst_off, unsigned long src_off,
4280                       unsigned long len)
4281{
4282        char *dst_kaddr = page_address(dst_page);
4283        if (dst_page == src_page) {
4284                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4285        } else {
4286                char *src_kaddr = page_address(src_page);
4287                char *p = dst_kaddr + dst_off + len;
4288                char *s = src_kaddr + src_off + len;
4289
4290                while (len--)
4291                        *--p = *--s;
4292        }
4293}
4294
4295static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4296{
4297        unsigned long distance = (src > dst) ? src - dst : dst - src;
4298        return distance < len;
4299}
4300
4301static void copy_pages(struct page *dst_page, struct page *src_page,
4302                       unsigned long dst_off, unsigned long src_off,
4303                       unsigned long len)
4304{
4305        char *dst_kaddr = page_address(dst_page);
4306        char *src_kaddr;
4307
4308        if (dst_page != src_page) {
4309                src_kaddr = page_address(src_page);
4310        } else {
4311                src_kaddr = dst_kaddr;
4312                BUG_ON(areas_overlap(src_off, dst_off, len));
4313        }
4314
4315        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4316}
4317
4318void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4319                           unsigned long src_offset, unsigned long len)
4320{
4321        size_t cur;
4322        size_t dst_off_in_page;
4323        size_t src_off_in_page;
4324        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4325        unsigned long dst_i;
4326        unsigned long src_i;
4327
4328        if (src_offset + len > dst->len) {
4329                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4330                       "len %lu dst len %lu\n", src_offset, len, dst->len);
4331                BUG_ON(1);
4332        }
4333        if (dst_offset + len > dst->len) {
4334                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4335                       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4336                BUG_ON(1);
4337        }
4338
4339        while (len > 0) {
4340                dst_off_in_page = (start_offset + dst_offset) &
4341                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4342                src_off_in_page = (start_offset + src_offset) &
4343                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4344
4345                dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4346                src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4347
4348                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4349                                               src_off_in_page));
4350                cur = min_t(unsigned long, cur,
4351                        (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4352
4353                copy_pages(extent_buffer_page(dst, dst_i),
4354                           extent_buffer_page(dst, src_i),
4355                           dst_off_in_page, src_off_in_page, cur);
4356
4357                src_offset += cur;
4358                dst_offset += cur;
4359                len -= cur;
4360        }
4361}
4362
4363void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4364                           unsigned long src_offset, unsigned long len)
4365{
4366        size_t cur;
4367        size_t dst_off_in_page;
4368        size_t src_off_in_page;
4369        unsigned long dst_end = dst_offset + len - 1;
4370        unsigned long src_end = src_offset + len - 1;
4371        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4372        unsigned long dst_i;
4373        unsigned long src_i;
4374
4375        if (src_offset + len > dst->len) {
4376                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4377                       "len %lu len %lu\n", src_offset, len, dst->len);
4378                BUG_ON(1);
4379        }
4380        if (dst_offset + len > dst->len) {
4381                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4382                       "len %lu len %lu\n", dst_offset, len, dst->len);
4383                BUG_ON(1);
4384        }
4385        if (!areas_overlap(src_offset, dst_offset, len)) {
4386                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4387                return;
4388        }
4389        while (len > 0) {
4390                dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4391                src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4392
4393                dst_off_in_page = (start_offset + dst_end) &
4394                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4395                src_off_in_page = (start_offset + src_end) &
4396                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4397
4398                cur = min_t(unsigned long, len, src_off_in_page + 1);
4399                cur = min(cur, dst_off_in_page + 1);
4400                move_pages(extent_buffer_page(dst, dst_i),
4401                           extent_buffer_page(dst, src_i),
4402                           dst_off_in_page - cur + 1,
4403                           src_off_in_page - cur + 1, cur);
4404
4405                dst_end -= cur;
4406                src_end -= cur;
4407                len -= cur;
4408        }
4409}
4410
4411static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4412{
4413        struct extent_buffer *eb =
4414                        container_of(head, struct extent_buffer, rcu_head);
4415
4416        btrfs_release_extent_buffer(eb);
4417}
4418
4419int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4420{
4421        u64 start = page_offset(page);
4422        struct extent_buffer *eb;
4423        int ret = 1;
4424
4425        spin_lock(&tree->buffer_lock);
4426        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4427        if (!eb) {
4428                spin_unlock(&tree->buffer_lock);
4429                return ret;
4430        }
4431
4432        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4433                ret = 0;
4434                goto out;
4435        }
4436
4437        /*
4438         * set @eb->refs to 0 if it is already 1, and then release the @eb.
4439         * Or go back.
4440         */
4441        if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
4442                ret = 0;
4443                goto out;
4444        }
4445
4446        radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4447out:
4448        spin_unlock(&tree->buffer_lock);
4449
4450        /* at this point we can safely release the extent buffer */
4451        if (atomic_read(&eb->refs) == 0)
4452                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4453        return ret;
4454}
4455
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.