linux/fs/btrfs/volumes.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/random.h>
  24#include <linux/iocontext.h>
  25#include <linux/capability.h>
  26#include <linux/ratelimit.h>
  27#include <linux/kthread.h>
  28#include <asm/div64.h>
  29#include "compat.h"
  30#include "ctree.h"
  31#include "extent_map.h"
  32#include "disk-io.h"
  33#include "transaction.h"
  34#include "print-tree.h"
  35#include "volumes.h"
  36#include "async-thread.h"
  37#include "check-integrity.h"
  38#include "rcu-string.h"
  39
  40static int init_first_rw_device(struct btrfs_trans_handle *trans,
  41                                struct btrfs_root *root,
  42                                struct btrfs_device *device);
  43static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  44static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  45static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  46
  47static DEFINE_MUTEX(uuid_mutex);
  48static LIST_HEAD(fs_uuids);
  49
  50static void lock_chunks(struct btrfs_root *root)
  51{
  52        mutex_lock(&root->fs_info->chunk_mutex);
  53}
  54
  55static void unlock_chunks(struct btrfs_root *root)
  56{
  57        mutex_unlock(&root->fs_info->chunk_mutex);
  58}
  59
  60static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
  61{
  62        struct btrfs_device *device;
  63        WARN_ON(fs_devices->opened);
  64        while (!list_empty(&fs_devices->devices)) {
  65                device = list_entry(fs_devices->devices.next,
  66                                    struct btrfs_device, dev_list);
  67                list_del(&device->dev_list);
  68                rcu_string_free(device->name);
  69                kfree(device);
  70        }
  71        kfree(fs_devices);
  72}
  73
  74void btrfs_cleanup_fs_uuids(void)
  75{
  76        struct btrfs_fs_devices *fs_devices;
  77
  78        while (!list_empty(&fs_uuids)) {
  79                fs_devices = list_entry(fs_uuids.next,
  80                                        struct btrfs_fs_devices, list);
  81                list_del(&fs_devices->list);
  82                free_fs_devices(fs_devices);
  83        }
  84}
  85
  86static noinline struct btrfs_device *__find_device(struct list_head *head,
  87                                                   u64 devid, u8 *uuid)
  88{
  89        struct btrfs_device *dev;
  90
  91        list_for_each_entry(dev, head, dev_list) {
  92                if (dev->devid == devid &&
  93                    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
  94                        return dev;
  95                }
  96        }
  97        return NULL;
  98}
  99
 100static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 101{
 102        struct btrfs_fs_devices *fs_devices;
 103
 104        list_for_each_entry(fs_devices, &fs_uuids, list) {
 105                if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 106                        return fs_devices;
 107        }
 108        return NULL;
 109}
 110
 111static void requeue_list(struct btrfs_pending_bios *pending_bios,
 112                        struct bio *head, struct bio *tail)
 113{
 114
 115        struct bio *old_head;
 116
 117        old_head = pending_bios->head;
 118        pending_bios->head = head;
 119        if (pending_bios->tail)
 120                tail->bi_next = old_head;
 121        else
 122                pending_bios->tail = tail;
 123}
 124
 125/*
 126 * we try to collect pending bios for a device so we don't get a large
 127 * number of procs sending bios down to the same device.  This greatly
 128 * improves the schedulers ability to collect and merge the bios.
 129 *
 130 * But, it also turns into a long list of bios to process and that is sure
 131 * to eventually make the worker thread block.  The solution here is to
 132 * make some progress and then put this work struct back at the end of
 133 * the list if the block device is congested.  This way, multiple devices
 134 * can make progress from a single worker thread.
 135 */
 136static noinline void run_scheduled_bios(struct btrfs_device *device)
 137{
 138        struct bio *pending;
 139        struct backing_dev_info *bdi;
 140        struct btrfs_fs_info *fs_info;
 141        struct btrfs_pending_bios *pending_bios;
 142        struct bio *tail;
 143        struct bio *cur;
 144        int again = 0;
 145        unsigned long num_run;
 146        unsigned long batch_run = 0;
 147        unsigned long limit;
 148        unsigned long last_waited = 0;
 149        int force_reg = 0;
 150        int sync_pending = 0;
 151        struct blk_plug plug;
 152
 153        /*
 154         * this function runs all the bios we've collected for
 155         * a particular device.  We don't want to wander off to
 156         * another device without first sending all of these down.
 157         * So, setup a plug here and finish it off before we return
 158         */
 159        blk_start_plug(&plug);
 160
 161        bdi = blk_get_backing_dev_info(device->bdev);
 162        fs_info = device->dev_root->fs_info;
 163        limit = btrfs_async_submit_limit(fs_info);
 164        limit = limit * 2 / 3;
 165
 166loop:
 167        spin_lock(&device->io_lock);
 168
 169loop_lock:
 170        num_run = 0;
 171
 172        /* take all the bios off the list at once and process them
 173         * later on (without the lock held).  But, remember the
 174         * tail and other pointers so the bios can be properly reinserted
 175         * into the list if we hit congestion
 176         */
 177        if (!force_reg && device->pending_sync_bios.head) {
 178                pending_bios = &device->pending_sync_bios;
 179                force_reg = 1;
 180        } else {
 181                pending_bios = &device->pending_bios;
 182                force_reg = 0;
 183        }
 184
 185        pending = pending_bios->head;
 186        tail = pending_bios->tail;
 187        WARN_ON(pending && !tail);
 188
 189        /*
 190         * if pending was null this time around, no bios need processing
 191         * at all and we can stop.  Otherwise it'll loop back up again
 192         * and do an additional check so no bios are missed.
 193         *
 194         * device->running_pending is used to synchronize with the
 195         * schedule_bio code.
 196         */
 197        if (device->pending_sync_bios.head == NULL &&
 198            device->pending_bios.head == NULL) {
 199                again = 0;
 200                device->running_pending = 0;
 201        } else {
 202                again = 1;
 203                device->running_pending = 1;
 204        }
 205
 206        pending_bios->head = NULL;
 207        pending_bios->tail = NULL;
 208
 209        spin_unlock(&device->io_lock);
 210
 211        while (pending) {
 212
 213                rmb();
 214                /* we want to work on both lists, but do more bios on the
 215                 * sync list than the regular list
 216                 */
 217                if ((num_run > 32 &&
 218                    pending_bios != &device->pending_sync_bios &&
 219                    device->pending_sync_bios.head) ||
 220                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 221                    device->pending_bios.head)) {
 222                        spin_lock(&device->io_lock);
 223                        requeue_list(pending_bios, pending, tail);
 224                        goto loop_lock;
 225                }
 226
 227                cur = pending;
 228                pending = pending->bi_next;
 229                cur->bi_next = NULL;
 230
 231                if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 232                    waitqueue_active(&fs_info->async_submit_wait))
 233                        wake_up(&fs_info->async_submit_wait);
 234
 235                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 236
 237                /*
 238                 * if we're doing the sync list, record that our
 239                 * plug has some sync requests on it
 240                 *
 241                 * If we're doing the regular list and there are
 242                 * sync requests sitting around, unplug before
 243                 * we add more
 244                 */
 245                if (pending_bios == &device->pending_sync_bios) {
 246                        sync_pending = 1;
 247                } else if (sync_pending) {
 248                        blk_finish_plug(&plug);
 249                        blk_start_plug(&plug);
 250                        sync_pending = 0;
 251                }
 252
 253                btrfsic_submit_bio(cur->bi_rw, cur);
 254                num_run++;
 255                batch_run++;
 256                if (need_resched())
 257                        cond_resched();
 258
 259                /*
 260                 * we made progress, there is more work to do and the bdi
 261                 * is now congested.  Back off and let other work structs
 262                 * run instead
 263                 */
 264                if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 265                    fs_info->fs_devices->open_devices > 1) {
 266                        struct io_context *ioc;
 267
 268                        ioc = current->io_context;
 269
 270                        /*
 271                         * the main goal here is that we don't want to
 272                         * block if we're going to be able to submit
 273                         * more requests without blocking.
 274                         *
 275                         * This code does two great things, it pokes into
 276                         * the elevator code from a filesystem _and_
 277                         * it makes assumptions about how batching works.
 278                         */
 279                        if (ioc && ioc->nr_batch_requests > 0 &&
 280                            time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 281                            (last_waited == 0 ||
 282                             ioc->last_waited == last_waited)) {
 283                                /*
 284                                 * we want to go through our batch of
 285                                 * requests and stop.  So, we copy out
 286                                 * the ioc->last_waited time and test
 287                                 * against it before looping
 288                                 */
 289                                last_waited = ioc->last_waited;
 290                                if (need_resched())
 291                                        cond_resched();
 292                                continue;
 293                        }
 294                        spin_lock(&device->io_lock);
 295                        requeue_list(pending_bios, pending, tail);
 296                        device->running_pending = 1;
 297
 298                        spin_unlock(&device->io_lock);
 299                        btrfs_requeue_work(&device->work);
 300                        goto done;
 301                }
 302                /* unplug every 64 requests just for good measure */
 303                if (batch_run % 64 == 0) {
 304                        blk_finish_plug(&plug);
 305                        blk_start_plug(&plug);
 306                        sync_pending = 0;
 307                }
 308        }
 309
 310        cond_resched();
 311        if (again)
 312                goto loop;
 313
 314        spin_lock(&device->io_lock);
 315        if (device->pending_bios.head || device->pending_sync_bios.head)
 316                goto loop_lock;
 317        spin_unlock(&device->io_lock);
 318
 319done:
 320        blk_finish_plug(&plug);
 321}
 322
 323static void pending_bios_fn(struct btrfs_work *work)
 324{
 325        struct btrfs_device *device;
 326
 327        device = container_of(work, struct btrfs_device, work);
 328        run_scheduled_bios(device);
 329}
 330
 331static noinline int device_list_add(const char *path,
 332                           struct btrfs_super_block *disk_super,
 333                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 334{
 335        struct btrfs_device *device;
 336        struct btrfs_fs_devices *fs_devices;
 337        struct rcu_string *name;
 338        u64 found_transid = btrfs_super_generation(disk_super);
 339
 340        fs_devices = find_fsid(disk_super->fsid);
 341        if (!fs_devices) {
 342                fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 343                if (!fs_devices)
 344                        return -ENOMEM;
 345                INIT_LIST_HEAD(&fs_devices->devices);
 346                INIT_LIST_HEAD(&fs_devices->alloc_list);
 347                list_add(&fs_devices->list, &fs_uuids);
 348                memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 349                fs_devices->latest_devid = devid;
 350                fs_devices->latest_trans = found_transid;
 351                mutex_init(&fs_devices->device_list_mutex);
 352                device = NULL;
 353        } else {
 354                device = __find_device(&fs_devices->devices, devid,
 355                                       disk_super->dev_item.uuid);
 356        }
 357        if (!device) {
 358                if (fs_devices->opened)
 359                        return -EBUSY;
 360
 361                device = kzalloc(sizeof(*device), GFP_NOFS);
 362                if (!device) {
 363                        /* we can safely leave the fs_devices entry around */
 364                        return -ENOMEM;
 365                }
 366                device->devid = devid;
 367                device->dev_stats_valid = 0;
 368                device->work.func = pending_bios_fn;
 369                memcpy(device->uuid, disk_super->dev_item.uuid,
 370                       BTRFS_UUID_SIZE);
 371                spin_lock_init(&device->io_lock);
 372
 373                name = rcu_string_strdup(path, GFP_NOFS);
 374                if (!name) {
 375                        kfree(device);
 376                        return -ENOMEM;
 377                }
 378                rcu_assign_pointer(device->name, name);
 379                INIT_LIST_HEAD(&device->dev_alloc_list);
 380
 381                /* init readahead state */
 382                spin_lock_init(&device->reada_lock);
 383                device->reada_curr_zone = NULL;
 384                atomic_set(&device->reada_in_flight, 0);
 385                device->reada_next = 0;
 386                INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
 387                INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 388
 389                mutex_lock(&fs_devices->device_list_mutex);
 390                list_add_rcu(&device->dev_list, &fs_devices->devices);
 391                mutex_unlock(&fs_devices->device_list_mutex);
 392
 393                device->fs_devices = fs_devices;
 394                fs_devices->num_devices++;
 395        } else if (!device->name || strcmp(device->name->str, path)) {
 396                name = rcu_string_strdup(path, GFP_NOFS);
 397                if (!name)
 398                        return -ENOMEM;
 399                rcu_string_free(device->name);
 400                rcu_assign_pointer(device->name, name);
 401                if (device->missing) {
 402                        fs_devices->missing_devices--;
 403                        device->missing = 0;
 404                }
 405        }
 406
 407        if (found_transid > fs_devices->latest_trans) {
 408                fs_devices->latest_devid = devid;
 409                fs_devices->latest_trans = found_transid;
 410        }
 411        *fs_devices_ret = fs_devices;
 412        return 0;
 413}
 414
 415static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 416{
 417        struct btrfs_fs_devices *fs_devices;
 418        struct btrfs_device *device;
 419        struct btrfs_device *orig_dev;
 420
 421        fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 422        if (!fs_devices)
 423                return ERR_PTR(-ENOMEM);
 424
 425        INIT_LIST_HEAD(&fs_devices->devices);
 426        INIT_LIST_HEAD(&fs_devices->alloc_list);
 427        INIT_LIST_HEAD(&fs_devices->list);
 428        mutex_init(&fs_devices->device_list_mutex);
 429        fs_devices->latest_devid = orig->latest_devid;
 430        fs_devices->latest_trans = orig->latest_trans;
 431        fs_devices->total_devices = orig->total_devices;
 432        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 433
 434        /* We have held the volume lock, it is safe to get the devices. */
 435        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 436                struct rcu_string *name;
 437
 438                device = kzalloc(sizeof(*device), GFP_NOFS);
 439                if (!device)
 440                        goto error;
 441
 442                /*
 443                 * This is ok to do without rcu read locked because we hold the
 444                 * uuid mutex so nothing we touch in here is going to disappear.
 445                 */
 446                name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
 447                if (!name) {
 448                        kfree(device);
 449                        goto error;
 450                }
 451                rcu_assign_pointer(device->name, name);
 452
 453                device->devid = orig_dev->devid;
 454                device->work.func = pending_bios_fn;
 455                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
 456                spin_lock_init(&device->io_lock);
 457                INIT_LIST_HEAD(&device->dev_list);
 458                INIT_LIST_HEAD(&device->dev_alloc_list);
 459
 460                list_add(&device->dev_list, &fs_devices->devices);
 461                device->fs_devices = fs_devices;
 462                fs_devices->num_devices++;
 463        }
 464        return fs_devices;
 465error:
 466        free_fs_devices(fs_devices);
 467        return ERR_PTR(-ENOMEM);
 468}
 469
 470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 471{
 472        struct btrfs_device *device, *next;
 473
 474        struct block_device *latest_bdev = NULL;
 475        u64 latest_devid = 0;
 476        u64 latest_transid = 0;
 477
 478        mutex_lock(&uuid_mutex);
 479again:
 480        /* This is the initialized path, it is safe to release the devices. */
 481        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 482                if (device->in_fs_metadata) {
 483                        if (!latest_transid ||
 484                            device->generation > latest_transid) {
 485                                latest_devid = device->devid;
 486                                latest_transid = device->generation;
 487                                latest_bdev = device->bdev;
 488                        }
 489                        continue;
 490                }
 491
 492                if (device->bdev) {
 493                        blkdev_put(device->bdev, device->mode);
 494                        device->bdev = NULL;
 495                        fs_devices->open_devices--;
 496                }
 497                if (device->writeable) {
 498                        list_del_init(&device->dev_alloc_list);
 499                        device->writeable = 0;
 500                        fs_devices->rw_devices--;
 501                }
 502                list_del_init(&device->dev_list);
 503                fs_devices->num_devices--;
 504                rcu_string_free(device->name);
 505                kfree(device);
 506        }
 507
 508        if (fs_devices->seed) {
 509                fs_devices = fs_devices->seed;
 510                goto again;
 511        }
 512
 513        fs_devices->latest_bdev = latest_bdev;
 514        fs_devices->latest_devid = latest_devid;
 515        fs_devices->latest_trans = latest_transid;
 516
 517        mutex_unlock(&uuid_mutex);
 518}
 519
 520static void __free_device(struct work_struct *work)
 521{
 522        struct btrfs_device *device;
 523
 524        device = container_of(work, struct btrfs_device, rcu_work);
 525
 526        if (device->bdev)
 527                blkdev_put(device->bdev, device->mode);
 528
 529        rcu_string_free(device->name);
 530        kfree(device);
 531}
 532
 533static void free_device(struct rcu_head *head)
 534{
 535        struct btrfs_device *device;
 536
 537        device = container_of(head, struct btrfs_device, rcu);
 538
 539        INIT_WORK(&device->rcu_work, __free_device);
 540        schedule_work(&device->rcu_work);
 541}
 542
 543static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 544{
 545        struct btrfs_device *device;
 546
 547        if (--fs_devices->opened > 0)
 548                return 0;
 549
 550        mutex_lock(&fs_devices->device_list_mutex);
 551        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 552                struct btrfs_device *new_device;
 553                struct rcu_string *name;
 554
 555                if (device->bdev)
 556                        fs_devices->open_devices--;
 557
 558                if (device->writeable) {
 559                        list_del_init(&device->dev_alloc_list);
 560                        fs_devices->rw_devices--;
 561                }
 562
 563                if (device->can_discard)
 564                        fs_devices->num_can_discard--;
 565
 566                new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
 567                BUG_ON(!new_device); /* -ENOMEM */
 568                memcpy(new_device, device, sizeof(*new_device));
 569
 570                /* Safe because we are under uuid_mutex */
 571                if (device->name) {
 572                        name = rcu_string_strdup(device->name->str, GFP_NOFS);
 573                        BUG_ON(device->name && !name); /* -ENOMEM */
 574                        rcu_assign_pointer(new_device->name, name);
 575                }
 576                new_device->bdev = NULL;
 577                new_device->writeable = 0;
 578                new_device->in_fs_metadata = 0;
 579                new_device->can_discard = 0;
 580                list_replace_rcu(&device->dev_list, &new_device->dev_list);
 581
 582                call_rcu(&device->rcu, free_device);
 583        }
 584        mutex_unlock(&fs_devices->device_list_mutex);
 585
 586        WARN_ON(fs_devices->open_devices);
 587        WARN_ON(fs_devices->rw_devices);
 588        fs_devices->opened = 0;
 589        fs_devices->seeding = 0;
 590
 591        return 0;
 592}
 593
 594int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 595{
 596        struct btrfs_fs_devices *seed_devices = NULL;
 597        int ret;
 598
 599        mutex_lock(&uuid_mutex);
 600        ret = __btrfs_close_devices(fs_devices);
 601        if (!fs_devices->opened) {
 602                seed_devices = fs_devices->seed;
 603                fs_devices->seed = NULL;
 604        }
 605        mutex_unlock(&uuid_mutex);
 606
 607        while (seed_devices) {
 608                fs_devices = seed_devices;
 609                seed_devices = fs_devices->seed;
 610                __btrfs_close_devices(fs_devices);
 611                free_fs_devices(fs_devices);
 612        }
 613        return ret;
 614}
 615
 616static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 617                                fmode_t flags, void *holder)
 618{
 619        struct request_queue *q;
 620        struct block_device *bdev;
 621        struct list_head *head = &fs_devices->devices;
 622        struct btrfs_device *device;
 623        struct block_device *latest_bdev = NULL;
 624        struct buffer_head *bh;
 625        struct btrfs_super_block *disk_super;
 626        u64 latest_devid = 0;
 627        u64 latest_transid = 0;
 628        u64 devid;
 629        int seeding = 1;
 630        int ret = 0;
 631
 632        flags |= FMODE_EXCL;
 633
 634        list_for_each_entry(device, head, dev_list) {
 635                if (device->bdev)
 636                        continue;
 637                if (!device->name)
 638                        continue;
 639
 640                bdev = blkdev_get_by_path(device->name->str, flags, holder);
 641                if (IS_ERR(bdev)) {
 642                        printk(KERN_INFO "open %s failed\n", device->name->str);
 643                        goto error;
 644                }
 645                filemap_write_and_wait(bdev->bd_inode->i_mapping);
 646                invalidate_bdev(bdev);
 647                set_blocksize(bdev, 4096);
 648
 649                bh = btrfs_read_dev_super(bdev);
 650                if (!bh)
 651                        goto error_close;
 652
 653                disk_super = (struct btrfs_super_block *)bh->b_data;
 654                devid = btrfs_stack_device_id(&disk_super->dev_item);
 655                if (devid != device->devid)
 656                        goto error_brelse;
 657
 658                if (memcmp(device->uuid, disk_super->dev_item.uuid,
 659                           BTRFS_UUID_SIZE))
 660                        goto error_brelse;
 661
 662                device->generation = btrfs_super_generation(disk_super);
 663                if (!latest_transid || device->generation > latest_transid) {
 664                        latest_devid = devid;
 665                        latest_transid = device->generation;
 666                        latest_bdev = bdev;
 667                }
 668
 669                if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 670                        device->writeable = 0;
 671                } else {
 672                        device->writeable = !bdev_read_only(bdev);
 673                        seeding = 0;
 674                }
 675
 676                q = bdev_get_queue(bdev);
 677                if (blk_queue_discard(q)) {
 678                        device->can_discard = 1;
 679                        fs_devices->num_can_discard++;
 680                }
 681
 682                device->bdev = bdev;
 683                device->in_fs_metadata = 0;
 684                device->mode = flags;
 685
 686                if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 687                        fs_devices->rotating = 1;
 688
 689                fs_devices->open_devices++;
 690                if (device->writeable) {
 691                        fs_devices->rw_devices++;
 692                        list_add(&device->dev_alloc_list,
 693                                 &fs_devices->alloc_list);
 694                }
 695                brelse(bh);
 696                continue;
 697
 698error_brelse:
 699                brelse(bh);
 700error_close:
 701                blkdev_put(bdev, flags);
 702error:
 703                continue;
 704        }
 705        if (fs_devices->open_devices == 0) {
 706                ret = -EINVAL;
 707                goto out;
 708        }
 709        fs_devices->seeding = seeding;
 710        fs_devices->opened = 1;
 711        fs_devices->latest_bdev = latest_bdev;
 712        fs_devices->latest_devid = latest_devid;
 713        fs_devices->latest_trans = latest_transid;
 714        fs_devices->total_rw_bytes = 0;
 715out:
 716        return ret;
 717}
 718
 719int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 720                       fmode_t flags, void *holder)
 721{
 722        int ret;
 723
 724        mutex_lock(&uuid_mutex);
 725        if (fs_devices->opened) {
 726                fs_devices->opened++;
 727                ret = 0;
 728        } else {
 729                ret = __btrfs_open_devices(fs_devices, flags, holder);
 730        }
 731        mutex_unlock(&uuid_mutex);
 732        return ret;
 733}
 734
 735int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 736                          struct btrfs_fs_devices **fs_devices_ret)
 737{
 738        struct btrfs_super_block *disk_super;
 739        struct block_device *bdev;
 740        struct buffer_head *bh;
 741        int ret;
 742        u64 devid;
 743        u64 transid;
 744        u64 total_devices;
 745
 746        flags |= FMODE_EXCL;
 747        bdev = blkdev_get_by_path(path, flags, holder);
 748
 749        if (IS_ERR(bdev)) {
 750                ret = PTR_ERR(bdev);
 751                goto error;
 752        }
 753
 754        mutex_lock(&uuid_mutex);
 755        ret = set_blocksize(bdev, 4096);
 756        if (ret)
 757                goto error_close;
 758        bh = btrfs_read_dev_super(bdev);
 759        if (!bh) {
 760                ret = -EINVAL;
 761                goto error_close;
 762        }
 763        disk_super = (struct btrfs_super_block *)bh->b_data;
 764        devid = btrfs_stack_device_id(&disk_super->dev_item);
 765        transid = btrfs_super_generation(disk_super);
 766        total_devices = btrfs_super_num_devices(disk_super);
 767        if (disk_super->label[0])
 768                printk(KERN_INFO "device label %s ", disk_super->label);
 769        else
 770                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
 771        printk(KERN_CONT "devid %llu transid %llu %s\n",
 772               (unsigned long long)devid, (unsigned long long)transid, path);
 773        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 774        if (!ret && fs_devices_ret)
 775                (*fs_devices_ret)->total_devices = total_devices;
 776        brelse(bh);
 777error_close:
 778        mutex_unlock(&uuid_mutex);
 779        blkdev_put(bdev, flags);
 780error:
 781        return ret;
 782}
 783
 784/* helper to account the used device space in the range */
 785int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 786                                   u64 end, u64 *length)
 787{
 788        struct btrfs_key key;
 789        struct btrfs_root *root = device->dev_root;
 790        struct btrfs_dev_extent *dev_extent;
 791        struct btrfs_path *path;
 792        u64 extent_end;
 793        int ret;
 794        int slot;
 795        struct extent_buffer *l;
 796
 797        *length = 0;
 798
 799        if (start >= device->total_bytes)
 800                return 0;
 801
 802        path = btrfs_alloc_path();
 803        if (!path)
 804                return -ENOMEM;
 805        path->reada = 2;
 806
 807        key.objectid = device->devid;
 808        key.offset = start;
 809        key.type = BTRFS_DEV_EXTENT_KEY;
 810
 811        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 812        if (ret < 0)
 813                goto out;
 814        if (ret > 0) {
 815                ret = btrfs_previous_item(root, path, key.objectid, key.type);
 816                if (ret < 0)
 817                        goto out;
 818        }
 819
 820        while (1) {
 821                l = path->nodes[0];
 822                slot = path->slots[0];
 823                if (slot >= btrfs_header_nritems(l)) {
 824                        ret = btrfs_next_leaf(root, path);
 825                        if (ret == 0)
 826                                continue;
 827                        if (ret < 0)
 828                                goto out;
 829
 830                        break;
 831                }
 832                btrfs_item_key_to_cpu(l, &key, slot);
 833
 834                if (key.objectid < device->devid)
 835                        goto next;
 836
 837                if (key.objectid > device->devid)
 838                        break;
 839
 840                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 841                        goto next;
 842
 843                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 844                extent_end = key.offset + btrfs_dev_extent_length(l,
 845                                                                  dev_extent);
 846                if (key.offset <= start && extent_end > end) {
 847                        *length = end - start + 1;
 848                        break;
 849                } else if (key.offset <= start && extent_end > start)
 850                        *length += extent_end - start;
 851                else if (key.offset > start && extent_end <= end)
 852                        *length += extent_end - key.offset;
 853                else if (key.offset > start && key.offset <= end) {
 854                        *length += end - key.offset + 1;
 855                        break;
 856                } else if (key.offset > end)
 857                        break;
 858
 859next:
 860                path->slots[0]++;
 861        }
 862        ret = 0;
 863out:
 864        btrfs_free_path(path);
 865        return ret;
 866}
 867
 868/*
 869 * find_free_dev_extent - find free space in the specified device
 870 * @device:     the device which we search the free space in
 871 * @num_bytes:  the size of the free space that we need
 872 * @start:      store the start of the free space.
 873 * @len:        the size of the free space. that we find, or the size of the max
 874 *              free space if we don't find suitable free space
 875 *
 876 * this uses a pretty simple search, the expectation is that it is
 877 * called very infrequently and that a given device has a small number
 878 * of extents
 879 *
 880 * @start is used to store the start of the free space if we find. But if we
 881 * don't find suitable free space, it will be used to store the start position
 882 * of the max free space.
 883 *
 884 * @len is used to store the size of the free space that we find.
 885 * But if we don't find suitable free space, it is used to store the size of
 886 * the max free space.
 887 */
 888int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 889                         u64 *start, u64 *len)
 890{
 891        struct btrfs_key key;
 892        struct btrfs_root *root = device->dev_root;
 893        struct btrfs_dev_extent *dev_extent;
 894        struct btrfs_path *path;
 895        u64 hole_size;
 896        u64 max_hole_start;
 897        u64 max_hole_size;
 898        u64 extent_end;
 899        u64 search_start;
 900        u64 search_end = device->total_bytes;
 901        int ret;
 902        int slot;
 903        struct extent_buffer *l;
 904
 905        /* FIXME use last free of some kind */
 906
 907        /* we don't want to overwrite the superblock on the drive,
 908         * so we make sure to start at an offset of at least 1MB
 909         */
 910        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
 911
 912        max_hole_start = search_start;
 913        max_hole_size = 0;
 914        hole_size = 0;
 915
 916        if (search_start >= search_end) {
 917                ret = -ENOSPC;
 918                goto error;
 919        }
 920
 921        path = btrfs_alloc_path();
 922        if (!path) {
 923                ret = -ENOMEM;
 924                goto error;
 925        }
 926        path->reada = 2;
 927
 928        key.objectid = device->devid;
 929        key.offset = search_start;
 930        key.type = BTRFS_DEV_EXTENT_KEY;
 931
 932        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 933        if (ret < 0)
 934                goto out;
 935        if (ret > 0) {
 936                ret = btrfs_previous_item(root, path, key.objectid, key.type);
 937                if (ret < 0)
 938                        goto out;
 939        }
 940
 941        while (1) {
 942                l = path->nodes[0];
 943                slot = path->slots[0];
 944                if (slot >= btrfs_header_nritems(l)) {
 945                        ret = btrfs_next_leaf(root, path);
 946                        if (ret == 0)
 947                                continue;
 948                        if (ret < 0)
 949                                goto out;
 950
 951                        break;
 952                }
 953                btrfs_item_key_to_cpu(l, &key, slot);
 954
 955                if (key.objectid < device->devid)
 956                        goto next;
 957
 958                if (key.objectid > device->devid)
 959                        break;
 960
 961                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 962                        goto next;
 963
 964                if (key.offset > search_start) {
 965                        hole_size = key.offset - search_start;
 966
 967                        if (hole_size > max_hole_size) {
 968                                max_hole_start = search_start;
 969                                max_hole_size = hole_size;
 970                        }
 971
 972                        /*
 973                         * If this free space is greater than which we need,
 974                         * it must be the max free space that we have found
 975                         * until now, so max_hole_start must point to the start
 976                         * of this free space and the length of this free space
 977                         * is stored in max_hole_size. Thus, we return
 978                         * max_hole_start and max_hole_size and go back to the
 979                         * caller.
 980                         */
 981                        if (hole_size >= num_bytes) {
 982                                ret = 0;
 983                                goto out;
 984                        }
 985                }
 986
 987                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 988                extent_end = key.offset + btrfs_dev_extent_length(l,
 989                                                                  dev_extent);
 990                if (extent_end > search_start)
 991                        search_start = extent_end;
 992next:
 993                path->slots[0]++;
 994                cond_resched();
 995        }
 996
 997        /*
 998         * At this point, search_start should be the end of
 999         * allocated dev extents, and when shrinking the device,
1000         * search_end may be smaller than search_start.
1001         */
1002        if (search_end > search_start)
1003                hole_size = search_end - search_start;
1004
1005        if (hole_size > max_hole_size) {
1006                max_hole_start = search_start;
1007                max_hole_size = hole_size;
1008        }
1009
1010        /* See above. */
1011        if (hole_size < num_bytes)
1012                ret = -ENOSPC;
1013        else
1014                ret = 0;
1015
1016out:
1017        btrfs_free_path(path);
1018error:
1019        *start = max_hole_start;
1020        if (len)
1021                *len = max_hole_size;
1022        return ret;
1023}
1024
1025static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1026                          struct btrfs_device *device,
1027                          u64 start)
1028{
1029        int ret;
1030        struct btrfs_path *path;
1031        struct btrfs_root *root = device->dev_root;
1032        struct btrfs_key key;
1033        struct btrfs_key found_key;
1034        struct extent_buffer *leaf = NULL;
1035        struct btrfs_dev_extent *extent = NULL;
1036
1037        path = btrfs_alloc_path();
1038        if (!path)
1039                return -ENOMEM;
1040
1041        key.objectid = device->devid;
1042        key.offset = start;
1043        key.type = BTRFS_DEV_EXTENT_KEY;
1044again:
1045        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1046        if (ret > 0) {
1047                ret = btrfs_previous_item(root, path, key.objectid,
1048                                          BTRFS_DEV_EXTENT_KEY);
1049                if (ret)
1050                        goto out;
1051                leaf = path->nodes[0];
1052                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1053                extent = btrfs_item_ptr(leaf, path->slots[0],
1054                                        struct btrfs_dev_extent);
1055                BUG_ON(found_key.offset > start || found_key.offset +
1056                       btrfs_dev_extent_length(leaf, extent) < start);
1057                key = found_key;
1058                btrfs_release_path(path);
1059                goto again;
1060        } else if (ret == 0) {
1061                leaf = path->nodes[0];
1062                extent = btrfs_item_ptr(leaf, path->slots[0],
1063                                        struct btrfs_dev_extent);
1064        } else {
1065                btrfs_error(root->fs_info, ret, "Slot search failed");
1066                goto out;
1067        }
1068
1069        if (device->bytes_used > 0) {
1070                u64 len = btrfs_dev_extent_length(leaf, extent);
1071                device->bytes_used -= len;
1072                spin_lock(&root->fs_info->free_chunk_lock);
1073                root->fs_info->free_chunk_space += len;
1074                spin_unlock(&root->fs_info->free_chunk_lock);
1075        }
1076        ret = btrfs_del_item(trans, root, path);
1077        if (ret) {
1078                btrfs_error(root->fs_info, ret,
1079                            "Failed to remove dev extent item");
1080        }
1081out:
1082        btrfs_free_path(path);
1083        return ret;
1084}
1085
1086int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1087                           struct btrfs_device *device,
1088                           u64 chunk_tree, u64 chunk_objectid,
1089                           u64 chunk_offset, u64 start, u64 num_bytes)
1090{
1091        int ret;
1092        struct btrfs_path *path;
1093        struct btrfs_root *root = device->dev_root;
1094        struct btrfs_dev_extent *extent;
1095        struct extent_buffer *leaf;
1096        struct btrfs_key key;
1097
1098        WARN_ON(!device->in_fs_metadata);
1099        path = btrfs_alloc_path();
1100        if (!path)
1101                return -ENOMEM;
1102
1103        key.objectid = device->devid;
1104        key.offset = start;
1105        key.type = BTRFS_DEV_EXTENT_KEY;
1106        ret = btrfs_insert_empty_item(trans, root, path, &key,
1107                                      sizeof(*extent));
1108        if (ret)
1109                goto out;
1110
1111        leaf = path->nodes[0];
1112        extent = btrfs_item_ptr(leaf, path->slots[0],
1113                                struct btrfs_dev_extent);
1114        btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1115        btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1116        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1117
1118        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1119                    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1120                    BTRFS_UUID_SIZE);
1121
1122        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1123        btrfs_mark_buffer_dirty(leaf);
1124out:
1125        btrfs_free_path(path);
1126        return ret;
1127}
1128
1129static noinline int find_next_chunk(struct btrfs_root *root,
1130                                    u64 objectid, u64 *offset)
1131{
1132        struct btrfs_path *path;
1133        int ret;
1134        struct btrfs_key key;
1135        struct btrfs_chunk *chunk;
1136        struct btrfs_key found_key;
1137
1138        path = btrfs_alloc_path();
1139        if (!path)
1140                return -ENOMEM;
1141
1142        key.objectid = objectid;
1143        key.offset = (u64)-1;
1144        key.type = BTRFS_CHUNK_ITEM_KEY;
1145
1146        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1147        if (ret < 0)
1148                goto error;
1149
1150        BUG_ON(ret == 0); /* Corruption */
1151
1152        ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1153        if (ret) {
1154                *offset = 0;
1155        } else {
1156                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1157                                      path->slots[0]);
1158                if (found_key.objectid != objectid)
1159                        *offset = 0;
1160                else {
1161                        chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1162                                               struct btrfs_chunk);
1163                        *offset = found_key.offset +
1164                                btrfs_chunk_length(path->nodes[0], chunk);
1165                }
1166        }
1167        ret = 0;
1168error:
1169        btrfs_free_path(path);
1170        return ret;
1171}
1172
1173static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1174{
1175        int ret;
1176        struct btrfs_key key;
1177        struct btrfs_key found_key;
1178        struct btrfs_path *path;
1179
1180        root = root->fs_info->chunk_root;
1181
1182        path = btrfs_alloc_path();
1183        if (!path)
1184                return -ENOMEM;
1185
1186        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1187        key.type = BTRFS_DEV_ITEM_KEY;
1188        key.offset = (u64)-1;
1189
1190        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1191        if (ret < 0)
1192                goto error;
1193
1194        BUG_ON(ret == 0); /* Corruption */
1195
1196        ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1197                                  BTRFS_DEV_ITEM_KEY);
1198        if (ret) {
1199                *objectid = 1;
1200        } else {
1201                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1202                                      path->slots[0]);
1203                *objectid = found_key.offset + 1;
1204        }
1205        ret = 0;
1206error:
1207        btrfs_free_path(path);
1208        return ret;
1209}
1210
1211/*
1212 * the device information is stored in the chunk root
1213 * the btrfs_device struct should be fully filled in
1214 */
1215int btrfs_add_device(struct btrfs_trans_handle *trans,
1216                     struct btrfs_root *root,
1217                     struct btrfs_device *device)
1218{
1219        int ret;
1220        struct btrfs_path *path;
1221        struct btrfs_dev_item *dev_item;
1222        struct extent_buffer *leaf;
1223        struct btrfs_key key;
1224        unsigned long ptr;
1225
1226        root = root->fs_info->chunk_root;
1227
1228        path = btrfs_alloc_path();
1229        if (!path)
1230                return -ENOMEM;
1231
1232        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1233        key.type = BTRFS_DEV_ITEM_KEY;
1234        key.offset = device->devid;
1235
1236        ret = btrfs_insert_empty_item(trans, root, path, &key,
1237                                      sizeof(*dev_item));
1238        if (ret)
1239                goto out;
1240
1241        leaf = path->nodes[0];
1242        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1243
1244        btrfs_set_device_id(leaf, dev_item, device->devid);
1245        btrfs_set_device_generation(leaf, dev_item, 0);
1246        btrfs_set_device_type(leaf, dev_item, device->type);
1247        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1248        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1249        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1250        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1251        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1252        btrfs_set_device_group(leaf, dev_item, 0);
1253        btrfs_set_device_seek_speed(leaf, dev_item, 0);
1254        btrfs_set_device_bandwidth(leaf, dev_item, 0);
1255        btrfs_set_device_start_offset(leaf, dev_item, 0);
1256
1257        ptr = (unsigned long)btrfs_device_uuid(dev_item);
1258        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1259        ptr = (unsigned long)btrfs_device_fsid(dev_item);
1260        write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1261        btrfs_mark_buffer_dirty(leaf);
1262
1263        ret = 0;
1264out:
1265        btrfs_free_path(path);
1266        return ret;
1267}
1268
1269static int btrfs_rm_dev_item(struct btrfs_root *root,
1270                             struct btrfs_device *device)
1271{
1272        int ret;
1273        struct btrfs_path *path;
1274        struct btrfs_key key;
1275        struct btrfs_trans_handle *trans;
1276
1277        root = root->fs_info->chunk_root;
1278
1279        path = btrfs_alloc_path();
1280        if (!path)
1281                return -ENOMEM;
1282
1283        trans = btrfs_start_transaction(root, 0);
1284        if (IS_ERR(trans)) {
1285                btrfs_free_path(path);
1286                return PTR_ERR(trans);
1287        }
1288        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1289        key.type = BTRFS_DEV_ITEM_KEY;
1290        key.offset = device->devid;
1291        lock_chunks(root);
1292
1293        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1294        if (ret < 0)
1295                goto out;
1296
1297        if (ret > 0) {
1298                ret = -ENOENT;
1299                goto out;
1300        }
1301
1302        ret = btrfs_del_item(trans, root, path);
1303        if (ret)
1304                goto out;
1305out:
1306        btrfs_free_path(path);
1307        unlock_chunks(root);
1308        btrfs_commit_transaction(trans, root);
1309        return ret;
1310}
1311
1312int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1313{
1314        struct btrfs_device *device;
1315        struct btrfs_device *next_device;
1316        struct block_device *bdev;
1317        struct buffer_head *bh = NULL;
1318        struct btrfs_super_block *disk_super;
1319        struct btrfs_fs_devices *cur_devices;
1320        u64 all_avail;
1321        u64 devid;
1322        u64 num_devices;
1323        u8 *dev_uuid;
1324        int ret = 0;
1325        bool clear_super = false;
1326
1327        mutex_lock(&uuid_mutex);
1328
1329        all_avail = root->fs_info->avail_data_alloc_bits |
1330                root->fs_info->avail_system_alloc_bits |
1331                root->fs_info->avail_metadata_alloc_bits;
1332
1333        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1334            root->fs_info->fs_devices->num_devices <= 4) {
1335                printk(KERN_ERR "btrfs: unable to go below four devices "
1336                       "on raid10\n");
1337                ret = -EINVAL;
1338                goto out;
1339        }
1340
1341        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1342            root->fs_info->fs_devices->num_devices <= 2) {
1343                printk(KERN_ERR "btrfs: unable to go below two "
1344                       "devices on raid1\n");
1345                ret = -EINVAL;
1346                goto out;
1347        }
1348
1349        if (strcmp(device_path, "missing") == 0) {
1350                struct list_head *devices;
1351                struct btrfs_device *tmp;
1352
1353                device = NULL;
1354                devices = &root->fs_info->fs_devices->devices;
1355                /*
1356                 * It is safe to read the devices since the volume_mutex
1357                 * is held.
1358                 */
1359                list_for_each_entry(tmp, devices, dev_list) {
1360                        if (tmp->in_fs_metadata && !tmp->bdev) {
1361                                device = tmp;
1362                                break;
1363                        }
1364                }
1365                bdev = NULL;
1366                bh = NULL;
1367                disk_super = NULL;
1368                if (!device) {
1369                        printk(KERN_ERR "btrfs: no missing devices found to "
1370                               "remove\n");
1371                        goto out;
1372                }
1373        } else {
1374                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1375                                          root->fs_info->bdev_holder);
1376                if (IS_ERR(bdev)) {
1377                        ret = PTR_ERR(bdev);
1378                        goto out;
1379                }
1380
1381                set_blocksize(bdev, 4096);
1382                invalidate_bdev(bdev);
1383                bh = btrfs_read_dev_super(bdev);
1384                if (!bh) {
1385                        ret = -EINVAL;
1386                        goto error_close;
1387                }
1388                disk_super = (struct btrfs_super_block *)bh->b_data;
1389                devid = btrfs_stack_device_id(&disk_super->dev_item);
1390                dev_uuid = disk_super->dev_item.uuid;
1391                device = btrfs_find_device(root, devid, dev_uuid,
1392                                           disk_super->fsid);
1393                if (!device) {
1394                        ret = -ENOENT;
1395                        goto error_brelse;
1396                }
1397        }
1398
1399        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400                printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401                       "device\n");
1402                ret = -EINVAL;
1403                goto error_brelse;
1404        }
1405
1406        if (device->writeable) {
1407                lock_chunks(root);
1408                list_del_init(&device->dev_alloc_list);
1409                unlock_chunks(root);
1410                root->fs_info->fs_devices->rw_devices--;
1411                clear_super = true;
1412        }
1413
1414        ret = btrfs_shrink_device(device, 0);
1415        if (ret)
1416                goto error_undo;
1417
1418        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419        if (ret)
1420                goto error_undo;
1421
1422        spin_lock(&root->fs_info->free_chunk_lock);
1423        root->fs_info->free_chunk_space = device->total_bytes -
1424                device->bytes_used;
1425        spin_unlock(&root->fs_info->free_chunk_lock);
1426
1427        device->in_fs_metadata = 0;
1428        btrfs_scrub_cancel_dev(root, device);
1429
1430        /*
1431         * the device list mutex makes sure that we don't change
1432         * the device list while someone else is writing out all
1433         * the device supers.
1434         */
1435
1436        cur_devices = device->fs_devices;
1437        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1438        list_del_rcu(&device->dev_list);
1439
1440        device->fs_devices->num_devices--;
1441        device->fs_devices->total_devices--;
1442
1443        if (device->missing)
1444                root->fs_info->fs_devices->missing_devices--;
1445
1446        next_device = list_entry(root->fs_info->fs_devices->devices.next,
1447                                 struct btrfs_device, dev_list);
1448        if (device->bdev == root->fs_info->sb->s_bdev)
1449                root->fs_info->sb->s_bdev = next_device->bdev;
1450        if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1451                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1452
1453        if (device->bdev)
1454                device->fs_devices->open_devices--;
1455
1456        call_rcu(&device->rcu, free_device);
1457        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1458
1459        num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1460        btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1461
1462        if (cur_devices->open_devices == 0) {
1463                struct btrfs_fs_devices *fs_devices;
1464                fs_devices = root->fs_info->fs_devices;
1465                while (fs_devices) {
1466                        if (fs_devices->seed == cur_devices)
1467                                break;
1468                        fs_devices = fs_devices->seed;
1469                }
1470                fs_devices->seed = cur_devices->seed;
1471                cur_devices->seed = NULL;
1472                lock_chunks(root);
1473                __btrfs_close_devices(cur_devices);
1474                unlock_chunks(root);
1475                free_fs_devices(cur_devices);
1476        }
1477
1478        /*
1479         * at this point, the device is zero sized.  We want to
1480         * remove it from the devices list and zero out the old super
1481         */
1482        if (clear_super) {
1483                /* make sure this device isn't detected as part of
1484                 * the FS anymore
1485                 */
1486                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1487                set_buffer_dirty(bh);
1488                sync_dirty_buffer(bh);
1489        }
1490
1491        ret = 0;
1492
1493error_brelse:
1494        brelse(bh);
1495error_close:
1496        if (bdev)
1497                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1498out:
1499        mutex_unlock(&uuid_mutex);
1500        return ret;
1501error_undo:
1502        if (device->writeable) {
1503                lock_chunks(root);
1504                list_add(&device->dev_alloc_list,
1505                         &root->fs_info->fs_devices->alloc_list);
1506                unlock_chunks(root);
1507                root->fs_info->fs_devices->rw_devices++;
1508        }
1509        goto error_brelse;
1510}
1511
1512/*
1513 * does all the dirty work required for changing file system's UUID.
1514 */
1515static int btrfs_prepare_sprout(struct btrfs_root *root)
1516{
1517        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1518        struct btrfs_fs_devices *old_devices;
1519        struct btrfs_fs_devices *seed_devices;
1520        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1521        struct btrfs_device *device;
1522        u64 super_flags;
1523
1524        BUG_ON(!mutex_is_locked(&uuid_mutex));
1525        if (!fs_devices->seeding)
1526                return -EINVAL;
1527
1528        seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1529        if (!seed_devices)
1530                return -ENOMEM;
1531
1532        old_devices = clone_fs_devices(fs_devices);
1533        if (IS_ERR(old_devices)) {
1534                kfree(seed_devices);
1535                return PTR_ERR(old_devices);
1536        }
1537
1538        list_add(&old_devices->list, &fs_uuids);
1539
1540        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1541        seed_devices->opened = 1;
1542        INIT_LIST_HEAD(&seed_devices->devices);
1543        INIT_LIST_HEAD(&seed_devices->alloc_list);
1544        mutex_init(&seed_devices->device_list_mutex);
1545
1546        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1547        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1548                              synchronize_rcu);
1549        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1550
1551        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1552        list_for_each_entry(device, &seed_devices->devices, dev_list) {
1553                device->fs_devices = seed_devices;
1554        }
1555
1556        fs_devices->seeding = 0;
1557        fs_devices->num_devices = 0;
1558        fs_devices->open_devices = 0;
1559        fs_devices->total_devices = 0;
1560        fs_devices->seed = seed_devices;
1561
1562        generate_random_uuid(fs_devices->fsid);
1563        memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1564        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1565        super_flags = btrfs_super_flags(disk_super) &
1566                      ~BTRFS_SUPER_FLAG_SEEDING;
1567        btrfs_set_super_flags(disk_super, super_flags);
1568
1569        return 0;
1570}
1571
1572/*
1573 * strore the expected generation for seed devices in device items.
1574 */
1575static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1576                               struct btrfs_root *root)
1577{
1578        struct btrfs_path *path;
1579        struct extent_buffer *leaf;
1580        struct btrfs_dev_item *dev_item;
1581        struct btrfs_device *device;
1582        struct btrfs_key key;
1583        u8 fs_uuid[BTRFS_UUID_SIZE];
1584        u8 dev_uuid[BTRFS_UUID_SIZE];
1585        u64 devid;
1586        int ret;
1587
1588        path = btrfs_alloc_path();
1589        if (!path)
1590                return -ENOMEM;
1591
1592        root = root->fs_info->chunk_root;
1593        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1594        key.offset = 0;
1595        key.type = BTRFS_DEV_ITEM_KEY;
1596
1597        while (1) {
1598                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1599                if (ret < 0)
1600                        goto error;
1601
1602                leaf = path->nodes[0];
1603next_slot:
1604                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1605                        ret = btrfs_next_leaf(root, path);
1606                        if (ret > 0)
1607                                break;
1608                        if (ret < 0)
1609                                goto error;
1610                        leaf = path->nodes[0];
1611                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1612                        btrfs_release_path(path);
1613                        continue;
1614                }
1615
1616                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1617                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1618                    key.type != BTRFS_DEV_ITEM_KEY)
1619                        break;
1620
1621                dev_item = btrfs_item_ptr(leaf, path->slots[0],
1622                                          struct btrfs_dev_item);
1623                devid = btrfs_device_id(leaf, dev_item);
1624                read_extent_buffer(leaf, dev_uuid,
1625                                   (unsigned long)btrfs_device_uuid(dev_item),
1626                                   BTRFS_UUID_SIZE);
1627                read_extent_buffer(leaf, fs_uuid,
1628                                   (unsigned long)btrfs_device_fsid(dev_item),
1629                                   BTRFS_UUID_SIZE);
1630                device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1631                BUG_ON(!device); /* Logic error */
1632
1633                if (device->fs_devices->seeding) {
1634                        btrfs_set_device_generation(leaf, dev_item,
1635                                                    device->generation);
1636                        btrfs_mark_buffer_dirty(leaf);
1637                }
1638
1639                path->slots[0]++;
1640                goto next_slot;
1641        }
1642        ret = 0;
1643error:
1644        btrfs_free_path(path);
1645        return ret;
1646}
1647
1648int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1649{
1650        struct request_queue *q;
1651        struct btrfs_trans_handle *trans;
1652        struct btrfs_device *device;
1653        struct block_device *bdev;
1654        struct list_head *devices;
1655        struct super_block *sb = root->fs_info->sb;
1656        struct rcu_string *name;
1657        u64 total_bytes;
1658        int seeding_dev = 0;
1659        int ret = 0;
1660
1661        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1662                return -EROFS;
1663
1664        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1665                                  root->fs_info->bdev_holder);
1666        if (IS_ERR(bdev))
1667                return PTR_ERR(bdev);
1668
1669        if (root->fs_info->fs_devices->seeding) {
1670                seeding_dev = 1;
1671                down_write(&sb->s_umount);
1672                mutex_lock(&uuid_mutex);
1673        }
1674
1675        filemap_write_and_wait(bdev->bd_inode->i_mapping);
1676
1677        devices = &root->fs_info->fs_devices->devices;
1678        /*
1679         * we have the volume lock, so we don't need the extra
1680         * device list mutex while reading the list here.
1681         */
1682        list_for_each_entry(device, devices, dev_list) {
1683                if (device->bdev == bdev) {
1684                        ret = -EEXIST;
1685                        goto error;
1686                }
1687        }
1688
1689        device = kzalloc(sizeof(*device), GFP_NOFS);
1690        if (!device) {
1691                /* we can safely leave the fs_devices entry around */
1692                ret = -ENOMEM;
1693                goto error;
1694        }
1695
1696        name = rcu_string_strdup(device_path, GFP_NOFS);
1697        if (!name) {
1698                kfree(device);
1699                ret = -ENOMEM;
1700                goto error;
1701        }
1702        rcu_assign_pointer(device->name, name);
1703
1704        ret = find_next_devid(root, &device->devid);
1705        if (ret) {
1706                rcu_string_free(device->name);
1707                kfree(device);
1708                goto error;
1709        }
1710
1711        trans = btrfs_start_transaction(root, 0);
1712        if (IS_ERR(trans)) {
1713                rcu_string_free(device->name);
1714                kfree(device);
1715                ret = PTR_ERR(trans);
1716                goto error;
1717        }
1718
1719        lock_chunks(root);
1720
1721        q = bdev_get_queue(bdev);
1722        if (blk_queue_discard(q))
1723                device->can_discard = 1;
1724        device->writeable = 1;
1725        device->work.func = pending_bios_fn;
1726        generate_random_uuid(device->uuid);
1727        spin_lock_init(&device->io_lock);
1728        device->generation = trans->transid;
1729        device->io_width = root->sectorsize;
1730        device->io_align = root->sectorsize;
1731        device->sector_size = root->sectorsize;
1732        device->total_bytes = i_size_read(bdev->bd_inode);
1733        device->disk_total_bytes = device->total_bytes;
1734        device->dev_root = root->fs_info->dev_root;
1735        device->bdev = bdev;
1736        device->in_fs_metadata = 1;
1737        device->mode = FMODE_EXCL;
1738        set_blocksize(device->bdev, 4096);
1739
1740        if (seeding_dev) {
1741                sb->s_flags &= ~MS_RDONLY;
1742                ret = btrfs_prepare_sprout(root);
1743                BUG_ON(ret); /* -ENOMEM */
1744        }
1745
1746        device->fs_devices = root->fs_info->fs_devices;
1747
1748        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1749        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1750        list_add(&device->dev_alloc_list,
1751                 &root->fs_info->fs_devices->alloc_list);
1752        root->fs_info->fs_devices->num_devices++;
1753        root->fs_info->fs_devices->open_devices++;
1754        root->fs_info->fs_devices->rw_devices++;
1755        root->fs_info->fs_devices->total_devices++;
1756        if (device->can_discard)
1757                root->fs_info->fs_devices->num_can_discard++;
1758        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1759
1760        spin_lock(&root->fs_info->free_chunk_lock);
1761        root->fs_info->free_chunk_space += device->total_bytes;
1762        spin_unlock(&root->fs_info->free_chunk_lock);
1763
1764        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1765                root->fs_info->fs_devices->rotating = 1;
1766
1767        total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1768        btrfs_set_super_total_bytes(root->fs_info->super_copy,
1769                                    total_bytes + device->total_bytes);
1770
1771        total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1772        btrfs_set_super_num_devices(root->fs_info->super_copy,
1773                                    total_bytes + 1);
1774        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1775
1776        if (seeding_dev) {
1777                ret = init_first_rw_device(trans, root, device);
1778                if (ret)
1779                        goto error_trans;
1780                ret = btrfs_finish_sprout(trans, root);
1781                if (ret)
1782                        goto error_trans;
1783        } else {
1784                ret = btrfs_add_device(trans, root, device);
1785                if (ret)
1786                        goto error_trans;
1787        }
1788
1789        /*
1790         * we've got more storage, clear any full flags on the space
1791         * infos
1792         */
1793        btrfs_clear_space_info_full(root->fs_info);
1794
1795        unlock_chunks(root);
1796        ret = btrfs_commit_transaction(trans, root);
1797
1798        if (seeding_dev) {
1799                mutex_unlock(&uuid_mutex);
1800                up_write(&sb->s_umount);
1801
1802                if (ret) /* transaction commit */
1803                        return ret;
1804
1805                ret = btrfs_relocate_sys_chunks(root);
1806                if (ret < 0)
1807                        btrfs_error(root->fs_info, ret,
1808                                    "Failed to relocate sys chunks after "
1809                                    "device initialization. This can be fixed "
1810                                    "using the \"btrfs balance\" command.");
1811        }
1812
1813        return ret;
1814
1815error_trans:
1816        unlock_chunks(root);
1817        btrfs_abort_transaction(trans, root, ret);
1818        btrfs_end_transaction(trans, root);
1819        rcu_string_free(device->name);
1820        kfree(device);
1821error:
1822        blkdev_put(bdev, FMODE_EXCL);
1823        if (seeding_dev) {
1824                mutex_unlock(&uuid_mutex);
1825                up_write(&sb->s_umount);
1826        }
1827        return ret;
1828}
1829
1830static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1831                                        struct btrfs_device *device)
1832{
1833        int ret;
1834        struct btrfs_path *path;
1835        struct btrfs_root *root;
1836        struct btrfs_dev_item *dev_item;
1837        struct extent_buffer *leaf;
1838        struct btrfs_key key;
1839
1840        root = device->dev_root->fs_info->chunk_root;
1841
1842        path = btrfs_alloc_path();
1843        if (!path)
1844                return -ENOMEM;
1845
1846        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1847        key.type = BTRFS_DEV_ITEM_KEY;
1848        key.offset = device->devid;
1849
1850        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1851        if (ret < 0)
1852                goto out;
1853
1854        if (ret > 0) {
1855                ret = -ENOENT;
1856                goto out;
1857        }
1858
1859        leaf = path->nodes[0];
1860        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1861
1862        btrfs_set_device_id(leaf, dev_item, device->devid);
1863        btrfs_set_device_type(leaf, dev_item, device->type);
1864        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1865        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1866        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1867        btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1868        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1869        btrfs_mark_buffer_dirty(leaf);
1870
1871out:
1872        btrfs_free_path(path);
1873        return ret;
1874}
1875
1876static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1877                      struct btrfs_device *device, u64 new_size)
1878{
1879        struct btrfs_super_block *super_copy =
1880                device->dev_root->fs_info->super_copy;
1881        u64 old_total = btrfs_super_total_bytes(super_copy);
1882        u64 diff = new_size - device->total_bytes;
1883
1884        if (!device->writeable)
1885                return -EACCES;
1886        if (new_size <= device->total_bytes)
1887                return -EINVAL;
1888
1889        btrfs_set_super_total_bytes(super_copy, old_total + diff);
1890        device->fs_devices->total_rw_bytes += diff;
1891
1892        device->total_bytes = new_size;
1893        device->disk_total_bytes = new_size;
1894        btrfs_clear_space_info_full(device->dev_root->fs_info);
1895
1896        return btrfs_update_device(trans, device);
1897}
1898
1899int btrfs_grow_device(struct btrfs_trans_handle *trans,
1900                      struct btrfs_device *device, u64 new_size)
1901{
1902        int ret;
1903        lock_chunks(device->dev_root);
1904        ret = __btrfs_grow_device(trans, device, new_size);
1905        unlock_chunks(device->dev_root);
1906        return ret;
1907}
1908
1909static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1910                            struct btrfs_root *root,
1911                            u64 chunk_tree, u64 chunk_objectid,
1912                            u64 chunk_offset)
1913{
1914        int ret;
1915        struct btrfs_path *path;
1916        struct btrfs_key key;
1917
1918        root = root->fs_info->chunk_root;
1919        path = btrfs_alloc_path();
1920        if (!path)
1921                return -ENOMEM;
1922
1923        key.objectid = chunk_objectid;
1924        key.offset = chunk_offset;
1925        key.type = BTRFS_CHUNK_ITEM_KEY;
1926
1927        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1928        if (ret < 0)
1929                goto out;
1930        else if (ret > 0) { /* Logic error or corruption */
1931                btrfs_error(root->fs_info, -ENOENT,
1932                            "Failed lookup while freeing chunk.");
1933                ret = -ENOENT;
1934                goto out;
1935        }
1936
1937        ret = btrfs_del_item(trans, root, path);
1938        if (ret < 0)
1939                btrfs_error(root->fs_info, ret,
1940                            "Failed to delete chunk item.");
1941out:
1942        btrfs_free_path(path);
1943        return ret;
1944}
1945
1946static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1947                        chunk_offset)
1948{
1949        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1950        struct btrfs_disk_key *disk_key;
1951        struct btrfs_chunk *chunk;
1952        u8 *ptr;
1953        int ret = 0;
1954        u32 num_stripes;
1955        u32 array_size;
1956        u32 len = 0;
1957        u32 cur;
1958        struct btrfs_key key;
1959
1960        array_size = btrfs_super_sys_array_size(super_copy);
1961
1962        ptr = super_copy->sys_chunk_array;
1963        cur = 0;
1964
1965        while (cur < array_size) {
1966                disk_key = (struct btrfs_disk_key *)ptr;
1967                btrfs_disk_key_to_cpu(&key, disk_key);
1968
1969                len = sizeof(*disk_key);
1970
1971                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1972                        chunk = (struct btrfs_chunk *)(ptr + len);
1973                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1974                        len += btrfs_chunk_item_size(num_stripes);
1975                } else {
1976                        ret = -EIO;
1977                        break;
1978                }
1979                if (key.objectid == chunk_objectid &&
1980                    key.offset == chunk_offset) {
1981                        memmove(ptr, ptr + len, array_size - (cur + len));
1982                        array_size -= len;
1983                        btrfs_set_super_sys_array_size(super_copy, array_size);
1984                } else {
1985                        ptr += len;
1986                        cur += len;
1987                }
1988        }
1989        return ret;
1990}
1991
1992static int btrfs_relocate_chunk(struct btrfs_root *root,
1993                         u64 chunk_tree, u64 chunk_objectid,
1994                         u64 chunk_offset)
1995{
1996        struct extent_map_tree *em_tree;
1997        struct btrfs_root *extent_root;
1998        struct btrfs_trans_handle *trans;
1999        struct extent_map *em;
2000        struct map_lookup *map;
2001        int ret;
2002        int i;
2003
2004        root = root->fs_info->chunk_root;
2005        extent_root = root->fs_info->extent_root;
2006        em_tree = &root->fs_info->mapping_tree.map_tree;
2007
2008        ret = btrfs_can_relocate(extent_root, chunk_offset);
2009        if (ret)
2010                return -ENOSPC;
2011
2012        /* step one, relocate all the extents inside this chunk */
2013        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2014        if (ret)
2015                return ret;
2016
2017        trans = btrfs_start_transaction(root, 0);
2018        BUG_ON(IS_ERR(trans));
2019
2020        lock_chunks(root);
2021
2022        /*
2023         * step two, delete the device extents and the
2024         * chunk tree entries
2025         */
2026        read_lock(&em_tree->lock);
2027        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2028        read_unlock(&em_tree->lock);
2029
2030        BUG_ON(!em || em->start > chunk_offset ||
2031               em->start + em->len < chunk_offset);
2032        map = (struct map_lookup *)em->bdev;
2033
2034        for (i = 0; i < map->num_stripes; i++) {
2035                ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2036                                            map->stripes[i].physical);
2037                BUG_ON(ret);
2038
2039                if (map->stripes[i].dev) {
2040                        ret = btrfs_update_device(trans, map->stripes[i].dev);
2041                        BUG_ON(ret);
2042                }
2043        }
2044        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2045                               chunk_offset);
2046
2047        BUG_ON(ret);
2048
2049        trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2050
2051        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2052                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2053                BUG_ON(ret);
2054        }
2055
2056        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2057        BUG_ON(ret);
2058
2059        write_lock(&em_tree->lock);
2060        remove_extent_mapping(em_tree, em);
2061        write_unlock(&em_tree->lock);
2062
2063        kfree(map);
2064        em->bdev = NULL;
2065
2066        /* once for the tree */
2067        free_extent_map(em);
2068        /* once for us */
2069        free_extent_map(em);
2070
2071        unlock_chunks(root);
2072        btrfs_end_transaction(trans, root);
2073        return 0;
2074}
2075
2076static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2077{
2078        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2079        struct btrfs_path *path;
2080        struct extent_buffer *leaf;
2081        struct btrfs_chunk *chunk;
2082        struct btrfs_key key;
2083        struct btrfs_key found_key;
2084        u64 chunk_tree = chunk_root->root_key.objectid;
2085        u64 chunk_type;
2086        bool retried = false;
2087        int failed = 0;
2088        int ret;
2089
2090        path = btrfs_alloc_path();
2091        if (!path)
2092                return -ENOMEM;
2093
2094again:
2095        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2096        key.offset = (u64)-1;
2097        key.type = BTRFS_CHUNK_ITEM_KEY;
2098
2099        while (1) {
2100                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2101                if (ret < 0)
2102                        goto error;
2103                BUG_ON(ret == 0); /* Corruption */
2104
2105                ret = btrfs_previous_item(chunk_root, path, key.objectid,
2106                                          key.type);
2107                if (ret < 0)
2108                        goto error;
2109                if (ret > 0)
2110                        break;
2111
2112                leaf = path->nodes[0];
2113                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2114
2115                chunk = btrfs_item_ptr(leaf, path->slots[0],
2116                                       struct btrfs_chunk);
2117                chunk_type = btrfs_chunk_type(leaf, chunk);
2118                btrfs_release_path(path);
2119
2120                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2121                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2122                                                   found_key.objectid,
2123                                                   found_key.offset);
2124                        if (ret == -ENOSPC)
2125                                failed++;
2126                        else if (ret)
2127                                BUG();
2128                }
2129
2130                if (found_key.offset == 0)
2131                        break;
2132                key.offset = found_key.offset - 1;
2133        }
2134        ret = 0;
2135        if (failed && !retried) {
2136                failed = 0;
2137                retried = true;
2138                goto again;
2139        } else if (failed && retried) {
2140                WARN_ON(1);
2141                ret = -ENOSPC;
2142        }
2143error:
2144        btrfs_free_path(path);
2145        return ret;
2146}
2147
2148static int insert_balance_item(struct btrfs_root *root,
2149                               struct btrfs_balance_control *bctl)
2150{
2151        struct btrfs_trans_handle *trans;
2152        struct btrfs_balance_item *item;
2153        struct btrfs_disk_balance_args disk_bargs;
2154        struct btrfs_path *path;
2155        struct extent_buffer *leaf;
2156        struct btrfs_key key;
2157        int ret, err;
2158
2159        path = btrfs_alloc_path();
2160        if (!path)
2161                return -ENOMEM;
2162
2163        trans = btrfs_start_transaction(root, 0);
2164        if (IS_ERR(trans)) {
2165                btrfs_free_path(path);
2166                return PTR_ERR(trans);
2167        }
2168
2169        key.objectid = BTRFS_BALANCE_OBJECTID;
2170        key.type = BTRFS_BALANCE_ITEM_KEY;
2171        key.offset = 0;
2172
2173        ret = btrfs_insert_empty_item(trans, root, path, &key,
2174                                      sizeof(*item));
2175        if (ret)
2176                goto out;
2177
2178        leaf = path->nodes[0];
2179        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2180
2181        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2182
2183        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2184        btrfs_set_balance_data(leaf, item, &disk_bargs);
2185        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2186        btrfs_set_balance_meta(leaf, item, &disk_bargs);
2187        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2188        btrfs_set_balance_sys(leaf, item, &disk_bargs);
2189
2190        btrfs_set_balance_flags(leaf, item, bctl->flags);
2191
2192        btrfs_mark_buffer_dirty(leaf);
2193out:
2194        btrfs_free_path(path);
2195        err = btrfs_commit_transaction(trans, root);
2196        if (err && !ret)
2197                ret = err;
2198        return ret;
2199}
2200
2201static int del_balance_item(struct btrfs_root *root)
2202{
2203        struct btrfs_trans_handle *trans;
2204        struct btrfs_path *path;
2205        struct btrfs_key key;
2206        int ret, err;
2207
2208        path = btrfs_alloc_path();
2209        if (!path)
2210                return -ENOMEM;
2211
2212        trans = btrfs_start_transaction(root, 0);
2213        if (IS_ERR(trans)) {
2214                btrfs_free_path(path);
2215                return PTR_ERR(trans);
2216        }
2217
2218        key.objectid = BTRFS_BALANCE_OBJECTID;
2219        key.type = BTRFS_BALANCE_ITEM_KEY;
2220        key.offset = 0;
2221
2222        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2223        if (ret < 0)
2224                goto out;
2225        if (ret > 0) {
2226                ret = -ENOENT;
2227                goto out;
2228        }
2229
2230        ret = btrfs_del_item(trans, root, path);
2231out:
2232        btrfs_free_path(path);
2233        err = btrfs_commit_transaction(trans, root);
2234        if (err && !ret)
2235                ret = err;
2236        return ret;
2237}
2238
2239/*
2240 * This is a heuristic used to reduce the number of chunks balanced on
2241 * resume after balance was interrupted.
2242 */
2243static void update_balance_args(struct btrfs_balance_control *bctl)
2244{
2245        /*
2246         * Turn on soft mode for chunk types that were being converted.
2247         */
2248        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2249                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2250        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2251                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2252        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2253                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2254
2255        /*
2256         * Turn on usage filter if is not already used.  The idea is
2257         * that chunks that we have already balanced should be
2258         * reasonably full.  Don't do it for chunks that are being
2259         * converted - that will keep us from relocating unconverted
2260         * (albeit full) chunks.
2261         */
2262        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2263            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2264                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2265                bctl->data.usage = 90;
2266        }
2267        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2268            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2269                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2270                bctl->sys.usage = 90;
2271        }
2272        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2273            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2274                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2275                bctl->meta.usage = 90;
2276        }
2277}
2278
2279/*
2280 * Should be called with both balance and volume mutexes held to
2281 * serialize other volume operations (add_dev/rm_dev/resize) with
2282 * restriper.  Same goes for unset_balance_control.
2283 */
2284static void set_balance_control(struct btrfs_balance_control *bctl)
2285{
2286        struct btrfs_fs_info *fs_info = bctl->fs_info;
2287
2288        BUG_ON(fs_info->balance_ctl);
2289
2290        spin_lock(&fs_info->balance_lock);
2291        fs_info->balance_ctl = bctl;
2292        spin_unlock(&fs_info->balance_lock);
2293}
2294
2295static void unset_balance_control(struct btrfs_fs_info *fs_info)
2296{
2297        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2298
2299        BUG_ON(!fs_info->balance_ctl);
2300
2301        spin_lock(&fs_info->balance_lock);
2302        fs_info->balance_ctl = NULL;
2303        spin_unlock(&fs_info->balance_lock);
2304
2305        kfree(bctl);
2306}
2307
2308/*
2309 * Balance filters.  Return 1 if chunk should be filtered out
2310 * (should not be balanced).
2311 */
2312static int chunk_profiles_filter(u64 chunk_type,
2313                                 struct btrfs_balance_args *bargs)
2314{
2315        chunk_type = chunk_to_extended(chunk_type) &
2316                                BTRFS_EXTENDED_PROFILE_MASK;
2317
2318        if (bargs->profiles & chunk_type)
2319                return 0;
2320
2321        return 1;
2322}
2323
2324static u64 div_factor_fine(u64 num, int factor)
2325{
2326        if (factor <= 0)
2327                return 0;
2328        if (factor >= 100)
2329                return num;
2330
2331        num *= factor;
2332        do_div(num, 100);
2333        return num;
2334}
2335
2336static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2337                              struct btrfs_balance_args *bargs)
2338{
2339        struct btrfs_block_group_cache *cache;
2340        u64 chunk_used, user_thresh;
2341        int ret = 1;
2342
2343        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2344        chunk_used = btrfs_block_group_used(&cache->item);
2345
2346        user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2347        if (chunk_used < user_thresh)
2348                ret = 0;
2349
2350        btrfs_put_block_group(cache);
2351        return ret;
2352}
2353
2354static int chunk_devid_filter(struct extent_buffer *leaf,
2355                              struct btrfs_chunk *chunk,
2356                              struct btrfs_balance_args *bargs)
2357{
2358        struct btrfs_stripe *stripe;
2359        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2360        int i;
2361
2362        for (i = 0; i < num_stripes; i++) {
2363                stripe = btrfs_stripe_nr(chunk, i);
2364                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2365                        return 0;
2366        }
2367
2368        return 1;
2369}
2370
2371/* [pstart, pend) */
2372static int chunk_drange_filter(struct extent_buffer *leaf,
2373                               struct btrfs_chunk *chunk,
2374                               u64 chunk_offset,
2375                               struct btrfs_balance_args *bargs)
2376{
2377        struct btrfs_stripe *stripe;
2378        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2379        u64 stripe_offset;
2380        u64 stripe_length;
2381        int factor;
2382        int i;
2383
2384        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2385                return 0;
2386
2387        if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2388             BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2389                factor = 2;
2390        else
2391                factor = 1;
2392        factor = num_stripes / factor;
2393
2394        for (i = 0; i < num_stripes; i++) {
2395                stripe = btrfs_stripe_nr(chunk, i);
2396                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2397                        continue;
2398
2399                stripe_offset = btrfs_stripe_offset(leaf, stripe);
2400                stripe_length = btrfs_chunk_length(leaf, chunk);
2401                do_div(stripe_length, factor);
2402
2403                if (stripe_offset < bargs->pend &&
2404                    stripe_offset + stripe_length > bargs->pstart)
2405                        return 0;
2406        }
2407
2408        return 1;
2409}
2410
2411/* [vstart, vend) */
2412static int chunk_vrange_filter(struct extent_buffer *leaf,
2413                               struct btrfs_chunk *chunk,
2414                               u64 chunk_offset,
2415                               struct btrfs_balance_args *bargs)
2416{
2417        if (chunk_offset < bargs->vend &&
2418            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2419                /* at least part of the chunk is inside this vrange */
2420                return 0;
2421
2422        return 1;
2423}
2424
2425static int chunk_soft_convert_filter(u64 chunk_type,
2426                                     struct btrfs_balance_args *bargs)
2427{
2428        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2429                return 0;
2430
2431        chunk_type = chunk_to_extended(chunk_type) &
2432                                BTRFS_EXTENDED_PROFILE_MASK;
2433
2434        if (bargs->target == chunk_type)
2435                return 1;
2436
2437        return 0;
2438}
2439
2440static int should_balance_chunk(struct btrfs_root *root,
2441                                struct extent_buffer *leaf,
2442                                struct btrfs_chunk *chunk, u64 chunk_offset)
2443{
2444        struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2445        struct btrfs_balance_args *bargs = NULL;
2446        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2447
2448        /* type filter */
2449        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2450              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2451                return 0;
2452        }
2453
2454        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2455                bargs = &bctl->data;
2456        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2457                bargs = &bctl->sys;
2458        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2459                bargs = &bctl->meta;
2460
2461        /* profiles filter */
2462        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2463            chunk_profiles_filter(chunk_type, bargs)) {
2464                return 0;
2465        }
2466
2467        /* usage filter */
2468        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2469            chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2470                return 0;
2471        }
2472
2473        /* devid filter */
2474        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2475            chunk_devid_filter(leaf, chunk, bargs)) {
2476                return 0;
2477        }
2478
2479        /* drange filter, makes sense only with devid filter */
2480        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2481            chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2482                return 0;
2483        }
2484
2485        /* vrange filter */
2486        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2487            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2488                return 0;
2489        }
2490
2491        /* soft profile changing mode */
2492        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2493            chunk_soft_convert_filter(chunk_type, bargs)) {
2494                return 0;
2495        }
2496
2497        return 1;
2498}
2499
2500static u64 div_factor(u64 num, int factor)
2501{
2502        if (factor == 10)
2503                return num;
2504        num *= factor;
2505        do_div(num, 10);
2506        return num;
2507}
2508
2509static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2510{
2511        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2512        struct btrfs_root *chunk_root = fs_info->chunk_root;
2513        struct btrfs_root *dev_root = fs_info->dev_root;
2514        struct list_head *devices;
2515        struct btrfs_device *device;
2516        u64 old_size;
2517        u64 size_to_free;
2518        struct btrfs_chunk *chunk;
2519        struct btrfs_path *path;
2520        struct btrfs_key key;
2521        struct btrfs_key found_key;
2522        struct btrfs_trans_handle *trans;
2523        struct extent_buffer *leaf;
2524        int slot;
2525        int ret;
2526        int enospc_errors = 0;
2527        bool counting = true;
2528
2529        /* step one make some room on all the devices */
2530        devices = &fs_info->fs_devices->devices;
2531        list_for_each_entry(device, devices, dev_list) {
2532                old_size = device->total_bytes;
2533                size_to_free = div_factor(old_size, 1);
2534                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2535                if (!device->writeable ||
2536                    device->total_bytes - device->bytes_used > size_to_free)
2537                        continue;
2538
2539                ret = btrfs_shrink_device(device, old_size - size_to_free);
2540                if (ret == -ENOSPC)
2541                        break;
2542                BUG_ON(ret);
2543
2544                trans = btrfs_start_transaction(dev_root, 0);
2545                BUG_ON(IS_ERR(trans));
2546
2547                ret = btrfs_grow_device(trans, device, old_size);
2548                BUG_ON(ret);
2549
2550                btrfs_end_transaction(trans, dev_root);
2551        }
2552
2553        /* step two, relocate all the chunks */
2554        path = btrfs_alloc_path();
2555        if (!path) {
2556                ret = -ENOMEM;
2557                goto error;
2558        }
2559
2560        /* zero out stat counters */
2561        spin_lock(&fs_info->balance_lock);
2562        memset(&bctl->stat, 0, sizeof(bctl->stat));
2563        spin_unlock(&fs_info->balance_lock);
2564again:
2565        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2566        key.offset = (u64)-1;
2567        key.type = BTRFS_CHUNK_ITEM_KEY;
2568
2569        while (1) {
2570                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2571                    atomic_read(&fs_info->balance_cancel_req)) {
2572                        ret = -ECANCELED;
2573                        goto error;
2574                }
2575
2576                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2577                if (ret < 0)
2578                        goto error;
2579
2580                /*
2581                 * this shouldn't happen, it means the last relocate
2582                 * failed
2583                 */
2584                if (ret == 0)
2585                        BUG(); /* FIXME break ? */
2586
2587                ret = btrfs_previous_item(chunk_root, path, 0,
2588                                          BTRFS_CHUNK_ITEM_KEY);
2589                if (ret) {
2590                        ret = 0;
2591                        break;
2592                }
2593
2594                leaf = path->nodes[0];
2595                slot = path->slots[0];
2596                btrfs_item_key_to_cpu(leaf, &found_key, slot);
2597
2598                if (found_key.objectid != key.objectid)
2599                        break;
2600
2601                /* chunk zero is special */
2602                if (found_key.offset == 0)
2603                        break;
2604
2605                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2606
2607                if (!counting) {
2608                        spin_lock(&fs_info->balance_lock);
2609                        bctl->stat.considered++;
2610                        spin_unlock(&fs_info->balance_lock);
2611                }
2612
2613                ret = should_balance_chunk(chunk_root, leaf, chunk,
2614                                           found_key.offset);
2615                btrfs_release_path(path);
2616                if (!ret)
2617                        goto loop;
2618
2619                if (counting) {
2620                        spin_lock(&fs_info->balance_lock);
2621                        bctl->stat.expected++;
2622                        spin_unlock(&fs_info->balance_lock);
2623                        goto loop;
2624                }
2625
2626                ret = btrfs_relocate_chunk(chunk_root,
2627                                           chunk_root->root_key.objectid,
2628                                           found_key.objectid,
2629                                           found_key.offset);
2630                if (ret && ret != -ENOSPC)
2631                        goto error;
2632                if (ret == -ENOSPC) {
2633                        enospc_errors++;
2634                } else {
2635                        spin_lock(&fs_info->balance_lock);
2636                        bctl->stat.completed++;
2637                        spin_unlock(&fs_info->balance_lock);
2638                }
2639loop:
2640                key.offset = found_key.offset - 1;
2641        }
2642
2643        if (counting) {
2644                btrfs_release_path(path);
2645                counting = false;
2646                goto again;
2647        }
2648error:
2649        btrfs_free_path(path);
2650        if (enospc_errors) {
2651                printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2652                       enospc_errors);
2653                if (!ret)
2654                        ret = -ENOSPC;
2655        }
2656
2657        return ret;
2658}
2659
2660/**
2661 * alloc_profile_is_valid - see if a given profile is valid and reduced
2662 * @flags: profile to validate
2663 * @extended: if true @flags is treated as an extended profile
2664 */
2665static int alloc_profile_is_valid(u64 flags, int extended)
2666{
2667        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
2668                               BTRFS_BLOCK_GROUP_PROFILE_MASK);
2669
2670        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2671
2672        /* 1) check that all other bits are zeroed */
2673        if (flags & ~mask)
2674                return 0;
2675
2676        /* 2) see if profile is reduced */
2677        if (flags == 0)
2678                return !extended; /* "0" is valid for usual profiles */
2679
2680        /* true if exactly one bit set */
2681        return (flags & (flags - 1)) == 0;
2682}
2683
2684static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2685{
2686        /* cancel requested || normal exit path */
2687        return atomic_read(&fs_info->balance_cancel_req) ||
2688                (atomic_read(&fs_info->balance_pause_req) == 0 &&
2689                 atomic_read(&fs_info->balance_cancel_req) == 0);
2690}
2691
2692static void __cancel_balance(struct btrfs_fs_info *fs_info)
2693{
2694        int ret;
2695
2696        unset_balance_control(fs_info);
2697        ret = del_balance_item(fs_info->tree_root);
2698        BUG_ON(ret);
2699}
2700
2701void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2702                               struct btrfs_ioctl_balance_args *bargs);
2703
2704/*
2705 * Should be called with both balance and volume mutexes held
2706 */
2707int btrfs_balance(struct btrfs_balance_control *bctl,
2708                  struct btrfs_ioctl_balance_args *bargs)
2709{
2710        struct btrfs_fs_info *fs_info = bctl->fs_info;
2711        u64 allowed;
2712        int mixed = 0;
2713        int ret;
2714
2715        if (btrfs_fs_closing(fs_info) ||
2716            atomic_read(&fs_info->balance_pause_req) ||
2717            atomic_read(&fs_info->balance_cancel_req)) {
2718                ret = -EINVAL;
2719                goto out;
2720        }
2721
2722        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2723        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
2724                mixed = 1;
2725
2726        /*
2727         * In case of mixed groups both data and meta should be picked,
2728         * and identical options should be given for both of them.
2729         */
2730        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
2731        if (mixed && (bctl->flags & allowed)) {
2732                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2733                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2734                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2735                        printk(KERN_ERR "btrfs: with mixed groups data and "
2736                               "metadata balance options must be the same\n");
2737                        ret = -EINVAL;
2738                        goto out;
2739                }
2740        }
2741
2742        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2743        if (fs_info->fs_devices->num_devices == 1)
2744                allowed |= BTRFS_BLOCK_GROUP_DUP;
2745        else if (fs_info->fs_devices->num_devices < 4)
2746                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2747        else
2748                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2749                                BTRFS_BLOCK_GROUP_RAID10);
2750
2751        if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2752            (!alloc_profile_is_valid(bctl->data.target, 1) ||
2753             (bctl->data.target & ~allowed))) {
2754                printk(KERN_ERR "btrfs: unable to start balance with target "
2755                       "data profile %llu\n",
2756                       (unsigned long long)bctl->data.target);
2757                ret = -EINVAL;
2758                goto out;
2759        }
2760        if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2761            (!alloc_profile_is_valid(bctl->meta.target, 1) ||
2762             (bctl->meta.target & ~allowed))) {
2763                printk(KERN_ERR "btrfs: unable to start balance with target "
2764                       "metadata profile %llu\n",
2765                       (unsigned long long)bctl->meta.target);
2766                ret = -EINVAL;
2767                goto out;
2768        }
2769        if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2770            (!alloc_profile_is_valid(bctl->sys.target, 1) ||
2771             (bctl->sys.target & ~allowed))) {
2772                printk(KERN_ERR "btrfs: unable to start balance with target "
2773                       "system profile %llu\n",
2774                       (unsigned long long)bctl->sys.target);
2775                ret = -EINVAL;
2776                goto out;
2777        }
2778
2779        /* allow dup'ed data chunks only in mixed mode */
2780        if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2781            (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
2782                printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2783                ret = -EINVAL;
2784                goto out;
2785        }
2786
2787        /* allow to reduce meta or sys integrity only if force set */
2788        allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2789                        BTRFS_BLOCK_GROUP_RAID10;
2790        if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2791             (fs_info->avail_system_alloc_bits & allowed) &&
2792             !(bctl->sys.target & allowed)) ||
2793            ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2794             (fs_info->avail_metadata_alloc_bits & allowed) &&
2795             !(bctl->meta.target & allowed))) {
2796                if (bctl->flags & BTRFS_BALANCE_FORCE) {
2797                        printk(KERN_INFO "btrfs: force reducing metadata "
2798                               "integrity\n");
2799                } else {
2800                        printk(KERN_ERR "btrfs: balance will reduce metadata "
2801                               "integrity, use force if you want this\n");
2802                        ret = -EINVAL;
2803                        goto out;
2804                }
2805        }
2806
2807        ret = insert_balance_item(fs_info->tree_root, bctl);
2808        if (ret && ret != -EEXIST)
2809                goto out;
2810
2811        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2812                BUG_ON(ret == -EEXIST);
2813                set_balance_control(bctl);
2814        } else {
2815                BUG_ON(ret != -EEXIST);
2816                spin_lock(&fs_info->balance_lock);
2817                update_balance_args(bctl);
2818                spin_unlock(&fs_info->balance_lock);
2819        }
2820
2821        atomic_inc(&fs_info->balance_running);
2822        mutex_unlock(&fs_info->balance_mutex);
2823
2824        ret = __btrfs_balance(fs_info);
2825
2826        mutex_lock(&fs_info->balance_mutex);
2827        atomic_dec(&fs_info->balance_running);
2828
2829        if (bargs) {
2830                memset(bargs, 0, sizeof(*bargs));
2831                update_ioctl_balance_args(fs_info, 0, bargs);
2832        }
2833
2834        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2835            balance_need_close(fs_info)) {
2836                __cancel_balance(fs_info);
2837        }
2838
2839        wake_up(&fs_info->balance_wait_q);
2840
2841        return ret;
2842out:
2843        if (bctl->flags & BTRFS_BALANCE_RESUME)
2844                __cancel_balance(fs_info);
2845        else
2846                kfree(bctl);
2847        return ret;
2848}
2849
2850static int balance_kthread(void *data)
2851{
2852        struct btrfs_fs_info *fs_info = data;
2853        int ret = 0;
2854
2855        mutex_lock(&fs_info->volume_mutex);
2856        mutex_lock(&fs_info->balance_mutex);
2857
2858        if (fs_info->balance_ctl) {
2859                printk(KERN_INFO "btrfs: continuing balance\n");
2860                ret = btrfs_balance(fs_info->balance_ctl, NULL);
2861        }
2862
2863        mutex_unlock(&fs_info->balance_mutex);
2864        mutex_unlock(&fs_info->volume_mutex);
2865
2866        return ret;
2867}
2868
2869int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2870{
2871        struct task_struct *tsk;
2872
2873        spin_lock(&fs_info->balance_lock);
2874        if (!fs_info->balance_ctl) {
2875                spin_unlock(&fs_info->balance_lock);
2876                return 0;
2877        }
2878        spin_unlock(&fs_info->balance_lock);
2879
2880        if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2881                printk(KERN_INFO "btrfs: force skipping balance\n");
2882                return 0;
2883        }
2884
2885        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2886        if (IS_ERR(tsk))
2887                return PTR_ERR(tsk);
2888
2889        return 0;
2890}
2891
2892int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
2893{
2894        struct btrfs_balance_control *bctl;
2895        struct btrfs_balance_item *item;
2896        struct btrfs_disk_balance_args disk_bargs;
2897        struct btrfs_path *path;
2898        struct extent_buffer *leaf;
2899        struct btrfs_key key;
2900        int ret;
2901
2902        path = btrfs_alloc_path();
2903        if (!path)
2904                return -ENOMEM;
2905
2906        key.objectid = BTRFS_BALANCE_OBJECTID;
2907        key.type = BTRFS_BALANCE_ITEM_KEY;
2908        key.offset = 0;
2909
2910        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2911        if (ret < 0)
2912                goto out;
2913        if (ret > 0) { /* ret = -ENOENT; */
2914                ret = 0;
2915                goto out;
2916        }
2917
2918        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2919        if (!bctl) {
2920                ret = -ENOMEM;
2921                goto out;
2922        }
2923
2924        leaf = path->nodes[0];
2925        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2926
2927        bctl->fs_info = fs_info;
2928        bctl->flags = btrfs_balance_flags(leaf, item);
2929        bctl->flags |= BTRFS_BALANCE_RESUME;
2930
2931        btrfs_balance_data(leaf, item, &disk_bargs);
2932        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2933        btrfs_balance_meta(leaf, item, &disk_bargs);
2934        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2935        btrfs_balance_sys(leaf, item, &disk_bargs);
2936        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2937
2938        mutex_lock(&fs_info->volume_mutex);
2939        mutex_lock(&fs_info->balance_mutex);
2940
2941        set_balance_control(bctl);
2942
2943        mutex_unlock(&fs_info->balance_mutex);
2944        mutex_unlock(&fs_info->volume_mutex);
2945out:
2946        btrfs_free_path(path);
2947        return ret;
2948}
2949
2950int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2951{
2952        int ret = 0;
2953
2954        mutex_lock(&fs_info->balance_mutex);
2955        if (!fs_info->balance_ctl) {
2956                mutex_unlock(&fs_info->balance_mutex);
2957                return -ENOTCONN;
2958        }
2959
2960        if (atomic_read(&fs_info->balance_running)) {
2961                atomic_inc(&fs_info->balance_pause_req);
2962                mutex_unlock(&fs_info->balance_mutex);
2963
2964                wait_event(fs_info->balance_wait_q,
2965                           atomic_read(&fs_info->balance_running) == 0);
2966
2967                mutex_lock(&fs_info->balance_mutex);
2968                /* we are good with balance_ctl ripped off from under us */
2969                BUG_ON(atomic_read(&fs_info->balance_running));
2970                atomic_dec(&fs_info->balance_pause_req);
2971        } else {
2972                ret = -ENOTCONN;
2973        }
2974
2975        mutex_unlock(&fs_info->balance_mutex);
2976        return ret;
2977}
2978
2979int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2980{
2981        mutex_lock(&fs_info->balance_mutex);
2982        if (!fs_info->balance_ctl) {
2983                mutex_unlock(&fs_info->balance_mutex);
2984                return -ENOTCONN;
2985        }
2986
2987        atomic_inc(&fs_info->balance_cancel_req);
2988        /*
2989         * if we are running just wait and return, balance item is
2990         * deleted in btrfs_balance in this case
2991         */
2992        if (atomic_read(&fs_info->balance_running)) {
2993                mutex_unlock(&fs_info->balance_mutex);
2994                wait_event(fs_info->balance_wait_q,
2995                           atomic_read(&fs_info->balance_running) == 0);
2996                mutex_lock(&fs_info->balance_mutex);
2997        } else {
2998                /* __cancel_balance needs volume_mutex */
2999                mutex_unlock(&fs_info->balance_mutex);
3000                mutex_lock(&fs_info->volume_mutex);
3001                mutex_lock(&fs_info->balance_mutex);
3002
3003                if (fs_info->balance_ctl)
3004                        __cancel_balance(fs_info);
3005
3006                mutex_unlock(&fs_info->volume_mutex);
3007        }
3008
3009        BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3010        atomic_dec(&fs_info->balance_cancel_req);
3011        mutex_unlock(&fs_info->balance_mutex);
3012        return 0;
3013}
3014
3015/*
3016 * shrinking a device means finding all of the device extents past
3017 * the new size, and then following the back refs to the chunks.
3018 * The chunk relocation code actually frees the device extent
3019 */
3020int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3021{
3022        struct btrfs_trans_handle *trans;
3023        struct btrfs_root *root = device->dev_root;
3024        struct btrfs_dev_extent *dev_extent = NULL;
3025        struct btrfs_path *path;
3026        u64 length;
3027        u64 chunk_tree;
3028        u64 chunk_objectid;
3029        u64 chunk_offset;
3030        int ret;
3031        int slot;
3032        int failed = 0;
3033        bool retried = false;
3034        struct extent_buffer *l;
3035        struct btrfs_key key;
3036        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3037        u64 old_total = btrfs_super_total_bytes(super_copy);
3038        u64 old_size = device->total_bytes;
3039        u64 diff = device->total_bytes - new_size;
3040
3041        if (new_size >= device->total_bytes)
3042                return -EINVAL;
3043
3044        path = btrfs_alloc_path();
3045        if (!path)
3046                return -ENOMEM;
3047
3048        path->reada = 2;
3049
3050        lock_chunks(root);
3051
3052        device->total_bytes = new_size;
3053        if (device->writeable) {
3054                device->fs_devices->total_rw_bytes -= diff;
3055                spin_lock(&root->fs_info->free_chunk_lock);
3056                root->fs_info->free_chunk_space -= diff;
3057                spin_unlock(&root->fs_info->free_chunk_lock);
3058        }
3059        unlock_chunks(root);
3060
3061again:
3062        key.objectid = device->devid;
3063        key.offset = (u64)-1;
3064        key.type = BTRFS_DEV_EXTENT_KEY;
3065
3066        do {
3067                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3068                if (ret < 0)
3069                        goto done;
3070
3071                ret = btrfs_previous_item(root, path, 0, key.type);
3072                if (ret < 0)
3073                        goto done;
3074                if (ret) {
3075                        ret = 0;
3076                        btrfs_release_path(path);
3077                        break;
3078                }
3079
3080                l = path->nodes[0];
3081                slot = path->slots[0];
3082                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3083
3084                if (key.objectid != device->devid) {
3085                        btrfs_release_path(path);
3086                        break;
3087                }
3088
3089                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3090                length = btrfs_dev_extent_length(l, dev_extent);
3091
3092                if (key.offset + length <= new_size) {
3093                        btrfs_release_path(path);
3094                        break;
3095                }
3096
3097                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3098                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3099                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3100                btrfs_release_path(path);
3101
3102                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3103                                           chunk_offset);
3104                if (ret && ret != -ENOSPC)
3105                        goto done;
3106                if (ret == -ENOSPC)
3107                        failed++;
3108        } while (key.offset-- > 0);
3109
3110        if (failed && !retried) {
3111                failed = 0;
3112                retried = true;
3113                goto again;
3114        } else if (failed && retried) {
3115                ret = -ENOSPC;
3116                lock_chunks(root);
3117
3118                device->total_bytes = old_size;
3119                if (device->writeable)
3120                        device->fs_devices->total_rw_bytes += diff;
3121                spin_lock(&root->fs_info->free_chunk_lock);
3122                root->fs_info->free_chunk_space += diff;
3123                spin_unlock(&root->fs_info->free_chunk_lock);
3124                unlock_chunks(root);
3125                goto done;
3126        }
3127
3128        /* Shrinking succeeded, else we would be at "done". */
3129        trans = btrfs_start_transaction(root, 0);
3130        if (IS_ERR(trans)) {
3131                ret = PTR_ERR(trans);
3132                goto done;
3133        }
3134
3135        lock_chunks(root);
3136
3137        device->disk_total_bytes = new_size;
3138        /* Now btrfs_update_device() will change the on-disk size. */
3139        ret = btrfs_update_device(trans, device);
3140        if (ret) {
3141                unlock_chunks(root);
3142                btrfs_end_transaction(trans, root);
3143                goto done;
3144        }
3145        WARN_ON(diff > old_total);
3146        btrfs_set_super_total_bytes(super_copy, old_total - diff);
3147        unlock_chunks(root);
3148        btrfs_end_transaction(trans, root);
3149done:
3150        btrfs_free_path(path);
3151        return ret;
3152}
3153
3154static int btrfs_add_system_chunk(struct btrfs_root *root,
3155                           struct btrfs_key *key,
3156                           struct btrfs_chunk *chunk, int item_size)
3157{
3158        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3159        struct btrfs_disk_key disk_key;
3160        u32 array_size;
3161        u8 *ptr;
3162
3163        array_size = btrfs_super_sys_array_size(super_copy);
3164        if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3165                return -EFBIG;
3166
3167        ptr = super_copy->sys_chunk_array + array_size;
3168        btrfs_cpu_key_to_disk(&disk_key, key);
3169        memcpy(ptr, &disk_key, sizeof(disk_key));
3170        ptr += sizeof(disk_key);
3171        memcpy(ptr, chunk, item_size);
3172        item_size += sizeof(disk_key);
3173        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3174        return 0;
3175}
3176
3177/*
3178 * sort the devices in descending order by max_avail, total_avail
3179 */
3180static int btrfs_cmp_device_info(const void *a, const void *b)
3181{
3182        const struct btrfs_device_info *di_a = a;
3183        const struct btrfs_device_info *di_b = b;
3184
3185        if (di_a->max_avail > di_b->max_avail)
3186                return -1;
3187        if (di_a->max_avail < di_b->max_avail)
3188                return 1;
3189        if (di_a->total_avail > di_b->total_avail)
3190                return -1;
3191        if (di_a->total_avail < di_b->total_avail)
3192                return 1;
3193        return 0;
3194}
3195
3196static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3197                               struct btrfs_root *extent_root,
3198                               struct map_lookup **map_ret,
3199                               u64 *num_bytes_out, u64 *stripe_size_out,
3200                               u64 start, u64 type)
3201{
3202        struct btrfs_fs_info *info = extent_root->fs_info;
3203        struct btrfs_fs_devices *fs_devices = info->fs_devices;
3204        struct list_head *cur;
3205        struct map_lookup *map = NULL;
3206        struct extent_map_tree *em_tree;
3207        struct extent_map *em;
3208        struct btrfs_device_info *devices_info = NULL;
3209        u64 total_avail;
3210        int num_stripes;        /* total number of stripes to allocate */
3211        int sub_stripes;        /* sub_stripes info for map */
3212        int dev_stripes;        /* stripes per dev */
3213        int devs_max;           /* max devs to use */
3214        int devs_min;           /* min devs needed */
3215        int devs_increment;     /* ndevs has to be a multiple of this */
3216        int ncopies;            /* how many copies to data has */
3217        int ret;
3218        u64 max_stripe_size;
3219        u64 max_chunk_size;
3220        u64 stripe_size;
3221        u64 num_bytes;
3222        int ndevs;
3223        int i;
3224        int j;
3225
3226        BUG_ON(!alloc_profile_is_valid(type, 0));
3227
3228        if (list_empty(&fs_devices->alloc_list))
3229                return -ENOSPC;
3230
3231        sub_stripes = 1;
3232        dev_stripes = 1;
3233        devs_increment = 1;
3234        ncopies = 1;
3235        devs_max = 0;   /* 0 == as many as possible */
3236        devs_min = 1;
3237
3238        /*
3239         * define the properties of each RAID type.
3240         * FIXME: move this to a global table and use it in all RAID
3241         * calculation code
3242         */
3243        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3244                dev_stripes = 2;
3245                ncopies = 2;
3246                devs_max = 1;
3247        } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3248                devs_min = 2;
3249        } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3250                devs_increment = 2;
3251                ncopies = 2;
3252                devs_max = 2;
3253                devs_min = 2;
3254        } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3255                sub_stripes = 2;
3256                devs_increment = 2;
3257                ncopies = 2;
3258                devs_min = 4;
3259        } else {
3260                devs_max = 1;
3261        }
3262
3263        if (type & BTRFS_BLOCK_GROUP_DATA) {
3264                max_stripe_size = 1024 * 1024 * 1024;
3265                max_chunk_size = 10 * max_stripe_size;
3266        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3267                /* for larger filesystems, use larger metadata chunks */
3268                if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3269                        max_stripe_size = 1024 * 1024 * 1024;
3270                else
3271                        max_stripe_size = 256 * 1024 * 1024;
3272                max_chunk_size = max_stripe_size;
3273        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3274                max_stripe_size = 32 * 1024 * 1024;
3275                max_chunk_size = 2 * max_stripe_size;
3276        } else {
3277                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3278                       type);
3279                BUG_ON(1);
3280        }
3281
3282        /* we don't want a chunk larger than 10% of writeable space */
3283        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3284                             max_chunk_size);
3285
3286        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3287                               GFP_NOFS);
3288        if (!devices_info)
3289                return -ENOMEM;
3290
3291        cur = fs_devices->alloc_list.next;
3292
3293        /*
3294         * in the first pass through the devices list, we gather information
3295         * about the available holes on each device.
3296         */
3297        ndevs = 0;
3298        while (cur != &fs_devices->alloc_list) {
3299                struct btrfs_device *device;
3300                u64 max_avail;
3301                u64 dev_offset;
3302
3303                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3304
3305                cur = cur->next;
3306
3307                if (!device->writeable) {
3308                        printk(KERN_ERR
3309                               "btrfs: read-only device in alloc_list\n");
3310                        WARN_ON(1);
3311                        continue;
3312                }
3313
3314                if (!device->in_fs_metadata)
3315                        continue;
3316
3317                if (device->total_bytes > device->bytes_used)
3318                        total_avail = device->total_bytes - device->bytes_used;
3319                else
3320                        total_avail = 0;
3321
3322                /* If there is no space on this device, skip it. */
3323                if (total_avail == 0)
3324                        continue;
3325
3326                ret = find_free_dev_extent(device,
3327                                           max_stripe_size * dev_stripes,
3328                                           &dev_offset, &max_avail);
3329                if (ret && ret != -ENOSPC)
3330                        goto error;
3331
3332                if (ret == 0)
3333                        max_avail = max_stripe_size * dev_stripes;
3334
3335                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3336                        continue;
3337
3338                devices_info[ndevs].dev_offset = dev_offset;
3339                devices_info[ndevs].max_avail = max_avail;
3340                devices_info[ndevs].total_avail = total_avail;
3341                devices_info[ndevs].dev = device;
3342                ++ndevs;
3343        }
3344
3345        /*
3346         * now sort the devices by hole size / available space
3347         */
3348        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3349             btrfs_cmp_device_info, NULL);
3350
3351        /* round down to number of usable stripes */
3352        ndevs -= ndevs % devs_increment;
3353
3354        if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3355                ret = -ENOSPC;
3356                goto error;
3357        }
3358
3359        if (devs_max && ndevs > devs_max)
3360                ndevs = devs_max;
3361        /*
3362         * the primary goal is to maximize the number of stripes, so use as many
3363         * devices as possible, even if the stripes are not maximum sized.
3364         */
3365        stripe_size = devices_info[ndevs-1].max_avail;
3366        num_stripes = ndevs * dev_stripes;
3367
3368        if (stripe_size * ndevs > max_chunk_size * ncopies) {
3369                stripe_size = max_chunk_size * ncopies;
3370                do_div(stripe_size, ndevs);
3371        }
3372
3373        do_div(stripe_size, dev_stripes);
3374
3375        /* align to BTRFS_STRIPE_LEN */
3376        do_div(stripe_size, BTRFS_STRIPE_LEN);
3377        stripe_size *= BTRFS_STRIPE_LEN;
3378
3379        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3380        if (!map) {
3381                ret = -ENOMEM;
3382                goto error;
3383        }
3384        map->num_stripes = num_stripes;
3385
3386        for (i = 0; i < ndevs; ++i) {
3387                for (j = 0; j < dev_stripes; ++j) {
3388                        int s = i * dev_stripes + j;
3389                        map->stripes[s].dev = devices_info[i].dev;
3390                        map->stripes[s].physical = devices_info[i].dev_offset +
3391                                                   j * stripe_size;
3392                }
3393        }
3394        map->sector_size = extent_root->sectorsize;
3395        map->stripe_len = BTRFS_STRIPE_LEN;
3396        map->io_align = BTRFS_STRIPE_LEN;
3397        map->io_width = BTRFS_STRIPE_LEN;
3398        map->type = type;
3399        map->sub_stripes = sub_stripes;
3400
3401        *map_ret = map;
3402        num_bytes = stripe_size * (num_stripes / ncopies);
3403
3404        *stripe_size_out = stripe_size;
3405        *num_bytes_out = num_bytes;
3406
3407        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3408
3409        em = alloc_extent_map();
3410        if (!em) {
3411                ret = -ENOMEM;
3412                goto error;
3413        }
3414        em->bdev = (struct block_device *)map;
3415        em->start = start;
3416        em->len = num_bytes;
3417        em->block_start = 0;
3418        em->block_len = em->len;
3419
3420        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3421        write_lock(&em_tree->lock);
3422        ret = add_extent_mapping(em_tree, em);
3423        write_unlock(&em_tree->lock);
3424        free_extent_map(em);
3425        if (ret)
3426                goto error;
3427
3428        ret = btrfs_make_block_group(trans, extent_root, 0, type,
3429                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3430                                     start, num_bytes);
3431        if (ret)
3432                goto error;
3433
3434        for (i = 0; i < map->num_stripes; ++i) {
3435                struct btrfs_device *device;
3436                u64 dev_offset;
3437
3438                device = map->stripes[i].dev;
3439                dev_offset = map->stripes[i].physical;
3440
3441                ret = btrfs_alloc_dev_extent(trans, device,
3442                                info->chunk_root->root_key.objectid,
3443                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3444                                start, dev_offset, stripe_size);
3445                if (ret) {
3446                        btrfs_abort_transaction(trans, extent_root, ret);
3447                        goto error;
3448                }
3449        }
3450
3451        kfree(devices_info);
3452        return 0;
3453
3454error:
3455        kfree(map);
3456        kfree(devices_info);
3457        return ret;
3458}
3459
3460static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3461                                struct btrfs_root *extent_root,
3462                                struct map_lookup *map, u64 chunk_offset,
3463                                u64 chunk_size, u64 stripe_size)
3464{
3465        u64 dev_offset;
3466        struct btrfs_key key;
3467        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3468        struct btrfs_device *device;
3469        struct btrfs_chunk *chunk;
3470        struct btrfs_stripe *stripe;
3471        size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3472        int index = 0;
3473        int ret;
3474
3475        chunk = kzalloc(item_size, GFP_NOFS);
3476        if (!chunk)
3477                return -ENOMEM;
3478
3479        index = 0;
3480        while (index < map->num_stripes) {
3481                device = map->stripes[index].dev;
3482                device->bytes_used += stripe_size;
3483                ret = btrfs_update_device(trans, device);
3484                if (ret)
3485                        goto out_free;
3486                index++;
3487        }
3488
3489        spin_lock(&extent_root->fs_info->free_chunk_lock);
3490        extent_root->fs_info->free_chunk_space -= (stripe_size *
3491                                                   map->num_stripes);
3492        spin_unlock(&extent_root->fs_info->free_chunk_lock);
3493
3494        index = 0;
3495        stripe = &chunk->stripe;
3496        while (index < map->num_stripes) {
3497                device = map->stripes[index].dev;
3498                dev_offset = map->stripes[index].physical;
3499
3500                btrfs_set_stack_stripe_devid(stripe, device->devid);
3501                btrfs_set_stack_stripe_offset(stripe, dev_offset);
3502                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3503                stripe++;
3504                index++;
3505        }
3506
3507        btrfs_set_stack_chunk_length(chunk, chunk_size);
3508        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3509        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3510        btrfs_set_stack_chunk_type(chunk, map->type);
3511        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3512        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3513        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3514        btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3515        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3516
3517        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3518        key.type = BTRFS_CHUNK_ITEM_KEY;
3519        key.offset = chunk_offset;
3520
3521        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3522
3523        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3524                /*
3525                 * TODO: Cleanup of inserted chunk root in case of
3526                 * failure.
3527                 */
3528                ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3529                                             item_size);
3530        }
3531
3532out_free:
3533        kfree(chunk);
3534        return ret;
3535}
3536
3537/*
3538 * Chunk allocation falls into two parts. The first part does works
3539 * that make the new allocated chunk useable, but not do any operation
3540 * that modifies the chunk tree. The second part does the works that
3541 * require modifying the chunk tree. This division is important for the
3542 * bootstrap process of adding storage to a seed btrfs.
3543 */
3544int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3545                      struct btrfs_root *extent_root, u64 type)
3546{
3547        u64 chunk_offset;
3548        u64 chunk_size;
3549        u64 stripe_size;
3550        struct map_lookup *map;
3551        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3552        int ret;
3553
3554        ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3555                              &chunk_offset);
3556        if (ret)
3557                return ret;
3558
3559        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3560                                  &stripe_size, chunk_offset, type);
3561        if (ret)
3562                return ret;
3563
3564        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3565                                   chunk_size, stripe_size);
3566        if (ret)
3567                return ret;
3568        return 0;
3569}
3570
3571static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3572                                         struct btrfs_root *root,
3573                                         struct btrfs_device *device)
3574{
3575        u64 chunk_offset;
3576        u64 sys_chunk_offset;
3577        u64 chunk_size;
3578        u64 sys_chunk_size;
3579        u64 stripe_size;
3580        u64 sys_stripe_size;
3581        u64 alloc_profile;
3582        struct map_lookup *map;
3583        struct map_lookup *sys_map;
3584        struct btrfs_fs_info *fs_info = root->fs_info;
3585        struct btrfs_root *extent_root = fs_info->extent_root;
3586        int ret;
3587
3588        ret = find_next_chunk(fs_info->chunk_root,
3589                              BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3590        if (ret)
3591                return ret;
3592
3593        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3594                                fs_info->avail_metadata_alloc_bits;
3595        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3596
3597        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3598                                  &stripe_size, chunk_offset, alloc_profile);
3599        if (ret)
3600                return ret;
3601
3602        sys_chunk_offset = chunk_offset + chunk_size;
3603
3604        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3605                                fs_info->avail_system_alloc_bits;
3606        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3607
3608        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3609                                  &sys_chunk_size, &sys_stripe_size,
3610                                  sys_chunk_offset, alloc_profile);
3611        if (ret)
3612                goto abort;
3613
3614        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3615        if (ret)
3616                goto abort;
3617
3618        /*
3619         * Modifying chunk tree needs allocating new blocks from both
3620         * system block group and metadata block group. So we only can
3621         * do operations require modifying the chunk tree after both
3622         * block groups were created.
3623         */
3624        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3625                                   chunk_size, stripe_size);
3626        if (ret)
3627                goto abort;
3628
3629        ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3630                                   sys_chunk_offset, sys_chunk_size,
3631                                   sys_stripe_size);
3632        if (ret)
3633                goto abort;
3634
3635        return 0;
3636
3637abort:
3638        btrfs_abort_transaction(trans, root, ret);
3639        return ret;
3640}
3641
3642int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3643{
3644        struct extent_map *em;
3645        struct map_lookup *map;
3646        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3647        int readonly = 0;
3648        int i;
3649
3650        read_lock(&map_tree->map_tree.lock);
3651        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3652        read_unlock(&map_tree->map_tree.lock);
3653        if (!em)
3654                return 1;
3655
3656        if (btrfs_test_opt(root, DEGRADED)) {
3657                free_extent_map(em);
3658                return 0;
3659        }
3660
3661        map = (struct map_lookup *)em->bdev;
3662        for (i = 0; i < map->num_stripes; i++) {
3663                if (!map->stripes[i].dev->writeable) {
3664                        readonly = 1;
3665                        break;
3666                }
3667        }
3668        free_extent_map(em);
3669        return readonly;
3670}
3671
3672void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3673{
3674        extent_map_tree_init(&tree->map_tree);
3675}
3676
3677void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3678{
3679        struct extent_map *em;
3680
3681        while (1) {
3682                write_lock(&tree->map_tree.lock);
3683                em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3684                if (em)
3685                        remove_extent_mapping(&tree->map_tree, em);
3686                write_unlock(&tree->map_tree.lock);
3687                if (!em)
3688                        break;
3689                kfree(em->bdev);
3690                /* once for us */
3691                free_extent_map(em);
3692                /* once for the tree */
3693                free_extent_map(em);
3694        }
3695}
3696
3697int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3698{
3699        struct extent_map *em;
3700        struct map_lookup *map;
3701        struct extent_map_tree *em_tree = &map_tree->map_tree;
3702        int ret;
3703
3704        read_lock(&em_tree->lock);
3705        em = lookup_extent_mapping(em_tree, logical, len);
3706        read_unlock(&em_tree->lock);
3707        BUG_ON(!em);
3708
3709        BUG_ON(em->start > logical || em->start + em->len < logical);
3710        map = (struct map_lookup *)em->bdev;
3711        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3712                ret = map->num_stripes;
3713        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3714                ret = map->sub_stripes;
3715        else
3716                ret = 1;
3717        free_extent_map(em);
3718        return ret;
3719}
3720
3721static int find_live_mirror(struct map_lookup *map, int first, int num,
3722                            int optimal)
3723{
3724        int i;
3725        if (map->stripes[optimal].dev->bdev)
3726                return optimal;
3727        for (i = first; i < first + num; i++) {
3728                if (map->stripes[i].dev->bdev)
3729                        return i;
3730        }
3731        /* we couldn't find one that doesn't fail.  Just return something
3732         * and the io error handling code will clean up eventually
3733         */
3734        return optimal;
3735}
3736
3737static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3738                             u64 logical, u64 *length,
3739                             struct btrfs_bio **bbio_ret,
3740                             int mirror_num)
3741{
3742        struct extent_map *em;
3743        struct map_lookup *map;
3744        struct extent_map_tree *em_tree = &map_tree->map_tree;
3745        u64 offset;
3746        u64 stripe_offset;
3747        u64 stripe_end_offset;
3748        u64 stripe_nr;
3749        u64 stripe_nr_orig;
3750        u64 stripe_nr_end;
3751        int stripe_index;
3752        int i;
3753        int ret = 0;
3754        int num_stripes;
3755        int max_errors = 0;
3756        struct btrfs_bio *bbio = NULL;
3757
3758        read_lock(&em_tree->lock);
3759        em = lookup_extent_mapping(em_tree, logical, *length);
3760        read_unlock(&em_tree->lock);
3761
3762        if (!em) {
3763                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3764                       (unsigned long long)logical,
3765                       (unsigned long long)*length);
3766                BUG();
3767        }
3768
3769        BUG_ON(em->start > logical || em->start + em->len < logical);
3770        map = (struct map_lookup *)em->bdev;
3771        offset = logical - em->start;
3772
3773        if (mirror_num > map->num_stripes)
3774                mirror_num = 0;
3775
3776        stripe_nr = offset;
3777        /*
3778         * stripe_nr counts the total number of stripes we have to stride
3779         * to get to this block
3780         */
3781        do_div(stripe_nr, map->stripe_len);
3782
3783        stripe_offset = stripe_nr * map->stripe_len;
3784        BUG_ON(offset < stripe_offset);
3785
3786        /* stripe_offset is the offset of this block in its stripe*/
3787        stripe_offset = offset - stripe_offset;
3788
3789        if (rw & REQ_DISCARD)
3790                *length = min_t(u64, em->len - offset, *length);
3791        else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3792                /* we limit the length of each bio to what fits in a stripe */
3793                *length = min_t(u64, em->len - offset,
3794                                map->stripe_len - stripe_offset);
3795        } else {
3796                *length = em->len - offset;
3797        }
3798
3799        if (!bbio_ret)
3800                goto out;
3801
3802        num_stripes = 1;
3803        stripe_index = 0;
3804        stripe_nr_orig = stripe_nr;
3805        stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3806                        (~(map->stripe_len - 1));
3807        do_div(stripe_nr_end, map->stripe_len);
3808        stripe_end_offset = stripe_nr_end * map->stripe_len -
3809                            (offset + *length);
3810        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3811                if (rw & REQ_DISCARD)
3812                        num_stripes = min_t(u64, map->num_stripes,
3813                                            stripe_nr_end - stripe_nr_orig);
3814                stripe_index = do_div(stripe_nr, map->num_stripes);
3815        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3816                if (rw & (REQ_WRITE | REQ_DISCARD))
3817                        num_stripes = map->num_stripes;
3818                else if (mirror_num)
3819                        stripe_index = mirror_num - 1;
3820                else {
3821                        stripe_index = find_live_mirror(map, 0,
3822                                            map->num_stripes,
3823                                            current->pid % map->num_stripes);
3824                        mirror_num = stripe_index + 1;
3825                }
3826
3827        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3828                if (rw & (REQ_WRITE | REQ_DISCARD)) {
3829                        num_stripes = map->num_stripes;
3830                } else if (mirror_num) {
3831                        stripe_index = mirror_num - 1;
3832                } else {
3833                        mirror_num = 1;
3834                }
3835
3836        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3837                int factor = map->num_stripes / map->sub_stripes;
3838
3839                stripe_index = do_div(stripe_nr, factor);
3840                stripe_index *= map->sub_stripes;
3841
3842                if (rw & REQ_WRITE)
3843                        num_stripes = map->sub_stripes;
3844                else if (rw & REQ_DISCARD)
3845                        num_stripes = min_t(u64, map->sub_stripes *
3846                                            (stripe_nr_end - stripe_nr_orig),
3847                                            map->num_stripes);
3848                else if (mirror_num)
3849                        stripe_index += mirror_num - 1;
3850                else {
3851                        int old_stripe_index = stripe_index;
3852                        stripe_index = find_live_mirror(map, stripe_index,
3853                                              map->sub_stripes, stripe_index +
3854                                              current->pid % map->sub_stripes);
3855                        mirror_num = stripe_index - old_stripe_index + 1;
3856                }
3857        } else {
3858                /*
3859                 * after this do_div call, stripe_nr is the number of stripes
3860                 * on this device we have to walk to find the data, and
3861                 * stripe_index is the number of our device in the stripe array
3862                 */
3863                stripe_index = do_div(stripe_nr, map->num_stripes);
3864                mirror_num = stripe_index + 1;
3865        }
3866        BUG_ON(stripe_index >= map->num_stripes);
3867
3868        bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3869        if (!bbio) {
3870                ret = -ENOMEM;
3871                goto out;
3872        }
3873        atomic_set(&bbio->error, 0);
3874
3875        if (rw & REQ_DISCARD) {
3876                int factor = 0;
3877                int sub_stripes = 0;
3878                u64 stripes_per_dev = 0;
3879                u32 remaining_stripes = 0;
3880                u32 last_stripe = 0;
3881
3882                if (map->type &
3883                    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3884                        if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3885                                sub_stripes = 1;
3886                        else
3887                                sub_stripes = map->sub_stripes;
3888
3889                        factor = map->num_stripes / sub_stripes;
3890                        stripes_per_dev = div_u64_rem(stripe_nr_end -
3891                                                      stripe_nr_orig,
3892                                                      factor,
3893                                                      &remaining_stripes);
3894                        div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
3895                        last_stripe *= sub_stripes;
3896                }
3897
3898                for (i = 0; i < num_stripes; i++) {
3899                        bbio->stripes[i].physical =
3900                                map->stripes[stripe_index].physical +
3901                                stripe_offset + stripe_nr * map->stripe_len;
3902                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3903
3904                        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3905                                         BTRFS_BLOCK_GROUP_RAID10)) {
3906                                bbio->stripes[i].length = stripes_per_dev *
3907                                                          map->stripe_len;
3908
3909                                if (i / sub_stripes < remaining_stripes)
3910                                        bbio->stripes[i].length +=
3911                                                map->stripe_len;
3912
3913                                /*
3914                                 * Special for the first stripe and
3915                                 * the last stripe:
3916                                 *
3917                                 * |-------|...|-------|
3918                                 *     |----------|
3919                                 *    off     end_off
3920                                 */
3921                                if (i < sub_stripes)
3922                                        bbio->stripes[i].length -=
3923                                                stripe_offset;
3924
3925                                if (stripe_index >= last_stripe &&
3926                                    stripe_index <= (last_stripe +
3927                                                     sub_stripes - 1))
3928                                        bbio->stripes[i].length -=
3929                                                stripe_end_offset;
3930
3931                                if (i == sub_stripes - 1)
3932                                        stripe_offset = 0;
3933                        } else
3934                                bbio->stripes[i].length = *length;
3935
3936                        stripe_index++;
3937                        if (stripe_index == map->num_stripes) {
3938                                /* This could only happen for RAID0/10 */
3939                                stripe_index = 0;
3940                                stripe_nr++;
3941                        }
3942                }
3943        } else {
3944                for (i = 0; i < num_stripes; i++) {
3945                        bbio->stripes[i].physical =
3946                                map->stripes[stripe_index].physical +
3947                                stripe_offset +
3948                                stripe_nr * map->stripe_len;
3949                        bbio->stripes[i].dev =
3950                                map->stripes[stripe_index].dev;
3951                        stripe_index++;
3952                }
3953        }
3954
3955        if (rw & REQ_WRITE) {
3956                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3957                                 BTRFS_BLOCK_GROUP_RAID10 |
3958                                 BTRFS_BLOCK_GROUP_DUP)) {
3959                        max_errors = 1;
3960                }
3961        }
3962
3963        *bbio_ret = bbio;
3964        bbio->num_stripes = num_stripes;
3965        bbio->max_errors = max_errors;
3966        bbio->mirror_num = mirror_num;
3967out:
3968        free_extent_map(em);
3969        return ret;
3970}
3971
3972int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3973                      u64 logical, u64 *length,
3974                      struct btrfs_bio **bbio_ret, int mirror_num)
3975{
3976        return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3977                                 mirror_num);
3978}
3979
3980int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3981                     u64 chunk_start, u64 physical, u64 devid,
3982                     u64 **logical, int *naddrs, int *stripe_len)
3983{
3984        struct extent_map_tree *em_tree = &map_tree->map_tree;
3985        struct extent_map *em;
3986        struct map_lookup *map;
3987        u64 *buf;
3988        u64 bytenr;
3989        u64 length;
3990        u64 stripe_nr;
3991        int i, j, nr = 0;
3992
3993        read_lock(&em_tree->lock);
3994        em = lookup_extent_mapping(em_tree, chunk_start, 1);
3995        read_unlock(&em_tree->lock);
3996
3997        BUG_ON(!em || em->start != chunk_start);
3998        map = (struct map_lookup *)em->bdev;
3999
4000        length = em->len;
4001        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4002                do_div(length, map->num_stripes / map->sub_stripes);
4003        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4004                do_div(length, map->num_stripes);
4005
4006        buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4007        BUG_ON(!buf); /* -ENOMEM */
4008
4009        for (i = 0; i < map->num_stripes; i++) {
4010                if (devid && map->stripes[i].dev->devid != devid)
4011                        continue;
4012                if (map->stripes[i].physical > physical ||
4013                    map->stripes[i].physical + length <= physical)
4014                        continue;
4015
4016                stripe_nr = physical - map->stripes[i].physical;
4017                do_div(stripe_nr, map->stripe_len);
4018
4019                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4020                        stripe_nr = stripe_nr * map->num_stripes + i;
4021                        do_div(stripe_nr, map->sub_stripes);
4022                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4023                        stripe_nr = stripe_nr * map->num_stripes + i;
4024                }
4025                bytenr = chunk_start + stripe_nr * map->stripe_len;
4026                WARN_ON(nr >= map->num_stripes);
4027                for (j = 0; j < nr; j++) {
4028                        if (buf[j] == bytenr)
4029                                break;
4030                }
4031                if (j == nr) {
4032                        WARN_ON(nr >= map->num_stripes);
4033                        buf[nr++] = bytenr;
4034                }
4035        }
4036
4037        *logical = buf;
4038        *naddrs = nr;
4039        *stripe_len = map->stripe_len;
4040
4041        free_extent_map(em);
4042        return 0;
4043}
4044
4045static void *merge_stripe_index_into_bio_private(void *bi_private,
4046                                                 unsigned int stripe_index)
4047{
4048        /*
4049         * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4050         * at most 1.
4051         * The alternative solution (instead of stealing bits from the
4052         * pointer) would be to allocate an intermediate structure
4053         * that contains the old private pointer plus the stripe_index.
4054         */
4055        BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4056        BUG_ON(stripe_index > 3);
4057        return (void *)(((uintptr_t)bi_private) | stripe_index);
4058}
4059
4060static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4061{
4062        return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4063}
4064
4065static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4066{
4067        return (unsigned int)((uintptr_t)bi_private) & 3;
4068}
4069
4070static void btrfs_end_bio(struct bio *bio, int err)
4071{
4072        struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4073        int is_orig_bio = 0;
4074
4075        if (err) {
4076                atomic_inc(&bbio->error);
4077                if (err == -EIO || err == -EREMOTEIO) {
4078                        unsigned int stripe_index =
4079                                extract_stripe_index_from_bio_private(
4080                                        bio->bi_private);
4081                        struct btrfs_device *dev;
4082
4083                        BUG_ON(stripe_index >= bbio->num_stripes);
4084                        dev = bbio->stripes[stripe_index].dev;
4085                        if (dev->bdev) {
4086                                if (bio->bi_rw & WRITE)
4087                                        btrfs_dev_stat_inc(dev,
4088                                                BTRFS_DEV_STAT_WRITE_ERRS);
4089                                else
4090                                        btrfs_dev_stat_inc(dev,
4091                                                BTRFS_DEV_STAT_READ_ERRS);
4092                                if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4093                                        btrfs_dev_stat_inc(dev,
4094                                                BTRFS_DEV_STAT_FLUSH_ERRS);
4095                                btrfs_dev_stat_print_on_error(dev);
4096                        }
4097                }
4098        }
4099
4100        if (bio == bbio->orig_bio)
4101                is_orig_bio = 1;
4102
4103        if (atomic_dec_and_test(&bbio->stripes_pending)) {
4104                if (!is_orig_bio) {
4105                        bio_put(bio);
4106                        bio = bbio->orig_bio;
4107                }
4108                bio->bi_private = bbio->private;
4109                bio->bi_end_io = bbio->end_io;
4110                bio->bi_bdev = (struct block_device *)
4111                                        (unsigned long)bbio->mirror_num;
4112                /* only send an error to the higher layers if it is
4113                 * beyond the tolerance of the multi-bio
4114                 */
4115                if (atomic_read(&bbio->error) > bbio->max_errors) {
4116                        err = -EIO;
4117                } else {
4118                        /*
4119                         * this bio is actually up to date, we didn't
4120                         * go over the max number of errors
4121                         */
4122                        set_bit(BIO_UPTODATE, &bio->bi_flags);
4123                        err = 0;
4124                }
4125                kfree(bbio);
4126
4127                bio_endio(bio, err);
4128        } else if (!is_orig_bio) {
4129                bio_put(bio);
4130        }
4131}
4132
4133struct async_sched {
4134        struct bio *bio;
4135        int rw;
4136        struct btrfs_fs_info *info;
4137        struct btrfs_work work;
4138};
4139
4140/*
4141 * see run_scheduled_bios for a description of why bios are collected for
4142 * async submit.
4143 *
4144 * This will add one bio to the pending list for a device and make sure
4145 * the work struct is scheduled.
4146 */
4147static noinline void schedule_bio(struct btrfs_root *root,
4148                                 struct btrfs_device *device,
4149                                 int rw, struct bio *bio)
4150{
4151        int should_queue = 1;
4152        struct btrfs_pending_bios *pending_bios;
4153
4154        /* don't bother with additional async steps for reads, right now */
4155        if (!(rw & REQ_WRITE)) {
4156                bio_get(bio);
4157                btrfsic_submit_bio(rw, bio);
4158                bio_put(bio);
4159                return;
4160        }
4161
4162        /*
4163         * nr_async_bios allows us to reliably return congestion to the
4164         * higher layers.  Otherwise, the async bio makes it appear we have
4165         * made progress against dirty pages when we've really just put it
4166         * on a queue for later
4167         */
4168        atomic_inc(&root->fs_info->nr_async_bios);
4169        WARN_ON(bio->bi_next);
4170        bio->bi_next = NULL;
4171        bio->bi_rw |= rw;
4172
4173        spin_lock(&device->io_lock);
4174        if (bio->bi_rw & REQ_SYNC)
4175                pending_bios = &device->pending_sync_bios;
4176        else
4177                pending_bios = &device->pending_bios;
4178
4179        if (pending_bios->tail)
4180                pending_bios->tail->bi_next = bio;
4181
4182        pending_bios->tail = bio;
4183        if (!pending_bios->head)
4184                pending_bios->head = bio;
4185        if (device->running_pending)
4186                should_queue = 0;
4187
4188        spin_unlock(&device->io_lock);
4189
4190        if (should_queue)
4191                btrfs_queue_worker(&root->fs_info->submit_workers,
4192                                   &device->work);
4193}
4194
4195int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4196                  int mirror_num, int async_submit)
4197{
4198        struct btrfs_mapping_tree *map_tree;
4199        struct btrfs_device *dev;
4200        struct bio *first_bio = bio;
4201        u64 logical = (u64)bio->bi_sector << 9;
4202        u64 length = 0;
4203        u64 map_length;
4204        int ret;
4205        int dev_nr = 0;
4206        int total_devs = 1;
4207        struct btrfs_bio *bbio = NULL;
4208
4209        length = bio->bi_size;
4210        map_tree = &root->fs_info->mapping_tree;
4211        map_length = length;
4212
4213        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4214                              mirror_num);
4215        if (ret) /* -ENOMEM */
4216                return ret;
4217
4218        total_devs = bbio->num_stripes;
4219        if (map_length < length) {
4220                printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4221                       "len %llu\n", (unsigned long long)logical,
4222                       (unsigned long long)length,
4223                       (unsigned long long)map_length);
4224                BUG();
4225        }
4226
4227        bbio->orig_bio = first_bio;
4228        bbio->private = first_bio->bi_private;
4229        bbio->end_io = first_bio->bi_end_io;
4230        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4231
4232        while (dev_nr < total_devs) {
4233                if (dev_nr < total_devs - 1) {
4234                        bio = bio_clone(first_bio, GFP_NOFS);
4235                        BUG_ON(!bio); /* -ENOMEM */
4236                } else {
4237                        bio = first_bio;
4238                }
4239                bio->bi_private = bbio;
4240                bio->bi_private = merge_stripe_index_into_bio_private(
4241                                bio->bi_private, (unsigned int)dev_nr);
4242                bio->bi_end_io = btrfs_end_bio;
4243                bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4244                dev = bbio->stripes[dev_nr].dev;
4245                if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4246#ifdef DEBUG
4247                        struct rcu_string *name;
4248
4249                        rcu_read_lock();
4250                        name = rcu_dereference(dev->name);
4251                        pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4252                                 "(%s id %llu), size=%u\n", rw,
4253                                 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4254                                 name->str, dev->devid, bio->bi_size);
4255                        rcu_read_unlock();
4256#endif
4257                        bio->bi_bdev = dev->bdev;
4258                        if (async_submit)
4259                                schedule_bio(root, dev, rw, bio);
4260                        else
4261                                btrfsic_submit_bio(rw, bio);
4262                } else {
4263                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4264                        bio->bi_sector = logical >> 9;
4265                        bio_endio(bio, -EIO);
4266                }
4267                dev_nr++;
4268        }
4269        return 0;
4270}
4271
4272struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4273                                       u8 *uuid, u8 *fsid)
4274{
4275        struct btrfs_device *device;
4276        struct btrfs_fs_devices *cur_devices;
4277
4278        cur_devices = root->fs_info->fs_devices;
4279        while (cur_devices) {
4280                if (!fsid ||
4281                    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4282                        device = __find_device(&cur_devices->devices,
4283                                               devid, uuid);
4284                        if (device)
4285                                return device;
4286                }
4287                cur_devices = cur_devices->seed;
4288        }
4289        return NULL;
4290}
4291
4292static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4293                                            u64 devid, u8 *dev_uuid)
4294{
4295        struct btrfs_device *device;
4296        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4297
4298        device = kzalloc(sizeof(*device), GFP_NOFS);
4299        if (!device)
4300                return NULL;
4301        list_add(&device->dev_list,
4302                 &fs_devices->devices);
4303        device->dev_root = root->fs_info->dev_root;
4304        device->devid = devid;
4305        device->work.func = pending_bios_fn;
4306        device->fs_devices = fs_devices;
4307        device->missing = 1;
4308        fs_devices->num_devices++;
4309        fs_devices->missing_devices++;
4310        spin_lock_init(&device->io_lock);
4311        INIT_LIST_HEAD(&device->dev_alloc_list);
4312        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4313        return device;
4314}
4315
4316static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4317                          struct extent_buffer *leaf,
4318                          struct btrfs_chunk *chunk)
4319{
4320        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4321        struct map_lookup *map;
4322        struct extent_map *em;
4323        u64 logical;
4324        u64 length;
4325        u64 devid;
4326        u8 uuid[BTRFS_UUID_SIZE];
4327        int num_stripes;
4328        int ret;
4329        int i;
4330
4331        logical = key->offset;
4332        length = btrfs_chunk_length(leaf, chunk);
4333
4334        read_lock(&map_tree->map_tree.lock);
4335        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4336        read_unlock(&map_tree->map_tree.lock);
4337
4338        /* already mapped? */
4339        if (em && em->start <= logical && em->start + em->len > logical) {
4340                free_extent_map(em);
4341                return 0;
4342        } else if (em) {
4343                free_extent_map(em);
4344        }
4345
4346        em = alloc_extent_map();
4347        if (!em)
4348                return -ENOMEM;
4349        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4350        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4351        if (!map) {
4352                free_extent_map(em);
4353                return -ENOMEM;
4354        }
4355
4356        em->bdev = (struct block_device *)map;
4357        em->start = logical;
4358        em->len = length;
4359        em->block_start = 0;
4360        em->block_len = em->len;
4361
4362        map->num_stripes = num_stripes;
4363        map->io_width = btrfs_chunk_io_width(leaf, chunk);
4364        map->io_align = btrfs_chunk_io_align(leaf, chunk);
4365        map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4366        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4367        map->type = btrfs_chunk_type(leaf, chunk);
4368        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4369        for (i = 0; i < num_stripes; i++) {
4370                map->stripes[i].physical =
4371                        btrfs_stripe_offset_nr(leaf, chunk, i);
4372                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4373                read_extent_buffer(leaf, uuid, (unsigned long)
4374                                   btrfs_stripe_dev_uuid_nr(chunk, i),
4375                                   BTRFS_UUID_SIZE);
4376                map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4377                                                        NULL);
4378                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4379                        kfree(map);
4380                        free_extent_map(em);
4381                        return -EIO;
4382                }
4383                if (!map->stripes[i].dev) {
4384                        map->stripes[i].dev =
4385                                add_missing_dev(root, devid, uuid);
4386                        if (!map->stripes[i].dev) {
4387                                kfree(map);
4388                                free_extent_map(em);
4389                                return -EIO;
4390                        }
4391                }
4392                map->stripes[i].dev->in_fs_metadata = 1;
4393        }
4394
4395        write_lock(&map_tree->map_tree.lock);
4396        ret = add_extent_mapping(&map_tree->map_tree, em);
4397        write_unlock(&map_tree->map_tree.lock);
4398        BUG_ON(ret); /* Tree corruption */
4399        free_extent_map(em);
4400
4401        return 0;
4402}
4403
4404static void fill_device_from_item(struct extent_buffer *leaf,
4405                                 struct btrfs_dev_item *dev_item,
4406                                 struct btrfs_device *device)
4407{
4408        unsigned long ptr;
4409
4410        device->devid = btrfs_device_id(leaf, dev_item);
4411        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4412        device->total_bytes = device->disk_total_bytes;
4413        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4414        device->type = btrfs_device_type(leaf, dev_item);
4415        device->io_align = btrfs_device_io_align(leaf, dev_item);
4416        device->io_width = btrfs_device_io_width(leaf, dev_item);
4417        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4418
4419        ptr = (unsigned long)btrfs_device_uuid(dev_item);
4420        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4421}
4422
4423static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4424{
4425        struct btrfs_fs_devices *fs_devices;
4426        int ret;
4427
4428        BUG_ON(!mutex_is_locked(&uuid_mutex));
4429
4430        fs_devices = root->fs_info->fs_devices->seed;
4431        while (fs_devices) {
4432                if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4433                        ret = 0;
4434                        goto out;
4435                }
4436                fs_devices = fs_devices->seed;
4437        }
4438
4439        fs_devices = find_fsid(fsid);
4440        if (!fs_devices) {
4441                ret = -ENOENT;
4442                goto out;
4443        }
4444
4445        fs_devices = clone_fs_devices(fs_devices);
4446        if (IS_ERR(fs_devices)) {
4447                ret = PTR_ERR(fs_devices);
4448                goto out;
4449        }
4450
4451        ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4452                                   root->fs_info->bdev_holder);
4453        if (ret) {
4454                free_fs_devices(fs_devices);
4455                goto out;
4456        }
4457
4458        if (!fs_devices->seeding) {
4459                __btrfs_close_devices(fs_devices);
4460                free_fs_devices(fs_devices);
4461                ret = -EINVAL;
4462                goto out;
4463        }
4464
4465        fs_devices->seed = root->fs_info->fs_devices->seed;
4466        root->fs_info->fs_devices->seed = fs_devices;
4467out:
4468        return ret;
4469}
4470
4471static int read_one_dev(struct btrfs_root *root,
4472                        struct extent_buffer *leaf,
4473                        struct btrfs_dev_item *dev_item)
4474{
4475        struct btrfs_device *device;
4476        u64 devid;
4477        int ret;
4478        u8 fs_uuid[BTRFS_UUID_SIZE];
4479        u8 dev_uuid[BTRFS_UUID_SIZE];
4480
4481        devid = btrfs_device_id(leaf, dev_item);
4482        read_extent_buffer(leaf, dev_uuid,
4483                           (unsigned long)btrfs_device_uuid(dev_item),
4484                           BTRFS_UUID_SIZE);
4485        read_extent_buffer(leaf, fs_uuid,
4486                           (unsigned long)btrfs_device_fsid(dev_item),
4487                           BTRFS_UUID_SIZE);
4488
4489        if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4490                ret = open_seed_devices(root, fs_uuid);
4491                if (ret && !btrfs_test_opt(root, DEGRADED))
4492                        return ret;
4493        }
4494
4495        device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4496        if (!device || !device->bdev) {
4497                if (!btrfs_test_opt(root, DEGRADED))
4498                        return -EIO;
4499
4500                if (!device) {
4501                        printk(KERN_WARNING "warning devid %llu missing\n",
4502                               (unsigned long long)devid);
4503                        device = add_missing_dev(root, devid, dev_uuid);
4504                        if (!device)
4505                                return -ENOMEM;
4506                } else if (!device->missing) {
4507                        /*
4508                         * this happens when a device that was properly setup
4509                         * in the device info lists suddenly goes bad.
4510                         * device->bdev is NULL, and so we have to set
4511                         * device->missing to one here
4512                         */
4513                        root->fs_info->fs_devices->missing_devices++;
4514                        device->missing = 1;
4515                }
4516        }
4517
4518        if (device->fs_devices != root->fs_info->fs_devices) {
4519                BUG_ON(device->writeable);
4520                if (device->generation !=
4521                    btrfs_device_generation(leaf, dev_item))
4522                        return -EINVAL;
4523        }
4524
4525        fill_device_from_item(leaf, dev_item, device);
4526        device->dev_root = root->fs_info->dev_root;
4527        device->in_fs_metadata = 1;
4528        if (device->writeable) {
4529                device->fs_devices->total_rw_bytes += device->total_bytes;
4530                spin_lock(&root->fs_info->free_chunk_lock);
4531                root->fs_info->free_chunk_space += device->total_bytes -
4532                        device->bytes_used;
4533                spin_unlock(&root->fs_info->free_chunk_lock);
4534        }
4535        ret = 0;
4536        return ret;
4537}
4538
4539int btrfs_read_sys_array(struct btrfs_root *root)
4540{
4541        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4542        struct extent_buffer *sb;
4543        struct btrfs_disk_key *disk_key;
4544        struct btrfs_chunk *chunk;
4545        u8 *ptr;
4546        unsigned long sb_ptr;
4547        int ret = 0;
4548        u32 num_stripes;
4549        u32 array_size;
4550        u32 len = 0;
4551        u32 cur;
4552        struct btrfs_key key;
4553
4554        sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4555                                          BTRFS_SUPER_INFO_SIZE);
4556        if (!sb)
4557                return -ENOMEM;
4558        btrfs_set_buffer_uptodate(sb);
4559        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4560        /*
4561         * The sb extent buffer is artifical and just used to read the system array.
4562         * btrfs_set_buffer_uptodate() call does not properly mark all it's
4563         * pages up-to-date when the page is larger: extent does not cover the
4564         * whole page and consequently check_page_uptodate does not find all
4565         * the page's extents up-to-date (the hole beyond sb),
4566         * write_extent_buffer then triggers a WARN_ON.
4567         *
4568         * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
4569         * but sb spans only this function. Add an explicit SetPageUptodate call
4570         * to silence the warning eg. on PowerPC 64.
4571         */
4572        if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
4573                SetPageUptodate(sb->pages[0]);
4574
4575        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4576        array_size = btrfs_super_sys_array_size(super_copy);
4577
4578        ptr = super_copy->sys_chunk_array;
4579        sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4580        cur = 0;
4581
4582        while (cur < array_size) {
4583                disk_key = (struct btrfs_disk_key *)ptr;
4584                btrfs_disk_key_to_cpu(&key, disk_key);
4585
4586                len = sizeof(*disk_key); ptr += len;
4587                sb_ptr += len;
4588                cur += len;
4589
4590                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4591                        chunk = (struct btrfs_chunk *)sb_ptr;
4592                        ret = read_one_chunk(root, &key, sb, chunk);
4593                        if (ret)
4594                                break;
4595                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4596                        len = btrfs_chunk_item_size(num_stripes);
4597                } else {
4598                        ret = -EIO;
4599                        break;
4600                }
4601                ptr += len;
4602                sb_ptr += len;
4603                cur += len;
4604        }
4605        free_extent_buffer(sb);
4606        return ret;
4607}
4608
4609int btrfs_read_chunk_tree(struct btrfs_root *root)
4610{
4611        struct btrfs_path *path;
4612        struct extent_buffer *leaf;
4613        struct btrfs_key key;
4614        struct btrfs_key found_key;
4615        int ret;
4616        int slot;
4617
4618        root = root->fs_info->chunk_root;
4619
4620        path = btrfs_alloc_path();
4621        if (!path)
4622                return -ENOMEM;
4623
4624        mutex_lock(&uuid_mutex);
4625        lock_chunks(root);
4626
4627        /* first we search for all of the device items, and then we
4628         * read in all of the chunk items.  This way we can create chunk
4629         * mappings that reference all of the devices that are afound
4630         */
4631        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4632        key.offset = 0;
4633        key.type = 0;
4634again:
4635        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4636        if (ret < 0)
4637                goto error;
4638        while (1) {
4639                leaf = path->nodes[0];
4640                slot = path->slots[0];
4641                if (slot >= btrfs_header_nritems(leaf)) {
4642                        ret = btrfs_next_leaf(root, path);
4643                        if (ret == 0)
4644                                continue;
4645                        if (ret < 0)
4646                                goto error;
4647                        break;
4648                }
4649                btrfs_item_key_to_cpu(leaf, &found_key, slot);
4650                if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4651                        if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4652                                break;
4653                        if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4654                                struct btrfs_dev_item *dev_item;
4655                                dev_item = btrfs_item_ptr(leaf, slot,
4656                                                  struct btrfs_dev_item);
4657                                ret = read_one_dev(root, leaf, dev_item);
4658                                if (ret)
4659                                        goto error;
4660                        }
4661                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4662                        struct btrfs_chunk *chunk;
4663                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4664                        ret = read_one_chunk(root, &found_key, leaf, chunk);
4665                        if (ret)
4666                                goto error;
4667                }
4668                path->slots[0]++;
4669        }
4670        if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4671                key.objectid = 0;
4672                btrfs_release_path(path);
4673                goto again;
4674        }
4675        ret = 0;
4676error:
4677        unlock_chunks(root);
4678        mutex_unlock(&uuid_mutex);
4679
4680        btrfs_free_path(path);
4681        return ret;
4682}
4683
4684static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4685{
4686        int i;
4687
4688        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4689                btrfs_dev_stat_reset(dev, i);
4690}
4691
4692int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4693{
4694        struct btrfs_key key;
4695        struct btrfs_key found_key;
4696        struct btrfs_root *dev_root = fs_info->dev_root;
4697        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4698        struct extent_buffer *eb;
4699        int slot;
4700        int ret = 0;
4701        struct btrfs_device *device;
4702        struct btrfs_path *path = NULL;
4703        int i;
4704
4705        path = btrfs_alloc_path();
4706        if (!path) {
4707                ret = -ENOMEM;
4708                goto out;
4709        }
4710
4711        mutex_lock(&fs_devices->device_list_mutex);
4712        list_for_each_entry(device, &fs_devices->devices, dev_list) {
4713                int item_size;
4714                struct btrfs_dev_stats_item *ptr;
4715
4716                key.objectid = 0;
4717                key.type = BTRFS_DEV_STATS_KEY;
4718                key.offset = device->devid;
4719                ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4720                if (ret) {
4721                        __btrfs_reset_dev_stats(device);
4722                        device->dev_stats_valid = 1;
4723                        btrfs_release_path(path);
4724                        continue;
4725                }
4726                slot = path->slots[0];
4727                eb = path->nodes[0];
4728                btrfs_item_key_to_cpu(eb, &found_key, slot);
4729                item_size = btrfs_item_size_nr(eb, slot);
4730
4731                ptr = btrfs_item_ptr(eb, slot,
4732                                     struct btrfs_dev_stats_item);
4733
4734                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4735                        if (item_size >= (1 + i) * sizeof(__le64))
4736                                btrfs_dev_stat_set(device, i,
4737                                        btrfs_dev_stats_value(eb, ptr, i));
4738                        else
4739                                btrfs_dev_stat_reset(device, i);
4740                }
4741
4742                device->dev_stats_valid = 1;
4743                btrfs_dev_stat_print_on_load(device);
4744                btrfs_release_path(path);
4745        }
4746        mutex_unlock(&fs_devices->device_list_mutex);
4747
4748out:
4749        btrfs_free_path(path);
4750        return ret < 0 ? ret : 0;
4751}
4752
4753static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4754                                struct btrfs_root *dev_root,
4755                                struct btrfs_device *device)
4756{
4757        struct btrfs_path *path;
4758        struct btrfs_key key;
4759        struct extent_buffer *eb;
4760        struct btrfs_dev_stats_item *ptr;
4761        int ret;
4762        int i;
4763
4764        key.objectid = 0;
4765        key.type = BTRFS_DEV_STATS_KEY;
4766        key.offset = device->devid;
4767
4768        path = btrfs_alloc_path();
4769        BUG_ON(!path);
4770        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4771        if (ret < 0) {
4772                printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4773                              ret, rcu_str_deref(device->name));
4774                goto out;
4775        }
4776
4777        if (ret == 0 &&
4778            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4779                /* need to delete old one and insert a new one */
4780                ret = btrfs_del_item(trans, dev_root, path);
4781                if (ret != 0) {
4782                        printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4783                                      rcu_str_deref(device->name), ret);
4784                        goto out;
4785                }
4786                ret = 1;
4787        }
4788
4789        if (ret == 1) {
4790                /* need to insert a new item */
4791                btrfs_release_path(path);
4792                ret = btrfs_insert_empty_item(trans, dev_root, path,
4793                                              &key, sizeof(*ptr));
4794                if (ret < 0) {
4795                        printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4796                                      rcu_str_deref(device->name), ret);
4797                        goto out;
4798                }
4799        }
4800
4801        eb = path->nodes[0];
4802        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4803        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4804                btrfs_set_dev_stats_value(eb, ptr, i,
4805                                          btrfs_dev_stat_read(device, i));
4806        btrfs_mark_buffer_dirty(eb);
4807
4808out:
4809        btrfs_free_path(path);
4810        return ret;
4811}
4812
4813/*
4814 * called from commit_transaction. Writes all changed device stats to disk.
4815 */
4816int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4817                        struct btrfs_fs_info *fs_info)
4818{
4819        struct btrfs_root *dev_root = fs_info->dev_root;
4820        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4821        struct btrfs_device *device;
4822        int ret = 0;
4823
4824        mutex_lock(&fs_devices->device_list_mutex);
4825        list_for_each_entry(device, &fs_devices->devices, dev_list) {
4826                if (!device->dev_stats_valid || !device->dev_stats_dirty)
4827                        continue;
4828
4829                ret = update_dev_stat_item(trans, dev_root, device);
4830                if (!ret)
4831                        device->dev_stats_dirty = 0;
4832        }
4833        mutex_unlock(&fs_devices->device_list_mutex);
4834
4835        return ret;
4836}
4837
4838void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4839{
4840        btrfs_dev_stat_inc(dev, index);
4841        btrfs_dev_stat_print_on_error(dev);
4842}
4843
4844void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4845{
4846        if (!dev->dev_stats_valid)
4847                return;
4848        printk_ratelimited_in_rcu(KERN_ERR
4849                           "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4850                           rcu_str_deref(dev->name),
4851                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4852                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4853                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4854                           btrfs_dev_stat_read(dev,
4855                                               BTRFS_DEV_STAT_CORRUPTION_ERRS),
4856                           btrfs_dev_stat_read(dev,
4857                                               BTRFS_DEV_STAT_GENERATION_ERRS));
4858}
4859
4860static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4861{
4862        int i;
4863
4864        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4865                if (btrfs_dev_stat_read(dev, i) != 0)
4866                        break;
4867        if (i == BTRFS_DEV_STAT_VALUES_MAX)
4868                return; /* all values == 0, suppress message */
4869
4870        printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4871               rcu_str_deref(dev->name),
4872               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4873               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4874               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4875               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4876               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4877}
4878
4879int btrfs_get_dev_stats(struct btrfs_root *root,
4880                        struct btrfs_ioctl_get_dev_stats *stats)
4881{
4882        struct btrfs_device *dev;
4883        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4884        int i;
4885
4886        mutex_lock(&fs_devices->device_list_mutex);
4887        dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4888        mutex_unlock(&fs_devices->device_list_mutex);
4889
4890        if (!dev) {
4891                printk(KERN_WARNING
4892                       "btrfs: get dev_stats failed, device not found\n");
4893                return -ENODEV;
4894        } else if (!dev->dev_stats_valid) {
4895                printk(KERN_WARNING
4896                       "btrfs: get dev_stats failed, not yet valid\n");
4897                return -ENODEV;
4898        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
4899                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4900                        if (stats->nr_items > i)
4901                                stats->values[i] =
4902                                        btrfs_dev_stat_read_and_reset(dev, i);
4903                        else
4904                                btrfs_dev_stat_reset(dev, i);
4905                }
4906        } else {
4907                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4908                        if (stats->nr_items > i)
4909                                stats->values[i] = btrfs_dev_stat_read(dev, i);
4910        }
4911        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4912                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4913        return 0;
4914}
4915
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.