linux/mm/backing-dev.c
<<
>>
Prefs
   1
   2#include <linux/wait.h>
   3#include <linux/backing-dev.h>
   4#include <linux/kthread.h>
   5#include <linux/freezer.h>
   6#include <linux/fs.h>
   7#include <linux/pagemap.h>
   8#include <linux/mm.h>
   9#include <linux/sched.h>
  10#include <linux/module.h>
  11#include <linux/writeback.h>
  12#include <linux/device.h>
  13#include <trace/events/writeback.h>
  14
  15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  16
  17struct backing_dev_info default_backing_dev_info = {
  18        .name           = "default",
  19        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
  20        .state          = 0,
  21        .capabilities   = BDI_CAP_MAP_COPY,
  22};
  23EXPORT_SYMBOL_GPL(default_backing_dev_info);
  24
  25struct backing_dev_info noop_backing_dev_info = {
  26        .name           = "noop",
  27        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
  28};
  29EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  30
  31static struct class *bdi_class;
  32
  33/*
  34 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
  35 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
  36 * locking.
  37 */
  38DEFINE_SPINLOCK(bdi_lock);
  39LIST_HEAD(bdi_list);
  40LIST_HEAD(bdi_pending_list);
  41
  42static struct task_struct *sync_supers_tsk;
  43static struct timer_list sync_supers_timer;
  44
  45static int bdi_sync_supers(void *);
  46static void sync_supers_timer_fn(unsigned long);
  47
  48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
  49{
  50        if (wb1 < wb2) {
  51                spin_lock(&wb1->list_lock);
  52                spin_lock_nested(&wb2->list_lock, 1);
  53        } else {
  54                spin_lock(&wb2->list_lock);
  55                spin_lock_nested(&wb1->list_lock, 1);
  56        }
  57}
  58
  59#ifdef CONFIG_DEBUG_FS
  60#include <linux/debugfs.h>
  61#include <linux/seq_file.h>
  62
  63static struct dentry *bdi_debug_root;
  64
  65static void bdi_debug_init(void)
  66{
  67        bdi_debug_root = debugfs_create_dir("bdi", NULL);
  68}
  69
  70static int bdi_debug_stats_show(struct seq_file *m, void *v)
  71{
  72        struct backing_dev_info *bdi = m->private;
  73        struct bdi_writeback *wb = &bdi->wb;
  74        unsigned long background_thresh;
  75        unsigned long dirty_thresh;
  76        unsigned long bdi_thresh;
  77        unsigned long nr_dirty, nr_io, nr_more_io;
  78        struct inode *inode;
  79
  80        nr_dirty = nr_io = nr_more_io = 0;
  81        spin_lock(&wb->list_lock);
  82        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
  83                nr_dirty++;
  84        list_for_each_entry(inode, &wb->b_io, i_wb_list)
  85                nr_io++;
  86        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
  87                nr_more_io++;
  88        spin_unlock(&wb->list_lock);
  89
  90        global_dirty_limits(&background_thresh, &dirty_thresh);
  91        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
  92
  93#define K(x) ((x) << (PAGE_SHIFT - 10))
  94        seq_printf(m,
  95                   "BdiWriteback:       %10lu kB\n"
  96                   "BdiReclaimable:     %10lu kB\n"
  97                   "BdiDirtyThresh:     %10lu kB\n"
  98                   "DirtyThresh:        %10lu kB\n"
  99                   "BackgroundThresh:   %10lu kB\n"
 100                   "BdiWritten:         %10lu kB\n"
 101                   "BdiWriteBandwidth:  %10lu kBps\n"
 102                   "b_dirty:            %10lu\n"
 103                   "b_io:               %10lu\n"
 104                   "b_more_io:          %10lu\n"
 105                   "bdi_list:           %10u\n"
 106                   "state:              %10lx\n",
 107                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 108                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
 109                   K(bdi_thresh),
 110                   K(dirty_thresh),
 111                   K(background_thresh),
 112                   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
 113                   (unsigned long) K(bdi->write_bandwidth),
 114                   nr_dirty,
 115                   nr_io,
 116                   nr_more_io,
 117                   !list_empty(&bdi->bdi_list), bdi->state);
 118#undef K
 119
 120        return 0;
 121}
 122
 123static int bdi_debug_stats_open(struct inode *inode, struct file *file)
 124{
 125        return single_open(file, bdi_debug_stats_show, inode->i_private);
 126}
 127
 128static const struct file_operations bdi_debug_stats_fops = {
 129        .open           = bdi_debug_stats_open,
 130        .read           = seq_read,
 131        .llseek         = seq_lseek,
 132        .release        = single_release,
 133};
 134
 135static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 136{
 137        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 138        bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
 139                                               bdi, &bdi_debug_stats_fops);
 140}
 141
 142static void bdi_debug_unregister(struct backing_dev_info *bdi)
 143{
 144        debugfs_remove(bdi->debug_stats);
 145        debugfs_remove(bdi->debug_dir);
 146}
 147#else
 148static inline void bdi_debug_init(void)
 149{
 150}
 151static inline void bdi_debug_register(struct backing_dev_info *bdi,
 152                                      const char *name)
 153{
 154}
 155static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 156{
 157}
 158#endif
 159
 160static ssize_t read_ahead_kb_store(struct device *dev,
 161                                  struct device_attribute *attr,
 162                                  const char *buf, size_t count)
 163{
 164        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 165        char *end;
 166        unsigned long read_ahead_kb;
 167        ssize_t ret = -EINVAL;
 168
 169        read_ahead_kb = simple_strtoul(buf, &end, 10);
 170        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 171                bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
 172                ret = count;
 173        }
 174        return ret;
 175}
 176
 177#define K(pages) ((pages) << (PAGE_SHIFT - 10))
 178
 179#define BDI_SHOW(name, expr)                                            \
 180static ssize_t name##_show(struct device *dev,                          \
 181                           struct device_attribute *attr, char *page)   \
 182{                                                                       \
 183        struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
 184                                                                        \
 185        return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr);  \
 186}
 187
 188BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 189
 190static ssize_t min_ratio_store(struct device *dev,
 191                struct device_attribute *attr, const char *buf, size_t count)
 192{
 193        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 194        char *end;
 195        unsigned int ratio;
 196        ssize_t ret = -EINVAL;
 197
 198        ratio = simple_strtoul(buf, &end, 10);
 199        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 200                ret = bdi_set_min_ratio(bdi, ratio);
 201                if (!ret)
 202                        ret = count;
 203        }
 204        return ret;
 205}
 206BDI_SHOW(min_ratio, bdi->min_ratio)
 207
 208static ssize_t max_ratio_store(struct device *dev,
 209                struct device_attribute *attr, const char *buf, size_t count)
 210{
 211        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 212        char *end;
 213        unsigned int ratio;
 214        ssize_t ret = -EINVAL;
 215
 216        ratio = simple_strtoul(buf, &end, 10);
 217        if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
 218                ret = bdi_set_max_ratio(bdi, ratio);
 219                if (!ret)
 220                        ret = count;
 221        }
 222        return ret;
 223}
 224BDI_SHOW(max_ratio, bdi->max_ratio)
 225
 226#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 227
 228static struct device_attribute bdi_dev_attrs[] = {
 229        __ATTR_RW(read_ahead_kb),
 230        __ATTR_RW(min_ratio),
 231        __ATTR_RW(max_ratio),
 232        __ATTR_NULL,
 233};
 234
 235static __init int bdi_class_init(void)
 236{
 237        bdi_class = class_create(THIS_MODULE, "bdi");
 238        if (IS_ERR(bdi_class))
 239                return PTR_ERR(bdi_class);
 240
 241        bdi_class->dev_attrs = bdi_dev_attrs;
 242        bdi_debug_init();
 243        return 0;
 244}
 245postcore_initcall(bdi_class_init);
 246
 247static int __init default_bdi_init(void)
 248{
 249        int err;
 250
 251        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
 252        BUG_ON(IS_ERR(sync_supers_tsk));
 253
 254        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
 255        bdi_arm_supers_timer();
 256
 257        err = bdi_init(&default_backing_dev_info);
 258        if (!err)
 259                bdi_register(&default_backing_dev_info, NULL, "default");
 260        err = bdi_init(&noop_backing_dev_info);
 261
 262        return err;
 263}
 264subsys_initcall(default_bdi_init);
 265
 266int bdi_has_dirty_io(struct backing_dev_info *bdi)
 267{
 268        return wb_has_dirty_io(&bdi->wb);
 269}
 270
 271/*
 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
 274 * to implement sync_supers_bdi() or similar and simply do it from the
 275 * bdi writeback thread individually.
 276 */
 277static int bdi_sync_supers(void *unused)
 278{
 279        set_user_nice(current, 0);
 280
 281        while (!kthread_should_stop()) {
 282                set_current_state(TASK_INTERRUPTIBLE);
 283                schedule();
 284
 285                /*
 286                 * Do this periodically, like kupdated() did before.
 287                 */
 288                sync_supers();
 289        }
 290
 291        return 0;
 292}
 293
 294void bdi_arm_supers_timer(void)
 295{
 296        unsigned long next;
 297
 298        if (!dirty_writeback_interval)
 299                return;
 300
 301        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
 302        mod_timer(&sync_supers_timer, round_jiffies_up(next));
 303}
 304
 305static void sync_supers_timer_fn(unsigned long unused)
 306{
 307        wake_up_process(sync_supers_tsk);
 308        bdi_arm_supers_timer();
 309}
 310
 311static void wakeup_timer_fn(unsigned long data)
 312{
 313        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
 314
 315        spin_lock_bh(&bdi->wb_lock);
 316        if (bdi->wb.task) {
 317                trace_writeback_wake_thread(bdi);
 318                wake_up_process(bdi->wb.task);
 319        } else {
 320                /*
 321                 * When bdi tasks are inactive for long time, they are killed.
 322                 * In this case we have to wake-up the forker thread which
 323                 * should create and run the bdi thread.
 324                 */
 325                trace_writeback_wake_forker_thread(bdi);
 326                wake_up_process(default_backing_dev_info.wb.task);
 327        }
 328        spin_unlock_bh(&bdi->wb_lock);
 329}
 330
 331/*
 332 * This function is used when the first inode for this bdi is marked dirty. It
 333 * wakes-up the corresponding bdi thread which should then take care of the
 334 * periodic background write-out of dirty inodes. Since the write-out would
 335 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 336 * set up a timer which wakes the bdi thread up later.
 337 *
 338 * Note, we wouldn't bother setting up the timer, but this function is on the
 339 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 340 * by delaying the wake-up.
 341 */
 342void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 343{
 344        unsigned long timeout;
 345
 346        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 347        mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
 348}
 349
 350/*
 351 * Calculate the longest interval (jiffies) bdi threads are allowed to be
 352 * inactive.
 353 */
 354static unsigned long bdi_longest_inactive(void)
 355{
 356        unsigned long interval;
 357
 358        interval = msecs_to_jiffies(dirty_writeback_interval * 10);
 359        return max(5UL * 60 * HZ, interval);
 360}
 361
 362/*
 363 * Clear pending bit and wakeup anybody waiting for flusher thread creation or
 364 * shutdown
 365 */
 366static void bdi_clear_pending(struct backing_dev_info *bdi)
 367{
 368        clear_bit(BDI_pending, &bdi->state);
 369        smp_mb__after_clear_bit();
 370        wake_up_bit(&bdi->state, BDI_pending);
 371}
 372
 373static int bdi_forker_thread(void *ptr)
 374{
 375        struct bdi_writeback *me = ptr;
 376
 377        current->flags |= PF_SWAPWRITE;
 378        set_freezable();
 379
 380        /*
 381         * Our parent may run at a different priority, just set us to normal
 382         */
 383        set_user_nice(current, 0);
 384
 385        for (;;) {
 386                struct task_struct *task = NULL;
 387                struct backing_dev_info *bdi;
 388                enum {
 389                        NO_ACTION,   /* Nothing to do */
 390                        FORK_THREAD, /* Fork bdi thread */
 391                        KILL_THREAD, /* Kill inactive bdi thread */
 392                } action = NO_ACTION;
 393
 394                /*
 395                 * Temporary measure, we want to make sure we don't see
 396                 * dirty data on the default backing_dev_info
 397                 */
 398                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
 399                        del_timer(&me->wakeup_timer);
 400                        wb_do_writeback(me, 0);
 401                }
 402
 403                spin_lock_bh(&bdi_lock);
 404                /*
 405                 * In the following loop we are going to check whether we have
 406                 * some work to do without any synchronization with tasks
 407                 * waking us up to do work for them. So we have to set task
 408                 * state already here so that we don't miss wakeups coming
 409                 * after we verify some condition.
 410                 */
 411                set_current_state(TASK_INTERRUPTIBLE);
 412
 413                list_for_each_entry(bdi, &bdi_list, bdi_list) {
 414                        bool have_dirty_io;
 415
 416                        if (!bdi_cap_writeback_dirty(bdi) ||
 417                             bdi_cap_flush_forker(bdi))
 418                                continue;
 419
 420                        WARN(!test_bit(BDI_registered, &bdi->state),
 421                             "bdi %p/%s is not registered!\n", bdi, bdi->name);
 422
 423                        have_dirty_io = !list_empty(&bdi->work_list) ||
 424                                        wb_has_dirty_io(&bdi->wb);
 425
 426                        /*
 427                         * If the bdi has work to do, but the thread does not
 428                         * exist - create it.
 429                         */
 430                        if (!bdi->wb.task && have_dirty_io) {
 431                                /*
 432                                 * Set the pending bit - if someone will try to
 433                                 * unregister this bdi - it'll wait on this bit.
 434                                 */
 435                                set_bit(BDI_pending, &bdi->state);
 436                                action = FORK_THREAD;
 437                                break;
 438                        }
 439
 440                        spin_lock(&bdi->wb_lock);
 441
 442                        /*
 443                         * If there is no work to do and the bdi thread was
 444                         * inactive long enough - kill it. The wb_lock is taken
 445                         * to make sure no-one adds more work to this bdi and
 446                         * wakes the bdi thread up.
 447                         */
 448                        if (bdi->wb.task && !have_dirty_io &&
 449                            time_after(jiffies, bdi->wb.last_active +
 450                                                bdi_longest_inactive())) {
 451                                task = bdi->wb.task;
 452                                bdi->wb.task = NULL;
 453                                spin_unlock(&bdi->wb_lock);
 454                                set_bit(BDI_pending, &bdi->state);
 455                                action = KILL_THREAD;
 456                                break;
 457                        }
 458                        spin_unlock(&bdi->wb_lock);
 459                }
 460                spin_unlock_bh(&bdi_lock);
 461
 462                /* Keep working if default bdi still has things to do */
 463                if (!list_empty(&me->bdi->work_list))
 464                        __set_current_state(TASK_RUNNING);
 465
 466                switch (action) {
 467                case FORK_THREAD:
 468                        __set_current_state(TASK_RUNNING);
 469                        task = kthread_create(bdi_writeback_thread, &bdi->wb,
 470                                              "flush-%s", dev_name(bdi->dev));
 471                        if (IS_ERR(task)) {
 472                                /*
 473                                 * If thread creation fails, force writeout of
 474                                 * the bdi from the thread. Hopefully 1024 is
 475                                 * large enough for efficient IO.
 476                                 */
 477                                writeback_inodes_wb(&bdi->wb, 1024);
 478                        } else {
 479                                /*
 480                                 * The spinlock makes sure we do not lose
 481                                 * wake-ups when racing with 'bdi_queue_work()'.
 482                                 * And as soon as the bdi thread is visible, we
 483                                 * can start it.
 484                                 */
 485                                spin_lock_bh(&bdi->wb_lock);
 486                                bdi->wb.task = task;
 487                                spin_unlock_bh(&bdi->wb_lock);
 488                                wake_up_process(task);
 489                        }
 490                        bdi_clear_pending(bdi);
 491                        break;
 492
 493                case KILL_THREAD:
 494                        __set_current_state(TASK_RUNNING);
 495                        kthread_stop(task);
 496                        bdi_clear_pending(bdi);
 497                        break;
 498
 499                case NO_ACTION:
 500                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
 501                                /*
 502                                 * There are no dirty data. The only thing we
 503                                 * should now care about is checking for
 504                                 * inactive bdi threads and killing them. Thus,
 505                                 * let's sleep for longer time, save energy and
 506                                 * be friendly for battery-driven devices.
 507                                 */
 508                                schedule_timeout(bdi_longest_inactive());
 509                        else
 510                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
 511                        try_to_freeze();
 512                        break;
 513                }
 514        }
 515
 516        return 0;
 517}
 518
 519/*
 520 * Remove bdi from bdi_list, and ensure that it is no longer visible
 521 */
 522static void bdi_remove_from_list(struct backing_dev_info *bdi)
 523{
 524        spin_lock_bh(&bdi_lock);
 525        list_del_rcu(&bdi->bdi_list);
 526        spin_unlock_bh(&bdi_lock);
 527
 528        synchronize_rcu_expedited();
 529}
 530
 531int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 532                const char *fmt, ...)
 533{
 534        va_list args;
 535        struct device *dev;
 536
 537        if (bdi->dev)   /* The driver needs to use separate queues per device */
 538                return 0;
 539
 540        va_start(args, fmt);
 541        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
 542        va_end(args);
 543        if (IS_ERR(dev))
 544                return PTR_ERR(dev);
 545
 546        bdi->dev = dev;
 547
 548        /*
 549         * Just start the forker thread for our default backing_dev_info,
 550         * and add other bdi's to the list. They will get a thread created
 551         * on-demand when they need it.
 552         */
 553        if (bdi_cap_flush_forker(bdi)) {
 554                struct bdi_writeback *wb = &bdi->wb;
 555
 556                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
 557                                                dev_name(dev));
 558                if (IS_ERR(wb->task))
 559                        return PTR_ERR(wb->task);
 560        }
 561
 562        bdi_debug_register(bdi, dev_name(dev));
 563        set_bit(BDI_registered, &bdi->state);
 564
 565        spin_lock_bh(&bdi_lock);
 566        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 567        spin_unlock_bh(&bdi_lock);
 568
 569        trace_writeback_bdi_register(bdi);
 570        return 0;
 571}
 572EXPORT_SYMBOL(bdi_register);
 573
 574int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 575{
 576        return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
 577}
 578EXPORT_SYMBOL(bdi_register_dev);
 579
 580/*
 581 * Remove bdi from the global list and shutdown any threads we have running
 582 */
 583static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 584{
 585        if (!bdi_cap_writeback_dirty(bdi))
 586                return;
 587
 588        /*
 589         * Make sure nobody finds us on the bdi_list anymore
 590         */
 591        bdi_remove_from_list(bdi);
 592
 593        /*
 594         * If setup is pending, wait for that to complete first
 595         */
 596        wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
 597                        TASK_UNINTERRUPTIBLE);
 598
 599        /*
 600         * Finally, kill the kernel thread. We don't need to be RCU
 601         * safe anymore, since the bdi is gone from visibility. Force
 602         * unfreeze of the thread before calling kthread_stop(), otherwise
 603         * it would never exet if it is currently stuck in the refrigerator.
 604         */
 605        if (bdi->wb.task) {
 606                thaw_process(bdi->wb.task);
 607                kthread_stop(bdi->wb.task);
 608        }
 609}
 610
 611/*
 612 * This bdi is going away now, make sure that no super_blocks point to it
 613 */
 614static void bdi_prune_sb(struct backing_dev_info *bdi)
 615{
 616        struct super_block *sb;
 617
 618        spin_lock(&sb_lock);
 619        list_for_each_entry(sb, &super_blocks, s_list) {
 620                if (sb->s_bdi == bdi)
 621                        sb->s_bdi = &default_backing_dev_info;
 622        }
 623        spin_unlock(&sb_lock);
 624}
 625
 626void bdi_unregister(struct backing_dev_info *bdi)
 627{
 628        if (bdi->dev) {
 629                bdi_set_min_ratio(bdi, 0);
 630                trace_writeback_bdi_unregister(bdi);
 631                bdi_prune_sb(bdi);
 632                del_timer_sync(&bdi->wb.wakeup_timer);
 633
 634                if (!bdi_cap_flush_forker(bdi))
 635                        bdi_wb_shutdown(bdi);
 636                bdi_debug_unregister(bdi);
 637                device_unregister(bdi->dev);
 638                bdi->dev = NULL;
 639        }
 640}
 641EXPORT_SYMBOL(bdi_unregister);
 642
 643static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 644{
 645        memset(wb, 0, sizeof(*wb));
 646
 647        wb->bdi = bdi;
 648        wb->last_old_flush = jiffies;
 649        INIT_LIST_HEAD(&wb->b_dirty);
 650        INIT_LIST_HEAD(&wb->b_io);
 651        INIT_LIST_HEAD(&wb->b_more_io);
 652        spin_lock_init(&wb->list_lock);
 653        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 654}
 655
 656/*
 657 * Initial write bandwidth: 100 MB/s
 658 */
 659#define INIT_BW         (100 << (20 - PAGE_SHIFT))
 660
 661int bdi_init(struct backing_dev_info *bdi)
 662{
 663        int i, err;
 664
 665        bdi->dev = NULL;
 666
 667        bdi->min_ratio = 0;
 668        bdi->max_ratio = 100;
 669        bdi->max_prop_frac = PROP_FRAC_BASE;
 670        spin_lock_init(&bdi->wb_lock);
 671        INIT_LIST_HEAD(&bdi->bdi_list);
 672        INIT_LIST_HEAD(&bdi->work_list);
 673
 674        bdi_wb_init(&bdi->wb, bdi);
 675
 676        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 677                err = percpu_counter_init(&bdi->bdi_stat[i], 0);
 678                if (err)
 679                        goto err;
 680        }
 681
 682        bdi->dirty_exceeded = 0;
 683
 684        bdi->bw_time_stamp = jiffies;
 685        bdi->written_stamp = 0;
 686
 687        bdi->write_bandwidth = INIT_BW;
 688        bdi->avg_write_bandwidth = INIT_BW;
 689
 690        err = prop_local_init_percpu(&bdi->completions);
 691
 692        if (err) {
 693err:
 694                while (i--)
 695                        percpu_counter_destroy(&bdi->bdi_stat[i]);
 696        }
 697
 698        return err;
 699}
 700EXPORT_SYMBOL(bdi_init);
 701
 702void bdi_destroy(struct backing_dev_info *bdi)
 703{
 704        int i;
 705
 706        /*
 707         * Splice our entries to the default_backing_dev_info, if this
 708         * bdi disappears
 709         */
 710        if (bdi_has_dirty_io(bdi)) {
 711                struct bdi_writeback *dst = &default_backing_dev_info.wb;
 712
 713                bdi_lock_two(&bdi->wb, dst);
 714                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 715                list_splice(&bdi->wb.b_io, &dst->b_io);
 716                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
 717                spin_unlock(&bdi->wb.list_lock);
 718                spin_unlock(&dst->list_lock);
 719        }
 720
 721        bdi_unregister(bdi);
 722
 723        /*
 724         * If bdi_unregister() had already been called earlier, the
 725         * wakeup_timer could still be armed because bdi_prune_sb()
 726         * can race with the bdi_wakeup_thread_delayed() calls from
 727         * __mark_inode_dirty().
 728         */
 729        del_timer_sync(&bdi->wb.wakeup_timer);
 730
 731        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 732                percpu_counter_destroy(&bdi->bdi_stat[i]);
 733
 734        prop_local_destroy_percpu(&bdi->completions);
 735}
 736EXPORT_SYMBOL(bdi_destroy);
 737
 738/*
 739 * For use from filesystems to quickly init and register a bdi associated
 740 * with dirty writeback
 741 */
 742int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
 743                           unsigned int cap)
 744{
 745        char tmp[32];
 746        int err;
 747
 748        bdi->name = name;
 749        bdi->capabilities = cap;
 750        err = bdi_init(bdi);
 751        if (err)
 752                return err;
 753
 754        sprintf(tmp, "%.28s%s", name, "-%d");
 755        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
 756        if (err) {
 757                bdi_destroy(bdi);
 758                return err;
 759        }
 760
 761        return 0;
 762}
 763EXPORT_SYMBOL(bdi_setup_and_register);
 764
 765static wait_queue_head_t congestion_wqh[2] = {
 766                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 767                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 768        };
 769static atomic_t nr_bdi_congested[2];
 770
 771void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 772{
 773        enum bdi_state bit;
 774        wait_queue_head_t *wqh = &congestion_wqh[sync];
 775
 776        bit = sync ? BDI_sync_congested : BDI_async_congested;
 777        if (test_and_clear_bit(bit, &bdi->state))
 778                atomic_dec(&nr_bdi_congested[sync]);
 779        smp_mb__after_clear_bit();
 780        if (waitqueue_active(wqh))
 781                wake_up(wqh);
 782}
 783EXPORT_SYMBOL(clear_bdi_congested);
 784
 785void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 786{
 787        enum bdi_state bit;
 788
 789        bit = sync ? BDI_sync_congested : BDI_async_congested;
 790        if (!test_and_set_bit(bit, &bdi->state))
 791                atomic_inc(&nr_bdi_congested[sync]);
 792}
 793EXPORT_SYMBOL(set_bdi_congested);
 794
 795/**
 796 * congestion_wait - wait for a backing_dev to become uncongested
 797 * @sync: SYNC or ASYNC IO
 798 * @timeout: timeout in jiffies
 799 *
 800 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 801 * write congestion.  If no backing_devs are congested then just wait for the
 802 * next write to be completed.
 803 */
 804long congestion_wait(int sync, long timeout)
 805{
 806        long ret;
 807        unsigned long start = jiffies;
 808        DEFINE_WAIT(wait);
 809        wait_queue_head_t *wqh = &congestion_wqh[sync];
 810
 811        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 812        ret = io_schedule_timeout(timeout);
 813        finish_wait(wqh, &wait);
 814
 815        trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
 816                                        jiffies_to_usecs(jiffies - start));
 817
 818        return ret;
 819}
 820EXPORT_SYMBOL(congestion_wait);
 821
 822/**
 823 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
 824 * @zone: A zone to check if it is heavily congested
 825 * @sync: SYNC or ASYNC IO
 826 * @timeout: timeout in jiffies
 827 *
 828 * In the event of a congested backing_dev (any backing_dev) and the given
 829 * @zone has experienced recent congestion, this waits for up to @timeout
 830 * jiffies for either a BDI to exit congestion of the given @sync queue
 831 * or a write to complete.
 832 *
 833 * In the absence of zone congestion, cond_resched() is called to yield
 834 * the processor if necessary but otherwise does not sleep.
 835 *
 836 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 837 * it is the number of jiffies that were still remaining when the function
 838 * returned. return_value == timeout implies the function did not sleep.
 839 */
 840long wait_iff_congested(struct zone *zone, int sync, long timeout)
 841{
 842        long ret;
 843        unsigned long start = jiffies;
 844        DEFINE_WAIT(wait);
 845        wait_queue_head_t *wqh = &congestion_wqh[sync];
 846
 847        /*
 848         * If there is no congestion, or heavy congestion is not being
 849         * encountered in the current zone, yield if necessary instead
 850         * of sleeping on the congestion queue
 851         */
 852        if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
 853                        !zone_is_reclaim_congested(zone)) {
 854                cond_resched();
 855
 856                /* In case we scheduled, work out time remaining */
 857                ret = timeout - (jiffies - start);
 858                if (ret < 0)
 859                        ret = 0;
 860
 861                goto out;
 862        }
 863
 864        /* Sleep until uncongested or a write happens */
 865        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 866        ret = io_schedule_timeout(timeout);
 867        finish_wait(wqh, &wait);
 868
 869out:
 870        trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
 871                                        jiffies_to_usecs(jiffies - start));
 872
 873        return ret;
 874}
 875EXPORT_SYMBOL(wait_iff_congested);
 876
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.