linux/mm/backing-dev.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2
   3#include <linux/wait.h>
   4#include <linux/rbtree.h>
   5#include <linux/backing-dev.h>
   6#include <linux/kthread.h>
   7#include <linux/freezer.h>
   8#include <linux/fs.h>
   9#include <linux/pagemap.h>
  10#include <linux/mm.h>
  11#include <linux/sched/mm.h>
  12#include <linux/sched.h>
  13#include <linux/module.h>
  14#include <linux/writeback.h>
  15#include <linux/device.h>
  16#include <trace/events/writeback.h>
  17
  18struct backing_dev_info noop_backing_dev_info;
  19EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  20
  21static struct class *bdi_class;
  22static const char *bdi_unknown_name = "(unknown)";
  23
  24/*
  25 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
  26 * reader side locking.
  27 */
  28DEFINE_SPINLOCK(bdi_lock);
  29static u64 bdi_id_cursor;
  30static struct rb_root bdi_tree = RB_ROOT;
  31LIST_HEAD(bdi_list);
  32
  33/* bdi_wq serves all asynchronous writeback tasks */
  34struct workqueue_struct *bdi_wq;
  35
  36#define K(x) ((x) << (PAGE_SHIFT - 10))
  37
  38#ifdef CONFIG_DEBUG_FS
  39#include <linux/debugfs.h>
  40#include <linux/seq_file.h>
  41
  42static struct dentry *bdi_debug_root;
  43
  44static void bdi_debug_init(void)
  45{
  46        bdi_debug_root = debugfs_create_dir("bdi", NULL);
  47}
  48
  49static int bdi_debug_stats_show(struct seq_file *m, void *v)
  50{
  51        struct backing_dev_info *bdi = m->private;
  52        struct bdi_writeback *wb = &bdi->wb;
  53        unsigned long background_thresh;
  54        unsigned long dirty_thresh;
  55        unsigned long wb_thresh;
  56        unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
  57        struct inode *inode;
  58
  59        nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
  60        spin_lock(&wb->list_lock);
  61        list_for_each_entry(inode, &wb->b_dirty, i_io_list)
  62                nr_dirty++;
  63        list_for_each_entry(inode, &wb->b_io, i_io_list)
  64                nr_io++;
  65        list_for_each_entry(inode, &wb->b_more_io, i_io_list)
  66                nr_more_io++;
  67        list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
  68                if (inode->i_state & I_DIRTY_TIME)
  69                        nr_dirty_time++;
  70        spin_unlock(&wb->list_lock);
  71
  72        global_dirty_limits(&background_thresh, &dirty_thresh);
  73        wb_thresh = wb_calc_thresh(wb, dirty_thresh);
  74
  75        seq_printf(m,
  76                   "BdiWriteback:       %10lu kB\n"
  77                   "BdiReclaimable:     %10lu kB\n"
  78                   "BdiDirtyThresh:     %10lu kB\n"
  79                   "DirtyThresh:        %10lu kB\n"
  80                   "BackgroundThresh:   %10lu kB\n"
  81                   "BdiDirtied:         %10lu kB\n"
  82                   "BdiWritten:         %10lu kB\n"
  83                   "BdiWriteBandwidth:  %10lu kBps\n"
  84                   "b_dirty:            %10lu\n"
  85                   "b_io:               %10lu\n"
  86                   "b_more_io:          %10lu\n"
  87                   "b_dirty_time:       %10lu\n"
  88                   "bdi_list:           %10u\n"
  89                   "state:              %10lx\n",
  90                   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
  91                   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
  92                   K(wb_thresh),
  93                   K(dirty_thresh),
  94                   K(background_thresh),
  95                   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
  96                   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
  97                   (unsigned long) K(wb->write_bandwidth),
  98                   nr_dirty,
  99                   nr_io,
 100                   nr_more_io,
 101                   nr_dirty_time,
 102                   !list_empty(&bdi->bdi_list), bdi->wb.state);
 103
 104        return 0;
 105}
 106DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
 107
 108static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 109{
 110        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 111
 112        debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
 113                            &bdi_debug_stats_fops);
 114}
 115
 116static void bdi_debug_unregister(struct backing_dev_info *bdi)
 117{
 118        debugfs_remove_recursive(bdi->debug_dir);
 119}
 120#else
 121static inline void bdi_debug_init(void)
 122{
 123}
 124static inline void bdi_debug_register(struct backing_dev_info *bdi,
 125                                      const char *name)
 126{
 127}
 128static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 129{
 130}
 131#endif
 132
 133static ssize_t read_ahead_kb_store(struct device *dev,
 134                                  struct device_attribute *attr,
 135                                  const char *buf, size_t count)
 136{
 137        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 138        unsigned long read_ahead_kb;
 139        ssize_t ret;
 140
 141        ret = kstrtoul(buf, 10, &read_ahead_kb);
 142        if (ret < 0)
 143                return ret;
 144
 145        bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
 146
 147        return count;
 148}
 149
 150#define BDI_SHOW(name, expr)                                            \
 151static ssize_t name##_show(struct device *dev,                          \
 152                           struct device_attribute *attr, char *buf)    \
 153{                                                                       \
 154        struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
 155                                                                        \
 156        return sysfs_emit(buf, "%lld\n", (long long)expr);              \
 157}                                                                       \
 158static DEVICE_ATTR_RW(name);
 159
 160BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 161
 162static ssize_t min_ratio_store(struct device *dev,
 163                struct device_attribute *attr, const char *buf, size_t count)
 164{
 165        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 166        unsigned int ratio;
 167        ssize_t ret;
 168
 169        ret = kstrtouint(buf, 10, &ratio);
 170        if (ret < 0)
 171                return ret;
 172
 173        ret = bdi_set_min_ratio(bdi, ratio);
 174        if (!ret)
 175                ret = count;
 176
 177        return ret;
 178}
 179BDI_SHOW(min_ratio, bdi->min_ratio)
 180
 181static ssize_t max_ratio_store(struct device *dev,
 182                struct device_attribute *attr, const char *buf, size_t count)
 183{
 184        struct backing_dev_info *bdi = dev_get_drvdata(dev);
 185        unsigned int ratio;
 186        ssize_t ret;
 187
 188        ret = kstrtouint(buf, 10, &ratio);
 189        if (ret < 0)
 190                return ret;
 191
 192        ret = bdi_set_max_ratio(bdi, ratio);
 193        if (!ret)
 194                ret = count;
 195
 196        return ret;
 197}
 198BDI_SHOW(max_ratio, bdi->max_ratio)
 199
 200static ssize_t stable_pages_required_show(struct device *dev,
 201                                          struct device_attribute *attr,
 202                                          char *buf)
 203{
 204        dev_warn_once(dev,
 205                "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
 206        return sysfs_emit(buf, "%d\n", 0);
 207}
 208static DEVICE_ATTR_RO(stable_pages_required);
 209
 210static struct attribute *bdi_dev_attrs[] = {
 211        &dev_attr_read_ahead_kb.attr,
 212        &dev_attr_min_ratio.attr,
 213        &dev_attr_max_ratio.attr,
 214        &dev_attr_stable_pages_required.attr,
 215        NULL,
 216};
 217ATTRIBUTE_GROUPS(bdi_dev);
 218
 219static __init int bdi_class_init(void)
 220{
 221        bdi_class = class_create(THIS_MODULE, "bdi");
 222        if (IS_ERR(bdi_class))
 223                return PTR_ERR(bdi_class);
 224
 225        bdi_class->dev_groups = bdi_dev_groups;
 226        bdi_debug_init();
 227
 228        return 0;
 229}
 230postcore_initcall(bdi_class_init);
 231
 232static int bdi_init(struct backing_dev_info *bdi);
 233
 234static int __init default_bdi_init(void)
 235{
 236        int err;
 237
 238        bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
 239                                 WQ_SYSFS, 0);
 240        if (!bdi_wq)
 241                return -ENOMEM;
 242
 243        err = bdi_init(&noop_backing_dev_info);
 244
 245        return err;
 246}
 247subsys_initcall(default_bdi_init);
 248
 249/*
 250 * This function is used when the first inode for this wb is marked dirty. It
 251 * wakes-up the corresponding bdi thread which should then take care of the
 252 * periodic background write-out of dirty inodes. Since the write-out would
 253 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 254 * set up a timer which wakes the bdi thread up later.
 255 *
 256 * Note, we wouldn't bother setting up the timer, but this function is on the
 257 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 258 * by delaying the wake-up.
 259 *
 260 * We have to be careful not to postpone flush work if it is scheduled for
 261 * earlier. Thus we use queue_delayed_work().
 262 */
 263void wb_wakeup_delayed(struct bdi_writeback *wb)
 264{
 265        unsigned long timeout;
 266
 267        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 268        spin_lock_bh(&wb->work_lock);
 269        if (test_bit(WB_registered, &wb->state))
 270                queue_delayed_work(bdi_wq, &wb->dwork, timeout);
 271        spin_unlock_bh(&wb->work_lock);
 272}
 273
 274/*
 275 * Initial write bandwidth: 100 MB/s
 276 */
 277#define INIT_BW         (100 << (20 - PAGE_SHIFT))
 278
 279static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 280                   gfp_t gfp)
 281{
 282        int i, err;
 283
 284        memset(wb, 0, sizeof(*wb));
 285
 286        if (wb != &bdi->wb)
 287                bdi_get(bdi);
 288        wb->bdi = bdi;
 289        wb->last_old_flush = jiffies;
 290        INIT_LIST_HEAD(&wb->b_dirty);
 291        INIT_LIST_HEAD(&wb->b_io);
 292        INIT_LIST_HEAD(&wb->b_more_io);
 293        INIT_LIST_HEAD(&wb->b_dirty_time);
 294        spin_lock_init(&wb->list_lock);
 295
 296        wb->bw_time_stamp = jiffies;
 297        wb->balanced_dirty_ratelimit = INIT_BW;
 298        wb->dirty_ratelimit = INIT_BW;
 299        wb->write_bandwidth = INIT_BW;
 300        wb->avg_write_bandwidth = INIT_BW;
 301
 302        spin_lock_init(&wb->work_lock);
 303        INIT_LIST_HEAD(&wb->work_list);
 304        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 305        wb->dirty_sleep = jiffies;
 306
 307        err = fprop_local_init_percpu(&wb->completions, gfp);
 308        if (err)
 309                goto out_put_bdi;
 310
 311        for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
 312                err = percpu_counter_init(&wb->stat[i], 0, gfp);
 313                if (err)
 314                        goto out_destroy_stat;
 315        }
 316
 317        return 0;
 318
 319out_destroy_stat:
 320        while (i--)
 321                percpu_counter_destroy(&wb->stat[i]);
 322        fprop_local_destroy_percpu(&wb->completions);
 323out_put_bdi:
 324        if (wb != &bdi->wb)
 325                bdi_put(bdi);
 326        return err;
 327}
 328
 329static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
 330
 331/*
 332 * Remove bdi from the global list and shutdown any threads we have running
 333 */
 334static void wb_shutdown(struct bdi_writeback *wb)
 335{
 336        /* Make sure nobody queues further work */
 337        spin_lock_bh(&wb->work_lock);
 338        if (!test_and_clear_bit(WB_registered, &wb->state)) {
 339                spin_unlock_bh(&wb->work_lock);
 340                return;
 341        }
 342        spin_unlock_bh(&wb->work_lock);
 343
 344        cgwb_remove_from_bdi_list(wb);
 345        /*
 346         * Drain work list and shutdown the delayed_work.  !WB_registered
 347         * tells wb_workfn() that @wb is dying and its work_list needs to
 348         * be drained no matter what.
 349         */
 350        mod_delayed_work(bdi_wq, &wb->dwork, 0);
 351        flush_delayed_work(&wb->dwork);
 352        WARN_ON(!list_empty(&wb->work_list));
 353}
 354
 355static void wb_exit(struct bdi_writeback *wb)
 356{
 357        int i;
 358
 359        WARN_ON(delayed_work_pending(&wb->dwork));
 360
 361        for (i = 0; i < NR_WB_STAT_ITEMS; i++)
 362                percpu_counter_destroy(&wb->stat[i]);
 363
 364        fprop_local_destroy_percpu(&wb->completions);
 365        if (wb != &wb->bdi->wb)
 366                bdi_put(wb->bdi);
 367}
 368
 369#ifdef CONFIG_CGROUP_WRITEBACK
 370
 371#include <linux/memcontrol.h>
 372
 373/*
 374 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
 375 * bdi->cgwb_tree is also RCU protected.
 376 */
 377static DEFINE_SPINLOCK(cgwb_lock);
 378static struct workqueue_struct *cgwb_release_wq;
 379
 380static void cgwb_release_workfn(struct work_struct *work)
 381{
 382        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
 383                                                release_work);
 384        struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
 385
 386        mutex_lock(&wb->bdi->cgwb_release_mutex);
 387        wb_shutdown(wb);
 388
 389        css_put(wb->memcg_css);
 390        css_put(wb->blkcg_css);
 391        mutex_unlock(&wb->bdi->cgwb_release_mutex);
 392
 393        /* triggers blkg destruction if no online users left */
 394        blkcg_unpin_online(blkcg);
 395
 396        fprop_local_destroy_percpu(&wb->memcg_completions);
 397        percpu_ref_exit(&wb->refcnt);
 398        wb_exit(wb);
 399        kfree_rcu(wb, rcu);
 400}
 401
 402static void cgwb_release(struct percpu_ref *refcnt)
 403{
 404        struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
 405                                                refcnt);
 406        queue_work(cgwb_release_wq, &wb->release_work);
 407}
 408
 409static void cgwb_kill(struct bdi_writeback *wb)
 410{
 411        lockdep_assert_held(&cgwb_lock);
 412
 413        WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
 414        list_del(&wb->memcg_node);
 415        list_del(&wb->blkcg_node);
 416        percpu_ref_kill(&wb->refcnt);
 417}
 418
 419static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
 420{
 421        spin_lock_irq(&cgwb_lock);
 422        list_del_rcu(&wb->bdi_node);
 423        spin_unlock_irq(&cgwb_lock);
 424}
 425
 426static int cgwb_create(struct backing_dev_info *bdi,
 427                       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
 428{
 429        struct mem_cgroup *memcg;
 430        struct cgroup_subsys_state *blkcg_css;
 431        struct blkcg *blkcg;
 432        struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
 433        struct bdi_writeback *wb;
 434        unsigned long flags;
 435        int ret = 0;
 436
 437        memcg = mem_cgroup_from_css(memcg_css);
 438        blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 439        blkcg = css_to_blkcg(blkcg_css);
 440        memcg_cgwb_list = &memcg->cgwb_list;
 441        blkcg_cgwb_list = &blkcg->cgwb_list;
 442
 443        /* look up again under lock and discard on blkcg mismatch */
 444        spin_lock_irqsave(&cgwb_lock, flags);
 445        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
 446        if (wb && wb->blkcg_css != blkcg_css) {
 447                cgwb_kill(wb);
 448                wb = NULL;
 449        }
 450        spin_unlock_irqrestore(&cgwb_lock, flags);
 451        if (wb)
 452                goto out_put;
 453
 454        /* need to create a new one */
 455        wb = kmalloc(sizeof(*wb), gfp);
 456        if (!wb) {
 457                ret = -ENOMEM;
 458                goto out_put;
 459        }
 460
 461        ret = wb_init(wb, bdi, gfp);
 462        if (ret)
 463                goto err_free;
 464
 465        ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
 466        if (ret)
 467                goto err_wb_exit;
 468
 469        ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
 470        if (ret)
 471                goto err_ref_exit;
 472
 473        wb->memcg_css = memcg_css;
 474        wb->blkcg_css = blkcg_css;
 475        INIT_WORK(&wb->release_work, cgwb_release_workfn);
 476        set_bit(WB_registered, &wb->state);
 477
 478        /*
 479         * The root wb determines the registered state of the whole bdi and
 480         * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
 481         * whether they're still online.  Don't link @wb if any is dead.
 482         * See wb_memcg_offline() and wb_blkcg_offline().
 483         */
 484        ret = -ENODEV;
 485        spin_lock_irqsave(&cgwb_lock, flags);
 486        if (test_bit(WB_registered, &bdi->wb.state) &&
 487            blkcg_cgwb_list->next && memcg_cgwb_list->next) {
 488                /* we might have raced another instance of this function */
 489                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
 490                if (!ret) {
 491                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
 492                        list_add(&wb->memcg_node, memcg_cgwb_list);
 493                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
 494                        blkcg_pin_online(blkcg);
 495                        css_get(memcg_css);
 496                        css_get(blkcg_css);
 497                }
 498        }
 499        spin_unlock_irqrestore(&cgwb_lock, flags);
 500        if (ret) {
 501                if (ret == -EEXIST)
 502                        ret = 0;
 503                goto err_fprop_exit;
 504        }
 505        goto out_put;
 506
 507err_fprop_exit:
 508        fprop_local_destroy_percpu(&wb->memcg_completions);
 509err_ref_exit:
 510        percpu_ref_exit(&wb->refcnt);
 511err_wb_exit:
 512        wb_exit(wb);
 513err_free:
 514        kfree(wb);
 515out_put:
 516        css_put(blkcg_css);
 517        return ret;
 518}
 519
 520/**
 521 * wb_get_lookup - get wb for a given memcg
 522 * @bdi: target bdi
 523 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 524 *
 525 * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
 526 * refcount incremented.
 527 *
 528 * This function uses css_get() on @memcg_css and thus expects its refcnt
 529 * to be positive on invocation.  IOW, rcu_read_lock() protection on
 530 * @memcg_css isn't enough.  try_get it before calling this function.
 531 *
 532 * A wb is keyed by its associated memcg.  As blkcg implicitly enables
 533 * memcg on the default hierarchy, memcg association is guaranteed to be
 534 * more specific (equal or descendant to the associated blkcg) and thus can
 535 * identify both the memcg and blkcg associations.
 536 *
 537 * Because the blkcg associated with a memcg may change as blkcg is enabled
 538 * and disabled closer to root in the hierarchy, each wb keeps track of
 539 * both the memcg and blkcg associated with it and verifies the blkcg on
 540 * each lookup.  On mismatch, the existing wb is discarded and a new one is
 541 * created.
 542 */
 543struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
 544                                    struct cgroup_subsys_state *memcg_css)
 545{
 546        struct bdi_writeback *wb;
 547
 548        if (!memcg_css->parent)
 549                return &bdi->wb;
 550
 551        rcu_read_lock();
 552        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
 553        if (wb) {
 554                struct cgroup_subsys_state *blkcg_css;
 555
 556                /* see whether the blkcg association has changed */
 557                blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 558                if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
 559                        wb = NULL;
 560                css_put(blkcg_css);
 561        }
 562        rcu_read_unlock();
 563
 564        return wb;
 565}
 566
 567/**
 568 * wb_get_create - get wb for a given memcg, create if necessary
 569 * @bdi: target bdi
 570 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 571 * @gfp: allocation mask to use
 572 *
 573 * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
 574 * create one.  See wb_get_lookup() for more details.
 575 */
 576struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 577                                    struct cgroup_subsys_state *memcg_css,
 578                                    gfp_t gfp)
 579{
 580        struct bdi_writeback *wb;
 581
 582        might_alloc(gfp);
 583
 584        if (!memcg_css->parent)
 585                return &bdi->wb;
 586
 587        do {
 588                wb = wb_get_lookup(bdi, memcg_css);
 589        } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
 590
 591        return wb;
 592}
 593
 594static int cgwb_bdi_init(struct backing_dev_info *bdi)
 595{
 596        int ret;
 597
 598        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 599        mutex_init(&bdi->cgwb_release_mutex);
 600        init_rwsem(&bdi->wb_switch_rwsem);
 601
 602        ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
 603        if (!ret) {
 604                bdi->wb.memcg_css = &root_mem_cgroup->css;
 605                bdi->wb.blkcg_css = blkcg_root_css;
 606        }
 607        return ret;
 608}
 609
 610static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 611{
 612        struct radix_tree_iter iter;
 613        void **slot;
 614        struct bdi_writeback *wb;
 615
 616        WARN_ON(test_bit(WB_registered, &bdi->wb.state));
 617
 618        spin_lock_irq(&cgwb_lock);
 619        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 620                cgwb_kill(*slot);
 621        spin_unlock_irq(&cgwb_lock);
 622
 623        mutex_lock(&bdi->cgwb_release_mutex);
 624        spin_lock_irq(&cgwb_lock);
 625        while (!list_empty(&bdi->wb_list)) {
 626                wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
 627                                      bdi_node);
 628                spin_unlock_irq(&cgwb_lock);
 629                wb_shutdown(wb);
 630                spin_lock_irq(&cgwb_lock);
 631        }
 632        spin_unlock_irq(&cgwb_lock);
 633        mutex_unlock(&bdi->cgwb_release_mutex);
 634}
 635
 636/**
 637 * wb_memcg_offline - kill all wb's associated with a memcg being offlined
 638 * @memcg: memcg being offlined
 639 *
 640 * Also prevents creation of any new wb's associated with @memcg.
 641 */
 642void wb_memcg_offline(struct mem_cgroup *memcg)
 643{
 644        struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
 645        struct bdi_writeback *wb, *next;
 646
 647        spin_lock_irq(&cgwb_lock);
 648        list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
 649                cgwb_kill(wb);
 650        memcg_cgwb_list->next = NULL;   /* prevent new wb's */
 651        spin_unlock_irq(&cgwb_lock);
 652}
 653
 654/**
 655 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
 656 * @blkcg: blkcg being offlined
 657 *
 658 * Also prevents creation of any new wb's associated with @blkcg.
 659 */
 660void wb_blkcg_offline(struct blkcg *blkcg)
 661{
 662        struct bdi_writeback *wb, *next;
 663
 664        spin_lock_irq(&cgwb_lock);
 665        list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
 666                cgwb_kill(wb);
 667        blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
 668        spin_unlock_irq(&cgwb_lock);
 669}
 670
 671static void cgwb_bdi_register(struct backing_dev_info *bdi)
 672{
 673        spin_lock_irq(&cgwb_lock);
 674        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
 675        spin_unlock_irq(&cgwb_lock);
 676}
 677
 678static int __init cgwb_init(void)
 679{
 680        /*
 681         * There can be many concurrent release work items overwhelming
 682         * system_wq.  Put them in a separate wq and limit concurrency.
 683         * There's no point in executing many of these in parallel.
 684         */
 685        cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
 686        if (!cgwb_release_wq)
 687                return -ENOMEM;
 688
 689        return 0;
 690}
 691subsys_initcall(cgwb_init);
 692
 693#else   /* CONFIG_CGROUP_WRITEBACK */
 694
 695static int cgwb_bdi_init(struct backing_dev_info *bdi)
 696{
 697        return wb_init(&bdi->wb, bdi, GFP_KERNEL);
 698}
 699
 700static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
 701
 702static void cgwb_bdi_register(struct backing_dev_info *bdi)
 703{
 704        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
 705}
 706
 707static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
 708{
 709        list_del_rcu(&wb->bdi_node);
 710}
 711
 712#endif  /* CONFIG_CGROUP_WRITEBACK */
 713
 714static int bdi_init(struct backing_dev_info *bdi)
 715{
 716        int ret;
 717
 718        bdi->dev = NULL;
 719
 720        kref_init(&bdi->refcnt);
 721        bdi->min_ratio = 0;
 722        bdi->max_ratio = 100;
 723        bdi->max_prop_frac = FPROP_FRAC_BASE;
 724        INIT_LIST_HEAD(&bdi->bdi_list);
 725        INIT_LIST_HEAD(&bdi->wb_list);
 726        init_waitqueue_head(&bdi->wb_waitq);
 727
 728        ret = cgwb_bdi_init(bdi);
 729
 730        return ret;
 731}
 732
 733struct backing_dev_info *bdi_alloc(int node_id)
 734{
 735        struct backing_dev_info *bdi;
 736
 737        bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
 738        if (!bdi)
 739                return NULL;
 740
 741        if (bdi_init(bdi)) {
 742                kfree(bdi);
 743                return NULL;
 744        }
 745        bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
 746        bdi->ra_pages = VM_READAHEAD_PAGES;
 747        bdi->io_pages = VM_READAHEAD_PAGES;
 748        return bdi;
 749}
 750EXPORT_SYMBOL(bdi_alloc);
 751
 752static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
 753{
 754        struct rb_node **p = &bdi_tree.rb_node;
 755        struct rb_node *parent = NULL;
 756        struct backing_dev_info *bdi;
 757
 758        lockdep_assert_held(&bdi_lock);
 759
 760        while (*p) {
 761                parent = *p;
 762                bdi = rb_entry(parent, struct backing_dev_info, rb_node);
 763
 764                if (bdi->id > id)
 765                        p = &(*p)->rb_left;
 766                else if (bdi->id < id)
 767                        p = &(*p)->rb_right;
 768                else
 769                        break;
 770        }
 771
 772        if (parentp)
 773                *parentp = parent;
 774        return p;
 775}
 776
 777/**
 778 * bdi_get_by_id - lookup and get bdi from its id
 779 * @id: bdi id to lookup
 780 *
 781 * Find bdi matching @id and get it.  Returns NULL if the matching bdi
 782 * doesn't exist or is already unregistered.
 783 */
 784struct backing_dev_info *bdi_get_by_id(u64 id)
 785{
 786        struct backing_dev_info *bdi = NULL;
 787        struct rb_node **p;
 788
 789        spin_lock_bh(&bdi_lock);
 790        p = bdi_lookup_rb_node(id, NULL);
 791        if (*p) {
 792                bdi = rb_entry(*p, struct backing_dev_info, rb_node);
 793                bdi_get(bdi);
 794        }
 795        spin_unlock_bh(&bdi_lock);
 796
 797        return bdi;
 798}
 799
 800int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 801{
 802        struct device *dev;
 803        struct rb_node *parent, **p;
 804
 805        if (bdi->dev)   /* The driver needs to use separate queues per device */
 806                return 0;
 807
 808        vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
 809        dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
 810        if (IS_ERR(dev))
 811                return PTR_ERR(dev);
 812
 813        cgwb_bdi_register(bdi);
 814        bdi->dev = dev;
 815
 816        bdi_debug_register(bdi, dev_name(dev));
 817        set_bit(WB_registered, &bdi->wb.state);
 818
 819        spin_lock_bh(&bdi_lock);
 820
 821        bdi->id = ++bdi_id_cursor;
 822
 823        p = bdi_lookup_rb_node(bdi->id, &parent);
 824        rb_link_node(&bdi->rb_node, parent, p);
 825        rb_insert_color(&bdi->rb_node, &bdi_tree);
 826
 827        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 828
 829        spin_unlock_bh(&bdi_lock);
 830
 831        trace_writeback_bdi_register(bdi);
 832        return 0;
 833}
 834
 835int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
 836{
 837        va_list args;
 838        int ret;
 839
 840        va_start(args, fmt);
 841        ret = bdi_register_va(bdi, fmt, args);
 842        va_end(args);
 843        return ret;
 844}
 845EXPORT_SYMBOL(bdi_register);
 846
 847void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
 848{
 849        WARN_ON_ONCE(bdi->owner);
 850        bdi->owner = owner;
 851        get_device(owner);
 852}
 853
 854/*
 855 * Remove bdi from bdi_list, and ensure that it is no longer visible
 856 */
 857static void bdi_remove_from_list(struct backing_dev_info *bdi)
 858{
 859        spin_lock_bh(&bdi_lock);
 860        rb_erase(&bdi->rb_node, &bdi_tree);
 861        list_del_rcu(&bdi->bdi_list);
 862        spin_unlock_bh(&bdi_lock);
 863
 864        synchronize_rcu_expedited();
 865}
 866
 867void bdi_unregister(struct backing_dev_info *bdi)
 868{
 869        /* make sure nobody finds us on the bdi_list anymore */
 870        bdi_remove_from_list(bdi);
 871        wb_shutdown(&bdi->wb);
 872        cgwb_bdi_unregister(bdi);
 873
 874        if (bdi->dev) {
 875                bdi_debug_unregister(bdi);
 876                device_unregister(bdi->dev);
 877                bdi->dev = NULL;
 878        }
 879
 880        if (bdi->owner) {
 881                put_device(bdi->owner);
 882                bdi->owner = NULL;
 883        }
 884}
 885
 886static void release_bdi(struct kref *ref)
 887{
 888        struct backing_dev_info *bdi =
 889                        container_of(ref, struct backing_dev_info, refcnt);
 890
 891        if (test_bit(WB_registered, &bdi->wb.state))
 892                bdi_unregister(bdi);
 893        WARN_ON_ONCE(bdi->dev);
 894        wb_exit(&bdi->wb);
 895        kfree(bdi);
 896}
 897
 898void bdi_put(struct backing_dev_info *bdi)
 899{
 900        kref_put(&bdi->refcnt, release_bdi);
 901}
 902EXPORT_SYMBOL(bdi_put);
 903
 904const char *bdi_dev_name(struct backing_dev_info *bdi)
 905{
 906        if (!bdi || !bdi->dev)
 907                return bdi_unknown_name;
 908        return bdi->dev_name;
 909}
 910EXPORT_SYMBOL_GPL(bdi_dev_name);
 911
 912static wait_queue_head_t congestion_wqh[2] = {
 913                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 914                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 915        };
 916static atomic_t nr_wb_congested[2];
 917
 918void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 919{
 920        wait_queue_head_t *wqh = &congestion_wqh[sync];
 921        enum wb_congested_state bit;
 922
 923        bit = sync ? WB_sync_congested : WB_async_congested;
 924        if (test_and_clear_bit(bit, &bdi->wb.congested))
 925                atomic_dec(&nr_wb_congested[sync]);
 926        smp_mb__after_atomic();
 927        if (waitqueue_active(wqh))
 928                wake_up(wqh);
 929}
 930EXPORT_SYMBOL(clear_bdi_congested);
 931
 932void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 933{
 934        enum wb_congested_state bit;
 935
 936        bit = sync ? WB_sync_congested : WB_async_congested;
 937        if (!test_and_set_bit(bit, &bdi->wb.congested))
 938                atomic_inc(&nr_wb_congested[sync]);
 939}
 940EXPORT_SYMBOL(set_bdi_congested);
 941
 942/**
 943 * congestion_wait - wait for a backing_dev to become uncongested
 944 * @sync: SYNC or ASYNC IO
 945 * @timeout: timeout in jiffies
 946 *
 947 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 948 * write congestion.  If no backing_devs are congested then just wait for the
 949 * next write to be completed.
 950 */
 951long congestion_wait(int sync, long timeout)
 952{
 953        long ret;
 954        unsigned long start = jiffies;
 955        DEFINE_WAIT(wait);
 956        wait_queue_head_t *wqh = &congestion_wqh[sync];
 957
 958        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 959        ret = io_schedule_timeout(timeout);
 960        finish_wait(wqh, &wait);
 961
 962        trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
 963                                        jiffies_to_usecs(jiffies - start));
 964
 965        return ret;
 966}
 967EXPORT_SYMBOL(congestion_wait);
 968
 969/**
 970 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
 971 * @sync: SYNC or ASYNC IO
 972 * @timeout: timeout in jiffies
 973 *
 974 * In the event of a congested backing_dev (any backing_dev) this waits
 975 * for up to @timeout jiffies for either a BDI to exit congestion of the
 976 * given @sync queue or a write to complete.
 977 *
 978 * The return value is 0 if the sleep is for the full timeout. Otherwise,
 979 * it is the number of jiffies that were still remaining when the function
 980 * returned. return_value == timeout implies the function did not sleep.
 981 */
 982long wait_iff_congested(int sync, long timeout)
 983{
 984        long ret;
 985        unsigned long start = jiffies;
 986        DEFINE_WAIT(wait);
 987        wait_queue_head_t *wqh = &congestion_wqh[sync];
 988
 989        /*
 990         * If there is no congestion, yield if necessary instead
 991         * of sleeping on the congestion queue
 992         */
 993        if (atomic_read(&nr_wb_congested[sync]) == 0) {
 994                cond_resched();
 995
 996                /* In case we scheduled, work out time remaining */
 997                ret = timeout - (jiffies - start);
 998                if (ret < 0)
 999                        ret = 0;
1000
1001                goto out;
1002        }
1003
1004        /* Sleep until uncongested or a write happens */
1005        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1006        ret = io_schedule_timeout(timeout);
1007        finish_wait(wqh, &wait);
1008
1009out:
1010        trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
1011                                        jiffies_to_usecs(jiffies - start));
1012
1013        return ret;
1014}
1015EXPORT_SYMBOL(wait_iff_congested);
1016