linux/mm/page-writeback.c
<<
>>
Prefs
   1/*
   2 * mm/page-writeback.c
   3 *
   4 * Copyright (C) 2002, Linus Torvalds.
   5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   6 *
   7 * Contains functions related to writing back dirty pages at the
   8 * address_space level.
   9 *
  10 * 10Apr2002    Andrew Morton
  11 *              Initial version
  12 */
  13
  14#include <linux/kernel.h>
  15#include <linux/export.h>
  16#include <linux/spinlock.h>
  17#include <linux/fs.h>
  18#include <linux/mm.h>
  19#include <linux/swap.h>
  20#include <linux/slab.h>
  21#include <linux/pagemap.h>
  22#include <linux/writeback.h>
  23#include <linux/init.h>
  24#include <linux/backing-dev.h>
  25#include <linux/task_io_accounting_ops.h>
  26#include <linux/blkdev.h>
  27#include <linux/mpage.h>
  28#include <linux/rmap.h>
  29#include <linux/percpu.h>
  30#include <linux/notifier.h>
  31#include <linux/smp.h>
  32#include <linux/sysctl.h>
  33#include <linux/cpu.h>
  34#include <linux/syscalls.h>
  35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
  36#include <linux/pagevec.h>
  37#include <trace/events/writeback.h>
  38
  39/*
  40 * Sleep at most 200ms at a time in balance_dirty_pages().
  41 */
  42#define MAX_PAUSE               max(HZ/5, 1)
  43
  44/*
  45 * Try to keep balance_dirty_pages() call intervals higher than this many pages
  46 * by raising pause time to max_pause when falls below it.
  47 */
  48#define DIRTY_POLL_THRESH       (128 >> (PAGE_SHIFT - 10))
  49
  50/*
  51 * Estimate write bandwidth at 200ms intervals.
  52 */
  53#define BANDWIDTH_INTERVAL      max(HZ/5, 1)
  54
  55#define RATELIMIT_CALC_SHIFT    10
  56
  57/*
  58 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  59 * will look to see if it needs to force writeback or throttling.
  60 */
  61static long ratelimit_pages = 32;
  62
  63/* The following parameters are exported via /proc/sys/vm */
  64
  65/*
  66 * Start background writeback (via writeback threads) at this percentage
  67 */
  68int dirty_background_ratio = 10;
  69
  70/*
  71 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
  72 * dirty_background_ratio * the amount of dirtyable memory
  73 */
  74unsigned long dirty_background_bytes;
  75
  76/*
  77 * free highmem will not be subtracted from the total free memory
  78 * for calculating free ratios if vm_highmem_is_dirtyable is true
  79 */
  80int vm_highmem_is_dirtyable;
  81
  82/*
  83 * The generator of dirty data starts writeback at this percentage
  84 */
  85int vm_dirty_ratio = 20;
  86
  87/*
  88 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
  89 * vm_dirty_ratio * the amount of dirtyable memory
  90 */
  91unsigned long vm_dirty_bytes;
  92
  93/*
  94 * The interval between `kupdate'-style writebacks
  95 */
  96unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
  97
  98/*
  99 * The longest time for which data is allowed to remain dirty
 100 */
 101unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 102
 103/*
 104 * Flag that makes the machine dump writes/reads and block dirtyings.
 105 */
 106int block_dump;
 107
 108/*
 109 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
 110 * a full sync is triggered after this time elapses without any disk activity.
 111 */
 112int laptop_mode;
 113
 114EXPORT_SYMBOL(laptop_mode);
 115
 116/* End of sysctl-exported parameters */
 117
 118unsigned long global_dirty_limit;
 119
 120/*
 121 * Scale the writeback cache size proportional to the relative writeout speeds.
 122 *
 123 * We do this by keeping a floating proportion between BDIs, based on page
 124 * writeback completions [end_page_writeback()]. Those devices that write out
 125 * pages fastest will get the larger share, while the slower will get a smaller
 126 * share.
 127 *
 128 * We use page writeout completions because we are interested in getting rid of
 129 * dirty pages. Having them written out is the primary goal.
 130 *
 131 * We introduce a concept of time, a period over which we measure these events,
 132 * because demand can/will vary over time. The length of this period itself is
 133 * measured in page writeback completions.
 134 *
 135 */
 136static struct prop_descriptor vm_completions;
 137
 138/*
 139 * Work out the current dirty-memory clamping and background writeout
 140 * thresholds.
 141 *
 142 * The main aim here is to lower them aggressively if there is a lot of mapped
 143 * memory around.  To avoid stressing page reclaim with lots of unreclaimable
 144 * pages.  It is better to clamp down on writers than to start swapping, and
 145 * performing lots of scanning.
 146 *
 147 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
 148 *
 149 * We don't permit the clamping level to fall below 5% - that is getting rather
 150 * excessive.
 151 *
 152 * We make sure that the background writeout level is below the adjusted
 153 * clamping level.
 154 */
 155
 156/*
 157 * In a memory zone, there is a certain amount of pages we consider
 158 * available for the page cache, which is essentially the number of
 159 * free and reclaimable pages, minus some zone reserves to protect
 160 * lowmem and the ability to uphold the zone's watermarks without
 161 * requiring writeback.
 162 *
 163 * This number of dirtyable pages is the base value of which the
 164 * user-configurable dirty ratio is the effictive number of pages that
 165 * are allowed to be actually dirtied.  Per individual zone, or
 166 * globally by using the sum of dirtyable pages over all zones.
 167 *
 168 * Because the user is allowed to specify the dirty limit globally as
 169 * absolute number of bytes, calculating the per-zone dirty limit can
 170 * require translating the configured limit into a percentage of
 171 * global dirtyable memory first.
 172 */
 173
 174static unsigned long highmem_dirtyable_memory(unsigned long total)
 175{
 176#ifdef CONFIG_HIGHMEM
 177        int node;
 178        unsigned long x = 0;
 179
 180        for_each_node_state(node, N_HIGH_MEMORY) {
 181                struct zone *z =
 182                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 183
 184                x += zone_page_state(z, NR_FREE_PAGES) +
 185                     zone_reclaimable_pages(z) - z->dirty_balance_reserve;
 186        }
 187        /*
 188         * Make sure that the number of highmem pages is never larger
 189         * than the number of the total dirtyable memory. This can only
 190         * occur in very strange VM situations but we want to make sure
 191         * that this does not occur.
 192         */
 193        return min(x, total);
 194#else
 195        return 0;
 196#endif
 197}
 198
 199/**
 200 * global_dirtyable_memory - number of globally dirtyable pages
 201 *
 202 * Returns the global number of pages potentially available for dirty
 203 * page cache.  This is the base value for the global dirty limits.
 204 */
 205unsigned long global_dirtyable_memory(void)
 206{
 207        unsigned long x;
 208
 209        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
 210            dirty_balance_reserve;
 211
 212        if (!vm_highmem_is_dirtyable)
 213                x -= highmem_dirtyable_memory(x);
 214
 215        return x + 1;   /* Ensure that we never return 0 */
 216}
 217
 218/*
 219 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 220 *
 221 * Calculate the dirty thresholds based on sysctl parameters
 222 * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
 223 * - vm.dirty_ratio             or  vm.dirty_bytes
 224 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
 225 * real-time tasks.
 226 */
 227void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 228{
 229        unsigned long background;
 230        unsigned long dirty;
 231        unsigned long uninitialized_var(available_memory);
 232        struct task_struct *tsk;
 233
 234        if (!vm_dirty_bytes || !dirty_background_bytes)
 235                available_memory = global_dirtyable_memory();
 236
 237        if (vm_dirty_bytes)
 238                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
 239        else
 240                dirty = (vm_dirty_ratio * available_memory) / 100;
 241
 242        if (dirty_background_bytes)
 243                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
 244        else
 245                background = (dirty_background_ratio * available_memory) / 100;
 246
 247        if (background >= dirty)
 248                background = dirty / 2;
 249        tsk = current;
 250        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
 251                background += background / 4;
 252                dirty += dirty / 4;
 253        }
 254        *pbackground = background;
 255        *pdirty = dirty;
 256        trace_global_dirty_state(background, dirty);
 257}
 258
 259/**
 260 * zone_dirtyable_memory - number of dirtyable pages in a zone
 261 * @zone: the zone
 262 *
 263 * Returns the zone's number of pages potentially available for dirty
 264 * page cache.  This is the base value for the per-zone dirty limits.
 265 */
 266static unsigned long zone_dirtyable_memory(struct zone *zone)
 267{
 268        /*
 269         * The effective global number of dirtyable pages may exclude
 270         * highmem as a big-picture measure to keep the ratio between
 271         * dirty memory and lowmem reasonable.
 272         *
 273         * But this function is purely about the individual zone and a
 274         * highmem zone can hold its share of dirty pages, so we don't
 275         * care about vm_highmem_is_dirtyable here.
 276         */
 277        return zone_page_state(zone, NR_FREE_PAGES) +
 278               zone_reclaimable_pages(zone) -
 279               zone->dirty_balance_reserve;
 280}
 281
 282/**
 283 * zone_dirty_limit - maximum number of dirty pages allowed in a zone
 284 * @zone: the zone
 285 *
 286 * Returns the maximum number of dirty pages allowed in a zone, based
 287 * on the zone's dirtyable memory.
 288 */
 289static unsigned long zone_dirty_limit(struct zone *zone)
 290{
 291        unsigned long zone_memory = zone_dirtyable_memory(zone);
 292        struct task_struct *tsk = current;
 293        unsigned long dirty;
 294
 295        if (vm_dirty_bytes)
 296                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
 297                        zone_memory / global_dirtyable_memory();
 298        else
 299                dirty = vm_dirty_ratio * zone_memory / 100;
 300
 301        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
 302                dirty += dirty / 4;
 303
 304        return dirty;
 305}
 306
 307/**
 308 * zone_dirty_ok - tells whether a zone is within its dirty limits
 309 * @zone: the zone to check
 310 *
 311 * Returns %true when the dirty pages in @zone are within the zone's
 312 * dirty limit, %false if the limit is exceeded.
 313 */
 314bool zone_dirty_ok(struct zone *zone)
 315{
 316        unsigned long limit = zone_dirty_limit(zone);
 317
 318        return zone_page_state(zone, NR_FILE_DIRTY) +
 319               zone_page_state(zone, NR_UNSTABLE_NFS) +
 320               zone_page_state(zone, NR_WRITEBACK) <= limit;
 321}
 322
 323/*
 324 * couple the period to the dirty_ratio:
 325 *
 326 *   period/2 ~ roundup_pow_of_two(dirty limit)
 327 */
 328static int calc_period_shift(void)
 329{
 330        unsigned long dirty_total;
 331
 332        if (vm_dirty_bytes)
 333                dirty_total = vm_dirty_bytes / PAGE_SIZE;
 334        else
 335                dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
 336                                100;
 337        return 2 + ilog2(dirty_total - 1);
 338}
 339
 340/*
 341 * update the period when the dirty threshold changes.
 342 */
 343static void update_completion_period(void)
 344{
 345        int shift = calc_period_shift();
 346        prop_change_shift(&vm_completions, shift);
 347
 348        writeback_set_ratelimit();
 349}
 350
 351int dirty_background_ratio_handler(struct ctl_table *table, int write,
 352                void __user *buffer, size_t *lenp,
 353                loff_t *ppos)
 354{
 355        int ret;
 356
 357        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 358        if (ret == 0 && write)
 359                dirty_background_bytes = 0;
 360        return ret;
 361}
 362
 363int dirty_background_bytes_handler(struct ctl_table *table, int write,
 364                void __user *buffer, size_t *lenp,
 365                loff_t *ppos)
 366{
 367        int ret;
 368
 369        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 370        if (ret == 0 && write)
 371                dirty_background_ratio = 0;
 372        return ret;
 373}
 374
 375int dirty_ratio_handler(struct ctl_table *table, int write,
 376                void __user *buffer, size_t *lenp,
 377                loff_t *ppos)
 378{
 379        int old_ratio = vm_dirty_ratio;
 380        int ret;
 381
 382        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 383        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
 384                update_completion_period();
 385                vm_dirty_bytes = 0;
 386        }
 387        return ret;
 388}
 389
 390int dirty_bytes_handler(struct ctl_table *table, int write,
 391                void __user *buffer, size_t *lenp,
 392                loff_t *ppos)
 393{
 394        unsigned long old_bytes = vm_dirty_bytes;
 395        int ret;
 396
 397        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 398        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
 399                update_completion_period();
 400                vm_dirty_ratio = 0;
 401        }
 402        return ret;
 403}
 404
 405/*
 406 * Increment the BDI's writeout completion count and the global writeout
 407 * completion count. Called from test_clear_page_writeback().
 408 */
 409static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 410{
 411        __inc_bdi_stat(bdi, BDI_WRITTEN);
 412        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
 413                              bdi->max_prop_frac);
 414}
 415
 416void bdi_writeout_inc(struct backing_dev_info *bdi)
 417{
 418        unsigned long flags;
 419
 420        local_irq_save(flags);
 421        __bdi_writeout_inc(bdi);
 422        local_irq_restore(flags);
 423}
 424EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 425
 426/*
 427 * Obtain an accurate fraction of the BDI's portion.
 428 */
 429static void bdi_writeout_fraction(struct backing_dev_info *bdi,
 430                long *numerator, long *denominator)
 431{
 432        prop_fraction_percpu(&vm_completions, &bdi->completions,
 433                                numerator, denominator);
 434}
 435
 436/*
 437 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 438 * registered backing devices, which, for obvious reasons, can not
 439 * exceed 100%.
 440 */
 441static unsigned int bdi_min_ratio;
 442
 443int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 444{
 445        int ret = 0;
 446
 447        spin_lock_bh(&bdi_lock);
 448        if (min_ratio > bdi->max_ratio) {
 449                ret = -EINVAL;
 450        } else {
 451                min_ratio -= bdi->min_ratio;
 452                if (bdi_min_ratio + min_ratio < 100) {
 453                        bdi_min_ratio += min_ratio;
 454                        bdi->min_ratio += min_ratio;
 455                } else {
 456                        ret = -EINVAL;
 457                }
 458        }
 459        spin_unlock_bh(&bdi_lock);
 460
 461        return ret;
 462}
 463
 464int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 465{
 466        int ret = 0;
 467
 468        if (max_ratio > 100)
 469                return -EINVAL;
 470
 471        spin_lock_bh(&bdi_lock);
 472        if (bdi->min_ratio > max_ratio) {
 473                ret = -EINVAL;
 474        } else {
 475                bdi->max_ratio = max_ratio;
 476                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
 477        }
 478        spin_unlock_bh(&bdi_lock);
 479
 480        return ret;
 481}
 482EXPORT_SYMBOL(bdi_set_max_ratio);
 483
 484static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 485                                           unsigned long bg_thresh)
 486{
 487        return (thresh + bg_thresh) / 2;
 488}
 489
 490static unsigned long hard_dirty_limit(unsigned long thresh)
 491{
 492        return max(thresh, global_dirty_limit);
 493}
 494
 495/**
 496 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
 497 * @bdi: the backing_dev_info to query
 498 * @dirty: global dirty limit in pages
 499 *
 500 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
 501 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
 502 *
 503 * Note that balance_dirty_pages() will only seriously take it as a hard limit
 504 * when sleeping max_pause per page is not enough to keep the dirty pages under
 505 * control. For example, when the device is completely stalled due to some error
 506 * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
 507 * In the other normal situations, it acts more gently by throttling the tasks
 508 * more (rather than completely block them) when the bdi dirty pages go high.
 509 *
 510 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 511 * - starving fast devices
 512 * - piling up dirty pages (that will take long time to sync) on slow devices
 513 *
 514 * The bdi's share of dirty limit will be adapting to its throughput and
 515 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 516 */
 517unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 518{
 519        u64 bdi_dirty;
 520        long numerator, denominator;
 521
 522        /*
 523         * Calculate this BDI's share of the dirty ratio.
 524         */
 525        bdi_writeout_fraction(bdi, &numerator, &denominator);
 526
 527        bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
 528        bdi_dirty *= numerator;
 529        do_div(bdi_dirty, denominator);
 530
 531        bdi_dirty += (dirty * bdi->min_ratio) / 100;
 532        if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 533                bdi_dirty = dirty * bdi->max_ratio / 100;
 534
 535        return bdi_dirty;
 536}
 537
 538/*
 539 * Dirty position control.
 540 *
 541 * (o) global/bdi setpoints
 542 *
 543 * We want the dirty pages be balanced around the global/bdi setpoints.
 544 * When the number of dirty pages is higher/lower than the setpoint, the
 545 * dirty position control ratio (and hence task dirty ratelimit) will be
 546 * decreased/increased to bring the dirty pages back to the setpoint.
 547 *
 548 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 549 *
 550 *     if (dirty < setpoint) scale up   pos_ratio
 551 *     if (dirty > setpoint) scale down pos_ratio
 552 *
 553 *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
 554 *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
 555 *
 556 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 557 *
 558 * (o) global control line
 559 *
 560 *     ^ pos_ratio
 561 *     |
 562 *     |            |<===== global dirty control scope ======>|
 563 * 2.0 .............*
 564 *     |            .*
 565 *     |            . *
 566 *     |            .   *
 567 *     |            .     *
 568 *     |            .        *
 569 *     |            .            *
 570 * 1.0 ................................*
 571 *     |            .                  .     *
 572 *     |            .                  .          *
 573 *     |            .                  .              *
 574 *     |            .                  .                 *
 575 *     |            .                  .                    *
 576 *   0 +------------.------------------.----------------------*------------->
 577 *           freerun^          setpoint^                 limit^   dirty pages
 578 *
 579 * (o) bdi control line
 580 *
 581 *     ^ pos_ratio
 582 *     |
 583 *     |            *
 584 *     |              *
 585 *     |                *
 586 *     |                  *
 587 *     |                    * |<=========== span ============>|
 588 * 1.0 .......................*
 589 *     |                      . *
 590 *     |                      .   *
 591 *     |                      .     *
 592 *     |                      .       *
 593 *     |                      .         *
 594 *     |                      .           *
 595 *     |                      .             *
 596 *     |                      .               *
 597 *     |                      .                 *
 598 *     |                      .                   *
 599 *     |                      .                     *
 600 * 1/4 ...............................................* * * * * * * * * * * *
 601 *     |                      .                         .
 602 *     |                      .                           .
 603 *     |                      .                             .
 604 *   0 +----------------------.-------------------------------.------------->
 605 *                bdi_setpoint^                    x_intercept^
 606 *
 607 * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
 608 * be smoothly throttled down to normal if it starts high in situations like
 609 * - start writing to a slow SD card and a fast disk at the same time. The SD
 610 *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
 611 * - the bdi dirty thresh drops quickly due to change of JBOD workload
 612 */
 613static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 614                                        unsigned long thresh,
 615                                        unsigned long bg_thresh,
 616                                        unsigned long dirty,
 617                                        unsigned long bdi_thresh,
 618                                        unsigned long bdi_dirty)
 619{
 620        unsigned long write_bw = bdi->avg_write_bandwidth;
 621        unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
 622        unsigned long limit = hard_dirty_limit(thresh);
 623        unsigned long x_intercept;
 624        unsigned long setpoint;         /* dirty pages' target balance point */
 625        unsigned long bdi_setpoint;
 626        unsigned long span;
 627        long long pos_ratio;            /* for scaling up/down the rate limit */
 628        long x;
 629
 630        if (unlikely(dirty >= limit))
 631                return 0;
 632
 633        /*
 634         * global setpoint
 635         *
 636         *                           setpoint - dirty 3
 637         *        f(dirty) := 1.0 + (----------------)
 638         *                           limit - setpoint
 639         *
 640         * it's a 3rd order polynomial that subjects to
 641         *
 642         * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 643         * (2) f(setpoint) = 1.0 => the balance point
 644         * (3) f(limit)    = 0   => the hard limit
 645         * (4) df/dx      <= 0   => negative feedback control
 646         * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 647         *     => fast response on large errors; small oscillation near setpoint
 648         */
 649        setpoint = (freerun + limit) / 2;
 650        x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
 651                    limit - setpoint + 1);
 652        pos_ratio = x;
 653        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
 654        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
 655        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
 656
 657        /*
 658         * We have computed basic pos_ratio above based on global situation. If
 659         * the bdi is over/under its share of dirty pages, we want to scale
 660         * pos_ratio further down/up. That is done by the following mechanism.
 661         */
 662
 663        /*
 664         * bdi setpoint
 665         *
 666         *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
 667         *
 668         *                        x_intercept - bdi_dirty
 669         *                     := --------------------------
 670         *                        x_intercept - bdi_setpoint
 671         *
 672         * The main bdi control line is a linear function that subjects to
 673         *
 674         * (1) f(bdi_setpoint) = 1.0
 675         * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
 676         *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
 677         *
 678         * For single bdi case, the dirty pages are observed to fluctuate
 679         * regularly within range
 680         *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
 681         * for various filesystems, where (2) can yield in a reasonable 12.5%
 682         * fluctuation range for pos_ratio.
 683         *
 684         * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
 685         * own size, so move the slope over accordingly and choose a slope that
 686         * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
 687         */
 688        if (unlikely(bdi_thresh > thresh))
 689                bdi_thresh = thresh;
 690        /*
 691         * It's very possible that bdi_thresh is close to 0 not because the
 692         * device is slow, but that it has remained inactive for long time.
 693         * Honour such devices a reasonable good (hopefully IO efficient)
 694         * threshold, so that the occasional writes won't be blocked and active
 695         * writes can rampup the threshold quickly.
 696         */
 697        bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
 698        /*
 699         * scale global setpoint to bdi's:
 700         *      bdi_setpoint = setpoint * bdi_thresh / thresh
 701         */
 702        x = div_u64((u64)bdi_thresh << 16, thresh + 1);
 703        bdi_setpoint = setpoint * (u64)x >> 16;
 704        /*
 705         * Use span=(8*write_bw) in single bdi case as indicated by
 706         * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
 707         *
 708         *        bdi_thresh                    thresh - bdi_thresh
 709         * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
 710         *          thresh                            thresh
 711         */
 712        span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
 713        x_intercept = bdi_setpoint + span;
 714
 715        if (bdi_dirty < x_intercept - span / 4) {
 716                pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
 717                                    x_intercept - bdi_setpoint + 1);
 718        } else
 719                pos_ratio /= 4;
 720
 721        /*
 722         * bdi reserve area, safeguard against dirty pool underrun and disk idle
 723         * It may push the desired control point of global dirty pages higher
 724         * than setpoint.
 725         */
 726        x_intercept = bdi_thresh / 2;
 727        if (bdi_dirty < x_intercept) {
 728                if (bdi_dirty > x_intercept / 8)
 729                        pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
 730                else
 731                        pos_ratio *= 8;
 732        }
 733
 734        return pos_ratio;
 735}
 736
 737static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 738                                       unsigned long elapsed,
 739                                       unsigned long written)
 740{
 741        const unsigned long period = roundup_pow_of_two(3 * HZ);
 742        unsigned long avg = bdi->avg_write_bandwidth;
 743        unsigned long old = bdi->write_bandwidth;
 744        u64 bw;
 745
 746        /*
 747         * bw = written * HZ / elapsed
 748         *
 749         *                   bw * elapsed + write_bandwidth * (period - elapsed)
 750         * write_bandwidth = ---------------------------------------------------
 751         *                                          period
 752         */
 753        bw = written - bdi->written_stamp;
 754        bw *= HZ;
 755        if (unlikely(elapsed > period)) {
 756                do_div(bw, elapsed);
 757                avg = bw;
 758                goto out;
 759        }
 760        bw += (u64)bdi->write_bandwidth * (period - elapsed);
 761        bw >>= ilog2(period);
 762
 763        /*
 764         * one more level of smoothing, for filtering out sudden spikes
 765         */
 766        if (avg > old && old >= (unsigned long)bw)
 767                avg -= (avg - old) >> 3;
 768
 769        if (avg < old && old <= (unsigned long)bw)
 770                avg += (old - avg) >> 3;
 771
 772out:
 773        bdi->write_bandwidth = bw;
 774        bdi->avg_write_bandwidth = avg;
 775}
 776
 777/*
 778 * The global dirtyable memory and dirty threshold could be suddenly knocked
 779 * down by a large amount (eg. on the startup of KVM in a swapless system).
 780 * This may throw the system into deep dirty exceeded state and throttle
 781 * heavy/light dirtiers alike. To retain good responsiveness, maintain
 782 * global_dirty_limit for tracking slowly down to the knocked down dirty
 783 * threshold.
 784 */
 785static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
 786{
 787        unsigned long limit = global_dirty_limit;
 788
 789        /*
 790         * Follow up in one step.
 791         */
 792        if (limit < thresh) {
 793                limit = thresh;
 794                goto update;
 795        }
 796
 797        /*
 798         * Follow down slowly. Use the higher one as the target, because thresh
 799         * may drop below dirty. This is exactly the reason to introduce
 800         * global_dirty_limit which is guaranteed to lie above the dirty pages.
 801         */
 802        thresh = max(thresh, dirty);
 803        if (limit > thresh) {
 804                limit -= (limit - thresh) >> 5;
 805                goto update;
 806        }
 807        return;
 808update:
 809        global_dirty_limit = limit;
 810}
 811
 812static void global_update_bandwidth(unsigned long thresh,
 813                                    unsigned long dirty,
 814                                    unsigned long now)
 815{
 816        static DEFINE_SPINLOCK(dirty_lock);
 817        static unsigned long update_time;
 818
 819        /*
 820         * check locklessly first to optimize away locking for the most time
 821         */
 822        if (time_before(now, update_time + BANDWIDTH_INTERVAL))
 823                return;
 824
 825        spin_lock(&dirty_lock);
 826        if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
 827                update_dirty_limit(thresh, dirty);
 828                update_time = now;
 829        }
 830        spin_unlock(&dirty_lock);
 831}
 832
 833/*
 834 * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
 835 *
 836 * Normal bdi tasks will be curbed at or below it in long term.
 837 * Obviously it should be around (write_bw / N) when there are N dd tasks.
 838 */
 839static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 840                                       unsigned long thresh,
 841                                       unsigned long bg_thresh,
 842                                       unsigned long dirty,
 843                                       unsigned long bdi_thresh,
 844                                       unsigned long bdi_dirty,
 845                                       unsigned long dirtied,
 846                                       unsigned long elapsed)
 847{
 848        unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
 849        unsigned long limit = hard_dirty_limit(thresh);
 850        unsigned long setpoint = (freerun + limit) / 2;
 851        unsigned long write_bw = bdi->avg_write_bandwidth;
 852        unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
 853        unsigned long dirty_rate;
 854        unsigned long task_ratelimit;
 855        unsigned long balanced_dirty_ratelimit;
 856        unsigned long pos_ratio;
 857        unsigned long step;
 858        unsigned long x;
 859
 860        /*
 861         * The dirty rate will match the writeout rate in long term, except
 862         * when dirty pages are truncated by userspace or re-dirtied by FS.
 863         */
 864        dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
 865
 866        pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
 867                                       bdi_thresh, bdi_dirty);
 868        /*
 869         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
 870         */
 871        task_ratelimit = (u64)dirty_ratelimit *
 872                                        pos_ratio >> RATELIMIT_CALC_SHIFT;
 873        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
 874
 875        /*
 876         * A linear estimation of the "balanced" throttle rate. The theory is,
 877         * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
 878         * dirty_rate will be measured to be (N * task_ratelimit). So the below
 879         * formula will yield the balanced rate limit (write_bw / N).
 880         *
 881         * Note that the expanded form is not a pure rate feedback:
 882         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate)              (1)
 883         * but also takes pos_ratio into account:
 884         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
 885         *
 886         * (1) is not realistic because pos_ratio also takes part in balancing
 887         * the dirty rate.  Consider the state
 888         *      pos_ratio = 0.5                                              (3)
 889         *      rate = 2 * (write_bw / N)                                    (4)
 890         * If (1) is used, it will stuck in that state! Because each dd will
 891         * be throttled at
 892         *      task_ratelimit = pos_ratio * rate = (write_bw / N)           (5)
 893         * yielding
 894         *      dirty_rate = N * task_ratelimit = write_bw                   (6)
 895         * put (6) into (1) we get
 896         *      rate_(i+1) = rate_(i)                                        (7)
 897         *
 898         * So we end up using (2) to always keep
 899         *      rate_(i+1) ~= (write_bw / N)                                 (8)
 900         * regardless of the value of pos_ratio. As long as (8) is satisfied,
 901         * pos_ratio is able to drive itself to 1.0, which is not only where
 902         * the dirty count meet the setpoint, but also where the slope of
 903         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
 904         */
 905        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
 906                                           dirty_rate | 1);
 907        /*
 908         * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
 909         */
 910        if (unlikely(balanced_dirty_ratelimit > write_bw))
 911                balanced_dirty_ratelimit = write_bw;
 912
 913        /*
 914         * We could safely do this and return immediately:
 915         *
 916         *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
 917         *
 918         * However to get a more stable dirty_ratelimit, the below elaborated
 919         * code makes use of task_ratelimit to filter out sigular points and
 920         * limit the step size.
 921         *
 922         * The below code essentially only uses the relative value of
 923         *
 924         *      task_ratelimit - dirty_ratelimit
 925         *      = (pos_ratio - 1) * dirty_ratelimit
 926         *
 927         * which reflects the direction and size of dirty position error.
 928         */
 929
 930        /*
 931         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
 932         * task_ratelimit is on the same side of dirty_ratelimit, too.
 933         * For example, when
 934         * - dirty_ratelimit > balanced_dirty_ratelimit
 935         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
 936         * lowering dirty_ratelimit will help meet both the position and rate
 937         * control targets. Otherwise, don't update dirty_ratelimit if it will
 938         * only help meet the rate target. After all, what the users ultimately
 939         * feel and care are stable dirty rate and small position error.
 940         *
 941         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
 942         * and filter out the sigular points of balanced_dirty_ratelimit. Which
 943         * keeps jumping around randomly and can even leap far away at times
 944         * due to the small 200ms estimation period of dirty_rate (we want to
 945         * keep that period small to reduce time lags).
 946         */
 947        step = 0;
 948        if (dirty < setpoint) {
 949                x = min(bdi->balanced_dirty_ratelimit,
 950                         min(balanced_dirty_ratelimit, task_ratelimit));
 951                if (dirty_ratelimit < x)
 952                        step = x - dirty_ratelimit;
 953        } else {
 954                x = max(bdi->balanced_dirty_ratelimit,
 955                         max(balanced_dirty_ratelimit, task_ratelimit));
 956                if (dirty_ratelimit > x)
 957                        step = dirty_ratelimit - x;
 958        }
 959
 960        /*
 961         * Don't pursue 100% rate matching. It's impossible since the balanced
 962         * rate itself is constantly fluctuating. So decrease the track speed
 963         * when it gets close to the target. Helps eliminate pointless tremors.
 964         */
 965        step >>= dirty_ratelimit / (2 * step + 1);
 966        /*
 967         * Limit the tracking speed to avoid overshooting.
 968         */
 969        step = (step + 7) / 8;
 970
 971        if (dirty_ratelimit < balanced_dirty_ratelimit)
 972                dirty_ratelimit += step;
 973        else
 974                dirty_ratelimit -= step;
 975
 976        bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
 977        bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 978
 979        trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
 980}
 981
 982void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 983                            unsigned long thresh,
 984                            unsigned long bg_thresh,
 985                            unsigned long dirty,
 986                            unsigned long bdi_thresh,
 987                            unsigned long bdi_dirty,
 988                            unsigned long start_time)
 989{
 990        unsigned long now = jiffies;
 991        unsigned long elapsed = now - bdi->bw_time_stamp;
 992        unsigned long dirtied;
 993        unsigned long written;
 994
 995        /*
 996         * rate-limit, only update once every 200ms.
 997         */
 998        if (elapsed < BANDWIDTH_INTERVAL)
 999                return;
1000
1001        dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
1002        written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
1003
1004        /*
1005         * Skip quiet periods when disk bandwidth is under-utilized.
1006         * (at least 1s idle time between two flusher runs)
1007         */
1008        if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
1009                goto snapshot;
1010
1011        if (thresh) {
1012                global_update_bandwidth(thresh, dirty, now);
1013                bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
1014                                           bdi_thresh, bdi_dirty,
1015                                           dirtied, elapsed);
1016        }
1017        bdi_update_write_bandwidth(bdi, elapsed, written);
1018
1019snapshot:
1020        bdi->dirtied_stamp = dirtied;
1021        bdi->written_stamp = written;
1022        bdi->bw_time_stamp = now;
1023}
1024
1025static void bdi_update_bandwidth(struct backing_dev_info *bdi,
1026                                 unsigned long thresh,
1027                                 unsigned long bg_thresh,
1028                                 unsigned long dirty,
1029                                 unsigned long bdi_thresh,
1030                                 unsigned long bdi_dirty,
1031                                 unsigned long start_time)
1032{
1033        if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
1034                return;
1035        spin_lock(&bdi->wb.list_lock);
1036        __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
1037                               bdi_thresh, bdi_dirty, start_time);
1038        spin_unlock(&bdi->wb.list_lock);
1039}
1040
1041/*
1042 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
1043 * will look to see if it needs to start dirty throttling.
1044 *
1045 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
1046 * global_page_state() too often. So scale it near-sqrt to the safety margin
1047 * (the number of pages we may dirty without exceeding the dirty limits).
1048 */
1049static unsigned long dirty_poll_interval(unsigned long dirty,
1050                                         unsigned long thresh)
1051{
1052        if (thresh > dirty)
1053                return 1UL << (ilog2(thresh - dirty) >> 1);
1054
1055        return 1;
1056}
1057
1058static long bdi_max_pause(struct backing_dev_info *bdi,
1059                          unsigned long bdi_dirty)
1060{
1061        long bw = bdi->avg_write_bandwidth;
1062        long t;
1063
1064        /*
1065         * Limit pause time for small memory systems. If sleeping for too long
1066         * time, a small pool of dirty/writeback pages may go empty and disk go
1067         * idle.
1068         *
1069         * 8 serves as the safety ratio.
1070         */
1071        t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1072        t++;
1073
1074        return min_t(long, t, MAX_PAUSE);
1075}
1076
1077static long bdi_min_pause(struct backing_dev_info *bdi,
1078                          long max_pause,
1079                          unsigned long task_ratelimit,
1080                          unsigned long dirty_ratelimit,
1081                          int *nr_dirtied_pause)
1082{
1083        long hi = ilog2(bdi->avg_write_bandwidth);
1084        long lo = ilog2(bdi->dirty_ratelimit);
1085        long t;         /* target pause */
1086        long pause;     /* estimated next pause */
1087        int pages;      /* target nr_dirtied_pause */
1088
1089        /* target for 10ms pause on 1-dd case */
1090        t = max(1, HZ / 100);
1091
1092        /*
1093         * Scale up pause time for concurrent dirtiers in order to reduce CPU
1094         * overheads.
1095         *
1096         * (N * 10ms) on 2^N concurrent tasks.
1097         */
1098        if (hi > lo)
1099                t += (hi - lo) * (10 * HZ) / 1024;
1100
1101        /*
1102         * This is a bit convoluted. We try to base the next nr_dirtied_pause
1103         * on the much more stable dirty_ratelimit. However the next pause time
1104         * will be computed based on task_ratelimit and the two rate limits may
1105         * depart considerably at some time. Especially if task_ratelimit goes
1106         * below dirty_ratelimit/2 and the target pause is max_pause, the next
1107         * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
1108         * result task_ratelimit won't be executed faithfully, which could
1109         * eventually bring down dirty_ratelimit.
1110         *
1111         * We apply two rules to fix it up:
1112         * 1) try to estimate the next pause time and if necessary, use a lower
1113         *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
1114         *    nr_dirtied_pause will be "dancing" with task_ratelimit.
1115         * 2) limit the target pause time to max_pause/2, so that the normal
1116         *    small fluctuations of task_ratelimit won't trigger rule (1) and
1117         *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
1118         */
1119        t = min(t, 1 + max_pause / 2);
1120        pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1121
1122        /*
1123         * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
1124         * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
1125         * When the 16 consecutive reads are often interrupted by some dirty
1126         * throttling pause during the async writes, cfq will go into idles
1127         * (deadline is fine). So push nr_dirtied_pause as high as possible
1128         * until reaches DIRTY_POLL_THRESH=32 pages.
1129         */
1130        if (pages < DIRTY_POLL_THRESH) {
1131                t = max_pause;
1132                pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
1133                if (pages > DIRTY_POLL_THRESH) {
1134                        pages = DIRTY_POLL_THRESH;
1135                        t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
1136                }
1137        }
1138
1139        pause = HZ * pages / (task_ratelimit + 1);
1140        if (pause > max_pause) {
1141                t = max_pause;
1142                pages = task_ratelimit * t / roundup_pow_of_two(HZ);
1143        }
1144
1145        *nr_dirtied_pause = pages;
1146        /*
1147         * The minimal pause time will normally be half the target pause time.
1148         */
1149        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1150}
1151
1152/*
1153 * balance_dirty_pages() must be called by processes which are generating dirty
1154 * data.  It looks at the number of dirty pages in the machine and will force
1155 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
1156 * If we're over `background_thresh' then the writeback threads are woken to
1157 * perform some writeout.
1158 */
1159static void balance_dirty_pages(struct address_space *mapping,
1160                                unsigned long pages_dirtied)
1161{
1162        unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
1163        unsigned long bdi_reclaimable;
1164        unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
1165        unsigned long bdi_dirty;
1166        unsigned long freerun;
1167        unsigned long background_thresh;
1168        unsigned long dirty_thresh;
1169        unsigned long bdi_thresh;
1170        long period;
1171        long pause;
1172        long max_pause;
1173        long min_pause;
1174        int nr_dirtied_pause;
1175        bool dirty_exceeded = false;
1176        unsigned long task_ratelimit;
1177        unsigned long dirty_ratelimit;
1178        unsigned long pos_ratio;
1179        struct backing_dev_info *bdi = mapping->backing_dev_info;
1180        unsigned long start_time = jiffies;
1181
1182        for (;;) {
1183                unsigned long now = jiffies;
1184
1185                /*
1186                 * Unstable writes are a feature of certain networked
1187                 * filesystems (i.e. NFS) in which data may have been
1188                 * written to the server's write cache, but has not yet
1189                 * been flushed to permanent storage.
1190                 */
1191                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
1192                                        global_page_state(NR_UNSTABLE_NFS);
1193                nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
1194
1195                global_dirty_limits(&background_thresh, &dirty_thresh);
1196
1197                /*
1198                 * Throttle it only when the background writeback cannot
1199                 * catch-up. This avoids (excessively) small writeouts
1200                 * when the bdi limits are ramping up.
1201                 */
1202                freerun = dirty_freerun_ceiling(dirty_thresh,
1203                                                background_thresh);
1204                if (nr_dirty <= freerun) {
1205                        current->dirty_paused_when = now;
1206                        current->nr_dirtied = 0;
1207                        current->nr_dirtied_pause =
1208                                dirty_poll_interval(nr_dirty, dirty_thresh);
1209                        break;
1210                }
1211
1212                if (unlikely(!writeback_in_progress(bdi)))
1213                        bdi_start_background_writeback(bdi);
1214
1215                /*
1216                 * bdi_thresh is not treated as some limiting factor as
1217                 * dirty_thresh, due to reasons
1218                 * - in JBOD setup, bdi_thresh can fluctuate a lot
1219                 * - in a system with HDD and USB key, the USB key may somehow
1220                 *   go into state (bdi_dirty >> bdi_thresh) either because
1221                 *   bdi_dirty starts high, or because bdi_thresh drops low.
1222                 *   In this case we don't want to hard throttle the USB key
1223                 *   dirtiers for 100 seconds until bdi_dirty drops under
1224                 *   bdi_thresh. Instead the auxiliary bdi control line in
1225                 *   bdi_position_ratio() will let the dirtier task progress
1226                 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1227                 */
1228                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1229
1230                /*
1231                 * In order to avoid the stacked BDI deadlock we need
1232                 * to ensure we accurately count the 'dirty' pages when
1233                 * the threshold is low.
1234                 *
1235                 * Otherwise it would be possible to get thresh+n pages
1236                 * reported dirty, even though there are thresh-m pages
1237                 * actually dirty; with m+n sitting in the percpu
1238                 * deltas.
1239                 */
1240                if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
1241                        bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1242                        bdi_dirty = bdi_reclaimable +
1243                                    bdi_stat_sum(bdi, BDI_WRITEBACK);
1244                } else {
1245                        bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1246                        bdi_dirty = bdi_reclaimable +
1247                                    bdi_stat(bdi, BDI_WRITEBACK);
1248                }
1249
1250                dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1251                                  (nr_dirty > dirty_thresh);
1252                if (dirty_exceeded && !bdi->dirty_exceeded)
1253                        bdi->dirty_exceeded = 1;
1254
1255                bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
1256                                     nr_dirty, bdi_thresh, bdi_dirty,
1257                                     start_time);
1258
1259                dirty_ratelimit = bdi->dirty_ratelimit;
1260                pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
1261                                               background_thresh, nr_dirty,
1262                                               bdi_thresh, bdi_dirty);
1263                task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
1264                                                        RATELIMIT_CALC_SHIFT;
1265                max_pause = bdi_max_pause(bdi, bdi_dirty);
1266                min_pause = bdi_min_pause(bdi, max_pause,
1267                                          task_ratelimit, dirty_ratelimit,
1268                                          &nr_dirtied_pause);
1269
1270                if (unlikely(task_ratelimit == 0)) {
1271                        period = max_pause;
1272                        pause = max_pause;
1273                        goto pause;
1274                }
1275                period = HZ * pages_dirtied / task_ratelimit;
1276                pause = period;
1277                if (current->dirty_paused_when)
1278                        pause -= now - current->dirty_paused_when;
1279                /*
1280                 * For less than 1s think time (ext3/4 may block the dirtier
1281                 * for up to 800ms from time to time on 1-HDD; so does xfs,
1282                 * however at much less frequency), try to compensate it in
1283                 * future periods by updating the virtual time; otherwise just
1284                 * do a reset, as it may be a light dirtier.
1285                 */
1286                if (pause < min_pause) {
1287                        trace_balance_dirty_pages(bdi,
1288                                                  dirty_thresh,
1289                                                  background_thresh,
1290                                                  nr_dirty,
1291                                                  bdi_thresh,
1292                                                  bdi_dirty,
1293                                                  dirty_ratelimit,
1294                                                  task_ratelimit,
1295                                                  pages_dirtied,
1296                                                  period,
1297                                                  min(pause, 0L),
1298                                                  start_time);
1299                        if (pause < -HZ) {
1300                                current->dirty_paused_when = now;
1301                                current->nr_dirtied = 0;
1302                        } else if (period) {
1303                                current->dirty_paused_when += period;
1304                                current->nr_dirtied = 0;
1305                        } else if (current->nr_dirtied_pause <= pages_dirtied)
1306                                current->nr_dirtied_pause += pages_dirtied;
1307                        break;
1308                }
1309                if (unlikely(pause > max_pause)) {
1310                        /* for occasional dropped task_ratelimit */
1311                        now += min(pause - max_pause, max_pause);
1312                        pause = max_pause;
1313                }
1314
1315pause:
1316                trace_balance_dirty_pages(bdi,
1317                                          dirty_thresh,
1318                                          background_thresh,
1319                                          nr_dirty,
1320                                          bdi_thresh,
1321                                          bdi_dirty,
1322                                          dirty_ratelimit,
1323                                          task_ratelimit,
1324                                          pages_dirtied,
1325                                          period,
1326                                          pause,
1327                                          start_time);
1328                __set_current_state(TASK_KILLABLE);
1329                io_schedule_timeout(pause);
1330
1331                current->dirty_paused_when = now + pause;
1332                current->nr_dirtied = 0;
1333                current->nr_dirtied_pause = nr_dirtied_pause;
1334
1335                /*
1336                 * This is typically equal to (nr_dirty < dirty_thresh) and can
1337                 * also keep "1000+ dd on a slow USB stick" under control.
1338                 */
1339                if (task_ratelimit)
1340                        break;
1341
1342                /*
1343                 * In the case of an unresponding NFS server and the NFS dirty
1344                 * pages exceeds dirty_thresh, give the other good bdi's a pipe
1345                 * to go through, so that tasks on them still remain responsive.
1346                 *
1347                 * In theory 1 page is enough to keep the comsumer-producer
1348                 * pipe going: the flusher cleans 1 page => the task dirties 1
1349                 * more page. However bdi_dirty has accounting errors.  So use
1350                 * the larger and more IO friendly bdi_stat_error.
1351                 */
1352                if (bdi_dirty <= bdi_stat_error(bdi))
1353                        break;
1354
1355                if (fatal_signal_pending(current))
1356                        break;
1357        }
1358
1359        if (!dirty_exceeded && bdi->dirty_exceeded)
1360                bdi->dirty_exceeded = 0;
1361
1362        if (writeback_in_progress(bdi))
1363                return;
1364
1365        /*
1366         * In laptop mode, we wait until hitting the higher threshold before
1367         * starting background writeout, and then write out all the way down
1368         * to the lower threshold.  So slow writers cause minimal disk activity.
1369         *
1370         * In normal mode, we start background writeout at the lower
1371         * background_thresh, to keep the amount of dirty memory low.
1372         */
1373        if (laptop_mode)
1374                return;
1375
1376        if (nr_reclaimable > background_thresh)
1377                bdi_start_background_writeback(bdi);
1378}
1379
1380void set_page_dirty_balance(struct page *page, int page_mkwrite)
1381{
1382        if (set_page_dirty(page) || page_mkwrite) {
1383                struct address_space *mapping = page_mapping(page);
1384
1385                if (mapping)
1386                        balance_dirty_pages_ratelimited(mapping);
1387        }
1388}
1389
1390static DEFINE_PER_CPU(int, bdp_ratelimits);
1391
1392/*
1393 * Normal tasks are throttled by
1394 *      loop {
1395 *              dirty tsk->nr_dirtied_pause pages;
1396 *              take a snap in balance_dirty_pages();
1397 *      }
1398 * However there is a worst case. If every task exit immediately when dirtied
1399 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
1400 * called to throttle the page dirties. The solution is to save the not yet
1401 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
1402 * randomly into the running tasks. This works well for the above worst case,
1403 * as the new task will pick up and accumulate the old task's leaked dirty
1404 * count and eventually get throttled.
1405 */
1406DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1407
1408/**
1409 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
1410 * @mapping: address_space which was dirtied
1411 * @nr_pages_dirtied: number of pages which the caller has just dirtied
1412 *
1413 * Processes which are dirtying memory should call in here once for each page
1414 * which was newly dirtied.  The function will periodically check the system's
1415 * dirty state and will initiate writeback if needed.
1416 *
1417 * On really big machines, get_writeback_state is expensive, so try to avoid
1418 * calling it too often (ratelimiting).  But once we're over the dirty memory
1419 * limit we decrease the ratelimiting by a lot, to prevent individual processes
1420 * from overshooting the limit by (ratelimit_pages) each.
1421 */
1422void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1423                                        unsigned long nr_pages_dirtied)
1424{
1425        struct backing_dev_info *bdi = mapping->backing_dev_info;
1426        int ratelimit;
1427        int *p;
1428
1429        if (!bdi_cap_account_dirty(bdi))
1430                return;
1431
1432        ratelimit = current->nr_dirtied_pause;
1433        if (bdi->dirty_exceeded)
1434                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
1435
1436        preempt_disable();
1437        /*
1438         * This prevents one CPU to accumulate too many dirtied pages without
1439         * calling into balance_dirty_pages(), which can happen when there are
1440         * 1000+ tasks, all of them start dirtying pages at exactly the same
1441         * time, hence all honoured too large initial task->nr_dirtied_pause.
1442         */
1443        p =  &__get_cpu_var(bdp_ratelimits);
1444        if (unlikely(current->nr_dirtied >= ratelimit))
1445                *p = 0;
1446        else if (unlikely(*p >= ratelimit_pages)) {
1447                *p = 0;
1448                ratelimit = 0;
1449        }
1450        /*
1451         * Pick up the dirtied pages by the exited tasks. This avoids lots of
1452         * short-lived tasks (eg. gcc invocations in a kernel build) escaping
1453         * the dirty throttling and livelock other long-run dirtiers.
1454         */
1455        p = &__get_cpu_var(dirty_throttle_leaks);
1456        if (*p > 0 && current->nr_dirtied < ratelimit) {
1457                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1458                *p -= nr_pages_dirtied;
1459                current->nr_dirtied += nr_pages_dirtied;
1460        }
1461        preempt_enable();
1462
1463        if (unlikely(current->nr_dirtied >= ratelimit))
1464                balance_dirty_pages(mapping, current->nr_dirtied);
1465}
1466EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
1467
1468void throttle_vm_writeout(gfp_t gfp_mask)
1469{
1470        unsigned long background_thresh;
1471        unsigned long dirty_thresh;
1472
1473        for ( ; ; ) {
1474                global_dirty_limits(&background_thresh, &dirty_thresh);
1475
1476                /*
1477                 * Boost the allowable dirty threshold a bit for page
1478                 * allocators so they don't get DoS'ed by heavy writers
1479                 */
1480                dirty_thresh += dirty_thresh / 10;      /* wheeee... */
1481
1482                if (global_page_state(NR_UNSTABLE_NFS) +
1483                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
1484                                break;
1485                congestion_wait(BLK_RW_ASYNC, HZ/10);
1486
1487                /*
1488                 * The caller might hold locks which can prevent IO completion
1489                 * or progress in the filesystem.  So we cannot just sit here
1490                 * waiting for IO to complete.
1491                 */
1492                if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
1493                        break;
1494        }
1495}
1496
1497/*
1498 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
1499 */
1500int dirty_writeback_centisecs_handler(ctl_table *table, int write,
1501        void __user *buffer, size_t *length, loff_t *ppos)
1502{
1503        proc_dointvec(table, write, buffer, length, ppos);
1504        bdi_arm_supers_timer();
1505        return 0;
1506}
1507
1508#ifdef CONFIG_BLOCK
1509void laptop_mode_timer_fn(unsigned long data)
1510{
1511        struct request_queue *q = (struct request_queue *)data;
1512        int nr_pages = global_page_state(NR_FILE_DIRTY) +
1513                global_page_state(NR_UNSTABLE_NFS);
1514
1515        /*
1516         * We want to write everything out, not just down to the dirty
1517         * threshold
1518         */
1519        if (bdi_has_dirty_io(&q->backing_dev_info))
1520                bdi_start_writeback(&q->backing_dev_info, nr_pages,
1521                                        WB_REASON_LAPTOP_TIMER);
1522}
1523
1524/*
1525 * We've spun up the disk and we're in laptop mode: schedule writeback
1526 * of all dirty data a few seconds from now.  If the flush is already scheduled
1527 * then push it back - the user is still using the disk.
1528 */
1529void laptop_io_completion(struct backing_dev_info *info)
1530{
1531        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
1532}
1533
1534/*
1535 * We're in laptop mode and we've just synced. The sync's writes will have
1536 * caused another writeback to be scheduled by laptop_io_completion.
1537 * Nothing needs to be written back anymore, so we unschedule the writeback.
1538 */
1539void laptop_sync_completion(void)
1540{
1541        struct backing_dev_info *bdi;
1542
1543        rcu_read_lock();
1544
1545        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
1546                del_timer(&bdi->laptop_mode_wb_timer);
1547
1548        rcu_read_unlock();
1549}
1550#endif
1551
1552/*
1553 * If ratelimit_pages is too high then we can get into dirty-data overload
1554 * if a large number of processes all perform writes at the same time.
1555 * If it is too low then SMP machines will call the (expensive)
1556 * get_writeback_state too often.
1557 *
1558 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
1559 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
1560 * thresholds.
1561 */
1562
1563void writeback_set_ratelimit(void)
1564{
1565        unsigned long background_thresh;
1566        unsigned long dirty_thresh;
1567        global_dirty_limits(&background_thresh, &dirty_thresh);
1568        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
1569        if (ratelimit_pages < 16)
1570                ratelimit_pages = 16;
1571}
1572
1573static int __cpuinit
1574ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
1575{
1576        writeback_set_ratelimit();
1577        return NOTIFY_DONE;
1578}
1579
1580static struct notifier_block __cpuinitdata ratelimit_nb = {
1581        .notifier_call  = ratelimit_handler,
1582        .next           = NULL,
1583};
1584
1585/*
1586 * Called early on to tune the page writeback dirty limits.
1587 *
1588 * We used to scale dirty pages according to how total memory
1589 * related to pages that could be allocated for buffers (by
1590 * comparing nr_free_buffer_pages() to vm_total_pages.
1591 *
1592 * However, that was when we used "dirty_ratio" to scale with
1593 * all memory, and we don't do that any more. "dirty_ratio"
1594 * is now applied to total non-HIGHPAGE memory (by subtracting
1595 * totalhigh_pages from vm_total_pages), and as such we can't
1596 * get into the old insane situation any more where we had
1597 * large amounts of dirty pages compared to a small amount of
1598 * non-HIGHMEM memory.
1599 *
1600 * But we might still want to scale the dirty_ratio by how
1601 * much memory the box has..
1602 */
1603void __init page_writeback_init(void)
1604{
1605        int shift;
1606
1607        writeback_set_ratelimit();
1608        register_cpu_notifier(&ratelimit_nb);
1609
1610        shift = calc_period_shift();
1611        prop_descriptor_init(&vm_completions, shift);
1612}
1613
1614/**
1615 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
1616 * @mapping: address space structure to write
1617 * @start: starting page index
1618 * @end: ending page index (inclusive)
1619 *
1620 * This function scans the page range from @start to @end (inclusive) and tags
1621 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
1622 * that write_cache_pages (or whoever calls this function) will then use
1623 * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
1624 * used to avoid livelocking of writeback by a process steadily creating new
1625 * dirty pages in the file (thus it is important for this function to be quick
1626 * so that it can tag pages faster than a dirtying process can create them).
1627 */
1628/*
1629 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
1630 */
1631void tag_pages_for_writeback(struct address_space *mapping,
1632                             pgoff_t start, pgoff_t end)
1633{
1634#define WRITEBACK_TAG_BATCH 4096
1635        unsigned long tagged;
1636
1637        do {
1638                spin_lock_irq(&mapping->tree_lock);
1639                tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
1640                                &start, end, WRITEBACK_TAG_BATCH,
1641                                PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
1642                spin_unlock_irq(&mapping->tree_lock);
1643                WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
1644                cond_resched();
1645                /* We check 'start' to handle wrapping when end == ~0UL */
1646        } while (tagged >= WRITEBACK_TAG_BATCH && start);
1647}
1648EXPORT_SYMBOL(tag_pages_for_writeback);
1649
1650/**
1651 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
1652 * @mapping: address space structure to write
1653 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1654 * @writepage: function called for each page
1655 * @data: data passed to writepage function
1656 *
1657 * If a page is already under I/O, write_cache_pages() skips it, even
1658 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
1659 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
1660 * and msync() need to guarantee that all the data which was dirty at the time
1661 * the call was made get new I/O started against them.  If wbc->sync_mode is
1662 * WB_SYNC_ALL then we were called for data integrity and we must wait for
1663 * existing IO to complete.
1664 *
1665 * To avoid livelocks (when other process dirties new pages), we first tag
1666 * pages which should be written back with TOWRITE tag and only then start
1667 * writing them. For data-integrity sync we have to be careful so that we do
1668 * not miss some pages (e.g., because some other process has cleared TOWRITE
1669 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
1670 * by the process clearing the DIRTY tag (and submitting the page for IO).
1671 */
1672int write_cache_pages(struct address_space *mapping,
1673                      struct writeback_control *wbc, writepage_t writepage,
1674                      void *data)
1675{
1676        int ret = 0;
1677        int done = 0;
1678        struct pagevec pvec;
1679        int nr_pages;
1680        pgoff_t uninitialized_var(writeback_index);
1681        pgoff_t index;
1682        pgoff_t end;            /* Inclusive */
1683        pgoff_t done_index;
1684        int cycled;
1685        int range_whole = 0;
1686        int tag;
1687
1688        pagevec_init(&pvec, 0);
1689        if (wbc->range_cyclic) {
1690                writeback_index = mapping->writeback_index; /* prev offset */
1691                index = writeback_index;
1692                if (index == 0)
1693                        cycled = 1;
1694                else
1695                        cycled = 0;
1696                end = -1;
1697        } else {
1698                index = wbc->range_start >> PAGE_CACHE_SHIFT;
1699                end = wbc->range_end >> PAGE_CACHE_SHIFT;
1700                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1701                        range_whole = 1;
1702                cycled = 1; /* ignore range_cyclic tests */
1703        }
1704        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1705                tag = PAGECACHE_TAG_TOWRITE;
1706        else
1707                tag = PAGECACHE_TAG_DIRTY;
1708retry:
1709        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1710                tag_pages_for_writeback(mapping, index, end);
1711        done_index = index;
1712        while (!done && (index <= end)) {
1713                int i;
1714
1715                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
1716                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1717                if (nr_pages == 0)
1718                        break;
1719
1720                for (i = 0; i < nr_pages; i++) {
1721                        struct page *page = pvec.pages[i];
1722
1723                        /*
1724                         * At this point, the page may be truncated or
1725                         * invalidated (changing page->mapping to NULL), or
1726                         * even swizzled back from swapper_space to tmpfs file
1727                         * mapping. However, page->index will not change
1728                         * because we have a reference on the page.
1729                         */
1730                        if (page->index > end) {
1731                                /*
1732                                 * can't be range_cyclic (1st pass) because
1733                                 * end == -1 in that case.
1734                                 */
1735                                done = 1;
1736                                break;
1737                        }
1738
1739                        done_index = page->index;
1740
1741                        lock_page(page);
1742
1743                        /*
1744                         * Page truncated or invalidated. We can freely skip it
1745                         * then, even for data integrity operations: the page
1746                         * has disappeared concurrently, so there could be no
1747                         * real expectation of this data interity operation
1748                         * even if there is now a new, dirty page at the same
1749                         * pagecache address.
1750                         */
1751                        if (unlikely(page->mapping != mapping)) {
1752continue_unlock:
1753                                unlock_page(page);
1754                                continue;
1755                        }
1756
1757                        if (!PageDirty(page)) {
1758                                /* someone wrote it for us */
1759                                goto continue_unlock;
1760                        }
1761
1762                        if (PageWriteback(page)) {
1763                                if (wbc->sync_mode != WB_SYNC_NONE)
1764                                        wait_on_page_writeback(page);
1765                                else
1766                                        goto continue_unlock;
1767                        }
1768
1769                        BUG_ON(PageWriteback(page));
1770                        if (!clear_page_dirty_for_io(page))
1771                                goto continue_unlock;
1772
1773                        trace_wbc_writepage(wbc, mapping->backing_dev_info);
1774                        ret = (*writepage)(page, wbc, data);
1775                        if (unlikely(ret)) {
1776                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
1777                                        unlock_page(page);
1778                                        ret = 0;
1779                                } else {
1780                                        /*
1781                                         * done_index is set past this page,
1782                                         * so media errors will not choke
1783                                         * background writeout for the entire
1784                                         * file. This has consequences for
1785                                         * range_cyclic semantics (ie. it may
1786                                         * not be suitable for data integrity
1787                                         * writeout).
1788                                         */
1789                                        done_index = page->index + 1;
1790                                        done = 1;
1791                                        break;
1792                                }
1793                        }
1794
1795                        /*
1796                         * We stop writing back only if we are not doing
1797                         * integrity sync. In case of integrity sync we have to
1798                         * keep going until we have written all the pages
1799                         * we tagged for writeback prior to entering this loop.
1800                         */
1801                        if (--wbc->nr_to_write <= 0 &&
1802                            wbc->sync_mode == WB_SYNC_NONE) {
1803                                done = 1;
1804                                break;
1805                        }
1806                }
1807                pagevec_release(&pvec);
1808                cond_resched();
1809        }
1810        if (!cycled && !done) {
1811                /*
1812                 * range_cyclic:
1813                 * We hit the last page and there is more work to be done: wrap
1814                 * back to the start of the file
1815                 */
1816                cycled = 1;
1817                index = 0;
1818                end = writeback_index - 1;
1819                goto retry;
1820        }
1821        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1822                mapping->writeback_index = done_index;
1823
1824        return ret;
1825}
1826EXPORT_SYMBOL(write_cache_pages);
1827
1828/*
1829 * Function used by generic_writepages to call the real writepage
1830 * function and set the mapping flags on error
1831 */
1832static int __writepage(struct page *page, struct writeback_control *wbc,
1833                       void *data)
1834{
1835        struct address_space *mapping = data;
1836        int ret = mapping->a_ops->writepage(page, wbc);
1837        mapping_set_error(mapping, ret);
1838        return ret;
1839}
1840
1841/**
1842 * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
1843 * @mapping: address space structure to write
1844 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1845 *
1846 * This is a library function, which implements the writepages()
1847 * address_space_operation.
1848 */
1849int generic_writepages(struct address_space *mapping,
1850                       struct writeback_control *wbc)
1851{
1852        struct blk_plug plug;
1853        int ret;
1854
1855        /* deal with chardevs and other special file */
1856        if (!mapping->a_ops->writepage)
1857                return 0;
1858
1859        blk_start_plug(&plug);
1860        ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1861        blk_finish_plug(&plug);
1862        return ret;
1863}
1864
1865EXPORT_SYMBOL(generic_writepages);
1866
1867int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1868{
1869        int ret;
1870
1871        if (wbc->nr_to_write <= 0)
1872                return 0;
1873        if (mapping->a_ops->writepages)
1874                ret = mapping->a_ops->writepages(mapping, wbc);
1875        else
1876                ret = generic_writepages(mapping, wbc);
1877        return ret;
1878}
1879
1880/**
1881 * write_one_page - write out a single page and optionally wait on I/O
1882 * @page: the page to write
1883 * @wait: if true, wait on writeout
1884 *
1885 * The page must be locked by the caller and will be unlocked upon return.
1886 *
1887 * write_one_page() returns a negative error code if I/O failed.
1888 */
1889int write_one_page(struct page *page, int wait)
1890{
1891        struct address_space *mapping = page->mapping;
1892        int ret = 0;
1893        struct writeback_control wbc = {
1894                .sync_mode = WB_SYNC_ALL,
1895                .nr_to_write = 1,
1896        };
1897
1898        BUG_ON(!PageLocked(page));
1899
1900        if (wait)
1901                wait_on_page_writeback(page);
1902
1903        if (clear_page_dirty_for_io(page)) {
1904                page_cache_get(page);
1905                ret = mapping->a_ops->writepage(page, &wbc);
1906                if (ret == 0 && wait) {
1907                        wait_on_page_writeback(page);
1908                        if (PageError(page))
1909                                ret = -EIO;
1910                }
1911                page_cache_release(page);
1912        } else {
1913                unlock_page(page);
1914        }
1915        return ret;
1916}
1917EXPORT_SYMBOL(write_one_page);
1918
1919/*
1920 * For address_spaces which do not use buffers nor write back.
1921 */
1922int __set_page_dirty_no_writeback(struct page *page)
1923{
1924        if (!PageDirty(page))
1925                return !TestSetPageDirty(page);
1926        return 0;
1927}
1928
1929/*
1930 * Helper function for set_page_dirty family.
1931 * NOTE: This relies on being atomic wrt interrupts.
1932 */
1933void account_page_dirtied(struct page *page, struct address_space *mapping)
1934{
1935        if (mapping_cap_account_dirty(mapping)) {
1936                __inc_zone_page_state(page, NR_FILE_DIRTY);
1937                __inc_zone_page_state(page, NR_DIRTIED);
1938                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1939                __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
1940                task_io_account_write(PAGE_CACHE_SIZE);
1941                current->nr_dirtied++;
1942                this_cpu_inc(bdp_ratelimits);
1943        }
1944}
1945EXPORT_SYMBOL(account_page_dirtied);
1946
1947/*
1948 * Helper function for set_page_writeback family.
1949 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
1950 * wrt interrupts.
1951 */
1952void account_page_writeback(struct page *page)
1953{
1954        inc_zone_page_state(page, NR_WRITEBACK);
1955}
1956EXPORT_SYMBOL(account_page_writeback);
1957
1958/*
1959 * For address_spaces which do not use buffers.  Just tag the page as dirty in
1960 * its radix tree.
1961 *
1962 * This is also used when a single buffer is being dirtied: we want to set the
1963 * page dirty in that case, but not all the buffers.  This is a "bottom-up"
1964 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
1965 *
1966 * Most callers have locked the page, which pins the address_space in memory.
1967 * But zap_pte_range() does not lock the page, however in that case the
1968 * mapping is pinned by the vma's ->vm_file reference.
1969 *
1970 * We take care to handle the case where the page was truncated from the
1971 * mapping by re-checking page_mapping() inside tree_lock.
1972 */
1973int __set_page_dirty_nobuffers(struct page *page)
1974{
1975        if (!TestSetPageDirty(page)) {
1976                struct address_space *mapping = page_mapping(page);
1977                struct address_space *mapping2;
1978
1979                if (!mapping)
1980                        return 1;
1981
1982                spin_lock_irq(&mapping->tree_lock);
1983                mapping2 = page_mapping(page);
1984                if (mapping2) { /* Race with truncate? */
1985                        BUG_ON(mapping2 != mapping);
1986                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1987                        account_page_dirtied(page, mapping);
1988                        radix_tree_tag_set(&mapping->page_tree,
1989                                page_index(page), PAGECACHE_TAG_DIRTY);
1990                }
1991                spin_unlock_irq(&mapping->tree_lock);
1992                if (mapping->host) {
1993                        /* !PageAnon && !swapper_space */
1994                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1995                }
1996                return 1;
1997        }
1998        return 0;
1999}
2000EXPORT_SYMBOL(__set_page_dirty_nobuffers);
2001
2002/*
2003 * Call this whenever redirtying a page, to de-account the dirty counters
2004 * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
2005 * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
2006 * systematic errors in balanced_dirty_ratelimit and the dirty pages position
2007 * control.
2008 */
2009void account_page_redirty(struct page *page)
2010{
2011        struct address_space *mapping = page->mapping;
2012        if (mapping && mapping_cap_account_dirty(mapping)) {
2013                current->nr_dirtied--;
2014                dec_zone_page_state(page, NR_DIRTIED);
2015                dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
2016        }
2017}
2018EXPORT_SYMBOL(account_page_redirty);
2019
2020/*
2021 * When a writepage implementation decides that it doesn't want to write this
2022 * page for some reason, it should redirty the locked page via
2023 * redirty_page_for_writepage() and it should then unlock the page and return 0
2024 */
2025int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
2026{
2027        wbc->pages_skipped++;
2028        account_page_redirty(page);
2029        return __set_page_dirty_nobuffers(page);
2030}
2031EXPORT_SYMBOL(redirty_page_for_writepage);
2032
2033/*
2034 * Dirty a page.
2035 *
2036 * For pages with a mapping this should be done under the page lock
2037 * for the benefit of asynchronous memory errors who prefer a consistent
2038 * dirty state. This rule can be broken in some special cases,
2039 * but should be better not to.
2040 *
2041 * If the mapping doesn't provide a set_page_dirty a_op, then
2042 * just fall through and assume that it wants buffer_heads.
2043 */
2044int set_page_dirty(struct page *page)
2045{
2046        struct address_space *mapping = page_mapping(page);
2047
2048        if (likely(mapping)) {
2049                int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
2050                /*
2051                 * readahead/lru_deactivate_page could remain
2052                 * PG_readahead/PG_reclaim due to race with end_page_writeback
2053                 * About readahead, if the page is written, the flags would be
2054                 * reset. So no problem.
2055                 * About lru_deactivate_page, if the page is redirty, the flag
2056                 * will be reset. So no problem. but if the page is used by readahead
2057                 * it will confuse readahead and make it restart the size rampup
2058                 * process. But it's a trivial problem.
2059                 */
2060                ClearPageReclaim(page);
2061#ifdef CONFIG_BLOCK
2062                if (!spd)
2063                        spd = __set_page_dirty_buffers;
2064#endif
2065                return (*spd)(page);
2066        }
2067        if (!PageDirty(page)) {
2068                if (!TestSetPageDirty(page))
2069                        return 1;
2070        }
2071        return 0;
2072}
2073EXPORT_SYMBOL(set_page_dirty);
2074
2075/*
2076 * set_page_dirty() is racy if the caller has no reference against
2077 * page->mapping->host, and if the page is unlocked.  This is because another
2078 * CPU could truncate the page off the mapping and then free the mapping.
2079 *
2080 * Usually, the page _is_ locked, or the caller is a user-space process which
2081 * holds a reference on the inode by having an open file.
2082 *
2083 * In other cases, the page should be locked before running set_page_dirty().
2084 */
2085int set_page_dirty_lock(struct page *page)
2086{
2087        int ret;
2088
2089        lock_page(page);
2090        ret = set_page_dirty(page);
2091        unlock_page(page);
2092        return ret;
2093}
2094EXPORT_SYMBOL(set_page_dirty_lock);
2095
2096/*
2097 * Clear a page's dirty flag, while caring for dirty memory accounting.
2098 * Returns true if the page was previously dirty.
2099 *
2100 * This is for preparing to put the page under writeout.  We leave the page
2101 * tagged as dirty in the radix tree so that a concurrent write-for-sync
2102 * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
2103 * implementation will run either set_page_writeback() or set_page_dirty(),
2104 * at which stage we bring the page's dirty flag and radix-tree dirty tag
2105 * back into sync.
2106 *
2107 * This incoherency between the page's dirty flag and radix-tree tag is
2108 * unfortunate, but it only exists while the page is locked.
2109 */
2110int clear_page_dirty_for_io(struct page *page)
2111{
2112        struct address_space *mapping = page_mapping(page);
2113
2114        BUG_ON(!PageLocked(page));
2115
2116        if (mapping && mapping_cap_account_dirty(mapping)) {
2117                /*
2118                 * Yes, Virginia, this is indeed insane.
2119                 *
2120                 * We use this sequence to make sure that
2121                 *  (a) we account for dirty stats properly
2122                 *  (b) we tell the low-level filesystem to
2123                 *      mark the whole page dirty if it was
2124                 *      dirty in a pagetable. Only to then
2125                 *  (c) clean the page again and return 1 to
2126                 *      cause the writeback.
2127                 *
2128                 * This way we avoid all nasty races with the
2129                 * dirty bit in multiple places and clearing
2130                 * them concurrently from different threads.
2131                 *
2132                 * Note! Normally the "set_page_dirty(page)"
2133                 * has no effect on the actual dirty bit - since
2134                 * that will already usually be set. But we
2135                 * need the side effects, and it can help us
2136                 * avoid races.
2137                 *
2138                 * We basically use the page "master dirty bit"
2139                 * as a serialization point for all the different
2140                 * threads doing their things.
2141                 */
2142                if (page_mkclean(page))
2143                        set_page_dirty(page);
2144                /*
2145                 * We carefully synchronise fault handlers against
2146                 * installing a dirty pte and marking the page dirty
2147                 * at this point. We do this by having them hold the
2148                 * page lock at some point after installing their
2149                 * pte, but before marking the page dirty.
2150                 * Pages are always locked coming in here, so we get
2151                 * the desired exclusion. See mm/memory.c:do_wp_page()
2152                 * for more comments.
2153                 */
2154                if (TestClearPageDirty(page)) {
2155                        dec_zone_page_state(page, NR_FILE_DIRTY);
2156                        dec_bdi_stat(mapping->backing_dev_info,
2157                                        BDI_RECLAIMABLE);
2158                        return 1;
2159                }
2160                return 0;
2161        }
2162        return TestClearPageDirty(page);
2163}
2164EXPORT_SYMBOL(clear_page_dirty_for_io);
2165
2166int test_clear_page_writeback(struct page *page)
2167{
2168        struct address_space *mapping = page_mapping(page);
2169        int ret;
2170
2171        if (mapping) {
2172                struct backing_dev_info *bdi = mapping->backing_dev_info;
2173                unsigned long flags;
2174
2175                spin_lock_irqsave(&mapping->tree_lock, flags);
2176                ret = TestClearPageWriteback(page);
2177                if (ret) {
2178                        radix_tree_tag_clear(&mapping->page_tree,
2179                                                page_index(page),
2180                                                PAGECACHE_TAG_WRITEBACK);
2181                        if (bdi_cap_account_writeback(bdi)) {
2182                                __dec_bdi_stat(bdi, BDI_WRITEBACK);
2183                                __bdi_writeout_inc(bdi);
2184                        }
2185                }
2186                spin_unlock_irqrestore(&mapping->tree_lock, flags);
2187        } else {
2188                ret = TestClearPageWriteback(page);
2189        }
2190        if (ret) {
2191                dec_zone_page_state(page, NR_WRITEBACK);
2192                inc_zone_page_state(page, NR_WRITTEN);
2193        }
2194        return ret;
2195}
2196
2197int test_set_page_writeback(struct page *page)
2198{
2199        struct address_space *mapping = page_mapping(page);
2200        int ret;
2201
2202        if (mapping) {
2203                struct backing_dev_info *bdi = mapping->backing_dev_info;
2204                unsigned long flags;
2205
2206                spin_lock_irqsave(&mapping->tree_lock, flags);
2207                ret = TestSetPageWriteback(page);
2208                if (!ret) {
2209                        radix_tree_tag_set(&mapping->page_tree,
2210                                                page_index(page),
2211                                                PAGECACHE_TAG_WRITEBACK);
2212                        if (bdi_cap_account_writeback(bdi))
2213                                __inc_bdi_stat(bdi, BDI_WRITEBACK);
2214                }
2215                if (!PageDirty(page))
2216                        radix_tree_tag_clear(&mapping->page_tree,
2217                                                page_index(page),
2218                                                PAGECACHE_TAG_DIRTY);
2219                radix_tree_tag_clear(&mapping->page_tree,
2220                                     page_index(page),
2221                                     PAGECACHE_TAG_TOWRITE);
2222                spin_unlock_irqrestore(&mapping->tree_lock, flags);
2223        } else {
2224                ret = TestSetPageWriteback(page);
2225        }
2226        if (!ret)
2227                account_page_writeback(page);
2228        return ret;
2229
2230}
2231EXPORT_SYMBOL(test_set_page_writeback);
2232
2233/*
2234 * Return true if any of the pages in the mapping are marked with the
2235 * passed tag.
2236 */
2237int mapping_tagged(struct address_space *mapping, int tag)
2238{
2239        return radix_tree_tagged(&mapping->page_tree, tag);
2240}
2241EXPORT_SYMBOL(mapping_tagged);
2242