linux-bk/mm/page-writeback.c
<<
>>
Prefs
   1/*
   2 * mm/page-writeback.c.
   3 *
   4 * Copyright (C) 2002, Linus Torvalds.
   5 *
   6 * Contains functions related to writing back dirty pages at the
   7 * address_space level.
   8 *
   9 * 10Apr2002    akpm@zip.com.au
  10 *              Initial version
  11 */
  12
  13#include <linux/kernel.h>
  14#include <linux/module.h>
  15#include <linux/spinlock.h>
  16#include <linux/fs.h>
  17#include <linux/mm.h>
  18#include <linux/slab.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/init.h>
  22#include <linux/sysrq.h>
  23#include <linux/backing-dev.h>
  24#include <linux/mpage.h>
  25#include <linux/notifier.h>
  26#include <linux/smp.h>
  27
  28/*
  29 * The maximum number of pages to writeout in a single bdflush/kupdate
  30 * operation.  We do this so we don't hold I_LOCK against an inode for
  31 * enormous amounts of time, which would block a userspace task which has
  32 * been forced to throttle against that inode.  Also, the code reevaluates
  33 * the dirty each time it has written this many pages.
  34 */
  35#define MAX_WRITEBACK_PAGES     1024
  36
  37/*
  38 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  39 * will look to see if it needs to force writeback or throttling.
  40 */
  41static long ratelimit_pages = 32;
  42
  43/*
  44 * The total number of pages in the machine.
  45 */
  46static long total_pages;
  47
  48/*
  49 * When balance_dirty_pages decides that the caller needs to perform some
  50 * non-background writeback, this is how many pages it will attempt to write.
  51 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
  52 * large amounts of I/O are submitted.
  53 */
  54static inline long sync_writeback_pages(void)
  55{
  56        return ratelimit_pages + ratelimit_pages / 2;
  57}
  58
  59/* The following parameters are exported via /proc/sys/vm */
  60
  61/*
  62 * Dirty memory thresholds, in percentages
  63 */
  64
  65/*
  66 * Start background writeback (via pdflush) at this level
  67 */
  68int dirty_background_ratio = 10;
  69
  70/*
  71 * The generator of dirty data starts async writeback at this level
  72 */
  73int dirty_async_ratio = 40;
  74
  75/*
  76 * The interval between `kupdate'-style writebacks, in centiseconds
  77 * (hundredths of a second)
  78 */
  79int dirty_writeback_centisecs = 5 * 100;
  80
  81/*
  82 * The longest amount of time for which data is allowed to remain dirty
  83 */
  84int dirty_expire_centisecs = 30 * 100;
  85
  86/* End of sysctl-exported parameters */
  87
  88
  89static void background_writeout(unsigned long _min_pages);
  90
  91/*
  92 * balance_dirty_pages() must be called by processes which are
  93 * generating dirty data.  It looks at the number of dirty pages
  94 * in the machine and either:
  95 *
  96 * - Starts background writeback or
  97 * - Causes the caller to perform async writeback or
  98 * - Causes the caller to perform synchronous writeback, then
  99 *   tells a pdflush thread to perform more writeback or
 100 * - Does nothing at all.
 101 *
 102 * balance_dirty_pages() can sleep.
 103 */
 104void balance_dirty_pages(struct address_space *mapping)
 105{
 106        struct page_state ps;
 107        long background_thresh, async_thresh;
 108        unsigned long dirty_and_writeback;
 109        struct backing_dev_info *bdi;
 110
 111        get_page_state(&ps);
 112        dirty_and_writeback = ps.nr_dirty + ps.nr_writeback;
 113
 114        background_thresh = (dirty_background_ratio * total_pages) / 100;
 115        async_thresh = (dirty_async_ratio * total_pages) / 100;
 116        bdi = mapping->backing_dev_info;
 117
 118        if (dirty_and_writeback > async_thresh) {
 119                struct writeback_control wbc = {
 120                        .bdi            = bdi,
 121                        .sync_mode      = WB_SYNC_NONE,
 122                        .older_than_this = NULL,
 123                        .nr_to_write    = sync_writeback_pages(),
 124                };
 125
 126                writeback_inodes(&wbc);
 127                get_page_state(&ps);
 128        }
 129
 130        if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh)
 131                pdflush_operation(background_writeout, 0);
 132}
 133EXPORT_SYMBOL_GPL(balance_dirty_pages);
 134
 135/**
 136 * balance_dirty_pages_ratelimited - balance dirty memory state
 137 * @mapping - address_space which was dirtied
 138 *
 139 * Processes which are dirtying memory should call in here once for each page
 140 * which was newly dirtied.  The function will periodically check the system's
 141 * dirty state and will initiate writeback if needed.
 142 *
 143 * balance_dirty_pages_ratelimited() may sleep.
 144 */
 145void balance_dirty_pages_ratelimited(struct address_space *mapping)
 146{
 147        static struct rate_limit_struct {
 148                int count;
 149        } ____cacheline_aligned ratelimits[NR_CPUS];
 150        int cpu;
 151
 152        cpu = get_cpu();
 153        if (ratelimits[cpu].count++ >= ratelimit_pages) {
 154                ratelimits[cpu].count = 0;
 155                put_cpu();
 156                balance_dirty_pages(mapping);
 157                return;
 158        }
 159        put_cpu();
 160}
 161
 162/*
 163 * writeback at least _min_pages, and keep writing until the amount of dirty
 164 * memory is less than the background threshold, or until we're all clean.
 165 */
 166static void background_writeout(unsigned long _min_pages)
 167{
 168        long min_pages = _min_pages;
 169        long background_thresh;
 170        struct writeback_control wbc = {
 171                .bdi            = NULL,
 172                .sync_mode      = WB_SYNC_NONE,
 173                .older_than_this = NULL,
 174                .nr_to_write    = 0,
 175        };
 176
 177        CHECK_EMERGENCY_SYNC
 178
 179        background_thresh = (dirty_background_ratio * total_pages) / 100;
 180
 181        do {
 182                struct page_state ps;
 183                get_page_state(&ps);
 184                if (ps.nr_dirty < background_thresh && min_pages <= 0)
 185                        break;
 186                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 187                writeback_inodes(&wbc);
 188                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 189        } while (wbc.nr_to_write <= 0);
 190        blk_run_queues();
 191}
 192
 193/*
 194 * Start heavy writeback of everything.
 195 */
 196void wakeup_bdflush(void)
 197{
 198        struct page_state ps;
 199
 200        get_page_state(&ps);
 201        pdflush_operation(background_writeout, ps.nr_dirty);
 202}
 203
 204static struct timer_list wb_timer;
 205
 206/*
 207 * Periodic writeback of "old" data.
 208 *
 209 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 210 * dirtying-time in the inode's address_space.  So this periodic writeback code
 211 * just walks the superblock inode list, writing back any inodes which are
 212 * older than a specific point in time.
 213 *
 214 * Try to run once per dirty_writeback_centisecs.  But if a writeback event
 215 * takes longer than a dirty_writeback_centisecs interval, then leave a
 216 * one-second gap.
 217 *
 218 * older_than_this takes precedence over nr_to_write.  So we'll only write back
 219 * all dirty pages if they are all attached to "old" mappings.
 220 */
 221static void wb_kupdate(unsigned long arg)
 222{
 223        unsigned long oldest_jif;
 224        unsigned long start_jif;
 225        unsigned long next_jif;
 226        struct page_state ps;
 227        struct writeback_control wbc = {
 228                .bdi            = NULL,
 229                .sync_mode      = WB_SYNC_NONE,
 230                .older_than_this = &oldest_jif,
 231                .nr_to_write    = 0,
 232        };
 233
 234        sync_supers();
 235        get_page_state(&ps);
 236
 237        oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 238        start_jif = jiffies;
 239        next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
 240        wbc.nr_to_write = ps.nr_dirty;
 241        writeback_inodes(&wbc);
 242        blk_run_queues();
 243        yield();
 244
 245        if (time_before(next_jif, jiffies + HZ))
 246                next_jif = jiffies + HZ;
 247        mod_timer(&wb_timer, next_jif);
 248}
 249
 250static void wb_timer_fn(unsigned long unused)
 251{
 252        if (pdflush_operation(wb_kupdate, 0) < 0)
 253                mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
 254
 255}
 256
 257/*
 258 * If ratelimit_pages is too high then we can get into dirty-data overload
 259 * if a large number of processes all perform writes at the same time.
 260 * If it is too low then SMP machines will call the (expensive) get_page_state
 261 * too often.
 262 *
 263 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 264 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 265 * thresholds before writeback cuts in.
 266 *
 267 * But the limit should not be set too high.  Because it also controls the
 268 * amount of memory which the balance_dirty_pages() caller has to write back.
 269 * If this is too large then the caller will block on the IO queue all the
 270 * time.  So limit it to four megabytes - the balance_dirty_pages() caller
 271 * will write six megabyte chunks, max.
 272 */
 273
 274static void set_ratelimit(void)
 275{
 276        ratelimit_pages = total_pages / (num_online_cpus() * 32);
 277        if (ratelimit_pages < 16)
 278                ratelimit_pages = 16;
 279        if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
 280                ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 281}
 282
 283static int
 284ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 285{
 286        set_ratelimit();
 287        return 0;
 288}
 289
 290static struct notifier_block ratelimit_nb = {
 291        .notifier_call  = ratelimit_handler,
 292        .next           = NULL,
 293};
 294
 295/*
 296 * If the machine has a large highmem:lowmem ratio then scale back the default
 297 * dirty memory thresholds: allowing too much dirty highmem pins an excessive
 298 * number of buffer_heads.
 299 */
 300static int __init page_writeback_init(void)
 301{
 302        long buffer_pages = nr_free_buffer_pages();
 303        long correction;
 304
 305        total_pages = nr_free_pagecache_pages();
 306
 307        correction = (100 * 4 * buffer_pages) / total_pages;
 308
 309        if (correction < 100) {
 310                dirty_background_ratio *= correction;
 311                dirty_background_ratio /= 100;
 312                dirty_async_ratio *= correction;
 313                dirty_async_ratio /= 100;
 314        }
 315
 316        init_timer(&wb_timer);
 317        wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
 318        wb_timer.data = 0;
 319        wb_timer.function = wb_timer_fn;
 320        add_timer(&wb_timer);
 321        set_ratelimit();
 322        register_cpu_notifier(&ratelimit_nb);
 323        return 0;
 324}
 325module_init(page_writeback_init);
 326
 327/*
 328 * A library function, which implements the vm_writeback a_op.  It's fairly
 329 * lame at this time.  The idea is: the VM wants to liberate this page,
 330 * so we pass the page to the address_space and give the fs the opportunity
 331 * to write out lots of pages around this one.  It allows extent-based
 332 * filesytems to do intelligent things.  It lets delayed-allocate filesystems
 333 * perform better file layout.  It lets the address_space opportunistically
 334 * write back disk-contiguous pages which are in other zones.
 335 *
 336 * FIXME: the VM wants to start I/O against *this* page.  Because its zone
 337 * is under pressure.  But this function may start writeout against a
 338 * totally different set of pages.  Unlikely to be a huge problem, but if it
 339 * is, we could just writepage the page if it is still (PageDirty &&
 340 * !PageWriteback) (See below).
 341 *
 342 * Another option is to just reposition page->mapping->dirty_pages so we
 343 * *know* that the page will be written.  That will work fine, but seems
 344 * unpleasant.  (If the page is not for-sure on ->dirty_pages we're dead).
 345 * Plus it assumes that the address_space is performing writeback in
 346 * ->dirty_pages order.
 347 *
 348 * So.  The proper fix is to leave the page locked-and-dirty and to pass
 349 * it all the way down.
 350 */
 351int generic_vm_writeback(struct page *page, struct writeback_control *wbc)
 352{
 353        struct inode *inode = page->mapping->host;
 354
 355        /*
 356         * We don't own this inode, and we don't want the address_space
 357         * vanishing while writeback is walking its pages.
 358         */
 359        inode = igrab(inode);
 360        unlock_page(page);
 361
 362        if (inode) {
 363                do_writepages(inode->i_mapping, wbc);
 364
 365                /*
 366                 * This iput() will internally call ext2_discard_prealloc(),
 367                 * which is rather bogus.  But there is no other way of
 368                 * dropping our ref to the inode.  However, there's no harm
 369                 * in dropping the prealloc, because there probably isn't any.
 370                 * Just a waste of cycles.
 371                 */
 372                iput(inode);
 373#if 0
 374                if (!PageWriteback(page) && PageDirty(page)) {
 375                        lock_page(page);
 376                        if (!PageWriteback(page)&&test_clear_page_dirty(page)) {
 377                                int ret;
 378
 379                                ret = page->mapping->a_ops->writepage(page);
 380                                if (ret == -EAGAIN)
 381                                        __set_page_dirty_nobuffers(page);
 382                        } else {
 383                                unlock_page(page);
 384                        }
 385                }
 386#endif
 387        }
 388        return 0;
 389}
 390EXPORT_SYMBOL(generic_vm_writeback);
 391
 392int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 393{
 394        if (mapping->a_ops->writepages)
 395                return mapping->a_ops->writepages(mapping, wbc);
 396        return generic_writepages(mapping, wbc);
 397}
 398
 399/**
 400 * write_one_page - write out a single page and optionally wait on I/O
 401 *
 402 * @page - the page to write
 403 * @wait - if true, wait on writeout
 404 *
 405 * The page must be locked by the caller and will be unlocked upon return.
 406 *
 407 * write_one_page() returns a negative error code if I/O failed.
 408 */
 409int write_one_page(struct page *page, int wait)
 410{
 411        struct address_space *mapping = page->mapping;
 412        int ret = 0;
 413
 414        BUG_ON(!PageLocked(page));
 415
 416        if (wait && PageWriteback(page))
 417                wait_on_page_writeback(page);
 418
 419        write_lock(&mapping->page_lock);
 420        list_del(&page->list);
 421        if (test_clear_page_dirty(page)) {
 422                list_add(&page->list, &mapping->locked_pages);
 423                page_cache_get(page);
 424                write_unlock(&mapping->page_lock);
 425                ret = mapping->a_ops->writepage(page);
 426                if (ret == -EAGAIN) {
 427                        __set_page_dirty_nobuffers(page);
 428                        ret = 0;
 429                }
 430                if (ret == 0 && wait) {
 431                        wait_on_page_writeback(page);
 432                        if (PageError(page))
 433                                ret = -EIO;
 434                }
 435                page_cache_release(page);
 436        } else {
 437                list_add(&page->list, &mapping->clean_pages);
 438                write_unlock(&mapping->page_lock);
 439                unlock_page(page);
 440        }
 441        return ret;
 442}
 443EXPORT_SYMBOL(write_one_page);
 444
 445/*
 446 * Add a page to the dirty page list.
 447 *
 448 * It is a sad fact of life that this function is called from several places
 449 * deeply under spinlocking.  It may not sleep.
 450 *
 451 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 452 * dirty-state coherency between the page and the buffers.  It the page does
 453 * not have buffers then when they are later attached they will all be set
 454 * dirty.
 455 *
 456 * The buffers are dirtied before the page is dirtied.  There's a small race
 457 * window in which a writepage caller may see the page cleanness but not the
 458 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 459 * before the buffers, a concurrent writepage caller could clear the page dirty
 460 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 461 * page on the dirty page list.
 462 *
 463 * There is also a small window where the page is dirty, and not on dirty_pages.
 464 * Also a possibility that by the time the page is added to dirty_pages, it has
 465 * been set clean.  The page lists are somewhat approximate in this regard.
 466 * It's better to have clean pages accidentally attached to dirty_pages than to
 467 * leave dirty pages attached to clean_pages.
 468 *
 469 * We use private_lock to lock against try_to_free_buffers while using the
 470 * page's buffer list.  Also use this to protect against clean buffers being
 471 * added to the page after it was set dirty.
 472 *
 473 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 474 * address_space though.
 475 *
 476 * For now, we treat swapper_space specially.  It doesn't use the normal
 477 * block a_ops.
 478 *
 479 * FIXME: this should move over to fs/buffer.c - buffer_heads have no business in mm/
 480 */
 481#include <linux/buffer_head.h>
 482int __set_page_dirty_buffers(struct page *page)
 483{
 484        struct address_space * const mapping = page->mapping;
 485        int ret = 0;
 486
 487        if (mapping == NULL) {
 488                SetPageDirty(page);
 489                goto out;
 490        }
 491
 492        if (!PageUptodate(page))
 493                buffer_error();
 494
 495        spin_lock(&mapping->private_lock);
 496
 497        if (page_has_buffers(page)) {
 498                struct buffer_head *head = page_buffers(page);
 499                struct buffer_head *bh = head;
 500
 501                do {
 502                        if (buffer_uptodate(bh))
 503                                set_buffer_dirty(bh);
 504                        else
 505                                buffer_error();
 506                        bh = bh->b_this_page;
 507                } while (bh != head);
 508        }
 509
 510        if (!TestSetPageDirty(page)) {
 511                write_lock(&mapping->page_lock);
 512                if (page->mapping) {    /* Race with truncate? */
 513                        if (!mapping->backing_dev_info->memory_backed)
 514                                inc_page_state(nr_dirty);
 515                        list_del(&page->list);
 516                        list_add(&page->list, &mapping->dirty_pages);
 517                }
 518                write_unlock(&mapping->page_lock);
 519                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 520        }
 521        
 522        spin_unlock(&mapping->private_lock);
 523out:
 524        return ret;
 525}
 526EXPORT_SYMBOL(__set_page_dirty_buffers);
 527
 528/*
 529 * For address_spaces which do not use buffers.  Just set the page's dirty bit
 530 * and move it to the dirty_pages list.  Also perform space reservation if
 531 * required.
 532 *
 533 * __set_page_dirty_nobuffers() may return -ENOSPC.  But if it does, the page
 534 * is still safe, as long as it actually manages to find some blocks at
 535 * writeback time.
 536 *
 537 * This is also used when a single buffer is being dirtied: we want to set the
 538 * page dirty in that case, but not all the buffers.  This is a "bottom-up"
 539 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
 540 */
 541int __set_page_dirty_nobuffers(struct page *page)
 542{
 543        int ret = 0;
 544
 545        if (!TestSetPageDirty(page)) {
 546                struct address_space *mapping = page->mapping;
 547
 548                if (mapping) {
 549                        write_lock(&mapping->page_lock);
 550                        if (page->mapping) {    /* Race with truncate? */
 551                                if (!mapping->backing_dev_info->memory_backed)
 552                                        inc_page_state(nr_dirty);
 553                                list_del(&page->list);
 554                                list_add(&page->list, &mapping->dirty_pages);
 555                        }
 556                        write_unlock(&mapping->page_lock);
 557                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 558                }
 559        }
 560        return ret;
 561}
 562EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 563
 564/*
 565 * Clear a page's dirty flag, while caring for dirty memory accounting. 
 566 * Returns true if the page was previously dirty.
 567 */
 568int test_clear_page_dirty(struct page *page)
 569{
 570        if (TestClearPageDirty(page)) {
 571                struct address_space *mapping = page->mapping;
 572
 573                if (mapping && !mapping->backing_dev_info->memory_backed)
 574                        dec_page_state(nr_dirty);
 575                return 1;
 576        }
 577        return 0;
 578}
 579
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.