linux-bk/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *
   6 *  Swap reorganised 29.12.95, Stephen Tweedie.
   7 *  kswapd added: 7.1.96  sct
   8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  11 *  Multiqueue VM started 5.8.00, Rik van Riel.
  12 */
  13
  14#include <linux/mm.h>
  15#include <linux/slab.h>
  16#include <linux/kernel_stat.h>
  17#include <linux/swap.h>
  18#include <linux/swapctl.h>
  19#include <linux/smp_lock.h>
  20#include <linux/pagemap.h>
  21#include <linux/init.h>
  22#include <linux/highmem.h>
  23#include <linux/file.h>
  24#include <linux/writeback.h>
  25#include <linux/suspend.h>
  26#include <linux/buffer_head.h>          /* for try_to_release_page() */
  27
  28#include <asm/pgalloc.h>
  29#include <asm/tlbflush.h>
  30#include <linux/swapops.h>
  31
  32/*
  33 * The "priority" of VM scanning is how much of the queues we
  34 * will scan in one go. A value of 6 for DEF_PRIORITY implies
  35 * that we'll scan 1/64th of the queues ("queue_length >> 6")
  36 * during a normal aging round.
  37 */
  38#define DEF_PRIORITY (6)
  39
  40static inline int is_page_cache_freeable(struct page * page)
  41{
  42        return page_count(page) - !!PagePrivate(page) == 1;
  43}
  44
  45/* Must be called with page's pte_chain_lock held. */
  46static inline int page_mapping_inuse(struct page * page)
  47{
  48        struct address_space *mapping = page->mapping;
  49
  50        /* Page is in somebody's page tables. */
  51        if (page->pte.chain)
  52                return 1;
  53
  54        /* XXX: does this happen ? */
  55        if (!mapping)
  56                return 0;
  57
  58        /* File is mmap'd by somebody. */
  59        if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared))
  60                return 1;
  61
  62        return 0;
  63}
  64
  65static int
  66shrink_cache(int nr_pages, zone_t *classzone,
  67                unsigned int gfp_mask, int priority, int max_scan)
  68{
  69        struct list_head * entry;
  70        struct address_space *mapping;
  71
  72        spin_lock(&pagemap_lru_lock);
  73        while (--max_scan >= 0 &&
  74                        (entry = inactive_list.prev) != &inactive_list) {
  75                struct page *page;
  76                int may_enter_fs;
  77
  78                if (need_resched()) {
  79                        spin_unlock(&pagemap_lru_lock);
  80                        __set_current_state(TASK_RUNNING);
  81                        schedule();
  82                        spin_lock(&pagemap_lru_lock);
  83                        continue;
  84                }
  85
  86                page = list_entry(entry, struct page, lru);
  87
  88                if (unlikely(!PageLRU(page)))
  89                        BUG();
  90                if (unlikely(PageActive(page)))
  91                        BUG();
  92
  93                list_del(entry);
  94                list_add(entry, &inactive_list);
  95                KERNEL_STAT_INC(pgscan);
  96
  97                /*
  98                 * Zero page counts can happen because we unlink the pages
  99                 * _after_ decrementing the usage count..
 100                 */
 101                if (unlikely(!page_count(page)))
 102                        continue;
 103
 104                if (!memclass(page_zone(page), classzone))
 105                        continue;
 106
 107                /*
 108                 * swap activity never enters the filesystem and is safe
 109                 * for GFP_NOFS allocations.
 110                 */
 111                may_enter_fs = (gfp_mask & __GFP_FS) ||
 112                                (PageSwapCache(page) && (gfp_mask & __GFP_IO));
 113
 114                /*
 115                 * IO in progress? Leave it at the back of the list.
 116                 */
 117                if (unlikely(PageWriteback(page))) {
 118                        if (may_enter_fs) {
 119                                page_cache_get(page);
 120                                spin_unlock(&pagemap_lru_lock);
 121                                wait_on_page_writeback(page);
 122                                page_cache_release(page);
 123                                spin_lock(&pagemap_lru_lock);
 124                        }
 125                        continue;
 126                }
 127
 128                if (TestSetPageLocked(page))
 129                        continue;
 130
 131                if (PageWriteback(page)) {      /* The non-racy check */
 132                        unlock_page(page);
 133                        continue;
 134                }
 135
 136                /*
 137                 * The page is in active use or really unfreeable. Move to
 138                 * the active list.
 139                 */
 140                pte_chain_lock(page);
 141                if (page_referenced(page) && page_mapping_inuse(page)) {
 142                        del_page_from_inactive_list(page);
 143                        add_page_to_active_list(page);
 144                        pte_chain_unlock(page);
 145                        unlock_page(page);
 146                        KERNEL_STAT_INC(pgactivate);
 147                        continue;
 148                }
 149
 150                /*
 151                 * Anonymous process memory without backing store. Try to
 152                 * allocate it some swap space here.
 153                 *
 154                 * XXX: implement swap clustering ?
 155                 */
 156                if (page->pte.chain && !page->mapping && !PagePrivate(page)) {
 157                        page_cache_get(page);
 158                        pte_chain_unlock(page);
 159                        spin_unlock(&pagemap_lru_lock);
 160                        if (!add_to_swap(page)) {
 161                                activate_page(page);
 162                                unlock_page(page);
 163                                page_cache_release(page);
 164                                spin_lock(&pagemap_lru_lock);
 165                                continue;
 166                        }
 167                        page_cache_release(page);
 168                        spin_lock(&pagemap_lru_lock);
 169                        pte_chain_lock(page);
 170                }
 171
 172                /*
 173                 * The page is mapped into the page tables of one or more
 174                 * processes. Try to unmap it here.
 175                 */
 176                if (page->pte.chain) {
 177                        switch (try_to_unmap(page)) {
 178                                case SWAP_ERROR:
 179                                case SWAP_FAIL:
 180                                        goto page_active;
 181                                case SWAP_AGAIN:
 182                                        pte_chain_unlock(page);
 183                                        unlock_page(page);
 184                                        continue;
 185                                case SWAP_SUCCESS:
 186                                        ; /* try to free the page below */
 187                        }
 188                }
 189                pte_chain_unlock(page);
 190                mapping = page->mapping;
 191
 192                if (PageDirty(page) && is_page_cache_freeable(page) &&
 193                                page->mapping && may_enter_fs) {
 194                        /*
 195                         * It is not critical here to write it only if
 196                         * the page is unmapped beause any direct writer
 197                         * like O_DIRECT would set the page's dirty bitflag
 198                         * on the physical page after having successfully
 199                         * pinned it and after the I/O to the page is finished,
 200                         * so the direct writes to the page cannot get lost.
 201                         */
 202                        int (*writeback)(struct page *, int *);
 203                        const int nr_pages = SWAP_CLUSTER_MAX;
 204                        int nr_to_write = nr_pages;
 205
 206                        writeback = mapping->a_ops->vm_writeback;
 207                        if (writeback == NULL)
 208                                writeback = generic_vm_writeback;
 209                        page_cache_get(page);
 210                        spin_unlock(&pagemap_lru_lock);
 211                        (*writeback)(page, &nr_to_write);
 212                        max_scan -= (nr_pages - nr_to_write);
 213                        page_cache_release(page);
 214                        spin_lock(&pagemap_lru_lock);
 215                        continue;
 216                }
 217
 218                /*
 219                 * If the page has buffers, try to free the buffer mappings
 220                 * associated with this page. If we succeed we try to free
 221                 * the page as well.
 222                 *
 223                 * We do this even if the page is PageDirty().
 224                 * try_to_release_page() does not perform I/O, but it is
 225                 * possible for a page to have PageDirty set, but it is actually
 226                 * clean (all its buffers are clean).  This happens if the
 227                 * buffers were written out directly, with submit_bh(). ext3
 228                 * will do this, as well as the blockdev mapping. 
 229                 * try_to_release_page() will discover that cleanness and will
 230                 * drop the buffers and mark the page clean - it can be freed.
 231                 */
 232                if (PagePrivate(page)) {
 233                        spin_unlock(&pagemap_lru_lock);
 234
 235                        /* avoid to free a locked page */
 236                        page_cache_get(page);
 237
 238                        if (try_to_release_page(page, gfp_mask)) {
 239                                if (!mapping) {
 240                                        /* effectively free the page here */
 241                                        unlock_page(page);
 242                                        page_cache_release(page);
 243
 244                                        spin_lock(&pagemap_lru_lock);
 245                                        if (--nr_pages)
 246                                                continue;
 247                                        break;
 248                                } else {
 249                                        /*
 250                                         * The page is still in pagecache so undo the stuff
 251                                         * before the try_to_release_page since we've not
 252                                         * finished and we can now try the next step.
 253                                         */
 254                                        page_cache_release(page);
 255
 256                                        spin_lock(&pagemap_lru_lock);
 257                                }
 258                        } else {
 259                                /* failed to drop the buffers so stop here */
 260                                unlock_page(page);
 261                                page_cache_release(page);
 262
 263                                spin_lock(&pagemap_lru_lock);
 264                                continue;
 265                        }
 266                }
 267
 268                /*
 269                 * This is the non-racy check for busy page.
 270                 */
 271                if (mapping) {
 272                        write_lock(&mapping->page_lock);
 273                        if (is_page_cache_freeable(page))
 274                                goto page_freeable;
 275                        write_unlock(&mapping->page_lock);
 276                }
 277                unlock_page(page);
 278                continue;
 279page_freeable:
 280                /*
 281                 * It is critical to check PageDirty _after_ we made sure
 282                 * the page is freeable* so not in use by anybody.
 283                 */
 284                if (PageDirty(page)) {
 285                        write_unlock(&mapping->page_lock);
 286                        unlock_page(page);
 287                        continue;
 288                }
 289
 290                /* point of no return */
 291                if (likely(!PageSwapCache(page))) {
 292                        __remove_inode_page(page);
 293                        write_unlock(&mapping->page_lock);
 294                } else {
 295                        swp_entry_t swap;
 296                        swap.val = page->index;
 297                        __delete_from_swap_cache(page);
 298                        write_unlock(&mapping->page_lock);
 299                        swap_free(swap);
 300                }
 301
 302                __lru_cache_del(page);
 303                unlock_page(page);
 304
 305                /* effectively free the page here */
 306                page_cache_release(page);
 307                KERNEL_STAT_INC(pgsteal);
 308                if (--nr_pages)
 309                        continue;
 310                goto out;
 311page_active:
 312                /*
 313                 * OK, we don't know what to do with the page.
 314                 * It's no use keeping it here, so we move it to
 315                 * the active list.
 316                 */
 317                del_page_from_inactive_list(page);
 318                add_page_to_active_list(page);
 319                pte_chain_unlock(page);
 320                unlock_page(page);
 321                KERNEL_STAT_INC(pgactivate);
 322        }
 323out:    spin_unlock(&pagemap_lru_lock);
 324        return nr_pages;
 325}
 326
 327/*
 328 * This moves pages from the active list to
 329 * the inactive list.
 330 *
 331 * We move them the other way if the page is 
 332 * referenced by one or more processes, from rmap
 333 */
 334static void refill_inactive(int nr_pages)
 335{
 336        struct list_head * entry;
 337
 338        spin_lock(&pagemap_lru_lock);
 339        entry = active_list.prev;
 340        while (nr_pages-- && entry != &active_list) {
 341                struct page * page;
 342
 343                page = list_entry(entry, struct page, lru);
 344                entry = entry->prev;
 345
 346                KERNEL_STAT_INC(pgscan);
 347
 348                pte_chain_lock(page);
 349                if (page->pte.chain && page_referenced(page)) {
 350                        list_del(&page->lru);
 351                        list_add(&page->lru, &active_list);
 352                        pte_chain_unlock(page);
 353                        continue;
 354                }
 355                del_page_from_active_list(page);
 356                add_page_to_inactive_list(page);
 357                pte_chain_unlock(page);
 358                KERNEL_STAT_INC(pgdeactivate);
 359        }
 360        spin_unlock(&pagemap_lru_lock);
 361}
 362
 363static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
 364static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
 365{
 366        int chunk_size = nr_pages;
 367        unsigned long ratio;
 368        struct page_state ps;
 369        int max_scan;
 370
 371        nr_pages -= kmem_cache_reap(gfp_mask);
 372        if (nr_pages <= 0)
 373                return 0;
 374
 375        nr_pages = chunk_size;
 376
 377        /*
 378         * Try to keep the active list 2/3 of the size of the cache
 379         */
 380        get_page_state(&ps);
 381        ratio = (unsigned long)nr_pages * ps.nr_active /
 382                                ((ps.nr_inactive | 1) * 2);
 383        refill_inactive(ratio);
 384        max_scan = ps.nr_inactive / priority;
 385        nr_pages = shrink_cache(nr_pages, classzone,
 386                                gfp_mask, priority, max_scan);
 387        if (nr_pages <= 0)
 388                return 0;
 389
 390        wakeup_bdflush();
 391
 392        shrink_dcache_memory(priority, gfp_mask);
 393
 394        /* After shrinking the dcache, get rid of unused inodes too .. */
 395        shrink_icache_memory(1, gfp_mask);
 396#ifdef CONFIG_QUOTA
 397        shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 398#endif
 399
 400        return nr_pages;
 401}
 402
 403int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
 404{
 405        int priority = DEF_PRIORITY;
 406        int nr_pages = SWAP_CLUSTER_MAX;
 407
 408        KERNEL_STAT_INC(pageoutrun);
 409
 410        do {
 411                nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
 412                if (nr_pages <= 0)
 413                        return 1;
 414        } while (--priority);
 415
 416        /*
 417         * Hmm.. Cache shrink failed - time to kill something?
 418         * Mhwahahhaha! This is the part I really like. Giggle.
 419         */
 420        out_of_memory();
 421        return 0;
 422}
 423
 424DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 425
 426static int check_classzone_need_balance(zone_t * classzone)
 427{
 428        zone_t * first_classzone;
 429
 430        first_classzone = classzone->zone_pgdat->node_zones;
 431        while (classzone >= first_classzone) {
 432                if (classzone->free_pages > classzone->pages_high)
 433                        return 0;
 434                classzone--;
 435        }
 436        return 1;
 437}
 438
 439static int kswapd_balance_pgdat(pg_data_t * pgdat)
 440{
 441        int need_more_balance = 0, i;
 442        zone_t * zone;
 443
 444        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 445                zone = pgdat->node_zones + i;
 446                cond_resched();
 447                if (!zone->need_balance)
 448                        continue;
 449                if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 450                        zone->need_balance = 0;
 451                        __set_current_state(TASK_INTERRUPTIBLE);
 452                        schedule_timeout(HZ);
 453                        continue;
 454                }
 455                if (check_classzone_need_balance(zone))
 456                        need_more_balance = 1;
 457                else
 458                        zone->need_balance = 0;
 459        }
 460
 461        return need_more_balance;
 462}
 463
 464static void kswapd_balance(void)
 465{
 466        int need_more_balance;
 467        pg_data_t * pgdat;
 468
 469        do {
 470                need_more_balance = 0;
 471                pgdat = pgdat_list;
 472                do
 473                        need_more_balance |= kswapd_balance_pgdat(pgdat);
 474                while ((pgdat = pgdat->node_next));
 475        } while (need_more_balance);
 476}
 477
 478static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 479{
 480        zone_t * zone;
 481        int i;
 482
 483        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 484                zone = pgdat->node_zones + i;
 485                if (!zone->need_balance)
 486                        continue;
 487                return 0;
 488        }
 489
 490        return 1;
 491}
 492
 493static int kswapd_can_sleep(void)
 494{
 495        pg_data_t * pgdat;
 496
 497        pgdat = pgdat_list;
 498        do {
 499                if (kswapd_can_sleep_pgdat(pgdat))
 500                        continue;
 501                return 0;
 502        } while ((pgdat = pgdat->node_next));
 503
 504        return 1;
 505}
 506
 507/*
 508 * The background pageout daemon, started as a kernel thread
 509 * from the init process. 
 510 *
 511 * This basically trickles out pages so that we have _some_
 512 * free memory available even if there is no other activity
 513 * that frees anything up. This is needed for things like routing
 514 * etc, where we otherwise might have all activity going on in
 515 * asynchronous contexts that cannot page things out.
 516 *
 517 * If there are applications that are active memory-allocators
 518 * (most normal use), this basically shouldn't matter.
 519 */
 520int kswapd(void *unused)
 521{
 522        struct task_struct *tsk = current;
 523        DECLARE_WAITQUEUE(wait, tsk);
 524
 525        daemonize();
 526        strcpy(tsk->comm, "kswapd");
 527        sigfillset(&tsk->blocked);
 528        
 529        /*
 530         * Tell the memory management that we're a "memory allocator",
 531         * and that if we need more memory we should get access to it
 532         * regardless (see "__alloc_pages()"). "kswapd" should
 533         * never get caught in the normal page freeing logic.
 534         *
 535         * (Kswapd normally doesn't need memory anyway, but sometimes
 536         * you need a small amount of memory in order to be able to
 537         * page out something else, and this flag essentially protects
 538         * us from recursively trying to free more memory as we're
 539         * trying to free the first piece of memory in the first place).
 540         */
 541        tsk->flags |= PF_MEMALLOC;
 542
 543        /*
 544         * Kswapd main loop.
 545         */
 546        for (;;) {
 547                if (current->flags & PF_FREEZE)
 548                        refrigerator(PF_IOTHREAD);
 549                __set_current_state(TASK_INTERRUPTIBLE);
 550                add_wait_queue(&kswapd_wait, &wait);
 551
 552                mb();
 553                if (kswapd_can_sleep())
 554                        schedule();
 555
 556                __set_current_state(TASK_RUNNING);
 557                remove_wait_queue(&kswapd_wait, &wait);
 558
 559                /*
 560                 * If we actually get into a low-memory situation,
 561                 * the processes needing more memory will wake us
 562                 * up on a more timely basis.
 563                 */
 564                kswapd_balance();
 565                blk_run_queues();
 566        }
 567}
 568
 569static int __init kswapd_init(void)
 570{
 571        printk("Starting kswapd\n");
 572        swap_setup();
 573        kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 574        return 0;
 575}
 576
 577module_init(kswapd_init)
 578
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.