linux-bk/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *
   6 *  Swap reorganised 29.12.95, Stephen Tweedie.
   7 *  kswapd added: 7.1.96  sct
   8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  11 *  Multiqueue VM started 5.8.00, Rik van Riel.
  12 */
  13
  14#include <linux/mm.h>
  15#include <linux/slab.h>
  16#include <linux/kernel_stat.h>
  17#include <linux/swap.h>
  18#include <linux/smp_lock.h>
  19#include <linux/pagemap.h>
  20#include <linux/init.h>
  21#include <linux/highmem.h>
  22#include <linux/file.h>
  23#include <linux/writeback.h>
  24#include <linux/suspend.h>
  25#include <linux/buffer_head.h>          /* for try_to_release_page() */
  26
  27#include <asm/pgalloc.h>
  28#include <asm/tlbflush.h>
  29#include <linux/swapops.h>
  30
  31/*
  32 * The "priority" of VM scanning is how much of the queues we
  33 * will scan in one go. A value of 6 for DEF_PRIORITY implies
  34 * that we'll scan 1/64th of the queues ("queue_length >> 6")
  35 * during a normal aging round.
  36 */
  37#define DEF_PRIORITY (6)
  38
  39static inline int is_page_cache_freeable(struct page * page)
  40{
  41        return page_count(page) - !!PagePrivate(page) == 1;
  42}
  43
  44/* Must be called with page's pte_chain_lock held. */
  45static inline int page_mapping_inuse(struct page * page)
  46{
  47        struct address_space *mapping = page->mapping;
  48
  49        /* Page is in somebody's page tables. */
  50        if (page->pte.chain)
  51                return 1;
  52
  53        /* XXX: does this happen ? */
  54        if (!mapping)
  55                return 0;
  56
  57        /* File is mmap'd by somebody. */
  58        if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared))
  59                return 1;
  60
  61        return 0;
  62}
  63
  64static int
  65shrink_cache(int nr_pages, zone_t *classzone,
  66                unsigned int gfp_mask, int priority, int max_scan)
  67{
  68        struct list_head * entry;
  69        struct address_space *mapping;
  70
  71        spin_lock(&pagemap_lru_lock);
  72        while (--max_scan >= 0 &&
  73                        (entry = inactive_list.prev) != &inactive_list) {
  74                struct page *page;
  75                int may_enter_fs;
  76
  77                if (need_resched()) {
  78                        spin_unlock(&pagemap_lru_lock);
  79                        __set_current_state(TASK_RUNNING);
  80                        schedule();
  81                        spin_lock(&pagemap_lru_lock);
  82                        continue;
  83                }
  84
  85                page = list_entry(entry, struct page, lru);
  86
  87                if (unlikely(!PageLRU(page)))
  88                        BUG();
  89                if (unlikely(PageActive(page)))
  90                        BUG();
  91
  92                list_del(entry);
  93                list_add(entry, &inactive_list);
  94                KERNEL_STAT_INC(pgscan);
  95
  96                /*
  97                 * Zero page counts can happen because we unlink the pages
  98                 * _after_ decrementing the usage count..
  99                 */
 100                if (unlikely(!page_count(page)))
 101                        continue;
 102
 103                if (!memclass(page_zone(page), classzone))
 104                        continue;
 105
 106                /*
 107                 * swap activity never enters the filesystem and is safe
 108                 * for GFP_NOFS allocations.
 109                 */
 110                may_enter_fs = (gfp_mask & __GFP_FS) ||
 111                                (PageSwapCache(page) && (gfp_mask & __GFP_IO));
 112
 113                /*
 114                 * IO in progress? Leave it at the back of the list.
 115                 */
 116                if (unlikely(PageWriteback(page))) {
 117                        if (may_enter_fs) {
 118                                page_cache_get(page);
 119                                spin_unlock(&pagemap_lru_lock);
 120                                wait_on_page_writeback(page);
 121                                page_cache_release(page);
 122                                spin_lock(&pagemap_lru_lock);
 123                        }
 124                        continue;
 125                }
 126
 127                if (TestSetPageLocked(page))
 128                        continue;
 129
 130                if (PageWriteback(page)) {      /* The non-racy check */
 131                        unlock_page(page);
 132                        continue;
 133                }
 134
 135                /*
 136                 * The page is in active use or really unfreeable. Move to
 137                 * the active list.
 138                 */
 139                pte_chain_lock(page);
 140                if (page_referenced(page) && page_mapping_inuse(page)) {
 141                        del_page_from_inactive_list(page);
 142                        add_page_to_active_list(page);
 143                        pte_chain_unlock(page);
 144                        unlock_page(page);
 145                        KERNEL_STAT_INC(pgactivate);
 146                        continue;
 147                }
 148
 149                /*
 150                 * Anonymous process memory without backing store. Try to
 151                 * allocate it some swap space here.
 152                 *
 153                 * XXX: implement swap clustering ?
 154                 */
 155                if (page->pte.chain && !page->mapping && !PagePrivate(page)) {
 156                        page_cache_get(page);
 157                        pte_chain_unlock(page);
 158                        spin_unlock(&pagemap_lru_lock);
 159                        if (!add_to_swap(page)) {
 160                                activate_page(page);
 161                                unlock_page(page);
 162                                page_cache_release(page);
 163                                spin_lock(&pagemap_lru_lock);
 164                                continue;
 165                        }
 166                        page_cache_release(page);
 167                        spin_lock(&pagemap_lru_lock);
 168                        pte_chain_lock(page);
 169                }
 170
 171                /*
 172                 * The page is mapped into the page tables of one or more
 173                 * processes. Try to unmap it here.
 174                 */
 175                if (page->pte.chain) {
 176                        switch (try_to_unmap(page)) {
 177                                case SWAP_ERROR:
 178                                case SWAP_FAIL:
 179                                        goto page_active;
 180                                case SWAP_AGAIN:
 181                                        pte_chain_unlock(page);
 182                                        unlock_page(page);
 183                                        continue;
 184                                case SWAP_SUCCESS:
 185                                        ; /* try to free the page below */
 186                        }
 187                }
 188                pte_chain_unlock(page);
 189                mapping = page->mapping;
 190
 191                if (PageDirty(page) && is_page_cache_freeable(page) &&
 192                                page->mapping && may_enter_fs) {
 193                        /*
 194                         * It is not critical here to write it only if
 195                         * the page is unmapped beause any direct writer
 196                         * like O_DIRECT would set the page's dirty bitflag
 197                         * on the physical page after having successfully
 198                         * pinned it and after the I/O to the page is finished,
 199                         * so the direct writes to the page cannot get lost.
 200                         */
 201                        int (*writeback)(struct page *, int *);
 202                        const int cluster_size = SWAP_CLUSTER_MAX;
 203                        int nr_to_write = cluster_size;
 204
 205                        writeback = mapping->a_ops->vm_writeback;
 206                        if (writeback == NULL)
 207                                writeback = generic_vm_writeback;
 208                        page_cache_get(page);
 209                        spin_unlock(&pagemap_lru_lock);
 210                        (*writeback)(page, &nr_to_write);
 211                        max_scan -= (cluster_size - nr_to_write);
 212                        page_cache_release(page);
 213                        spin_lock(&pagemap_lru_lock);
 214                        continue;
 215                }
 216
 217                /*
 218                 * If the page has buffers, try to free the buffer mappings
 219                 * associated with this page. If we succeed we try to free
 220                 * the page as well.
 221                 *
 222                 * We do this even if the page is PageDirty().
 223                 * try_to_release_page() does not perform I/O, but it is
 224                 * possible for a page to have PageDirty set, but it is actually
 225                 * clean (all its buffers are clean).  This happens if the
 226                 * buffers were written out directly, with submit_bh(). ext3
 227                 * will do this, as well as the blockdev mapping. 
 228                 * try_to_release_page() will discover that cleanness and will
 229                 * drop the buffers and mark the page clean - it can be freed.
 230                 */
 231                if (PagePrivate(page)) {
 232                        spin_unlock(&pagemap_lru_lock);
 233
 234                        /* avoid to free a locked page */
 235                        page_cache_get(page);
 236
 237                        if (try_to_release_page(page, gfp_mask)) {
 238                                if (!mapping) {
 239                                        /* effectively free the page here */
 240                                        unlock_page(page);
 241                                        page_cache_release(page);
 242
 243                                        spin_lock(&pagemap_lru_lock);
 244                                        if (--nr_pages)
 245                                                continue;
 246                                        break;
 247                                } else {
 248                                        /*
 249                                         * The page is still in pagecache so undo the stuff
 250                                         * before the try_to_release_page since we've not
 251                                         * finished and we can now try the next step.
 252                                         */
 253                                        page_cache_release(page);
 254
 255                                        spin_lock(&pagemap_lru_lock);
 256                                }
 257                        } else {
 258                                /* failed to drop the buffers so stop here */
 259                                unlock_page(page);
 260                                page_cache_release(page);
 261
 262                                spin_lock(&pagemap_lru_lock);
 263                                continue;
 264                        }
 265                }
 266
 267                /*
 268                 * This is the non-racy check for busy page.
 269                 */
 270                if (mapping) {
 271                        write_lock(&mapping->page_lock);
 272                        if (is_page_cache_freeable(page))
 273                                goto page_freeable;
 274                        write_unlock(&mapping->page_lock);
 275                }
 276                unlock_page(page);
 277                continue;
 278page_freeable:
 279                /*
 280                 * It is critical to check PageDirty _after_ we made sure
 281                 * the page is freeable* so not in use by anybody.
 282                 */
 283                if (PageDirty(page)) {
 284                        write_unlock(&mapping->page_lock);
 285                        unlock_page(page);
 286                        continue;
 287                }
 288
 289                /* point of no return */
 290                if (likely(!PageSwapCache(page))) {
 291                        __remove_inode_page(page);
 292                        write_unlock(&mapping->page_lock);
 293                } else {
 294                        swp_entry_t swap;
 295                        swap.val = page->index;
 296                        __delete_from_swap_cache(page);
 297                        write_unlock(&mapping->page_lock);
 298                        swap_free(swap);
 299                }
 300
 301                __lru_cache_del(page);
 302                unlock_page(page);
 303
 304                /* effectively free the page here */
 305                page_cache_release(page);
 306                KERNEL_STAT_INC(pgsteal);
 307                if (--nr_pages)
 308                        continue;
 309                goto out;
 310page_active:
 311                /*
 312                 * OK, we don't know what to do with the page.
 313                 * It's no use keeping it here, so we move it to
 314                 * the active list.
 315                 */
 316                del_page_from_inactive_list(page);
 317                add_page_to_active_list(page);
 318                pte_chain_unlock(page);
 319                unlock_page(page);
 320                KERNEL_STAT_INC(pgactivate);
 321        }
 322out:    spin_unlock(&pagemap_lru_lock);
 323        return nr_pages;
 324}
 325
 326/*
 327 * This moves pages from the active list to
 328 * the inactive list.
 329 *
 330 * We move them the other way if the page is 
 331 * referenced by one or more processes, from rmap
 332 */
 333static void refill_inactive(int nr_pages)
 334{
 335        struct list_head * entry;
 336
 337        spin_lock(&pagemap_lru_lock);
 338        entry = active_list.prev;
 339        while (nr_pages-- && entry != &active_list) {
 340                struct page * page;
 341
 342                page = list_entry(entry, struct page, lru);
 343                entry = entry->prev;
 344
 345                KERNEL_STAT_INC(pgscan);
 346
 347                pte_chain_lock(page);
 348                if (page->pte.chain && page_referenced(page)) {
 349                        list_del(&page->lru);
 350                        list_add(&page->lru, &active_list);
 351                        pte_chain_unlock(page);
 352                        continue;
 353                }
 354                del_page_from_active_list(page);
 355                add_page_to_inactive_list(page);
 356                pte_chain_unlock(page);
 357                KERNEL_STAT_INC(pgdeactivate);
 358        }
 359        spin_unlock(&pagemap_lru_lock);
 360}
 361
 362static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
 363static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
 364{
 365        int chunk_size = nr_pages;
 366        unsigned long ratio;
 367        struct page_state ps;
 368        int max_scan;
 369
 370        nr_pages -= kmem_cache_reap(gfp_mask);
 371        if (nr_pages <= 0)
 372                return 0;
 373
 374        nr_pages = chunk_size;
 375
 376        /*
 377         * Try to keep the active list 2/3 of the size of the cache
 378         */
 379        get_page_state(&ps);
 380        ratio = (unsigned long)nr_pages * ps.nr_active /
 381                                ((ps.nr_inactive | 1) * 2);
 382        refill_inactive(ratio);
 383        max_scan = ps.nr_inactive / priority;
 384        nr_pages = shrink_cache(nr_pages, classzone,
 385                                gfp_mask, priority, max_scan);
 386        if (nr_pages <= 0)
 387                return 0;
 388
 389        wakeup_bdflush();
 390
 391        shrink_dcache_memory(priority, gfp_mask);
 392
 393        /* After shrinking the dcache, get rid of unused inodes too .. */
 394        shrink_icache_memory(1, gfp_mask);
 395#ifdef CONFIG_QUOTA
 396        shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 397#endif
 398
 399        return nr_pages;
 400}
 401
 402int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
 403{
 404        int priority = DEF_PRIORITY;
 405        int nr_pages = SWAP_CLUSTER_MAX;
 406
 407        KERNEL_STAT_INC(pageoutrun);
 408
 409        do {
 410                nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
 411                if (nr_pages <= 0)
 412                        return 1;
 413        } while (--priority);
 414
 415        /*
 416         * Hmm.. Cache shrink failed - time to kill something?
 417         * Mhwahahhaha! This is the part I really like. Giggle.
 418         */
 419        out_of_memory();
 420        return 0;
 421}
 422
 423DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 424
 425static int check_classzone_need_balance(zone_t * classzone)
 426{
 427        zone_t * first_classzone;
 428
 429        first_classzone = classzone->zone_pgdat->node_zones;
 430        while (classzone >= first_classzone) {
 431                if (classzone->free_pages > classzone->pages_high)
 432                        return 0;
 433                classzone--;
 434        }
 435        return 1;
 436}
 437
 438static int kswapd_balance_pgdat(pg_data_t * pgdat)
 439{
 440        int need_more_balance = 0, i;
 441        zone_t * zone;
 442
 443        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 444                zone = pgdat->node_zones + i;
 445                cond_resched();
 446                if (!zone->need_balance)
 447                        continue;
 448                if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 449                        zone->need_balance = 0;
 450                        __set_current_state(TASK_INTERRUPTIBLE);
 451                        schedule_timeout(HZ);
 452                        continue;
 453                }
 454                if (check_classzone_need_balance(zone))
 455                        need_more_balance = 1;
 456                else
 457                        zone->need_balance = 0;
 458        }
 459
 460        return need_more_balance;
 461}
 462
 463static void kswapd_balance(void)
 464{
 465        int need_more_balance;
 466        pg_data_t * pgdat;
 467
 468        do {
 469                need_more_balance = 0;
 470                pgdat = pgdat_list;
 471                do
 472                        need_more_balance |= kswapd_balance_pgdat(pgdat);
 473                while ((pgdat = pgdat->pgdat_next));
 474        } while (need_more_balance);
 475}
 476
 477static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 478{
 479        zone_t * zone;
 480        int i;
 481
 482        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 483                zone = pgdat->node_zones + i;
 484                if (!zone->need_balance)
 485                        continue;
 486                return 0;
 487        }
 488
 489        return 1;
 490}
 491
 492static int kswapd_can_sleep(void)
 493{
 494        pg_data_t * pgdat;
 495
 496        pgdat = pgdat_list;
 497        do {
 498                if (kswapd_can_sleep_pgdat(pgdat))
 499                        continue;
 500                return 0;
 501        } while ((pgdat = pgdat->pgdat_next));
 502
 503        return 1;
 504}
 505
 506/*
 507 * The background pageout daemon, started as a kernel thread
 508 * from the init process. 
 509 *
 510 * This basically trickles out pages so that we have _some_
 511 * free memory available even if there is no other activity
 512 * that frees anything up. This is needed for things like routing
 513 * etc, where we otherwise might have all activity going on in
 514 * asynchronous contexts that cannot page things out.
 515 *
 516 * If there are applications that are active memory-allocators
 517 * (most normal use), this basically shouldn't matter.
 518 */
 519int kswapd(void *unused)
 520{
 521        struct task_struct *tsk = current;
 522        DECLARE_WAITQUEUE(wait, tsk);
 523
 524        daemonize();
 525        strcpy(tsk->comm, "kswapd");
 526        sigfillset(&tsk->blocked);
 527        
 528        /*
 529         * Tell the memory management that we're a "memory allocator",
 530         * and that if we need more memory we should get access to it
 531         * regardless (see "__alloc_pages()"). "kswapd" should
 532         * never get caught in the normal page freeing logic.
 533         *
 534         * (Kswapd normally doesn't need memory anyway, but sometimes
 535         * you need a small amount of memory in order to be able to
 536         * page out something else, and this flag essentially protects
 537         * us from recursively trying to free more memory as we're
 538         * trying to free the first piece of memory in the first place).
 539         */
 540        tsk->flags |= PF_MEMALLOC;
 541
 542        /*
 543         * Kswapd main loop.
 544         */
 545        for (;;) {
 546                if (current->flags & PF_FREEZE)
 547                        refrigerator(PF_IOTHREAD);
 548                __set_current_state(TASK_INTERRUPTIBLE);
 549                add_wait_queue(&kswapd_wait, &wait);
 550
 551                mb();
 552                if (kswapd_can_sleep())
 553                        schedule();
 554
 555                __set_current_state(TASK_RUNNING);
 556                remove_wait_queue(&kswapd_wait, &wait);
 557
 558                /*
 559                 * If we actually get into a low-memory situation,
 560                 * the processes needing more memory will wake us
 561                 * up on a more timely basis.
 562                 */
 563                kswapd_balance();
 564                blk_run_queues();
 565        }
 566}
 567
 568static int __init kswapd_init(void)
 569{
 570        printk("Starting kswapd\n");
 571        swap_setup();
 572        kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 573        return 0;
 574}
 575
 576module_init(kswapd_init)
 577
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.