linux-bk/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *
   6 *  Swap reorganised 29.12.95, Stephen Tweedie.
   7 *  kswapd added: 7.1.96  sct
   8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  11 *  Multiqueue VM started 5.8.00, Rik van Riel.
  12 */
  13
  14#include <linux/mm.h>
  15#include <linux/slab.h>
  16#include <linux/kernel_stat.h>
  17#include <linux/swap.h>
  18#include <linux/pagemap.h>
  19#include <linux/init.h>
  20#include <linux/highmem.h>
  21#include <linux/file.h>
  22#include <linux/writeback.h>
  23#include <linux/suspend.h>
  24#include <linux/buffer_head.h>          /* for try_to_release_page() */
  25#include <linux/mm_inline.h>
  26#include <linux/pagevec.h>
  27#include <linux/rmap-locking.h>
  28
  29#include <asm/pgalloc.h>
  30#include <asm/tlbflush.h>
  31#include <linux/swapops.h>
  32
  33/*
  34 * The "priority" of VM scanning is how much of the queues we
  35 * will scan in one go. A value of 6 for DEF_PRIORITY implies
  36 * that we'll scan 1/64th of the queues ("queue_length >> 6")
  37 * during a normal aging round.
  38 */
  39#define DEF_PRIORITY (6)
  40
  41#ifdef ARCH_HAS_PREFETCH
  42#define prefetch_prev_lru_page(_page, _base, _field)                    \
  43        do {                                                            \
  44                if ((_page)->lru.prev != _base) {                       \
  45                        struct page *prev;                              \
  46                                                                        \
  47                        prev = list_entry(_page->lru.prev,              \
  48                                        struct page, lru);              \
  49                        prefetch(&prev->_field);                        \
  50                }                                                       \
  51        } while (0)
  52#else
  53#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
  54#endif
  55
  56#ifdef ARCH_HAS_PREFETCHW
  57#define prefetchw_prev_lru_page(_page, _base, _field)                   \
  58        do {                                                            \
  59                if ((_page)->lru.prev != _base) {                       \
  60                        struct page *prev;                              \
  61                                                                        \
  62                        prev = list_entry(_page->lru.prev,              \
  63                                        struct page, lru);              \
  64                        prefetchw(&prev->_field);                       \
  65                }                                                       \
  66        } while (0)
  67#else
  68#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
  69#endif
  70
  71/* Must be called with page's pte_chain_lock held. */
  72static inline int page_mapping_inuse(struct page * page)
  73{
  74        struct address_space *mapping = page->mapping;
  75
  76        /* Page is in somebody's page tables. */
  77        if (page_mapped(page))
  78                return 1;
  79
  80        /* XXX: does this happen ? */
  81        if (!mapping)
  82                return 0;
  83
  84        /* File is mmap'd by somebody. */
  85        if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared))
  86                return 1;
  87
  88        return 0;
  89}
  90
  91static inline int is_page_cache_freeable(struct page *page)
  92{
  93        return page_count(page) - !!PagePrivate(page) == 2;
  94}
  95
  96static /* inline */ int
  97shrink_list(struct list_head *page_list, int nr_pages,
  98                unsigned int gfp_mask, int priority, int *max_scan)
  99{
 100        struct address_space *mapping;
 101        LIST_HEAD(ret_pages);
 102        struct pagevec freed_pvec;
 103        const int nr_pages_in = nr_pages;
 104        int pgactivate = 0;
 105
 106        pagevec_init(&freed_pvec);
 107        while (!list_empty(page_list)) {
 108                struct page *page;
 109                int may_enter_fs;
 110
 111                page = list_entry(page_list->prev, struct page, lru);
 112                list_del(&page->lru);
 113
 114                if (TestSetPageLocked(page))
 115                        goto keep;
 116
 117                BUG_ON(PageActive(page));
 118                may_enter_fs = (gfp_mask & __GFP_FS) ||
 119                                (PageSwapCache(page) && (gfp_mask & __GFP_IO));
 120                if (PageWriteback(page)) {
 121                        if (may_enter_fs)
 122                                wait_on_page_writeback(page);  /* throttling */
 123                        else
 124                                goto keep_locked;
 125                }
 126
 127                pte_chain_lock(page);
 128                if (page_referenced(page) && page_mapping_inuse(page)) {
 129                        /* In active use or really unfreeable.  Activate it. */
 130                        pte_chain_unlock(page);
 131                        goto activate_locked;
 132                }
 133
 134                mapping = page->mapping;
 135
 136                /*
 137                 * Anonymous process memory without backing store. Try to
 138                 * allocate it some swap space here.
 139                 *
 140                 * XXX: implement swap clustering ?
 141                 */
 142                if (page_mapped(page) && !mapping && !PagePrivate(page)) {
 143                        pte_chain_unlock(page);
 144                        if (!add_to_swap(page))
 145                                goto activate_locked;
 146                        pte_chain_lock(page);
 147                        mapping = page->mapping;
 148                }
 149
 150                /*
 151                 * The page is mapped into the page tables of one or more
 152                 * processes. Try to unmap it here.
 153                 */
 154                if (page_mapped(page) && mapping) {
 155                        switch (try_to_unmap(page)) {
 156                        case SWAP_ERROR:
 157                        case SWAP_FAIL:
 158                                pte_chain_unlock(page);
 159                                goto activate_locked;
 160                        case SWAP_AGAIN:
 161                                pte_chain_unlock(page);
 162                                goto keep_locked;
 163                        case SWAP_SUCCESS:
 164                                ; /* try to free the page below */
 165                        }
 166                }
 167                pte_chain_unlock(page);
 168
 169                /*
 170                 * FIXME: this is CPU-inefficient for shared mappings.
 171                 * try_to_unmap() will set the page dirty and ->vm_writeback
 172                 * will write it.  So we're back to page-at-a-time writepage
 173                 * in LRU order.
 174                 */
 175                if (PageDirty(page) && is_page_cache_freeable(page) &&
 176                                        mapping && may_enter_fs) {
 177                        int (*writeback)(struct page *,
 178                                        struct writeback_control *);
 179                        const int cluster_size = SWAP_CLUSTER_MAX;
 180                        struct writeback_control wbc = {
 181                                .nr_to_write = cluster_size,
 182                        };
 183
 184                        writeback = mapping->a_ops->vm_writeback;
 185                        if (writeback == NULL)
 186                                writeback = generic_vm_writeback;
 187                        (*writeback)(page, &wbc);
 188                        *max_scan -= (cluster_size - wbc.nr_to_write);
 189                        goto keep;
 190                }
 191
 192                /*
 193                 * If the page has buffers, try to free the buffer mappings
 194                 * associated with this page. If we succeed we try to free
 195                 * the page as well.
 196                 *
 197                 * We do this even if the page is PageDirty().
 198                 * try_to_release_page() does not perform I/O, but it is
 199                 * possible for a page to have PageDirty set, but it is actually
 200                 * clean (all its buffers are clean).  This happens if the
 201                 * buffers were written out directly, with submit_bh(). ext3
 202                 * will do this, as well as the blockdev mapping. 
 203                 * try_to_release_page() will discover that cleanness and will
 204                 * drop the buffers and mark the page clean - it can be freed.
 205                 *
 206                 * Rarely, pages can have buffers and no ->mapping.  These are
 207                 * the pages which were not successfully invalidated in
 208                 * truncate_complete_page().  We try to drop those buffers here
 209                 * and if that worked, and the page is no longer mapped into
 210                 * process address space (page_count == 0) it can be freed.
 211                 * Otherwise, leave the page on the LRU so it is swappable.
 212                 */
 213                if (PagePrivate(page)) {
 214                        if (!try_to_release_page(page, gfp_mask))
 215                                goto keep_locked;
 216                        if (!mapping && page_count(page) == 1)
 217                                goto free_it;
 218                }
 219
 220                if (!mapping)
 221                        goto keep_locked;       /* truncate got there first */
 222
 223                write_lock(&mapping->page_lock);
 224
 225                /*
 226                 * The non-racy check for busy page.  It is critical to check
 227                 * PageDirty _after_ making sure that the page is freeable and
 228                 * not in use by anybody.       (pagecache + us == 2)
 229                 */
 230                if (page_count(page) != 2 || PageDirty(page)) {
 231                        write_unlock(&mapping->page_lock);
 232                        goto keep_locked;
 233                }
 234
 235                if (PageSwapCache(page)) {
 236                        swp_entry_t swap = { .val = page->index };
 237                        __delete_from_swap_cache(page);
 238                        write_unlock(&mapping->page_lock);
 239                        swap_free(swap);
 240                } else {
 241                        __remove_from_page_cache(page);
 242                        write_unlock(&mapping->page_lock);
 243                }
 244                __put_page(page);       /* The pagecache ref */
 245free_it:
 246                unlock_page(page);
 247                nr_pages--;
 248                if (!pagevec_add(&freed_pvec, page))
 249                        __pagevec_release_nonlru(&freed_pvec);
 250                continue;
 251
 252activate_locked:
 253                SetPageActive(page);
 254                pgactivate++;
 255keep_locked:
 256                unlock_page(page);
 257keep:
 258                list_add(&page->lru, &ret_pages);
 259                BUG_ON(PageLRU(page));
 260        }
 261        list_splice(&ret_pages, page_list);
 262        if (pagevec_count(&freed_pvec))
 263                __pagevec_release_nonlru(&freed_pvec);
 264        KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages);
 265        KERNEL_STAT_ADD(pgactivate, pgactivate);
 266        return nr_pages;
 267}
 268
 269/*
 270 * zone->lru_lock is heavily contented.  We relieve it by quickly privatising
 271 * a batch of pages and working on them outside the lock.  Any pages which were
 272 * not freed will be added back to the LRU.
 273 *
 274 * shrink_cache() is passed the number of pages to try to free, and returns
 275 * the number which are yet-to-free.
 276 *
 277 * For pagecache intensive workloads, the first loop here is the hottest spot
 278 * in the kernel (apart from the copy_*_user functions).
 279 */
 280static /* inline */ int
 281shrink_cache(int nr_pages, struct zone *zone,
 282                unsigned int gfp_mask, int priority, int max_scan)
 283{
 284        LIST_HEAD(page_list);
 285        struct pagevec pvec;
 286        int nr_to_process;
 287
 288        /*
 289         * Try to ensure that we free `nr_pages' pages in one pass of the loop.
 290         */
 291        nr_to_process = nr_pages;
 292        if (nr_to_process < SWAP_CLUSTER_MAX)
 293                nr_to_process = SWAP_CLUSTER_MAX;
 294
 295        pagevec_init(&pvec);
 296
 297        lru_add_drain();
 298        spin_lock_irq(&zone->lru_lock);
 299        while (max_scan > 0 && nr_pages > 0) {
 300                struct page *page;
 301                int n = 0;
 302
 303                while (n < nr_to_process && !list_empty(&zone->inactive_list)) {
 304                        page = list_entry(zone->inactive_list.prev,
 305                                        struct page, lru);
 306
 307                        prefetchw_prev_lru_page(page,
 308                                                &zone->inactive_list, flags);
 309
 310                        if (!TestClearPageLRU(page))
 311                                BUG();
 312                        list_del(&page->lru);
 313                        if (page_count(page) == 0) {
 314                                /* It is currently in pagevec_release() */
 315                                SetPageLRU(page);
 316                                list_add(&page->lru, &zone->inactive_list);
 317                                continue;
 318                        }
 319                        list_add(&page->lru, &page_list);
 320                        page_cache_get(page);
 321                        n++;
 322                }
 323                zone->nr_inactive -= n;
 324                spin_unlock_irq(&zone->lru_lock);
 325
 326                if (list_empty(&page_list))
 327                        goto done;
 328
 329                max_scan -= n;
 330                KERNEL_STAT_ADD(pgscan, n);
 331                nr_pages = shrink_list(&page_list, nr_pages,
 332                                        gfp_mask, priority, &max_scan);
 333
 334                if (nr_pages <= 0 && list_empty(&page_list))
 335                        goto done;
 336
 337                spin_lock_irq(&zone->lru_lock);
 338                /*
 339                 * Put back any unfreeable pages.
 340                 */
 341                while (!list_empty(&page_list)) {
 342                        page = list_entry(page_list.prev, struct page, lru);
 343                        if (TestSetPageLRU(page))
 344                                BUG();
 345                        list_del(&page->lru);
 346                        if (PageActive(page))
 347                                add_page_to_active_list(zone, page);
 348                        else
 349                                add_page_to_inactive_list(zone, page);
 350                        if (!pagevec_add(&pvec, page)) {
 351                                spin_unlock_irq(&zone->lru_lock);
 352                                __pagevec_release(&pvec);
 353                                spin_lock_irq(&zone->lru_lock);
 354                        }
 355                }
 356        }
 357        spin_unlock_irq(&zone->lru_lock);
 358done:
 359        pagevec_release(&pvec);
 360        return nr_pages;        
 361}
 362
 363/*
 364 * This moves pages from the active list to the inactive list.
 365 *
 366 * We move them the other way if the page is referenced by one or more
 367 * processes, from rmap.
 368 *
 369 * If the pages are mostly unmapped, the processing is fast and it is
 370 * appropriate to hold zone->lru_lock across the whole operation.  But if
 371 * the pages are mapped, the processing is slow (page_referenced()) so we
 372 * should drop zone->lru_lock around each page.  It's impossible to balance
 373 * this, so instead we remove the pages from the LRU while processing them.
 374 * It is safe to rely on PG_active against the non-LRU pages in here because
 375 * nobody will play with that bit on a non-LRU page.
 376 *
 377 * The downside is that we have to touch page->count against each page.
 378 * But we had to alter page->flags anyway.
 379 */
 380static /* inline */ void
 381refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 382{
 383        int pgdeactivate = 0;
 384        int nr_pages = nr_pages_in;
 385        LIST_HEAD(l_hold);      /* The pages which were snipped off */
 386        LIST_HEAD(l_inactive);  /* Pages to go onto the inactive_list */
 387        LIST_HEAD(l_active);    /* Pages to go onto the active_list */
 388        struct page *page;
 389        struct pagevec pvec;
 390
 391        lru_add_drain();
 392        spin_lock_irq(&zone->lru_lock);
 393        while (nr_pages && !list_empty(&zone->active_list)) {
 394                page = list_entry(zone->active_list.prev, struct page, lru);
 395                prefetchw_prev_lru_page(page, &zone->active_list, flags);
 396                if (!TestClearPageLRU(page))
 397                        BUG();
 398                list_del(&page->lru);
 399                if (page_count(page) == 0) {
 400                        /* It is currently in pagevec_release() */
 401                        SetPageLRU(page);
 402                        list_add(&page->lru, &zone->active_list);
 403                        continue;
 404                }
 405                page_cache_get(page);
 406                list_add(&page->lru, &l_hold);
 407                nr_pages--;
 408        }
 409        spin_unlock_irq(&zone->lru_lock);
 410
 411        while (!list_empty(&l_hold)) {
 412                page = list_entry(l_hold.prev, struct page, lru);
 413                list_del(&page->lru);
 414                if (page_mapped(page)) {
 415                        pte_chain_lock(page);
 416                        if (page_mapped(page) && page_referenced(page)) {
 417                                pte_chain_unlock(page);
 418                                list_add(&page->lru, &l_active);
 419                                continue;
 420                        }
 421                        pte_chain_unlock(page);
 422                }
 423                list_add(&page->lru, &l_inactive);
 424                pgdeactivate++;
 425        }
 426
 427        pagevec_init(&pvec);
 428        spin_lock_irq(&zone->lru_lock);
 429        while (!list_empty(&l_inactive)) {
 430                page = list_entry(l_inactive.prev, struct page, lru);
 431                prefetchw_prev_lru_page(page, &l_inactive, flags);
 432                if (TestSetPageLRU(page))
 433                        BUG();
 434                if (!TestClearPageActive(page))
 435                        BUG();
 436                list_move(&page->lru, &zone->inactive_list);
 437                if (!pagevec_add(&pvec, page)) {
 438                        spin_unlock_irq(&zone->lru_lock);
 439                        if (buffer_heads_over_limit)
 440                                pagevec_strip(&pvec);
 441                        __pagevec_release(&pvec);
 442                        spin_lock_irq(&zone->lru_lock);
 443                }
 444        }
 445        if (buffer_heads_over_limit) {
 446                spin_unlock_irq(&zone->lru_lock);
 447                pagevec_strip(&pvec);
 448                spin_lock_irq(&zone->lru_lock);
 449        }
 450        while (!list_empty(&l_active)) {
 451                page = list_entry(l_active.prev, struct page, lru);
 452                prefetchw_prev_lru_page(page, &l_active, flags);
 453                if (TestSetPageLRU(page))
 454                        BUG();
 455                BUG_ON(!PageActive(page));
 456                list_move(&page->lru, &zone->active_list);
 457                if (!pagevec_add(&pvec, page)) {
 458                        spin_unlock_irq(&zone->lru_lock);
 459                        __pagevec_release(&pvec);
 460                        spin_lock_irq(&zone->lru_lock);
 461                }
 462        }
 463        zone->nr_active -= pgdeactivate;
 464        zone->nr_inactive += pgdeactivate;
 465        spin_unlock_irq(&zone->lru_lock);
 466        pagevec_release(&pvec);
 467
 468        KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
 469        KERNEL_STAT_ADD(pgdeactivate, pgdeactivate);
 470}
 471
 472static /* inline */ int
 473shrink_zone(struct zone *zone, int priority,
 474        unsigned int gfp_mask, int nr_pages)
 475{
 476        unsigned long ratio;
 477        int max_scan;
 478
 479        /* This is bogus for ZONE_HIGHMEM? */
 480        if (kmem_cache_reap(gfp_mask) >= nr_pages)
 481                return 0;
 482
 483        /*
 484         * Try to keep the active list 2/3 of the size of the cache.  And
 485         * make sure that refill_inactive is given a decent number of pages.
 486         *
 487         * The "ratio+1" here is important.  With pagecache-intensive workloads
 488         * the inactive list is huge, and `ratio' evaluates to zero all the
 489         * time.  Which pins the active list memory.  So we add one to `ratio'
 490         * just to make sure that the kernel will slowly sift through the
 491         * active list.
 492         */
 493        ratio = (unsigned long)nr_pages * zone->nr_active /
 494                                ((zone->nr_inactive | 1) * 2);
 495        atomic_add(ratio+1, &zone->refill_counter);
 496        while (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
 497                atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
 498                refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
 499        }
 500
 501        max_scan = zone->nr_inactive / priority;
 502        nr_pages = shrink_cache(nr_pages, zone,
 503                                gfp_mask, priority, max_scan);
 504
 505        if (nr_pages <= 0)
 506                return 0;
 507
 508        wakeup_bdflush();
 509
 510        shrink_dcache_memory(priority, gfp_mask);
 511
 512        /* After shrinking the dcache, get rid of unused inodes too .. */
 513        shrink_icache_memory(1, gfp_mask);
 514#ifdef CONFIG_QUOTA
 515        shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 516#endif
 517
 518        return nr_pages;
 519}
 520
 521static int
 522shrink_caches(struct zone *classzone, int priority,
 523                int gfp_mask, int nr_pages)
 524{
 525        struct zone *first_classzone;
 526        struct zone *zone;
 527
 528        first_classzone = classzone->zone_pgdat->node_zones;
 529        zone = classzone;
 530        while (zone >= first_classzone && nr_pages > 0) {
 531                if (zone->free_pages <= zone->pages_high) {
 532                        nr_pages = shrink_zone(zone, priority,
 533                                        gfp_mask, nr_pages);
 534                }
 535                zone--;
 536        }
 537        return nr_pages;
 538}
 539
 540/*
 541 * This is the main entry point to page reclaim.
 542 *
 543 * If a full scan of the inactive list fails to free enough memory then we
 544 * are "out of memory" and something needs to be killed.
 545 *
 546 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 547 * high - the zone may be full of dirty or under-writeback pages, which this
 548 * caller can't do much about.  So for !__GFP_FS callers, we just perform a
 549 * small LRU walk and if that didn't work out, fail the allocation back to the
 550 * caller.  GFP_NOFS allocators need to know how to deal with it.  Kicking
 551 * bdflush, waiting and retrying will work.
 552 *
 553 * This is a fairly lame algorithm - it can result in excessive CPU burning and
 554 * excessive rotation of the inactive list, which is _supposed_ to be an LRU,
 555 * yes?
 556 */
 557int
 558try_to_free_pages(struct zone *classzone,
 559                unsigned int gfp_mask, unsigned int order)
 560{
 561        int priority = DEF_PRIORITY;
 562        int nr_pages = SWAP_CLUSTER_MAX;
 563
 564        KERNEL_STAT_INC(pageoutrun);
 565
 566        for (priority = DEF_PRIORITY; priority; priority--) {
 567                nr_pages = shrink_caches(classzone, priority,
 568                                        gfp_mask, nr_pages);
 569                if (nr_pages <= 0)
 570                        return 1;
 571                if (!(gfp_mask & __GFP_FS))
 572                        break;
 573        }
 574        if (gfp_mask & __GFP_FS)
 575                out_of_memory();
 576        return 0;
 577}
 578
 579DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 580
 581static int check_classzone_need_balance(struct zone *classzone)
 582{
 583        struct zone *first_classzone;
 584
 585        first_classzone = classzone->zone_pgdat->node_zones;
 586        while (classzone >= first_classzone) {
 587                if (classzone->free_pages > classzone->pages_high)
 588                        return 0;
 589                classzone--;
 590        }
 591        return 1;
 592}
 593
 594static int kswapd_balance_pgdat(pg_data_t * pgdat)
 595{
 596        int need_more_balance = 0, i;
 597        struct zone *zone;
 598
 599        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 600                zone = pgdat->node_zones + i;
 601                cond_resched();
 602                if (!zone->need_balance)
 603                        continue;
 604                if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 605                        zone->need_balance = 0;
 606                        __set_current_state(TASK_INTERRUPTIBLE);
 607                        schedule_timeout(HZ);
 608                        continue;
 609                }
 610                if (check_classzone_need_balance(zone))
 611                        need_more_balance = 1;
 612                else
 613                        zone->need_balance = 0;
 614        }
 615
 616        return need_more_balance;
 617}
 618
 619static void kswapd_balance(void)
 620{
 621        int need_more_balance;
 622        pg_data_t * pgdat;
 623
 624        do {
 625                need_more_balance = 0;
 626                pgdat = pgdat_list;
 627                do
 628                        need_more_balance |= kswapd_balance_pgdat(pgdat);
 629                while ((pgdat = pgdat->pgdat_next));
 630        } while (need_more_balance);
 631}
 632
 633static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 634{
 635        struct zone *zone;
 636        int i;
 637
 638        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 639                zone = pgdat->node_zones + i;
 640                if (!zone->need_balance)
 641                        continue;
 642                return 0;
 643        }
 644
 645        return 1;
 646}
 647
 648static int kswapd_can_sleep(void)
 649{
 650        pg_data_t * pgdat;
 651
 652        pgdat = pgdat_list;
 653        do {
 654                if (kswapd_can_sleep_pgdat(pgdat))
 655                        continue;
 656                return 0;
 657        } while ((pgdat = pgdat->pgdat_next));
 658
 659        return 1;
 660}
 661
 662/*
 663 * The background pageout daemon, started as a kernel thread
 664 * from the init process. 
 665 *
 666 * This basically trickles out pages so that we have _some_
 667 * free memory available even if there is no other activity
 668 * that frees anything up. This is needed for things like routing
 669 * etc, where we otherwise might have all activity going on in
 670 * asynchronous contexts that cannot page things out.
 671 *
 672 * If there are applications that are active memory-allocators
 673 * (most normal use), this basically shouldn't matter.
 674 */
 675int kswapd(void *unused)
 676{
 677        struct task_struct *tsk = current;
 678        DECLARE_WAITQUEUE(wait, tsk);
 679
 680        daemonize();
 681        strcpy(tsk->comm, "kswapd");
 682        sigfillset(&tsk->blocked);
 683        
 684        /*
 685         * Tell the memory management that we're a "memory allocator",
 686         * and that if we need more memory we should get access to it
 687         * regardless (see "__alloc_pages()"). "kswapd" should
 688         * never get caught in the normal page freeing logic.
 689         *
 690         * (Kswapd normally doesn't need memory anyway, but sometimes
 691         * you need a small amount of memory in order to be able to
 692         * page out something else, and this flag essentially protects
 693         * us from recursively trying to free more memory as we're
 694         * trying to free the first piece of memory in the first place).
 695         */
 696        tsk->flags |= PF_MEMALLOC;
 697
 698        /*
 699         * Kswapd main loop.
 700         */
 701        for (;;) {
 702                if (current->flags & PF_FREEZE)
 703                        refrigerator(PF_IOTHREAD);
 704                __set_current_state(TASK_INTERRUPTIBLE);
 705                add_wait_queue(&kswapd_wait, &wait);
 706
 707                mb();
 708                if (kswapd_can_sleep())
 709                        schedule();
 710
 711                __set_current_state(TASK_RUNNING);
 712                remove_wait_queue(&kswapd_wait, &wait);
 713
 714                /*
 715                 * If we actually get into a low-memory situation,
 716                 * the processes needing more memory will wake us
 717                 * up on a more timely basis.
 718                 */
 719                kswapd_balance();
 720                blk_run_queues();
 721        }
 722}
 723
 724static int __init kswapd_init(void)
 725{
 726        printk("Starting kswapd\n");
 727        swap_setup();
 728        kernel_thread(kswapd, NULL, CLONE_KERNEL);
 729        return 0;
 730}
 731
 732module_init(kswapd_init)
 733
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.