linux-old/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  The pageout daemon, decides which pages to evict (swap out) and
   5 *  does the actual work of freeing them.
   6 *
   7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8 *
   9 *  Swap reorganised 29.12.95, Stephen Tweedie.
  10 *  kswapd added: 7.1.96  sct
  11 *  Removed kswapd_ctl limits, and swap out as many pages as needed
  12 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  13 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  14 *  Multiqueue VM started 5.8.00, Rik van Riel.
  15 */
  16
  17#include <linux/slab.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/swap.h>
  20#include <linux/swapctl.h>
  21#include <linux/smp_lock.h>
  22#include <linux/pagemap.h>
  23#include <linux/init.h>
  24#include <linux/highmem.h>
  25#include <linux/file.h>
  26
  27#include <asm/pgalloc.h>
  28
  29/*
  30 * "vm_passes" is the number of vm passes before failing the
  31 * memory balancing. Take into account 3 passes are needed
  32 * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
  33 * of the inactive list at each pass.
  34 */
  35int vm_passes = 60;
  36
  37/*
  38 * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
  39 * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
  40 * scan 1/6 of the inactive lists during a normal aging round.
  41 */
  42int vm_cache_scan_ratio = 6;
  43
  44/*
  45 * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
  46 * we'll start to pageout.
  47 */
  48int vm_mapped_ratio = 100;
  49
  50/*
  51 * "vm_lru_balance_ratio" controls the balance between active and
  52 * inactive cache. The bigger vm_balance is, the easier the
  53 * active cache will grow, because we'll rotate the active list
  54 * slowly. A value of 2 means we'll go towards a balance of
  55 * 1/3 of the cache being inactive.
  56 */
  57int vm_lru_balance_ratio = 2;
  58
  59/*
  60 * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
  61 * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
  62 * the unused-inode, dentry and dquot caches will be freed during a normal
  63 * aging round.
  64 */
  65int vm_vfs_scan_ratio = 6;
  66
  67/*
  68 * The swap-out function returns 1 if it successfully
  69 * scanned all the pages it was asked to (`count').
  70 * It returns zero if it couldn't do anything,
  71 *
  72 * rss may decrease because pages are shared, but this
  73 * doesn't count as having freed a page.
  74 */
  75
  76/* mm->page_table_lock is held. mmap_sem is not held */
  77static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
  78{
  79        pte_t pte;
  80        swp_entry_t entry;
  81
  82        /* Don't look at this pte if it's been accessed recently. */
  83        if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
  84                mark_page_accessed(page);
  85                return 0;
  86        }
  87
  88        /* Don't bother unmapping pages that are active */
  89        if (PageActive(page))
  90                return 0;
  91
  92        /* Don't bother replenishing zones not under pressure.. */
  93        if (!memclass(page_zone(page), classzone))
  94                return 0;
  95
  96        if (TryLockPage(page))
  97                return 0;
  98
  99        /* From this point on, the odds are that we're going to
 100         * nuke this pte, so read and clear the pte.  This hook
 101         * is needed on CPUs which update the accessed and dirty
 102         * bits in hardware.
 103         */
 104        flush_cache_page(vma, address);
 105        pte = ptep_get_and_clear(page_table);
 106        flush_tlb_page(vma, address);
 107
 108        if (pte_dirty(pte))
 109                set_page_dirty(page);
 110
 111        /*
 112         * Is the page already in the swap cache? If so, then
 113         * we can just drop our reference to it without doing
 114         * any IO - it's already up-to-date on disk.
 115         */
 116        if (PageSwapCache(page)) {
 117                entry.val = page->index;
 118                swap_duplicate(entry);
 119set_swap_pte:
 120                set_pte(page_table, swp_entry_to_pte(entry));
 121drop_pte:
 122                mm->rss--;
 123                UnlockPage(page);
 124                {
 125                        int freeable = page_count(page) - !!page->buffers <= 2;
 126                        page_cache_release(page);
 127                        return freeable;
 128                }
 129        }
 130
 131        /*
 132         * Is it a clean page? Then it must be recoverable
 133         * by just paging it in again, and we can just drop
 134         * it..  or if it's dirty but has backing store,
 135         * just mark the page dirty and drop it.
 136         *
 137         * However, this won't actually free any real
 138         * memory, as the page will just be in the page cache
 139         * somewhere, and as such we should just continue
 140         * our scan.
 141         *
 142         * Basically, this just makes it possible for us to do
 143         * some real work in the future in "refill_inactive()".
 144         */
 145        if (page->mapping)
 146                goto drop_pte;
 147        if (!PageDirty(page))
 148                goto drop_pte;
 149
 150        /*
 151         * Anonymous buffercache pages can be left behind by
 152         * concurrent truncate and pagefault.
 153         */
 154        if (page->buffers)
 155                goto preserve;
 156
 157        /*
 158         * This is a dirty, swappable page.  First of all,
 159         * get a suitable swap entry for it, and make sure
 160         * we have the swap cache set up to associate the
 161         * page with that swap entry.
 162         */
 163        for (;;) {
 164                entry = get_swap_page();
 165                if (!entry.val)
 166                        break;
 167                /* Add it to the swap cache and mark it dirty
 168                 * (adding to the page cache will clear the dirty
 169                 * and uptodate bits, so we need to do it again)
 170                 */
 171                if (add_to_swap_cache(page, entry) == 0) {
 172                        SetPageUptodate(page);
 173                        set_page_dirty(page);
 174                        goto set_swap_pte;
 175                }
 176                /* Raced with "speculative" read_swap_cache_async */
 177                swap_free(entry);
 178        }
 179
 180        /* No swap space left */
 181preserve:
 182        set_pte(page_table, pte);
 183        UnlockPage(page);
 184        return 0;
 185}
 186
 187/* mm->page_table_lock is held. mmap_sem is not held */
 188static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 189{
 190        pte_t * pte;
 191        unsigned long pmd_end;
 192
 193        if (pmd_none(*dir))
 194                return count;
 195        if (pmd_bad(*dir)) {
 196                pmd_ERROR(*dir);
 197                pmd_clear(dir);
 198                return count;
 199        }
 200        
 201        pte = pte_offset(dir, address);
 202        
 203        pmd_end = (address + PMD_SIZE) & PMD_MASK;
 204        if (end > pmd_end)
 205                end = pmd_end;
 206
 207        do {
 208                if (pte_present(*pte)) {
 209                        struct page *page = pte_page(*pte);
 210
 211                        if (VALID_PAGE(page) && !PageReserved(page)) {
 212                                count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
 213                                if (!count) {
 214                                        address += PAGE_SIZE;
 215                                        break;
 216                                }
 217                        }
 218                }
 219                address += PAGE_SIZE;
 220                pte++;
 221        } while (address && (address < end));
 222        mm->swap_address = address;
 223        return count;
 224}
 225
 226/* mm->page_table_lock is held. mmap_sem is not held */
 227static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
 228{
 229        pmd_t * pmd;
 230        unsigned long pgd_end;
 231
 232        if (pgd_none(*dir))
 233                return count;
 234        if (pgd_bad(*dir)) {
 235                pgd_ERROR(*dir);
 236                pgd_clear(dir);
 237                return count;
 238        }
 239
 240        pmd = pmd_offset(dir, address);
 241
 242        pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 243        if (pgd_end && (end > pgd_end))
 244                end = pgd_end;
 245        
 246        do {
 247                count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
 248                if (!count)
 249                        break;
 250                address = (address + PMD_SIZE) & PMD_MASK;
 251                pmd++;
 252        } while (address && (address < end));
 253        return count;
 254}
 255
 256/* mm->page_table_lock is held. mmap_sem is not held */
 257static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
 258{
 259        pgd_t *pgdir;
 260        unsigned long end;
 261
 262        /* Don't swap out areas which are reserved */
 263        if (vma->vm_flags & VM_RESERVED)
 264                return count;
 265
 266        pgdir = pgd_offset(mm, address);
 267
 268        end = vma->vm_end;
 269        BUG_ON(address >= end);
 270        do {
 271                count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
 272                if (!count)
 273                        break;
 274                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 275                pgdir++;
 276        } while (address && (address < end));
 277        return count;
 278}
 279
 280/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
 281struct mm_struct *swap_mm = &init_mm;
 282
 283/*
 284 * Returns remaining count of pages to be swapped out by followup call.
 285 */
 286static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
 287{
 288        unsigned long address;
 289        struct vm_area_struct* vma;
 290
 291        /*
 292         * Find the proper vm-area after freezing the vma chain 
 293         * and ptes.
 294         */
 295        spin_lock(&mm->page_table_lock);
 296        address = mm->swap_address;
 297        if (address == TASK_SIZE || swap_mm != mm) {
 298                /* We raced: don't count this mm but try again */
 299                ++*mmcounter;
 300                goto out_unlock;
 301        }
 302        vma = find_vma(mm, address);
 303        if (vma) {
 304                if (address < vma->vm_start)
 305                        address = vma->vm_start;
 306
 307                for (;;) {
 308                        count = swap_out_vma(mm, vma, address, count, classzone);
 309                        vma = vma->vm_next;
 310                        if (!vma)
 311                                break;
 312                        if (!count)
 313                                goto out_unlock;
 314                        address = vma->vm_start;
 315                }
 316        }
 317        /* Indicate that we reached the end of address space */
 318        mm->swap_address = TASK_SIZE;
 319
 320out_unlock:
 321        spin_unlock(&mm->page_table_lock);
 322        return count;
 323}
 324
 325static int FASTCALL(swap_out(zone_t * classzone));
 326static int swap_out(zone_t * classzone)
 327{
 328        int counter, nr_pages = SWAP_CLUSTER_MAX;
 329        struct mm_struct *mm;
 330
 331        counter = mmlist_nr << 1;
 332        do {
 333                if (unlikely(current->need_resched)) {
 334                        __set_current_state(TASK_RUNNING);
 335                        schedule();
 336                }
 337
 338                spin_lock(&mmlist_lock);
 339                mm = swap_mm;
 340                while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
 341                        mm->swap_address = 0;
 342                        mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 343                        if (mm == swap_mm)
 344                                goto empty;
 345                        swap_mm = mm;
 346                }
 347
 348                /* Make sure the mm doesn't disappear when we drop the lock.. */
 349                atomic_inc(&mm->mm_users);
 350                spin_unlock(&mmlist_lock);
 351
 352                nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
 353
 354                mmput(mm);
 355
 356                if (!nr_pages)
 357                        return 1;
 358        } while (--counter >= 0);
 359
 360        return 0;
 361
 362empty:
 363        spin_unlock(&mmlist_lock);
 364        return 0;
 365}
 366
 367static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
 368static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
 369static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
 370{
 371        struct list_head * entry;
 372        int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
 373        int max_mapped = vm_mapped_ratio * nr_pages;
 374
 375        while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
 376                struct page * page;
 377
 378                if (unlikely(current->need_resched)) {
 379                        spin_unlock(&pagemap_lru_lock);
 380                        __set_current_state(TASK_RUNNING);
 381                        schedule();
 382                        spin_lock(&pagemap_lru_lock);
 383                        continue;
 384                }
 385
 386                page = list_entry(entry, struct page, lru);
 387
 388                BUG_ON(!PageLRU(page));
 389                BUG_ON(PageActive(page));
 390
 391                list_del(entry);
 392                list_add(entry, &inactive_list);
 393
 394                /*
 395                 * Zero page counts can happen because we unlink the pages
 396                 * _after_ decrementing the usage count..
 397                 */
 398                if (unlikely(!page_count(page)))
 399                        continue;
 400
 401                if (!memclass(page_zone(page), classzone))
 402                        continue;
 403
 404                max_scan--;
 405
 406                /* Racy check to avoid trylocking when not worthwhile */
 407                if (!page->buffers && (page_count(page) != 1 || !page->mapping))
 408                        goto page_mapped;
 409
 410                /*
 411                 * The page is locked. IO in progress?
 412                 * Move it to the back of the list.
 413                 */
 414                if (unlikely(TryLockPage(page))) {
 415                        if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
 416                                page_cache_get(page);
 417                                spin_unlock(&pagemap_lru_lock);
 418                                wait_on_page(page);
 419                                page_cache_release(page);
 420                                spin_lock(&pagemap_lru_lock);
 421                        }
 422                        continue;
 423                }
 424
 425                if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
 426                        /*
 427                         * It is not critical here to write it only if
 428                         * the page is unmapped beause any direct writer
 429                         * like O_DIRECT would set the PG_dirty bitflag
 430                         * on the phisical page after having successfully
 431                         * pinned it and after the I/O to the page is finished,
 432                         * so the direct writes to the page cannot get lost.
 433                         */
 434                        int (*writepage)(struct page *);
 435
 436                        writepage = page->mapping->a_ops->writepage;
 437                        if ((gfp_mask & __GFP_FS) && writepage) {
 438                                ClearPageDirty(page);
 439                                SetPageLaunder(page);
 440                                page_cache_get(page);
 441                                spin_unlock(&pagemap_lru_lock);
 442
 443                                writepage(page);
 444                                page_cache_release(page);
 445
 446                                spin_lock(&pagemap_lru_lock);
 447                                continue;
 448                        }
 449                }
 450
 451                /*
 452                 * If the page has buffers, try to free the buffer mappings
 453                 * associated with this page. If we succeed we try to free
 454                 * the page as well.
 455                 */
 456                if (page->buffers) {
 457                        spin_unlock(&pagemap_lru_lock);
 458
 459                        /* avoid to free a locked page */
 460                        page_cache_get(page);
 461
 462                        if (try_to_release_page(page, gfp_mask)) {
 463                                if (!page->mapping) {
 464                                        /*
 465                                         * We must not allow an anon page
 466                                         * with no buffers to be visible on
 467                                         * the LRU, so we unlock the page after
 468                                         * taking the lru lock
 469                                         */
 470                                        spin_lock(&pagemap_lru_lock);
 471                                        UnlockPage(page);
 472                                        __lru_cache_del(page);
 473
 474                                        /* effectively free the page here */
 475                                        page_cache_release(page);
 476
 477                                        if (--nr_pages)
 478                                                continue;
 479                                        break;
 480                                } else {
 481                                        /*
 482                                         * The page is still in pagecache so undo the stuff
 483                                         * before the try_to_release_page since we've not
 484                                         * finished and we can now try the next step.
 485                                         */
 486                                        page_cache_release(page);
 487
 488                                        spin_lock(&pagemap_lru_lock);
 489                                }
 490                        } else {
 491                                /* failed to drop the buffers so stop here */
 492                                UnlockPage(page);
 493                                page_cache_release(page);
 494
 495                                spin_lock(&pagemap_lru_lock);
 496                                continue;
 497                        }
 498                }
 499
 500                spin_lock(&pagecache_lock);
 501
 502                /*
 503                 * This is the non-racy check for busy page.
 504                 * It is critical to check PageDirty _after_ we made sure
 505                 * the page is freeable so not in use by anybody.
 506                 * At this point we're guaranteed that page->buffers is NULL,
 507                 * nobody can refill page->buffers under us because we still
 508                 * hold the page lock.
 509                 */
 510                if (!page->mapping || page_count(page) > 1) {
 511                        spin_unlock(&pagecache_lock);
 512                        UnlockPage(page);
 513page_mapped:
 514                        if (--max_mapped < 0) {
 515                                spin_unlock(&pagemap_lru_lock);
 516
 517                                nr_pages -= kmem_cache_reap(gfp_mask);
 518                                if (nr_pages <= 0)
 519                                        goto out;
 520
 521                                shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
 522                                shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
 523#ifdef CONFIG_QUOTA
 524                                shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
 525#endif
 526
 527                                if (!*failed_swapout)
 528                                        *failed_swapout = !swap_out(classzone);
 529
 530                                max_mapped = nr_pages * vm_mapped_ratio;
 531
 532                                spin_lock(&pagemap_lru_lock);
 533                                refill_inactive(nr_pages, classzone);
 534                        }
 535                        continue;
 536                        
 537                }
 538                if (PageDirty(page)) {
 539                        spin_unlock(&pagecache_lock);
 540                        UnlockPage(page);
 541                        continue;
 542                }
 543
 544                __lru_cache_del(page);
 545
 546                /* point of no return */
 547                if (likely(!PageSwapCache(page))) {
 548                        __remove_inode_page(page);
 549                        spin_unlock(&pagecache_lock);
 550                } else {
 551                        swp_entry_t swap;
 552                        swap.val = page->index;
 553                        __delete_from_swap_cache(page);
 554                        spin_unlock(&pagecache_lock);
 555                        swap_free(swap);
 556                }
 557
 558                UnlockPage(page);
 559
 560                /* effectively free the page here */
 561                page_cache_release(page);
 562
 563                if (--nr_pages)
 564                        continue;
 565                break;
 566        }
 567        spin_unlock(&pagemap_lru_lock);
 568
 569 out:
 570        return nr_pages;
 571}
 572
 573/*
 574 * This moves pages from the active list to
 575 * the inactive list.
 576 *
 577 * We move them the other way when we see the
 578 * reference bit on the page.
 579 */
 580static void refill_inactive(int nr_pages, zone_t * classzone)
 581{
 582        struct list_head * entry;
 583        unsigned long ratio;
 584
 585        ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
 586
 587        entry = active_list.prev;
 588        while (ratio && entry != &active_list) {
 589                struct page * page;
 590
 591                page = list_entry(entry, struct page, lru);
 592                entry = entry->prev;
 593                if (PageTestandClearReferenced(page)) {
 594                        list_del(&page->lru);
 595                        list_add(&page->lru, &active_list);
 596                        continue;
 597                }
 598
 599                ratio--;
 600
 601                del_page_from_active_list(page);
 602                add_page_to_inactive_list(page);
 603                SetPageReferenced(page);
 604        }
 605
 606        if (entry != &active_list) {
 607                list_del(&active_list);
 608                list_add(&active_list, entry);
 609        }
 610}
 611
 612static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
 613static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
 614{
 615        nr_pages -= kmem_cache_reap(gfp_mask);
 616        if (nr_pages <= 0)
 617                goto out;
 618
 619        spin_lock(&pagemap_lru_lock);
 620        refill_inactive(nr_pages, classzone);
 621
 622        nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
 623
 624out:
 625        return nr_pages;
 626}
 627
 628static int check_classzone_need_balance(zone_t * classzone);
 629
 630int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
 631{
 632        gfp_mask = pf_gfp_mask(gfp_mask);
 633
 634        for (;;) {
 635                int tries = vm_passes;
 636                int failed_swapout = !(gfp_mask & __GFP_IO);
 637                int nr_pages = SWAP_CLUSTER_MAX;
 638
 639                do {
 640                        nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
 641                        if (nr_pages <= 0)
 642                                return 1;
 643                        shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
 644                        shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
 645#ifdef CONFIG_QUOTA
 646                        shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
 647#endif
 648                        if (!failed_swapout)
 649                                failed_swapout = !swap_out(classzone);
 650                } while (--tries);
 651
 652#ifdef  CONFIG_OOM_KILLER
 653        out_of_memory();
 654#else
 655        if (likely(current->pid != 1))
 656                break;
 657        if (!check_classzone_need_balance(classzone))
 658                break;
 659
 660        __set_current_state(TASK_RUNNING);
 661        yield();
 662#endif
 663        }
 664
 665        return 0;
 666}
 667
 668int try_to_free_pages(unsigned int gfp_mask)
 669{
 670        pg_data_t *pgdat;
 671        zonelist_t *zonelist;
 672        unsigned long pf_free_pages;
 673        int error = 0;
 674
 675        pf_free_pages = current->flags & PF_FREE_PAGES;
 676        current->flags &= ~PF_FREE_PAGES;
 677
 678        for_each_pgdat(pgdat) {
 679                zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
 680                error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
 681        }
 682
 683        current->flags |= pf_free_pages;
 684        return error;
 685}
 686
 687DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
 688
 689static int check_classzone_need_balance(zone_t * classzone)
 690{
 691        zone_t * first_zone;
 692        int class_idx = zone_idx(classzone);
 693
 694        first_zone = classzone->zone_pgdat->node_zones;
 695        while (classzone >= first_zone) {
 696                if (classzone->free_pages > classzone->watermarks[class_idx].high)
 697                        return 0;
 698                classzone--;
 699        }
 700        return 1;
 701}
 702
 703static int kswapd_balance_pgdat(pg_data_t * pgdat)
 704{
 705        int need_more_balance = 0, i;
 706        zone_t * zone;
 707
 708        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 709                zone = pgdat->node_zones + i;
 710                if (unlikely(current->need_resched))
 711                        schedule();
 712                if (!zone->need_balance || !zone->size)
 713                        continue;
 714                if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
 715                        zone->need_balance = 0;
 716                        __set_current_state(TASK_INTERRUPTIBLE);
 717                        schedule_timeout(HZ*5);
 718                        continue;
 719                }
 720                if (check_classzone_need_balance(zone))
 721                        need_more_balance = 1;
 722                else
 723                        zone->need_balance = 0;
 724        }
 725
 726        return need_more_balance;
 727}
 728
 729static void kswapd_balance(void)
 730{
 731        int need_more_balance;
 732        pg_data_t * pgdat;
 733
 734        do {
 735                need_more_balance = 0;
 736
 737                for_each_pgdat(pgdat)
 738                        need_more_balance |= kswapd_balance_pgdat(pgdat);
 739        } while (need_more_balance);
 740}
 741
 742static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
 743{
 744        zone_t * zone;
 745        int i;
 746
 747        for (i = pgdat->nr_zones-1; i >= 0; i--) {
 748                zone = pgdat->node_zones + i;
 749                if (!zone->need_balance || !zone->size)
 750                        continue;
 751                return 0;
 752        }
 753
 754        return 1;
 755}
 756
 757static int kswapd_can_sleep(void)
 758{
 759        pg_data_t * pgdat;
 760
 761        for_each_pgdat(pgdat) {
 762                if (!kswapd_can_sleep_pgdat(pgdat))
 763                        return 0;
 764        }
 765
 766        return 1;
 767}
 768
 769/*
 770 * The background pageout daemon, started as a kernel thread
 771 * from the init process. 
 772 *
 773 * This basically trickles out pages so that we have _some_
 774 * free memory available even if there is no other activity
 775 * that frees anything up. This is needed for things like routing
 776 * etc, where we otherwise might have all activity going on in
 777 * asynchronous contexts that cannot page things out.
 778 *
 779 * If there are applications that are active memory-allocators
 780 * (most normal use), this basically shouldn't matter.
 781 */
 782int kswapd(void *unused)
 783{
 784        struct task_struct *tsk = current;
 785        DECLARE_WAITQUEUE(wait, tsk);
 786
 787        daemonize();
 788        strcpy(tsk->comm, "kswapd");
 789        sigfillset(&tsk->blocked);
 790        
 791        /*
 792         * Tell the memory management that we're a "memory allocator",
 793         * and that if we need more memory we should get access to it
 794         * regardless (see "__alloc_pages()"). "kswapd" should
 795         * never get caught in the normal page freeing logic.
 796         *
 797         * (Kswapd normally doesn't need memory anyway, but sometimes
 798         * you need a small amount of memory in order to be able to
 799         * page out something else, and this flag essentially protects
 800         * us from recursively trying to free more memory as we're
 801         * trying to free the first piece of memory in the first place).
 802         */
 803        tsk->flags |= PF_MEMALLOC;
 804
 805        /*
 806         * Kswapd main loop.
 807         */
 808        for (;;) {
 809                __set_current_state(TASK_INTERRUPTIBLE);
 810                add_wait_queue(&kswapd_wait, &wait);
 811
 812                mb();
 813                if (kswapd_can_sleep())
 814                        schedule();
 815
 816                __set_current_state(TASK_RUNNING);
 817                remove_wait_queue(&kswapd_wait, &wait);
 818
 819                /*
 820                 * If we actually get into a low-memory situation,
 821                 * the processes needing more memory will wake us
 822                 * up on a more timely basis.
 823                 */
 824                kswapd_balance();
 825                run_task_queue(&tq_disk);
 826        }
 827}
 828
 829static int __init kswapd_init(void)
 830{
 831        printk("Starting kswapd\n");
 832        swap_setup();
 833        kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 834        return 0;
 835}
 836
 837module_init(kswapd_init)
 838
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.