linux-old/mm/vmscan.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/vmscan.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *
   6 *  Swap reorganised 29.12.95, Stephen Tweedie.
   7 *  kswapd added: 7.1.96  sct
   8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10 *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11 */
  12
  13#include <linux/slab.h>
  14#include <linux/kernel_stat.h>
  15#include <linux/swap.h>
  16#include <linux/swapctl.h>
  17#include <linux/smp_lock.h>
  18#include <linux/pagemap.h>
  19#include <linux/init.h>
  20
  21#include <asm/pgtable.h>
  22
  23/*
  24 * The swap-out functions return 1 if they successfully
  25 * threw something out, and we got a free page. It returns
  26 * zero if it couldn't do anything, and any other value
  27 * indicates it decreased rss, but the page was shared.
  28 *
  29 * NOTE! If it sleeps, it *must* return 1 to make sure we
  30 * don't continue with the swap-out. Otherwise we may be
  31 * using a process that no longer actually exists (it might
  32 * have died while we slept).
  33 */
  34static int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
  35        unsigned long address, pte_t * page_table, int gfp_mask)
  36{
  37        pte_t pte;
  38        unsigned long entry;
  39        unsigned long page;
  40        struct page * page_map;
  41
  42        pte = *page_table;
  43        if (!pte_present(pte))
  44                return 0;
  45        page = pte_page(pte);
  46        if (MAP_NR(page) >= max_mapnr)
  47                return 0;
  48        page_map = mem_map + MAP_NR(page);
  49
  50        if (pte_young(pte)) {
  51                /*
  52                 * Transfer the "accessed" bit from the page
  53                 * tables to the global page map.
  54                 */
  55                set_pte(page_table, pte_mkold(pte));
  56                flush_tlb_page(vma, address);
  57                set_bit(PG_referenced, &page_map->flags);
  58                return 0;
  59        }
  60
  61        if (PageReserved(page_map)
  62            || PageLocked(page_map)
  63            || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
  64                return 0;
  65
  66        /*
  67         * Is the page already in the swap cache? If so, then
  68         * we can just drop our reference to it without doing
  69         * any IO - it's already up-to-date on disk.
  70         *
  71         * Return 0, as we didn't actually free any real
  72         * memory, and we should just continue our scan.
  73         */
  74        if (PageSwapCache(page_map)) {
  75                entry = page_map->offset;
  76                swap_duplicate(entry);
  77                set_pte(page_table, __pte(entry));
  78drop_pte:
  79                vma->vm_mm->rss--;
  80                flush_tlb_page(vma, address);
  81                __free_page(page_map);
  82                return 0;
  83        }
  84
  85        /*
  86         * Is it a clean page? Then it must be recoverable
  87         * by just paging it in again, and we can just drop
  88         * it..
  89         *
  90         * However, this won't actually free any real
  91         * memory, as the page will just be in the page cache
  92         * somewhere, and as such we should just continue
  93         * our scan.
  94         *
  95         * Basically, this just makes it possible for us to do
  96         * some real work in the future in "shrink_mmap()".
  97         */
  98        if (!pte_dirty(pte)) {
  99                if (page_map->inode && pgcache_under_min())
 100                        /* unmapping this page would be useless */
 101                        return 0;
 102                flush_cache_page(vma, address);
 103                pte_clear(page_table);
 104                goto drop_pte;
 105        }
 106
 107        /*
 108         * Don't go down into the swap-out stuff if
 109         * we cannot do I/O! Avoid recursing on FS
 110         * locks etc.
 111         */
 112        if (!(gfp_mask & __GFP_IO) || current->fs_locks)
 113                return 0;
 114
 115        /*
 116         * Ok, it's really dirty. That means that
 117         * we should either create a new swap cache
 118         * entry for it, or we should write it back
 119         * to its own backing store.
 120         *
 121         * Note that in neither case do we actually
 122         * know that we make a page available, but
 123         * as we potentially sleep we can no longer
 124         * continue scanning, so we migth as well
 125         * assume we free'd something.
 126         *
 127         * NOTE NOTE NOTE! This should just set a
 128         * dirty bit in page_map, and just drop the
 129         * pte. All the hard work would be done by
 130         * shrink_mmap().
 131         *
 132         * That would get rid of a lot of problems.
 133         */
 134        flush_cache_page(vma, address);
 135        if (vma->vm_ops && vma->vm_ops->swapout) {
 136                pid_t pid = tsk->pid;
 137                pte_clear(page_table);
 138                flush_tlb_page(vma, address);
 139                vma->vm_mm->rss--;
 140                
 141                if (vma->vm_ops->swapout(vma, page_map))
 142                        kill_proc(pid, SIGBUS, 1);
 143                __free_page(page_map);
 144                return 1;
 145        }
 146
 147        /*
 148         * This is a dirty, swappable page.  First of all,
 149         * get a suitable swap entry for it, and make sure
 150         * we have the swap cache set up to associate the
 151         * page with that swap entry.
 152         */
 153        entry = get_swap_page();
 154        if (!entry)
 155                return 0; /* No swap space left */
 156                
 157        vma->vm_mm->rss--;
 158        tsk->nswap++;
 159        set_pte(page_table, __pte(entry));
 160        flush_tlb_page(vma, address);
 161        swap_duplicate(entry);  /* One for the process, one for the swap cache */
 162        add_to_swap_cache(page_map, entry);
 163        /* We checked we were unlocked way up above, and we
 164           have been careful not to stall until here */
 165        set_bit(PG_locked, &page_map->flags);
 166
 167        /* OK, do a physical asynchronous write to swap.  */
 168        rw_swap_page(WRITE, entry, (char *) page, 0);
 169
 170        __free_page(page_map);
 171        return 1;
 172}
 173
 174/*
 175 * A new implementation of swap_out().  We do not swap complete processes,
 176 * but only a small number of blocks, before we continue with the next
 177 * process.  The number of blocks actually swapped is determined on the
 178 * number of page faults, that this process actually had in the last time,
 179 * so we won't swap heavily used processes all the time ...
 180 *
 181 * Note: the priority argument is a hint on much CPU to waste with the
 182 *       swap block search, not a hint, of how much blocks to swap with
 183 *       each process.
 184 *
 185 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 186 */
 187
 188static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
 189        pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 190{
 191        pte_t * pte;
 192        unsigned long pmd_end;
 193
 194        if (pmd_none(*dir))
 195                return 0;
 196        if (pmd_bad(*dir)) {
 197                printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 198                pmd_clear(dir);
 199                return 0;
 200        }
 201        
 202        pte = pte_offset(dir, address);
 203        
 204        pmd_end = (address + PMD_SIZE) & PMD_MASK;
 205        if (end > pmd_end)
 206                end = pmd_end;
 207
 208        do {
 209                int result;
 210                tsk->mm->swap_address = address + PAGE_SIZE;
 211                result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 212                if (result)
 213                        return result;
 214                if (current->need_resched)
 215                        return 2;
 216                address += PAGE_SIZE;
 217                pte++;
 218        } while (address < end);
 219        return 0;
 220}
 221
 222static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
 223        pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 224{
 225        pmd_t * pmd;
 226        unsigned long pgd_end;
 227
 228        if (pgd_none(*dir))
 229                return 0;
 230        if (pgd_bad(*dir)) {
 231                printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 232                pgd_clear(dir);
 233                return 0;
 234        }
 235
 236        pmd = pmd_offset(dir, address);
 237
 238        pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
 239        if (end > pgd_end)
 240                end = pgd_end;
 241        
 242        do {
 243                int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
 244                if (result)
 245                        return result;
 246                address = (address + PMD_SIZE) & PMD_MASK;
 247                pmd++;
 248        } while (address < end);
 249        return 0;
 250}
 251
 252static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
 253        unsigned long address, int gfp_mask)
 254{
 255        pgd_t *pgdir;
 256        unsigned long end;
 257
 258        /* Don't swap out areas which are locked down */
 259        if (vma->vm_flags & VM_LOCKED)
 260                return 0;
 261
 262        pgdir = pgd_offset(tsk->mm, address);
 263
 264        end = vma->vm_end;
 265        while (address < end) {
 266                int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
 267                if (result)
 268                        return result;
 269                address = (address + PGDIR_SIZE) & PGDIR_MASK;
 270                pgdir++;
 271        }
 272        return 0;
 273}
 274
 275static int swap_out_process(struct task_struct * p, int gfp_mask)
 276{
 277        unsigned long address;
 278        struct vm_area_struct* vma;
 279
 280        /*
 281         * Go through process' page directory.
 282         */
 283        address = p->mm->swap_address;
 284
 285        /*
 286         * Find the proper vm-area
 287         */
 288        vma = find_vma(p->mm, address);
 289        if (vma) {
 290                if (address < vma->vm_start)
 291                        address = vma->vm_start;
 292
 293                for (;;) {
 294                        int result = swap_out_vma(p, vma, address, gfp_mask);
 295                        if (result)
 296                                return result;
 297                        vma = vma->vm_next;
 298                        if (!vma)
 299                                break;
 300                        address = vma->vm_start;
 301                }
 302        }
 303
 304        /* We didn't find anything for the process */
 305        p->mm->swap_cnt = 0;
 306        p->mm->swap_address = 0;
 307        return 0;
 308}
 309
 310/*
 311 * Select the task with maximal swap_cnt and try to swap out a page.
 312 * N.B. This function returns only 0 or 1.  Return values != 1 from
 313 * the lower level routines result in continued processing.
 314 */
 315static int swap_out(unsigned int priority, int gfp_mask)
 316{
 317        struct task_struct * p, * pbest;
 318        int assign = 0, counter;
 319        unsigned long max_cnt;
 320
 321        /* 
 322         * We make one or two passes through the task list, indexed by 
 323         * assign = {0, 1}:
 324         *   Pass 1: select the swappable task with maximal RSS that has
 325         *         not yet been swapped out. 
 326         *   Pass 2: re-assign rss swap_cnt values, then select as above.
 327         *
 328         * With this approach, there's no need to remember the last task
 329         * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 330         * task won't be selected again until all others have been tried.
 331         *
 332         * Think of swap_cnt as a "shadow rss" - it tells us which process
 333         * we want to page out (always try largest first).
 334         */
 335        counter = nr_tasks / priority;
 336        if (counter < 1)
 337                counter = 1;
 338
 339        for (; counter >= 0; counter--) {
 340                max_cnt = 0;
 341                pbest = NULL;
 342        select:
 343                read_lock(&tasklist_lock);
 344                p = init_task.next_task;
 345                for (; p != &init_task; p = p->next_task) {
 346                        if (!p->swappable)
 347                                continue;
 348                        if (p->mm->rss <= 0)
 349                                continue;
 350                        /* Refresh swap_cnt? */
 351                        if (assign == 1)
 352                                p->mm->swap_cnt = p->mm->rss;
 353                        if (p->mm->swap_cnt > max_cnt) {
 354                                max_cnt = p->mm->swap_cnt;
 355                                pbest = p;
 356                        }
 357                }
 358                read_unlock(&tasklist_lock);
 359                if (assign == 1)
 360                        assign = 2;
 361                if (!pbest) {
 362                        if (!assign) {
 363                                assign = 1;
 364                                goto select;
 365                        }
 366                        goto out;
 367                }
 368
 369                switch (swap_out_process(pbest, gfp_mask)) {
 370                case 1:
 371                        return 1;
 372                case 2:
 373                        current->state = TASK_RUNNING;
 374                        schedule();
 375                }
 376        }
 377out:
 378        return 0;
 379}
 380
 381/*
 382 * We need to make the locks finer granularity, but right
 383 * now we need this so that we can do page allocations
 384 * without holding the kernel lock etc.
 385 *
 386 * We want to try to free "count" pages, and we need to 
 387 * cluster them so that we get good swap-out behaviour. See
 388 * the "free_memory()" macro for details.
 389 */
 390int try_to_free_pages(unsigned int gfp_mask)
 391{
 392        int priority;
 393        int count = SWAP_CLUSTER_MAX;
 394
 395        lock_kernel();
 396
 397        /* Always trim SLAB caches when memory gets low. */
 398        kmem_cache_reap(gfp_mask);
 399
 400        priority = 5;
 401        do {
 402                /* Always pick on the dcache even if we didnt need to,
 403                   without this some workloads cause excessive dcache
 404                   growth */
 405
 406                shrink_dcache_memory(priority, gfp_mask);
 407        
 408                while (shrink_mmap(priority, gfp_mask)) {
 409                        if (!--count)
 410                                goto done;
 411                }
 412
 413                /* Try to get rid of some shared memory pages.. */
 414                if (gfp_mask & __GFP_IO && !current->fs_locks) {
 415                        while (shm_swap(priority, gfp_mask)) {
 416                                if (!--count)
 417                                        goto done;
 418                        }
 419                }
 420
 421                /* Then, try to page stuff out.. */
 422                while (swap_out(priority, gfp_mask)) {
 423                        if (!--count)
 424                                goto done;
 425                }
 426
 427        } while (--priority > 0);
 428done:
 429        unlock_kernel();
 430
 431        /* Return success if we freed a page. */
 432        return priority > 0;
 433}
 434
 435/*
 436 * Before we start the kernel thread, print out the 
 437 * kswapd initialization message (otherwise the init message 
 438 * may be printed in the middle of another driver's init 
 439 * message).  It looks very bad when that happens.
 440 */
 441void __init kswapd_setup(void)
 442{
 443       int i;
 444       char *revision="$Revision: 1.5 $", *s, *e;
 445
 446       swap_setup();
 447       
 448       if ((s = strchr(revision, ':')) &&
 449           (e = strchr(s, '$')))
 450               s++, i = e - s;
 451       else
 452               s = revision, i = -1;
 453       printk ("Starting kswapd v%.*s\n", i, s);
 454}
 455
 456struct wait_queue * kswapd_wait;
 457
 458/*
 459 * The background pageout daemon, started as a kernel thread
 460 * from the init process. 
 461 *
 462 * This basically executes once a second, trickling out pages
 463 * so that we have _some_ free memory available even if there
 464 * is no other activity that frees anything up. This is needed
 465 * for things like routing etc, where we otherwise might have
 466 * all activity going on in asynchronous contexts that cannot
 467 * page things out.
 468 *
 469 * If there are applications that are active memory-allocators
 470 * (most normal use), this basically shouldn't matter.
 471 */
 472int kswapd(void *unused)
 473{
 474        struct task_struct *tsk = current;
 475
 476        tsk->session = 1;
 477        tsk->pgrp = 1;
 478        strcpy(tsk->comm, "kswapd");
 479        sigfillset(&tsk->blocked);
 480        
 481        /*
 482         * Tell the memory management that we're a "memory allocator",
 483         * and that if we need more memory we should get access to it
 484         * regardless (see "__get_free_pages()"). "kswapd" should
 485         * never get caught in the normal page freeing logic.
 486         *
 487         * (Kswapd normally doesn't need memory anyway, but sometimes
 488         * you need a small amount of memory in order to be able to
 489         * page out something else, and this flag essentially protects
 490         * us from recursively trying to free more memory as we're
 491         * trying to free the first piece of memory in the first place).
 492         */
 493        tsk->flags |= PF_MEMALLOC;
 494
 495        while (1) {
 496                /*
 497                 * Wake up once a second to see if we need to make
 498                 * more memory available.
 499                 *
 500                 * If we actually get into a low-memory situation,
 501                 * the processes needing more memory will wake us
 502                 * up on a more timely basis.
 503                 */
 504                interruptible_sleep_on(&kswapd_wait);
 505
 506                while (nr_free_pages < freepages.high)
 507                {
 508                        if (try_to_free_pages(GFP_KSWAPD))
 509                        {
 510                                if (tsk->need_resched)
 511                                        schedule();
 512                                continue;
 513                        }
 514                        tsk->state = TASK_INTERRUPTIBLE;
 515                        schedule_timeout(10*HZ);
 516                }
 517        }
 518}
 519
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.