linux/mm/swap.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/swap.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * This file contains the default values for the operation of the
   9 * Linux VM subsystem. Fine-tuning documentation can be found in
  10 * Documentation/sysctl/vm.txt.
  11 * Started 18.12.91
  12 * Swap aging added 23.2.95, Stephen Tweedie.
  13 * Buffermem limits added 12.3.98, Rik van Riel.
  14 */
  15
  16#include <linux/mm.h>
  17#include <linux/sched.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/swap.h>
  20#include <linux/mman.h>
  21#include <linux/pagemap.h>
  22#include <linux/pagevec.h>
  23#include <linux/init.h>
  24#include <linux/module.h>
  25#include <linux/mm_inline.h>
  26#include <linux/buffer_head.h>  /* for try_to_release_page() */
  27#include <linux/percpu_counter.h>
  28#include <linux/percpu.h>
  29#include <linux/cpu.h>
  30#include <linux/notifier.h>
  31#include <linux/backing-dev.h>
  32#include <linux/memcontrol.h>
  33
  34/* How many pages do we try to swap or page in/out together? */
  35int page_cluster;
  36
  37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
  38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
  39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
  40
  41/*
  42 * This path almost never happens for VM activity - pages are normally
  43 * freed via pagevecs.  But it gets used by networking.
  44 */
  45static void __page_cache_release(struct page *page)
  46{
  47        if (PageLRU(page)) {
  48                unsigned long flags;
  49                struct zone *zone = page_zone(page);
  50
  51                spin_lock_irqsave(&zone->lru_lock, flags);
  52                VM_BUG_ON(!PageLRU(page));
  53                __ClearPageLRU(page);
  54                del_page_from_lru(zone, page);
  55                spin_unlock_irqrestore(&zone->lru_lock, flags);
  56        }
  57        free_hot_page(page);
  58}
  59
  60static void put_compound_page(struct page *page)
  61{
  62        page = compound_head(page);
  63        if (put_page_testzero(page)) {
  64                compound_page_dtor *dtor;
  65
  66                dtor = get_compound_page_dtor(page);
  67                (*dtor)(page);
  68        }
  69}
  70
  71void put_page(struct page *page)
  72{
  73        if (unlikely(PageCompound(page)))
  74                put_compound_page(page);
  75        else if (put_page_testzero(page))
  76                __page_cache_release(page);
  77}
  78EXPORT_SYMBOL(put_page);
  79
  80/**
  81 * put_pages_list() - release a list of pages
  82 * @pages: list of pages threaded on page->lru
  83 *
  84 * Release a list of pages which are strung together on page.lru.  Currently
  85 * used by read_cache_pages() and related error recovery code.
  86 */
  87void put_pages_list(struct list_head *pages)
  88{
  89        while (!list_empty(pages)) {
  90                struct page *victim;
  91
  92                victim = list_entry(pages->prev, struct page, lru);
  93                list_del(&victim->lru);
  94                page_cache_release(victim);
  95        }
  96}
  97EXPORT_SYMBOL(put_pages_list);
  98
  99/*
 100 * pagevec_move_tail() must be called with IRQ disabled.
 101 * Otherwise this may cause nasty races.
 102 */
 103static void pagevec_move_tail(struct pagevec *pvec)
 104{
 105        int i;
 106        int pgmoved = 0;
 107        struct zone *zone = NULL;
 108
 109        for (i = 0; i < pagevec_count(pvec); i++) {
 110                struct page *page = pvec->pages[i];
 111                struct zone *pagezone = page_zone(page);
 112
 113                if (pagezone != zone) {
 114                        if (zone)
 115                                spin_unlock(&zone->lru_lock);
 116                        zone = pagezone;
 117                        spin_lock(&zone->lru_lock);
 118                }
 119                if (PageLRU(page) && !PageActive(page)) {
 120                        list_move_tail(&page->lru, &zone->inactive_list);
 121                        pgmoved++;
 122                }
 123        }
 124        if (zone)
 125                spin_unlock(&zone->lru_lock);
 126        __count_vm_events(PGROTATED, pgmoved);
 127        release_pages(pvec->pages, pvec->nr, pvec->cold);
 128        pagevec_reinit(pvec);
 129}
 130
 131/*
 132 * Writeback is about to end against a page which has been marked for immediate
 133 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 134 * inactive list.
 135 *
 136 * Returns zero if it cleared PG_writeback.
 137 */
 138int rotate_reclaimable_page(struct page *page)
 139{
 140        struct pagevec *pvec;
 141        unsigned long flags;
 142
 143        if (PageLocked(page))
 144                return 1;
 145        if (PageDirty(page))
 146                return 1;
 147        if (PageActive(page))
 148                return 1;
 149        if (!PageLRU(page))
 150                return 1;
 151
 152        page_cache_get(page);
 153        local_irq_save(flags);
 154        pvec = &__get_cpu_var(lru_rotate_pvecs);
 155        if (!pagevec_add(pvec, page))
 156                pagevec_move_tail(pvec);
 157        local_irq_restore(flags);
 158
 159        if (!test_clear_page_writeback(page))
 160                BUG();
 161
 162        return 0;
 163}
 164
 165/*
 166 * FIXME: speed this up?
 167 */
 168void activate_page(struct page *page)
 169{
 170        struct zone *zone = page_zone(page);
 171
 172        spin_lock_irq(&zone->lru_lock);
 173        if (PageLRU(page) && !PageActive(page)) {
 174                del_page_from_inactive_list(zone, page);
 175                SetPageActive(page);
 176                add_page_to_active_list(zone, page);
 177                __count_vm_event(PGACTIVATE);
 178                mem_cgroup_move_lists(page, true);
 179        }
 180        spin_unlock_irq(&zone->lru_lock);
 181}
 182
 183/*
 184 * Mark a page as having seen activity.
 185 *
 186 * inactive,unreferenced        ->      inactive,referenced
 187 * inactive,referenced          ->      active,unreferenced
 188 * active,unreferenced          ->      active,referenced
 189 */
 190void mark_page_accessed(struct page *page)
 191{
 192        if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
 193                activate_page(page);
 194                ClearPageReferenced(page);
 195        } else if (!PageReferenced(page)) {
 196                SetPageReferenced(page);
 197        }
 198}
 199
 200EXPORT_SYMBOL(mark_page_accessed);
 201
 202/**
 203 * lru_cache_add: add a page to the page lists
 204 * @page: the page to add
 205 */
 206void lru_cache_add(struct page *page)
 207{
 208        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 209
 210        page_cache_get(page);
 211        if (!pagevec_add(pvec, page))
 212                __pagevec_lru_add(pvec);
 213        put_cpu_var(lru_add_pvecs);
 214}
 215
 216void lru_cache_add_active(struct page *page)
 217{
 218        struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
 219
 220        page_cache_get(page);
 221        if (!pagevec_add(pvec, page))
 222                __pagevec_lru_add_active(pvec);
 223        put_cpu_var(lru_add_active_pvecs);
 224}
 225
 226/*
 227 * Drain pages out of the cpu's pagevecs.
 228 * Either "cpu" is the current CPU, and preemption has already been
 229 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 230 */
 231static void drain_cpu_pagevecs(int cpu)
 232{
 233        struct pagevec *pvec;
 234
 235        pvec = &per_cpu(lru_add_pvecs, cpu);
 236        if (pagevec_count(pvec))
 237                __pagevec_lru_add(pvec);
 238
 239        pvec = &per_cpu(lru_add_active_pvecs, cpu);
 240        if (pagevec_count(pvec))
 241                __pagevec_lru_add_active(pvec);
 242
 243        pvec = &per_cpu(lru_rotate_pvecs, cpu);
 244        if (pagevec_count(pvec)) {
 245                unsigned long flags;
 246
 247                /* No harm done if a racing interrupt already did this */
 248                local_irq_save(flags);
 249                pagevec_move_tail(pvec);
 250                local_irq_restore(flags);
 251        }
 252}
 253
 254void lru_add_drain(void)
 255{
 256        drain_cpu_pagevecs(get_cpu());
 257        put_cpu();
 258}
 259
 260#ifdef CONFIG_NUMA
 261static void lru_add_drain_per_cpu(struct work_struct *dummy)
 262{
 263        lru_add_drain();
 264}
 265
 266/*
 267 * Returns 0 for success
 268 */
 269int lru_add_drain_all(void)
 270{
 271        return schedule_on_each_cpu(lru_add_drain_per_cpu);
 272}
 273
 274#else
 275
 276/*
 277 * Returns 0 for success
 278 */
 279int lru_add_drain_all(void)
 280{
 281        lru_add_drain();
 282        return 0;
 283}
 284#endif
 285
 286/*
 287 * Batched page_cache_release().  Decrement the reference count on all the
 288 * passed pages.  If it fell to zero then remove the page from the LRU and
 289 * free it.
 290 *
 291 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 292 * for the remainder of the operation.
 293 *
 294 * The locking in this function is against shrink_cache(): we recheck the
 295 * page count inside the lock to see whether shrink_cache grabbed the page
 296 * via the LRU.  If it did, give up: shrink_cache will free it.
 297 */
 298void release_pages(struct page **pages, int nr, int cold)
 299{
 300        int i;
 301        struct pagevec pages_to_free;
 302        struct zone *zone = NULL;
 303        unsigned long uninitialized_var(flags);
 304
 305        pagevec_init(&pages_to_free, cold);
 306        for (i = 0; i < nr; i++) {
 307                struct page *page = pages[i];
 308
 309                if (unlikely(PageCompound(page))) {
 310                        if (zone) {
 311                                spin_unlock_irqrestore(&zone->lru_lock, flags);
 312                                zone = NULL;
 313                        }
 314                        put_compound_page(page);
 315                        continue;
 316                }
 317
 318                if (!put_page_testzero(page))
 319                        continue;
 320
 321                if (PageLRU(page)) {
 322                        struct zone *pagezone = page_zone(page);
 323                        if (pagezone != zone) {
 324                                if (zone)
 325                                        spin_unlock_irqrestore(&zone->lru_lock,
 326                                                                        flags);
 327                                zone = pagezone;
 328                                spin_lock_irqsave(&zone->lru_lock, flags);
 329                        }
 330                        VM_BUG_ON(!PageLRU(page));
 331                        __ClearPageLRU(page);
 332                        del_page_from_lru(zone, page);
 333                }
 334
 335                if (!pagevec_add(&pages_to_free, page)) {
 336                        if (zone) {
 337                                spin_unlock_irqrestore(&zone->lru_lock, flags);
 338                                zone = NULL;
 339                        }
 340                        __pagevec_free(&pages_to_free);
 341                        pagevec_reinit(&pages_to_free);
 342                }
 343        }
 344        if (zone)
 345                spin_unlock_irqrestore(&zone->lru_lock, flags);
 346
 347        pagevec_free(&pages_to_free);
 348}
 349
 350/*
 351 * The pages which we're about to release may be in the deferred lru-addition
 352 * queues.  That would prevent them from really being freed right now.  That's
 353 * OK from a correctness point of view but is inefficient - those pages may be
 354 * cache-warm and we want to give them back to the page allocator ASAP.
 355 *
 356 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 357 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 358 * mutual recursion.
 359 */
 360void __pagevec_release(struct pagevec *pvec)
 361{
 362        lru_add_drain();
 363        release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 364        pagevec_reinit(pvec);
 365}
 366
 367EXPORT_SYMBOL(__pagevec_release);
 368
 369/*
 370 * pagevec_release() for pages which are known to not be on the LRU
 371 *
 372 * This function reinitialises the caller's pagevec.
 373 */
 374void __pagevec_release_nonlru(struct pagevec *pvec)
 375{
 376        int i;
 377        struct pagevec pages_to_free;
 378
 379        pagevec_init(&pages_to_free, pvec->cold);
 380        for (i = 0; i < pagevec_count(pvec); i++) {
 381                struct page *page = pvec->pages[i];
 382
 383                VM_BUG_ON(PageLRU(page));
 384                if (put_page_testzero(page))
 385                        pagevec_add(&pages_to_free, page);
 386        }
 387        pagevec_free(&pages_to_free);
 388        pagevec_reinit(pvec);
 389}
 390
 391/*
 392 * Add the passed pages to the LRU, then drop the caller's refcount
 393 * on them.  Reinitialises the caller's pagevec.
 394 */
 395void __pagevec_lru_add(struct pagevec *pvec)
 396{
 397        int i;
 398        struct zone *zone = NULL;
 399
 400        for (i = 0; i < pagevec_count(pvec); i++) {
 401                struct page *page = pvec->pages[i];
 402                struct zone *pagezone = page_zone(page);
 403
 404                if (pagezone != zone) {
 405                        if (zone)
 406                                spin_unlock_irq(&zone->lru_lock);
 407                        zone = pagezone;
 408                        spin_lock_irq(&zone->lru_lock);
 409                }
 410                VM_BUG_ON(PageLRU(page));
 411                SetPageLRU(page);
 412                add_page_to_inactive_list(zone, page);
 413        }
 414        if (zone)
 415                spin_unlock_irq(&zone->lru_lock);
 416        release_pages(pvec->pages, pvec->nr, pvec->cold);
 417        pagevec_reinit(pvec);
 418}
 419
 420EXPORT_SYMBOL(__pagevec_lru_add);
 421
 422void __pagevec_lru_add_active(struct pagevec *pvec)
 423{
 424        int i;
 425        struct zone *zone = NULL;
 426
 427        for (i = 0; i < pagevec_count(pvec); i++) {
 428                struct page *page = pvec->pages[i];
 429                struct zone *pagezone = page_zone(page);
 430
 431                if (pagezone != zone) {
 432                        if (zone)
 433                                spin_unlock_irq(&zone->lru_lock);
 434                        zone = pagezone;
 435                        spin_lock_irq(&zone->lru_lock);
 436                }
 437                VM_BUG_ON(PageLRU(page));
 438                SetPageLRU(page);
 439                VM_BUG_ON(PageActive(page));
 440                SetPageActive(page);
 441                add_page_to_active_list(zone, page);
 442        }
 443        if (zone)
 444                spin_unlock_irq(&zone->lru_lock);
 445        release_pages(pvec->pages, pvec->nr, pvec->cold);
 446        pagevec_reinit(pvec);
 447}
 448
 449/*
 450 * Try to drop buffers from the pages in a pagevec
 451 */
 452void pagevec_strip(struct pagevec *pvec)
 453{
 454        int i;
 455
 456        for (i = 0; i < pagevec_count(pvec); i++) {
 457                struct page *page = pvec->pages[i];
 458
 459                if (PagePrivate(page) && !TestSetPageLocked(page)) {
 460                        if (PagePrivate(page))
 461                                try_to_release_page(page, 0);
 462                        unlock_page(page);
 463                }
 464        }
 465}
 466
 467/**
 468 * pagevec_lookup - gang pagecache lookup
 469 * @pvec:       Where the resulting pages are placed
 470 * @mapping:    The address_space to search
 471 * @start:      The starting page index
 472 * @nr_pages:   The maximum number of pages
 473 *
 474 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 475 * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 476 * reference against the pages in @pvec.
 477 *
 478 * The search returns a group of mapping-contiguous pages with ascending
 479 * indexes.  There may be holes in the indices due to not-present pages.
 480 *
 481 * pagevec_lookup() returns the number of pages which were found.
 482 */
 483unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 484                pgoff_t start, unsigned nr_pages)
 485{
 486        pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 487        return pagevec_count(pvec);
 488}
 489
 490EXPORT_SYMBOL(pagevec_lookup);
 491
 492unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 493                pgoff_t *index, int tag, unsigned nr_pages)
 494{
 495        pvec->nr = find_get_pages_tag(mapping, index, tag,
 496                                        nr_pages, pvec->pages);
 497        return pagevec_count(pvec);
 498}
 499
 500EXPORT_SYMBOL(pagevec_lookup_tag);
 501
 502#ifdef CONFIG_SMP
 503/*
 504 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
 505 * CPUs
 506 */
 507#define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
 508
 509static DEFINE_PER_CPU(long, committed_space) = 0;
 510
 511void vm_acct_memory(long pages)
 512{
 513        long *local;
 514
 515        preempt_disable();
 516        local = &__get_cpu_var(committed_space);
 517        *local += pages;
 518        if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
 519                atomic_add(*local, &vm_committed_space);
 520                *local = 0;
 521        }
 522        preempt_enable();
 523}
 524
 525#ifdef CONFIG_HOTPLUG_CPU
 526
 527/* Drop the CPU's cached committed space back into the central pool. */
 528static int cpu_swap_callback(struct notifier_block *nfb,
 529                             unsigned long action,
 530                             void *hcpu)
 531{
 532        long *committed;
 533
 534        committed = &per_cpu(committed_space, (long)hcpu);
 535        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 536                atomic_add(*committed, &vm_committed_space);
 537                *committed = 0;
 538                drain_cpu_pagevecs((long)hcpu);
 539        }
 540        return NOTIFY_OK;
 541}
 542#endif /* CONFIG_HOTPLUG_CPU */
 543#endif /* CONFIG_SMP */
 544
 545/*
 546 * Perform any setup for the swap system
 547 */
 548void __init swap_setup(void)
 549{
 550        unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 551
 552#ifdef CONFIG_SWAP
 553        bdi_init(swapper_space.backing_dev_info);
 554#endif
 555
 556        /* Use a smaller cluster for small-memory machines */
 557        if (megs < 16)
 558                page_cluster = 2;
 559        else
 560                page_cluster = 3;
 561        /*
 562         * Right now other parts of the system means that we
 563         * _really_ don't want to cluster much more
 564         */
 565#ifdef CONFIG_HOTPLUG_CPU
 566        hotcpu_notifier(cpu_swap_callback, 0);
 567#endif
 568}
 569
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.