linux/mm/compaction.c
<<
>>
Prefs
   1/*
   2 * linux/mm/compaction.c
   3 *
   4 * Memory compaction for the reduction of external fragmentation. Note that
   5 * this heavily depends upon page migration to do all the real heavy
   6 * lifting
   7 *
   8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
   9 */
  10#include <linux/swap.h>
  11#include <linux/migrate.h>
  12#include <linux/compaction.h>
  13#include <linux/mm_inline.h>
  14#include <linux/backing-dev.h>
  15#include <linux/sysctl.h>
  16#include <linux/sysfs.h>
  17#include "internal.h"
  18
  19/*
  20 * compact_control is used to track pages being migrated and the free pages
  21 * they are being migrated to during memory compaction. The free_pfn starts
  22 * at the end of a zone and migrate_pfn begins at the start. Movable pages
  23 * are moved to the end of a zone during a compaction run and the run
  24 * completes when free_pfn <= migrate_pfn
  25 */
  26struct compact_control {
  27        struct list_head freepages;     /* List of free pages to migrate to */
  28        struct list_head migratepages;  /* List of pages being migrated */
  29        unsigned long nr_freepages;     /* Number of isolated free pages */
  30        unsigned long nr_migratepages;  /* Number of pages to migrate */
  31        unsigned long free_pfn;         /* isolate_freepages search base */
  32        unsigned long migrate_pfn;      /* isolate_migratepages search base */
  33
  34        /* Account for isolated anon and file pages */
  35        unsigned long nr_anon;
  36        unsigned long nr_file;
  37
  38        unsigned int order;             /* order a direct compactor needs */
  39        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
  40        struct zone *zone;
  41};
  42
  43static unsigned long release_freepages(struct list_head *freelist)
  44{
  45        struct page *page, *next;
  46        unsigned long count = 0;
  47
  48        list_for_each_entry_safe(page, next, freelist, lru) {
  49                list_del(&page->lru);
  50                __free_page(page);
  51                count++;
  52        }
  53
  54        return count;
  55}
  56
  57/* Isolate free pages onto a private freelist. Must hold zone->lock */
  58static unsigned long isolate_freepages_block(struct zone *zone,
  59                                unsigned long blockpfn,
  60                                struct list_head *freelist)
  61{
  62        unsigned long zone_end_pfn, end_pfn;
  63        int total_isolated = 0;
  64        struct page *cursor;
  65
  66        /* Get the last PFN we should scan for free pages at */
  67        zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
  68        end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
  69
  70        /* Find the first usable PFN in the block to initialse page cursor */
  71        for (; blockpfn < end_pfn; blockpfn++) {
  72                if (pfn_valid_within(blockpfn))
  73                        break;
  74        }
  75        cursor = pfn_to_page(blockpfn);
  76
  77        /* Isolate free pages. This assumes the block is valid */
  78        for (; blockpfn < end_pfn; blockpfn++, cursor++) {
  79                int isolated, i;
  80                struct page *page = cursor;
  81
  82                if (!pfn_valid_within(blockpfn))
  83                        continue;
  84
  85                if (!PageBuddy(page))
  86                        continue;
  87
  88                /* Found a free page, break it into order-0 pages */
  89                isolated = split_free_page(page);
  90                total_isolated += isolated;
  91                for (i = 0; i < isolated; i++) {
  92                        list_add(&page->lru, freelist);
  93                        page++;
  94                }
  95
  96                /* If a page was split, advance to the end of it */
  97                if (isolated) {
  98                        blockpfn += isolated - 1;
  99                        cursor += isolated - 1;
 100                }
 101        }
 102
 103        return total_isolated;
 104}
 105
 106/* Returns true if the page is within a block suitable for migration to */
 107static bool suitable_migration_target(struct page *page)
 108{
 109
 110        int migratetype = get_pageblock_migratetype(page);
 111
 112        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
 113        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
 114                return false;
 115
 116        /* If the page is a large free page, then allow migration */
 117        if (PageBuddy(page) && page_order(page) >= pageblock_order)
 118                return true;
 119
 120        /* If the block is MIGRATE_MOVABLE, allow migration */
 121        if (migratetype == MIGRATE_MOVABLE)
 122                return true;
 123
 124        /* Otherwise skip the block */
 125        return false;
 126}
 127
 128/*
 129 * Based on information in the current compact_control, find blocks
 130 * suitable for isolating free pages from and then isolate them.
 131 */
 132static void isolate_freepages(struct zone *zone,
 133                                struct compact_control *cc)
 134{
 135        struct page *page;
 136        unsigned long high_pfn, low_pfn, pfn;
 137        unsigned long flags;
 138        int nr_freepages = cc->nr_freepages;
 139        struct list_head *freelist = &cc->freepages;
 140
 141        pfn = cc->free_pfn;
 142        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
 143        high_pfn = low_pfn;
 144
 145        /*
 146         * Isolate free pages until enough are available to migrate the
 147         * pages on cc->migratepages. We stop searching if the migrate
 148         * and free page scanners meet or enough free pages are isolated.
 149         */
 150        spin_lock_irqsave(&zone->lock, flags);
 151        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
 152                                        pfn -= pageblock_nr_pages) {
 153                unsigned long isolated;
 154
 155                if (!pfn_valid(pfn))
 156                        continue;
 157
 158                /*
 159                 * Check for overlapping nodes/zones. It's possible on some
 160                 * configurations to have a setup like
 161                 * node0 node1 node0
 162                 * i.e. it's possible that all pages within a zones range of
 163                 * pages do not belong to a single zone.
 164                 */
 165                page = pfn_to_page(pfn);
 166                if (page_zone(page) != zone)
 167                        continue;
 168
 169                /* Check the block is suitable for migration */
 170                if (!suitable_migration_target(page))
 171                        continue;
 172
 173                /* Found a block suitable for isolating free pages from */
 174                isolated = isolate_freepages_block(zone, pfn, freelist);
 175                nr_freepages += isolated;
 176
 177                /*
 178                 * Record the highest PFN we isolated pages from. When next
 179                 * looking for free pages, the search will restart here as
 180                 * page migration may have returned some pages to the allocator
 181                 */
 182                if (isolated)
 183                        high_pfn = max(high_pfn, pfn);
 184        }
 185        spin_unlock_irqrestore(&zone->lock, flags);
 186
 187        /* split_free_page does not map the pages */
 188        list_for_each_entry(page, freelist, lru) {
 189                arch_alloc_page(page, 0);
 190                kernel_map_pages(page, 1, 1);
 191        }
 192
 193        cc->free_pfn = high_pfn;
 194        cc->nr_freepages = nr_freepages;
 195}
 196
 197/* Update the number of anon and file isolated pages in the zone */
 198static void acct_isolated(struct zone *zone, struct compact_control *cc)
 199{
 200        struct page *page;
 201        unsigned int count[NR_LRU_LISTS] = { 0, };
 202
 203        list_for_each_entry(page, &cc->migratepages, lru) {
 204                int lru = page_lru_base_type(page);
 205                count[lru]++;
 206        }
 207
 208        cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
 209        cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
 210        __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
 211        __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
 212}
 213
 214/* Similar to reclaim, but different enough that they don't share logic */
 215static bool too_many_isolated(struct zone *zone)
 216{
 217        unsigned long active, inactive, isolated;
 218
 219        inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
 220                                        zone_page_state(zone, NR_INACTIVE_ANON);
 221        active = zone_page_state(zone, NR_ACTIVE_FILE) +
 222                                        zone_page_state(zone, NR_ACTIVE_ANON);
 223        isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
 224                                        zone_page_state(zone, NR_ISOLATED_ANON);
 225
 226        return isolated > (inactive + active) / 2;
 227}
 228
 229/*
 230 * Isolate all pages that can be migrated from the block pointed to by
 231 * the migrate scanner within compact_control.
 232 */
 233static unsigned long isolate_migratepages(struct zone *zone,
 234                                        struct compact_control *cc)
 235{
 236        unsigned long low_pfn, end_pfn;
 237        struct list_head *migratelist = &cc->migratepages;
 238
 239        /* Do not scan outside zone boundaries */
 240        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
 241
 242        /* Only scan within a pageblock boundary */
 243        end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
 244
 245        /* Do not cross the free scanner or scan within a memory hole */
 246        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
 247                cc->migrate_pfn = end_pfn;
 248                return 0;
 249        }
 250
 251        /*
 252         * Ensure that there are not too many pages isolated from the LRU
 253         * list by either parallel reclaimers or compaction. If there are,
 254         * delay for some time until fewer pages are isolated
 255         */
 256        while (unlikely(too_many_isolated(zone))) {
 257                congestion_wait(BLK_RW_ASYNC, HZ/10);
 258
 259                if (fatal_signal_pending(current))
 260                        return 0;
 261        }
 262
 263        /* Time to isolate some pages for migration */
 264        spin_lock_irq(&zone->lru_lock);
 265        for (; low_pfn < end_pfn; low_pfn++) {
 266                struct page *page;
 267                if (!pfn_valid_within(low_pfn))
 268                        continue;
 269
 270                /* Get the page and skip if free */
 271                page = pfn_to_page(low_pfn);
 272                if (PageBuddy(page))
 273                        continue;
 274
 275                /* Try isolate the page */
 276                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
 277                        continue;
 278
 279                /* Successfully isolated */
 280                del_page_from_lru_list(zone, page, page_lru(page));
 281                list_add(&page->lru, migratelist);
 282                mem_cgroup_del_lru(page);
 283                cc->nr_migratepages++;
 284
 285                /* Avoid isolating too much */
 286                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
 287                        break;
 288        }
 289
 290        acct_isolated(zone, cc);
 291
 292        spin_unlock_irq(&zone->lru_lock);
 293        cc->migrate_pfn = low_pfn;
 294
 295        return cc->nr_migratepages;
 296}
 297
 298/*
 299 * This is a migrate-callback that "allocates" freepages by taking pages
 300 * from the isolated freelists in the block we are migrating to.
 301 */
 302static struct page *compaction_alloc(struct page *migratepage,
 303                                        unsigned long data,
 304                                        int **result)
 305{
 306        struct compact_control *cc = (struct compact_control *)data;
 307        struct page *freepage;
 308
 309        /* Isolate free pages if necessary */
 310        if (list_empty(&cc->freepages)) {
 311                isolate_freepages(cc->zone, cc);
 312
 313                if (list_empty(&cc->freepages))
 314                        return NULL;
 315        }
 316
 317        freepage = list_entry(cc->freepages.next, struct page, lru);
 318        list_del(&freepage->lru);
 319        cc->nr_freepages--;
 320
 321        return freepage;
 322}
 323
 324/*
 325 * We cannot control nr_migratepages and nr_freepages fully when migration is
 326 * running as migrate_pages() has no knowledge of compact_control. When
 327 * migration is complete, we count the number of pages on the lists by hand.
 328 */
 329static void update_nr_listpages(struct compact_control *cc)
 330{
 331        int nr_migratepages = 0;
 332        int nr_freepages = 0;
 333        struct page *page;
 334
 335        list_for_each_entry(page, &cc->migratepages, lru)
 336                nr_migratepages++;
 337        list_for_each_entry(page, &cc->freepages, lru)
 338                nr_freepages++;
 339
 340        cc->nr_migratepages = nr_migratepages;
 341        cc->nr_freepages = nr_freepages;
 342}
 343
 344static int compact_finished(struct zone *zone,
 345                                                struct compact_control *cc)
 346{
 347        unsigned int order;
 348        unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
 349
 350        if (fatal_signal_pending(current))
 351                return COMPACT_PARTIAL;
 352
 353        /* Compaction run completes if the migrate and free scanner meet */
 354        if (cc->free_pfn <= cc->migrate_pfn)
 355                return COMPACT_COMPLETE;
 356
 357        /* Compaction run is not finished if the watermark is not met */
 358        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
 359                return COMPACT_CONTINUE;
 360
 361        if (cc->order == -1)
 362                return COMPACT_CONTINUE;
 363
 364        /* Direct compactor: Is a suitable page free? */
 365        for (order = cc->order; order < MAX_ORDER; order++) {
 366                /* Job done if page is free of the right migratetype */
 367                if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
 368                        return COMPACT_PARTIAL;
 369
 370                /* Job done if allocation would set block type */
 371                if (order >= pageblock_order && zone->free_area[order].nr_free)
 372                        return COMPACT_PARTIAL;
 373        }
 374
 375        return COMPACT_CONTINUE;
 376}
 377
 378static int compact_zone(struct zone *zone, struct compact_control *cc)
 379{
 380        int ret;
 381
 382        /* Setup to move all movable pages to the end of the zone */
 383        cc->migrate_pfn = zone->zone_start_pfn;
 384        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
 385        cc->free_pfn &= ~(pageblock_nr_pages-1);
 386
 387        migrate_prep_local();
 388
 389        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
 390                unsigned long nr_migrate, nr_remaining;
 391
 392                if (!isolate_migratepages(zone, cc))
 393                        continue;
 394
 395                nr_migrate = cc->nr_migratepages;
 396                migrate_pages(&cc->migratepages, compaction_alloc,
 397                                                (unsigned long)cc, 0);
 398                update_nr_listpages(cc);
 399                nr_remaining = cc->nr_migratepages;
 400
 401                count_vm_event(COMPACTBLOCKS);
 402                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
 403                if (nr_remaining)
 404                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
 405
 406                /* Release LRU pages not migrated */
 407                if (!list_empty(&cc->migratepages)) {
 408                        putback_lru_pages(&cc->migratepages);
 409                        cc->nr_migratepages = 0;
 410                }
 411
 412        }
 413
 414        /* Release free pages and check accounting */
 415        cc->nr_freepages -= release_freepages(&cc->freepages);
 416        VM_BUG_ON(cc->nr_freepages != 0);
 417
 418        return ret;
 419}
 420
 421static unsigned long compact_zone_order(struct zone *zone,
 422                                                int order, gfp_t gfp_mask)
 423{
 424        struct compact_control cc = {
 425                .nr_freepages = 0,
 426                .nr_migratepages = 0,
 427                .order = order,
 428                .migratetype = allocflags_to_migratetype(gfp_mask),
 429                .zone = zone,
 430        };
 431        INIT_LIST_HEAD(&cc.freepages);
 432        INIT_LIST_HEAD(&cc.migratepages);
 433
 434        return compact_zone(zone, &cc);
 435}
 436
 437int sysctl_extfrag_threshold = 500;
 438
 439/**
 440 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 441 * @zonelist: The zonelist used for the current allocation
 442 * @order: The order of the current allocation
 443 * @gfp_mask: The GFP mask of the current allocation
 444 * @nodemask: The allowed nodes to allocate from
 445 *
 446 * This is the main entry point for direct page compaction.
 447 */
 448unsigned long try_to_compact_pages(struct zonelist *zonelist,
 449                        int order, gfp_t gfp_mask, nodemask_t *nodemask)
 450{
 451        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 452        int may_enter_fs = gfp_mask & __GFP_FS;
 453        int may_perform_io = gfp_mask & __GFP_IO;
 454        unsigned long watermark;
 455        struct zoneref *z;
 456        struct zone *zone;
 457        int rc = COMPACT_SKIPPED;
 458
 459        /*
 460         * Check whether it is worth even starting compaction. The order check is
 461         * made because an assumption is made that the page allocator can satisfy
 462         * the "cheaper" orders without taking special steps
 463         */
 464        if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
 465                return rc;
 466
 467        count_vm_event(COMPACTSTALL);
 468
 469        /* Compact each zone in the list */
 470        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
 471                                                                nodemask) {
 472                int fragindex;
 473                int status;
 474
 475                /*
 476                 * Watermarks for order-0 must be met for compaction. Note
 477                 * the 2UL. This is because during migration, copies of
 478                 * pages need to be allocated and for a short time, the
 479                 * footprint is higher
 480                 */
 481                watermark = low_wmark_pages(zone) + (2UL << order);
 482                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 483                        continue;
 484
 485                /*
 486                 * fragmentation index determines if allocation failures are
 487                 * due to low memory or external fragmentation
 488                 *
 489                 * index of -1 implies allocations might succeed depending
 490                 *      on watermarks
 491                 * index towards 0 implies failure is due to lack of memory
 492                 * index towards 1000 implies failure is due to fragmentation
 493                 *
 494                 * Only compact if a failure would be due to fragmentation.
 495                 */
 496                fragindex = fragmentation_index(zone, order);
 497                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
 498                        continue;
 499
 500                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
 501                        rc = COMPACT_PARTIAL;
 502                        break;
 503                }
 504
 505                status = compact_zone_order(zone, order, gfp_mask);
 506                rc = max(status, rc);
 507
 508                if (zone_watermark_ok(zone, order, watermark, 0, 0))
 509                        break;
 510        }
 511
 512        return rc;
 513}
 514
 515
 516/* Compact all zones within a node */
 517static int compact_node(int nid)
 518{
 519        int zoneid;
 520        pg_data_t *pgdat;
 521        struct zone *zone;
 522
 523        if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
 524                return -EINVAL;
 525        pgdat = NODE_DATA(nid);
 526
 527        /* Flush pending updates to the LRU lists */
 528        lru_add_drain_all();
 529
 530        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 531                struct compact_control cc = {
 532                        .nr_freepages = 0,
 533                        .nr_migratepages = 0,
 534                        .order = -1,
 535                };
 536
 537                zone = &pgdat->node_zones[zoneid];
 538                if (!populated_zone(zone))
 539                        continue;
 540
 541                cc.zone = zone;
 542                INIT_LIST_HEAD(&cc.freepages);
 543                INIT_LIST_HEAD(&cc.migratepages);
 544
 545                compact_zone(zone, &cc);
 546
 547                VM_BUG_ON(!list_empty(&cc.freepages));
 548                VM_BUG_ON(!list_empty(&cc.migratepages));
 549        }
 550
 551        return 0;
 552}
 553
 554/* Compact all nodes in the system */
 555static int compact_nodes(void)
 556{
 557        int nid;
 558
 559        for_each_online_node(nid)
 560                compact_node(nid);
 561
 562        return COMPACT_COMPLETE;
 563}
 564
 565/* The written value is actually unused, all memory is compacted */
 566int sysctl_compact_memory;
 567
 568/* This is the entry point for compacting all nodes via /proc/sys/vm */
 569int sysctl_compaction_handler(struct ctl_table *table, int write,
 570                        void __user *buffer, size_t *length, loff_t *ppos)
 571{
 572        if (write)
 573                return compact_nodes();
 574
 575        return 0;
 576}
 577
 578int sysctl_extfrag_handler(struct ctl_table *table, int write,
 579                        void __user *buffer, size_t *length, loff_t *ppos)
 580{
 581        proc_dointvec_minmax(table, write, buffer, length, ppos);
 582
 583        return 0;
 584}
 585
 586#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
 587ssize_t sysfs_compact_node(struct sys_device *dev,
 588                        struct sysdev_attribute *attr,
 589                        const char *buf, size_t count)
 590{
 591        compact_node(dev->id);
 592
 593        return count;
 594}
 595static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
 596
 597int compaction_register_node(struct node *node)
 598{
 599        return sysdev_create_file(&node->sysdev, &attr_compact);
 600}
 601
 602void compaction_unregister_node(struct node *node)
 603{
 604        return sysdev_remove_file(&node->sysdev, &attr_compact);
 605}
 606#endif /* CONFIG_SYSFS && CONFIG_NUMA */
 607
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.