1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/stddef.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/interrupt.h>
21#include <linux/pagemap.h>
22#include <linux/jiffies.h>
23#include <linux/bootmem.h>
24#include <linux/memblock.h>
25#include <linux/compiler.h>
26#include <linux/kernel.h>
27#include <linux/kmemcheck.h>
28#include <linux/module.h>
29#include <linux/suspend.h>
30#include <linux/pagevec.h>
31#include <linux/blkdev.h>
32#include <linux/slab.h>
33#include <linux/ratelimit.h>
34#include <linux/oom.h>
35#include <linux/notifier.h>
36#include <linux/topology.h>
37#include <linux/sysctl.h>
38#include <linux/cpu.h>
39#include <linux/cpuset.h>
40#include <linux/memory_hotplug.h>
41#include <linux/nodemask.h>
42#include <linux/vmalloc.h>
43#include <linux/vmstat.h>
44#include <linux/mempolicy.h>
45#include <linux/stop_machine.h>
46#include <linux/sort.h>
47#include <linux/pfn.h>
48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h>
51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h>
54#include <linux/compaction.h>
55#include <trace/events/kmem.h>
56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h>
58#include <linux/prefetch.h>
59#include <linux/migrate.h>
60#include <linux/page-debug-flags.h>
61
62#include <asm/tlbflush.h>
63#include <asm/div64.h>
64#include "internal.h"
65
66#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
67DEFINE_PER_CPU(int, numa_node);
68EXPORT_PER_CPU_SYMBOL(numa_node);
69#endif
70
71#ifdef CONFIG_HAVE_MEMORYLESS_NODES
72
73
74
75
76
77
78DEFINE_PER_CPU(int, _numa_mem_);
79EXPORT_PER_CPU_SYMBOL(_numa_mem_);
80#endif
81
82
83
84
85nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
86 [N_POSSIBLE] = NODE_MASK_ALL,
87 [N_ONLINE] = { { [0] = 1UL } },
88#ifndef CONFIG_NUMA
89 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
90#ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92#endif
93 [N_CPU] = { { [0] = 1UL } },
94#endif
95};
96EXPORT_SYMBOL(node_states);
97
98unsigned long totalram_pages __read_mostly;
99unsigned long totalreserve_pages __read_mostly;
100
101
102
103
104
105
106unsigned long dirty_balance_reserve __read_mostly;
107
108int percpu_pagelist_fraction;
109gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
110
111#ifdef CONFIG_PM_SLEEP
112
113
114
115
116
117
118
119
120
121static gfp_t saved_gfp_mask;
122
123void pm_restore_gfp_mask(void)
124{
125 WARN_ON(!mutex_is_locked(&pm_mutex));
126 if (saved_gfp_mask) {
127 gfp_allowed_mask = saved_gfp_mask;
128 saved_gfp_mask = 0;
129 }
130}
131
132void pm_restrict_gfp_mask(void)
133{
134 WARN_ON(!mutex_is_locked(&pm_mutex));
135 WARN_ON(saved_gfp_mask);
136 saved_gfp_mask = gfp_allowed_mask;
137 gfp_allowed_mask &= ~GFP_IOFS;
138}
139
140bool pm_suspended_storage(void)
141{
142 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
143 return false;
144 return true;
145}
146#endif
147
148#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
149int pageblock_order __read_mostly;
150#endif
151
152static void __free_pages_ok(struct page *page, unsigned int order);
153
154
155
156
157
158
159
160
161
162
163
164
165int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
166#ifdef CONFIG_ZONE_DMA
167 256,
168#endif
169#ifdef CONFIG_ZONE_DMA32
170 256,
171#endif
172#ifdef CONFIG_HIGHMEM
173 32,
174#endif
175 32,
176};
177
178EXPORT_SYMBOL(totalram_pages);
179
180static char * const zone_names[MAX_NR_ZONES] = {
181#ifdef CONFIG_ZONE_DMA
182 "DMA",
183#endif
184#ifdef CONFIG_ZONE_DMA32
185 "DMA32",
186#endif
187 "Normal",
188#ifdef CONFIG_HIGHMEM
189 "HighMem",
190#endif
191 "Movable",
192};
193
194int min_free_kbytes = 1024;
195
196static unsigned long __meminitdata nr_kernel_pages;
197static unsigned long __meminitdata nr_all_pages;
198static unsigned long __meminitdata dma_reserve;
199
200#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
201static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
202static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
203static unsigned long __initdata required_kernelcore;
204static unsigned long __initdata required_movablecore;
205static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
206
207
208int movable_zone;
209EXPORT_SYMBOL(movable_zone);
210#endif
211
212#if MAX_NUMNODES > 1
213int nr_node_ids __read_mostly = MAX_NUMNODES;
214int nr_online_nodes __read_mostly = 1;
215EXPORT_SYMBOL(nr_node_ids);
216EXPORT_SYMBOL(nr_online_nodes);
217#endif
218
219int page_group_by_mobility_disabled __read_mostly;
220
221
222
223
224
225
226void set_pageblock_migratetype(struct page *page, int migratetype)
227{
228
229 if (unlikely(page_group_by_mobility_disabled))
230 migratetype = MIGRATE_UNMOVABLE;
231
232 set_pageblock_flags_group(page, (unsigned long)migratetype,
233 PB_migrate, PB_migrate_end);
234}
235
236bool oom_killer_disabled __read_mostly;
237
238#ifdef CONFIG_DEBUG_VM
239static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
240{
241 int ret = 0;
242 unsigned seq;
243 unsigned long pfn = page_to_pfn(page);
244
245 do {
246 seq = zone_span_seqbegin(zone);
247 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
248 ret = 1;
249 else if (pfn < zone->zone_start_pfn)
250 ret = 1;
251 } while (zone_span_seqretry(zone, seq));
252
253 return ret;
254}
255
256static int page_is_consistent(struct zone *zone, struct page *page)
257{
258 if (!pfn_valid_within(page_to_pfn(page)))
259 return 0;
260 if (zone != page_zone(page))
261 return 0;
262
263 return 1;
264}
265
266
267
268static int bad_range(struct zone *zone, struct page *page)
269{
270 if (page_outside_zone_boundaries(zone, page))
271 return 1;
272 if (!page_is_consistent(zone, page))
273 return 1;
274
275 return 0;
276}
277#else
278static inline int bad_range(struct zone *zone, struct page *page)
279{
280 return 0;
281}
282#endif
283
284static void bad_page(struct page *page)
285{
286 static unsigned long resume;
287 static unsigned long nr_shown;
288 static unsigned long nr_unshown;
289
290
291 if (PageHWPoison(page)) {
292 reset_page_mapcount(page);
293 return;
294 }
295
296
297
298
299
300 if (nr_shown == 60) {
301 if (time_before(jiffies, resume)) {
302 nr_unshown++;
303 goto out;
304 }
305 if (nr_unshown) {
306 printk(KERN_ALERT
307 "BUG: Bad page state: %lu messages suppressed\n",
308 nr_unshown);
309 nr_unshown = 0;
310 }
311 nr_shown = 0;
312 }
313 if (nr_shown++ == 0)
314 resume = jiffies + 60 * HZ;
315
316 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
317 current->comm, page_to_pfn(page));
318 dump_page(page);
319
320 print_modules();
321 dump_stack();
322out:
323
324 reset_page_mapcount(page);
325 add_taint(TAINT_BAD_PAGE);
326}
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343static void free_compound_page(struct page *page)
344{
345 __free_pages_ok(page, compound_order(page));
346}
347
348void prep_compound_page(struct page *page, unsigned long order)
349{
350 int i;
351 int nr_pages = 1 << order;
352
353 set_compound_page_dtor(page, free_compound_page);
354 set_compound_order(page, order);
355 __SetPageHead(page);
356 for (i = 1; i < nr_pages; i++) {
357 struct page *p = page + i;
358 __SetPageTail(p);
359 set_page_count(p, 0);
360 p->first_page = page;
361 }
362}
363
364
365static int destroy_compound_page(struct page *page, unsigned long order)
366{
367 int i;
368 int nr_pages = 1 << order;
369 int bad = 0;
370
371 if (unlikely(compound_order(page) != order) ||
372 unlikely(!PageHead(page))) {
373 bad_page(page);
374 bad++;
375 }
376
377 __ClearPageHead(page);
378
379 for (i = 1; i < nr_pages; i++) {
380 struct page *p = page + i;
381
382 if (unlikely(!PageTail(p) || (p->first_page != page))) {
383 bad_page(page);
384 bad++;
385 }
386 __ClearPageTail(p);
387 }
388
389 return bad;
390}
391
392static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
393{
394 int i;
395
396
397
398
399
400 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
401 for (i = 0; i < (1 << order); i++)
402 clear_highpage(page + i);
403}
404
405#ifdef CONFIG_DEBUG_PAGEALLOC
406unsigned int _debug_guardpage_minorder;
407
408static int __init debug_guardpage_minorder_setup(char *buf)
409{
410 unsigned long res;
411
412 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
413 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
414 return 0;
415 }
416 _debug_guardpage_minorder = res;
417 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
418 return 0;
419}
420__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
421
422static inline void set_page_guard_flag(struct page *page)
423{
424 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
425}
426
427static inline void clear_page_guard_flag(struct page *page)
428{
429 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
430}
431#else
432static inline void set_page_guard_flag(struct page *page) { }
433static inline void clear_page_guard_flag(struct page *page) { }
434#endif
435
436static inline void set_page_order(struct page *page, int order)
437{
438 set_page_private(page, order);
439 __SetPageBuddy(page);
440}
441
442static inline void rmv_page_order(struct page *page)
443{
444 __ClearPageBuddy(page);
445 set_page_private(page, 0);
446}
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465static inline unsigned long
466__find_buddy_index(unsigned long page_idx, unsigned int order)
467{
468 return page_idx ^ (1 << order);
469}
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484static inline int page_is_buddy(struct page *page, struct page *buddy,
485 int order)
486{
487 if (!pfn_valid_within(page_to_pfn(buddy)))
488 return 0;
489
490 if (page_zone_id(page) != page_zone_id(buddy))
491 return 0;
492
493 if (page_is_guard(buddy) && page_order(buddy) == order) {
494 VM_BUG_ON(page_count(buddy) != 0);
495 return 1;
496 }
497
498 if (PageBuddy(buddy) && page_order(buddy) == order) {
499 VM_BUG_ON(page_count(buddy) != 0);
500 return 1;
501 }
502 return 0;
503}
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529static inline void __free_one_page(struct page *page,
530 struct zone *zone, unsigned int order,
531 int migratetype)
532{
533 unsigned long page_idx;
534 unsigned long combined_idx;
535 unsigned long uninitialized_var(buddy_idx);
536 struct page *buddy;
537
538 if (unlikely(PageCompound(page)))
539 if (unlikely(destroy_compound_page(page, order)))
540 return;
541
542 VM_BUG_ON(migratetype == -1);
543
544 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
545
546 VM_BUG_ON(page_idx & ((1 << order) - 1));
547 VM_BUG_ON(bad_range(zone, page));
548
549 while (order < MAX_ORDER-1) {
550 buddy_idx = __find_buddy_index(page_idx, order);
551 buddy = page + (buddy_idx - page_idx);
552 if (!page_is_buddy(page, buddy, order))
553 break;
554
555
556
557
558 if (page_is_guard(buddy)) {
559 clear_page_guard_flag(buddy);
560 set_page_private(page, 0);
561 __mod_zone_freepage_state(zone, 1 << order,
562 migratetype);
563 } else {
564 list_del(&buddy->lru);
565 zone->free_area[order].nr_free--;
566 rmv_page_order(buddy);
567 }
568 combined_idx = buddy_idx & page_idx;
569 page = page + (combined_idx - page_idx);
570 page_idx = combined_idx;
571 order++;
572 }
573 set_page_order(page, order);
574
575
576
577
578
579
580
581
582
583 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
584 struct page *higher_page, *higher_buddy;
585 combined_idx = buddy_idx & page_idx;
586 higher_page = page + (combined_idx - page_idx);
587 buddy_idx = __find_buddy_index(combined_idx, order + 1);
588 higher_buddy = higher_page + (buddy_idx - combined_idx);
589 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
590 list_add_tail(&page->lru,
591 &zone->free_area[order].free_list[migratetype]);
592 goto out;
593 }
594 }
595
596 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
597out:
598 zone->free_area[order].nr_free++;
599}
600
601static inline int free_pages_check(struct page *page)
602{
603 if (unlikely(page_mapcount(page) |
604 (page->mapping != NULL) |
605 (atomic_read(&page->_count) != 0) |
606 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
607 (mem_cgroup_bad_page_check(page)))) {
608 bad_page(page);
609 return 1;
610 }
611 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
612 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
613 return 0;
614}
615
616
617
618
619
620
621
622
623
624
625
626
627static void free_pcppages_bulk(struct zone *zone, int count,
628 struct per_cpu_pages *pcp)
629{
630 int migratetype = 0;
631 int batch_free = 0;
632 int to_free = count;
633
634 spin_lock(&zone->lock);
635 zone->all_unreclaimable = 0;
636 zone->pages_scanned = 0;
637
638 while (to_free) {
639 struct page *page;
640 struct list_head *list;
641
642
643
644
645
646
647
648
649 do {
650 batch_free++;
651 if (++migratetype == MIGRATE_PCPTYPES)
652 migratetype = 0;
653 list = &pcp->lists[migratetype];
654 } while (list_empty(list));
655
656
657 if (batch_free == MIGRATE_PCPTYPES)
658 batch_free = to_free;
659
660 do {
661 int mt;
662
663 page = list_entry(list->prev, struct page, lru);
664
665 list_del(&page->lru);
666 mt = get_freepage_migratetype(page);
667
668 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
672 } while (--to_free && --batch_free && !list_empty(list));
673 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock);
676}
677
678static void free_one_page(struct zone *zone, struct page *page, int order,
679 int migratetype)
680{
681 spin_lock(&zone->lock);
682 zone->all_unreclaimable = 0;
683 zone->pages_scanned = 0;
684
685 __free_one_page(page, zone, order, migratetype);
686 if (unlikely(migratetype != MIGRATE_ISOLATE))
687 __mod_zone_freepage_state(zone, 1 << order, migratetype);
688 spin_unlock(&zone->lock);
689}
690
691static bool free_pages_prepare(struct page *page, unsigned int order)
692{
693 int i;
694 int bad = 0;
695
696 trace_mm_page_free(page, order);
697 kmemcheck_free_shadow(page, order);
698
699 if (PageAnon(page))
700 page->mapping = NULL;
701 for (i = 0; i < (1 << order); i++)
702 bad += free_pages_check(page + i);
703 if (bad)
704 return false;
705
706 if (!PageHighMem(page)) {
707 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
708 debug_check_no_obj_freed(page_address(page),
709 PAGE_SIZE << order);
710 }
711 arch_free_page(page, order);
712 kernel_map_pages(page, 1 << order, 0);
713
714 return true;
715}
716
717static void __free_pages_ok(struct page *page, unsigned int order)
718{
719 unsigned long flags;
720 int migratetype;
721
722 if (!free_pages_prepare(page, order))
723 return;
724
725 local_irq_save(flags);
726 __count_vm_events(PGFREE, 1 << order);
727 migratetype = get_pageblock_migratetype(page);
728 set_freepage_migratetype(page, migratetype);
729 free_one_page(page_zone(page), page, order, migratetype);
730 local_irq_restore(flags);
731}
732
733void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
734{
735 unsigned int nr_pages = 1 << order;
736 unsigned int loop;
737
738 prefetchw(page);
739 for (loop = 0; loop < nr_pages; loop++) {
740 struct page *p = &page[loop];
741
742 if (loop + 1 < nr_pages)
743 prefetchw(p + 1);
744 __ClearPageReserved(p);
745 set_page_count(p, 0);
746 }
747
748 set_page_refcounted(page);
749 __free_pages(page, order);
750}
751
752#ifdef CONFIG_CMA
753
754void __init init_cma_reserved_pageblock(struct page *page)
755{
756 unsigned i = pageblock_nr_pages;
757 struct page *p = page;
758
759 do {
760 __ClearPageReserved(p);
761 set_page_count(p, 0);
762 } while (++p, --i);
763
764 set_page_refcounted(page);
765 set_pageblock_migratetype(page, MIGRATE_CMA);
766 __free_pages(page, pageblock_order);
767 totalram_pages += pageblock_nr_pages;
768}
769#endif
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785static inline void expand(struct zone *zone, struct page *page,
786 int low, int high, struct free_area *area,
787 int migratetype)
788{
789 unsigned long size = 1 << high;
790
791 while (high > low) {
792 area--;
793 high--;
794 size >>= 1;
795 VM_BUG_ON(bad_range(zone, &page[size]));
796
797#ifdef CONFIG_DEBUG_PAGEALLOC
798 if (high < debug_guardpage_minorder()) {
799
800
801
802
803
804
805 INIT_LIST_HEAD(&page[size].lru);
806 set_page_guard_flag(&page[size]);
807 set_page_private(&page[size], high);
808
809 __mod_zone_freepage_state(zone, -(1 << high),
810 migratetype);
811 continue;
812 }
813#endif
814 list_add(&page[size].lru, &area->free_list[migratetype]);
815 area->nr_free++;
816 set_page_order(&page[size], high);
817 }
818}
819
820
821
822
823static inline int check_new_page(struct page *page)
824{
825 if (unlikely(page_mapcount(page) |
826 (page->mapping != NULL) |
827 (atomic_read(&page->_count) != 0) |
828 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
829 (mem_cgroup_bad_page_check(page)))) {
830 bad_page(page);
831 return 1;
832 }
833 return 0;
834}
835
836static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
837{
838 int i;
839
840 for (i = 0; i < (1 << order); i++) {
841 struct page *p = page + i;
842 if (unlikely(check_new_page(p)))
843 return 1;
844 }
845
846 set_page_private(page, 0);
847 set_page_refcounted(page);
848
849 arch_alloc_page(page, order);
850 kernel_map_pages(page, 1 << order, 1);
851
852 if (gfp_flags & __GFP_ZERO)
853 prep_zero_page(page, order, gfp_flags);
854
855 if (order && (gfp_flags & __GFP_COMP))
856 prep_compound_page(page, order);
857
858 return 0;
859}
860
861
862
863
864
865static inline
866struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
867 int migratetype)
868{
869 unsigned int current_order;
870 struct free_area * area;
871 struct page *page;
872
873
874 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
875 area = &(zone->free_area[current_order]);
876 if (list_empty(&area->free_list[migratetype]))
877 continue;
878
879 page = list_entry(area->free_list[migratetype].next,
880 struct page, lru);
881 list_del(&page->lru);
882 rmv_page_order(page);
883 area->nr_free--;
884 expand(zone, page, order, current_order, area, migratetype);
885 return page;
886 }
887
888 return NULL;
889}
890
891
892
893
894
895
896static int fallbacks[MIGRATE_TYPES][4] = {
897 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
898 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
899#ifdef CONFIG_CMA
900 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
901 [MIGRATE_CMA] = { MIGRATE_RESERVE },
902#else
903 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
904#endif
905 [MIGRATE_RESERVE] = { MIGRATE_RESERVE },
906 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE },
907};
908
909
910
911
912
913
914int move_freepages(struct zone *zone,
915 struct page *start_page, struct page *end_page,
916 int migratetype)
917{
918 struct page *page;
919 unsigned long order;
920 int pages_moved = 0;
921
922#ifndef CONFIG_HOLES_IN_ZONE
923
924
925
926
927
928
929
930 BUG_ON(page_zone(start_page) != page_zone(end_page));
931#endif
932
933 for (page = start_page; page <= end_page;) {
934
935 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
936
937 if (!pfn_valid_within(page_to_pfn(page))) {
938 page++;
939 continue;
940 }
941
942 if (!PageBuddy(page)) {
943 page++;
944 continue;
945 }
946
947 order = page_order(page);
948 list_move(&page->lru,
949 &zone->free_area[order].free_list[migratetype]);
950 set_freepage_migratetype(page, migratetype);
951 page += 1 << order;
952 pages_moved += 1 << order;
953 }
954
955 return pages_moved;
956}
957
958int move_freepages_block(struct zone *zone, struct page *page,
959 int migratetype)
960{
961 unsigned long start_pfn, end_pfn;
962 struct page *start_page, *end_page;
963
964 start_pfn = page_to_pfn(page);
965 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
966 start_page = pfn_to_page(start_pfn);
967 end_page = start_page + pageblock_nr_pages - 1;
968 end_pfn = start_pfn + pageblock_nr_pages - 1;
969
970
971 if (start_pfn < zone->zone_start_pfn)
972 start_page = page;
973 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
974 return 0;
975
976 return move_freepages(zone, start_page, end_page, migratetype);
977}
978
979static void change_pageblock_range(struct page *pageblock_page,
980 int start_order, int migratetype)
981{
982 int nr_pageblocks = 1 << (start_order - pageblock_order);
983
984 while (nr_pageblocks--) {
985 set_pageblock_migratetype(pageblock_page, migratetype);
986 pageblock_page += pageblock_nr_pages;
987 }
988}
989
990
991static inline struct page *
992__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
993{
994 struct free_area * area;
995 int current_order;
996 struct page *page;
997 int migratetype, i;
998
999
1000 for (current_order = MAX_ORDER-1; current_order >= order;
1001 --current_order) {
1002 for (i = 0;; i++) {
1003 migratetype = fallbacks[start_migratetype][i];
1004
1005
1006 if (migratetype == MIGRATE_RESERVE)
1007 break;
1008
1009 area = &(zone->free_area[current_order]);
1010 if (list_empty(&area->free_list[migratetype]))
1011 continue;
1012
1013 page = list_entry(area->free_list[migratetype].next,
1014 struct page, lru);
1015 area->nr_free--;
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029 if (!is_migrate_cma(migratetype) &&
1030 (unlikely(current_order >= pageblock_order / 2) ||
1031 start_migratetype == MIGRATE_RECLAIMABLE ||
1032 page_group_by_mobility_disabled)) {
1033 int pages;
1034 pages = move_freepages_block(zone, page,
1035 start_migratetype);
1036
1037
1038 if (pages >= (1 << (pageblock_order-1)) ||
1039 page_group_by_mobility_disabled)
1040 set_pageblock_migratetype(page,
1041 start_migratetype);
1042
1043 migratetype = start_migratetype;
1044 }
1045
1046
1047 list_del(&page->lru);
1048 rmv_page_order(page);
1049
1050
1051 if (current_order >= pageblock_order &&
1052 !is_migrate_cma(migratetype))
1053 change_pageblock_range(page, current_order,
1054 start_migratetype);
1055
1056 expand(zone, page, order, current_order, area,
1057 is_migrate_cma(migratetype)
1058 ? migratetype : start_migratetype);
1059
1060 trace_mm_page_alloc_extfrag(page, order, current_order,
1061 start_migratetype, migratetype);
1062
1063 return page;
1064 }
1065 }
1066
1067 return NULL;
1068}
1069
1070
1071
1072
1073
1074static struct page *__rmqueue(struct zone *zone, unsigned int order,
1075 int migratetype)
1076{
1077 struct page *page;
1078
1079retry_reserve:
1080 page = __rmqueue_smallest(zone, order, migratetype);
1081
1082 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1083 page = __rmqueue_fallback(zone, order, migratetype);
1084
1085
1086
1087
1088
1089
1090 if (!page) {
1091 migratetype = MIGRATE_RESERVE;
1092 goto retry_reserve;
1093 }
1094 }
1095
1096 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1097 return page;
1098}
1099
1100
1101
1102
1103
1104
1105static int rmqueue_bulk(struct zone *zone, unsigned int order,
1106 unsigned long count, struct list_head *list,
1107 int migratetype, int cold)
1108{
1109 int mt = migratetype, i;
1110
1111 spin_lock(&zone->lock);
1112 for (i = 0; i < count; ++i) {
1113 struct page *page = __rmqueue(zone, order, migratetype);
1114 if (unlikely(page == NULL))
1115 break;
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126 if (likely(cold == 0))
1127 list_add(&page->lru, list);
1128 else
1129 list_add_tail(&page->lru, list);
1130 if (IS_ENABLED(CONFIG_CMA)) {
1131 mt = get_pageblock_migratetype(page);
1132 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1133 mt = migratetype;
1134 }
1135 set_freepage_migratetype(page, mt);
1136 list = &page->lru;
1137 if (is_migrate_cma(mt))
1138 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1139 -(1 << order));
1140 }
1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1142 spin_unlock(&zone->lock);
1143 return i;
1144}
1145
1146#ifdef CONFIG_NUMA
1147
1148
1149
1150
1151
1152
1153
1154
1155void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1156{
1157 unsigned long flags;
1158 int to_drain;
1159
1160 local_irq_save(flags);
1161 if (pcp->count >= pcp->batch)
1162 to_drain = pcp->batch;
1163 else
1164 to_drain = pcp->count;
1165 if (to_drain > 0) {
1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1169 local_irq_restore(flags);
1170}
1171#endif
1172
1173
1174
1175
1176
1177
1178
1179
1180static void drain_pages(unsigned int cpu)
1181{
1182 unsigned long flags;
1183 struct zone *zone;
1184
1185 for_each_populated_zone(zone) {
1186 struct per_cpu_pageset *pset;
1187 struct per_cpu_pages *pcp;
1188
1189 local_irq_save(flags);
1190 pset = per_cpu_ptr(zone->pageset, cpu);
1191
1192 pcp = &pset->pcp;
1193 if (pcp->count) {
1194 free_pcppages_bulk(zone, pcp->count, pcp);
1195 pcp->count = 0;
1196 }
1197 local_irq_restore(flags);
1198 }
1199}
1200
1201
1202
1203
1204void drain_local_pages(void *arg)
1205{
1206 drain_pages(smp_processor_id());
1207}
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218void drain_all_pages(void)
1219{
1220 int cpu;
1221 struct per_cpu_pageset *pcp;
1222 struct zone *zone;
1223
1224
1225
1226
1227
1228 static cpumask_t cpus_with_pcps;
1229
1230
1231
1232
1233
1234
1235
1236 for_each_online_cpu(cpu) {
1237 bool has_pcps = false;
1238 for_each_populated_zone(zone) {
1239 pcp = per_cpu_ptr(zone->pageset, cpu);
1240 if (pcp->pcp.count) {
1241 has_pcps = true;
1242 break;
1243 }
1244 }
1245 if (has_pcps)
1246 cpumask_set_cpu(cpu, &cpus_with_pcps);
1247 else
1248 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1249 }
1250 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1251}
1252
1253#ifdef CONFIG_HIBERNATION
1254
1255void mark_free_pages(struct zone *zone)
1256{
1257 unsigned long pfn, max_zone_pfn;
1258 unsigned long flags;
1259 int order, t;
1260 struct list_head *curr;
1261
1262 if (!zone->spanned_pages)
1263 return;
1264
1265 spin_lock_irqsave(&zone->lock, flags);
1266
1267 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1268 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1269 if (pfn_valid(pfn)) {
1270 struct page *page = pfn_to_page(pfn);
1271
1272 if (!swsusp_page_is_forbidden(page))
1273 swsusp_unset_page_free(page);
1274 }
1275
1276 for_each_migratetype_order(order, t) {
1277 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1278 unsigned long i;
1279
1280 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1281 for (i = 0; i < (1UL << order); i++)
1282 swsusp_set_page_free(pfn_to_page(pfn + i));
1283 }
1284 }
1285 spin_unlock_irqrestore(&zone->lock, flags);
1286}
1287#endif
1288
1289
1290
1291
1292
1293void free_hot_cold_page(struct page *page, int cold)
1294{
1295 struct zone *zone = page_zone(page);
1296 struct per_cpu_pages *pcp;
1297 unsigned long flags;
1298 int migratetype;
1299
1300 if (!free_pages_prepare(page, 0))
1301 return;
1302
1303 migratetype = get_pageblock_migratetype(page);
1304 set_freepage_migratetype(page, migratetype);
1305 local_irq_save(flags);
1306 __count_vm_event(PGFREE);
1307
1308
1309
1310
1311
1312
1313
1314
1315 if (migratetype >= MIGRATE_PCPTYPES) {
1316 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1317 free_one_page(zone, page, 0, migratetype);
1318 goto out;
1319 }
1320 migratetype = MIGRATE_MOVABLE;
1321 }
1322
1323 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1324 if (cold)
1325 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1326 else
1327 list_add(&page->lru, &pcp->lists[migratetype]);
1328 pcp->count++;
1329 if (pcp->count >= pcp->high) {
1330 free_pcppages_bulk(zone, pcp->batch, pcp);
1331 pcp->count -= pcp->batch;
1332 }
1333
1334out:
1335 local_irq_restore(flags);
1336}
1337
1338
1339
1340
1341void free_hot_cold_page_list(struct list_head *list, int cold)
1342{
1343 struct page *page, *next;
1344
1345 list_for_each_entry_safe(page, next, list, lru) {
1346 trace_mm_page_free_batched(page, cold);
1347 free_hot_cold_page(page, cold);
1348 }
1349}
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359void split_page(struct page *page, unsigned int order)
1360{
1361 int i;
1362
1363 VM_BUG_ON(PageCompound(page));
1364 VM_BUG_ON(!page_count(page));
1365
1366#ifdef CONFIG_KMEMCHECK
1367
1368
1369
1370
1371 if (kmemcheck_page_is_tracked(page))
1372 split_page(virt_to_page(page[0].shadow), order);
1373#endif
1374
1375 for (i = 1; i < (1 << order); i++)
1376 set_page_refcounted(page + i);
1377}
1378
1379static int __isolate_free_page(struct page *page, unsigned int order)
1380{
1381 unsigned long watermark;
1382 struct zone *zone;
1383 int mt;
1384
1385 BUG_ON(!PageBuddy(page));
1386
1387 zone = page_zone(page);
1388
1389
1390 watermark = low_wmark_pages(zone) + (1 << order);
1391 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1392 return 0;
1393
1394
1395 list_del(&page->lru);
1396 zone->free_area[order].nr_free--;
1397 rmv_page_order(page);
1398
1399 mt = get_pageblock_migratetype(page);
1400 if (unlikely(mt != MIGRATE_ISOLATE))
1401 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1402
1403
1404 if (order >= pageblock_order - 1) {
1405 struct page *endpage = page + (1 << order) - 1;
1406 for (; page < endpage; page += pageblock_nr_pages) {
1407 int mt = get_pageblock_migratetype(page);
1408 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1409 set_pageblock_migratetype(page,
1410 MIGRATE_MOVABLE);
1411 }
1412 }
1413
1414 return 1UL << order;
1415}
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427int split_free_page(struct page *page)
1428{
1429 unsigned int order;
1430 int nr_pages;
1431
1432 order = page_order(page);
1433
1434 nr_pages = __isolate_free_page(page, order);
1435 if (!nr_pages)
1436 return 0;
1437
1438
1439 set_page_refcounted(page);
1440 split_page(page, order);
1441 return nr_pages;
1442}
1443
1444
1445
1446
1447
1448
1449static inline
1450struct page *buffered_rmqueue(struct zone *preferred_zone,
1451 struct zone *zone, int order, gfp_t gfp_flags,
1452 int migratetype)
1453{
1454 unsigned long flags;
1455 struct page *page;
1456 int cold = !!(gfp_flags & __GFP_COLD);
1457
1458again:
1459 if (likely(order == 0)) {
1460 struct per_cpu_pages *pcp;
1461 struct list_head *list;
1462
1463 local_irq_save(flags);
1464 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1465 list = &pcp->lists[migratetype];
1466 if (list_empty(list)) {
1467 pcp->count += rmqueue_bulk(zone, 0,
1468 pcp->batch, list,
1469 migratetype, cold);
1470 if (unlikely(list_empty(list)))
1471 goto failed;
1472 }
1473
1474 if (cold)
1475 page = list_entry(list->prev, struct page, lru);
1476 else
1477 page = list_entry(list->next, struct page, lru);
1478
1479 list_del(&page->lru);
1480 pcp->count--;
1481 } else {
1482 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493 WARN_ON_ONCE(order > 1);
1494 }
1495 spin_lock_irqsave(&zone->lock, flags);
1496 page = __rmqueue(zone, order, migratetype);
1497 spin_unlock(&zone->lock);
1498 if (!page)
1499 goto failed;
1500 __mod_zone_freepage_state(zone, -(1 << order),
1501 get_pageblock_migratetype(page));
1502 }
1503
1504 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1505 zone_statistics(preferred_zone, zone, gfp_flags);
1506 local_irq_restore(flags);
1507
1508 VM_BUG_ON(bad_range(zone, page));
1509 if (prep_new_page(page, order, gfp_flags))
1510 goto again;
1511 return page;
1512
1513failed:
1514 local_irq_restore(flags);
1515 return NULL;
1516}
1517
1518#ifdef CONFIG_FAIL_PAGE_ALLOC
1519
1520static struct {
1521 struct fault_attr attr;
1522
1523 u32 ignore_gfp_highmem;
1524 u32 ignore_gfp_wait;
1525 u32 min_order;
1526} fail_page_alloc = {
1527 .attr = FAULT_ATTR_INITIALIZER,
1528 .ignore_gfp_wait = 1,
1529 .ignore_gfp_highmem = 1,
1530 .min_order = 1,
1531};
1532
1533static int __init setup_fail_page_alloc(char *str)
1534{
1535 return setup_fault_attr(&fail_page_alloc.attr, str);
1536}
1537__setup("fail_page_alloc=", setup_fail_page_alloc);
1538
1539static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1540{
1541 if (order < fail_page_alloc.min_order)
1542 return false;
1543 if (gfp_mask & __GFP_NOFAIL)
1544 return false;
1545 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1546 return false;
1547 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1548 return false;
1549
1550 return should_fail(&fail_page_alloc.attr, 1 << order);
1551}
1552
1553#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1554
1555static int __init fail_page_alloc_debugfs(void)
1556{
1557 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1558 struct dentry *dir;
1559
1560 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1561 &fail_page_alloc.attr);
1562 if (IS_ERR(dir))
1563 return PTR_ERR(dir);
1564
1565 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1566 &fail_page_alloc.ignore_gfp_wait))
1567 goto fail;
1568 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1569 &fail_page_alloc.ignore_gfp_highmem))
1570 goto fail;
1571 if (!debugfs_create_u32("min-order", mode, dir,
1572 &fail_page_alloc.min_order))
1573 goto fail;
1574
1575 return 0;
1576fail:
1577 debugfs_remove_recursive(dir);
1578
1579 return -ENOMEM;
1580}
1581
1582late_initcall(fail_page_alloc_debugfs);
1583
1584#endif
1585
1586#else
1587
1588static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1589{
1590 return false;
1591}
1592
1593#endif
1594
1595
1596
1597
1598
1599static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1600 int classzone_idx, int alloc_flags, long free_pages)
1601{
1602
1603 long min = mark;
1604 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1605 int o;
1606
1607 free_pages -= (1 << order) - 1;
1608 if (alloc_flags & ALLOC_HIGH)
1609 min -= min / 2;
1610 if (alloc_flags & ALLOC_HARDER)
1611 min -= min / 4;
1612#ifdef CONFIG_CMA
1613
1614 if (!(alloc_flags & ALLOC_CMA))
1615 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
1616#endif
1617 if (free_pages <= min + lowmem_reserve)
1618 return false;
1619 for (o = 0; o < order; o++) {
1620
1621 free_pages -= z->free_area[o].nr_free << o;
1622
1623
1624 min >>= 1;
1625
1626 if (free_pages <= min)
1627 return false;
1628 }
1629 return true;
1630}
1631
1632#ifdef CONFIG_MEMORY_ISOLATION
1633static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1634{
1635 if (unlikely(zone->nr_pageblock_isolate))
1636 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1637 return 0;
1638}
1639#else
1640static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1641{
1642 return 0;
1643}
1644#endif
1645
1646bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1647 int classzone_idx, int alloc_flags)
1648{
1649 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1650 zone_page_state(z, NR_FREE_PAGES));
1651}
1652
1653bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1654 int classzone_idx, int alloc_flags)
1655{
1656 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1657
1658 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1659 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1660
1661
1662
1663
1664
1665
1666
1667
1668 free_pages -= nr_zone_isolate_freepages(z);
1669 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1670 free_pages);
1671}
1672
1673#ifdef CONFIG_NUMA
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1697{
1698 struct zonelist_cache *zlc;
1699 nodemask_t *allowednodes;
1700
1701 zlc = zonelist->zlcache_ptr;
1702 if (!zlc)
1703 return NULL;
1704
1705 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1706 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1707 zlc->last_full_zap = jiffies;
1708 }
1709
1710 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1711 &cpuset_current_mems_allowed :
1712 &node_states[N_HIGH_MEMORY];
1713 return allowednodes;
1714}
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1739 nodemask_t *allowednodes)
1740{
1741 struct zonelist_cache *zlc;
1742 int i;
1743 int n;
1744
1745 zlc = zonelist->zlcache_ptr;
1746 if (!zlc)
1747 return 1;
1748
1749 i = z - zonelist->_zonerefs;
1750 n = zlc->z_to_n[i];
1751
1752
1753 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1754}
1755
1756
1757
1758
1759
1760
1761static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1762{
1763 struct zonelist_cache *zlc;
1764 int i;
1765
1766 zlc = zonelist->zlcache_ptr;
1767 if (!zlc)
1768 return;
1769
1770 i = z - zonelist->_zonerefs;
1771
1772 set_bit(i, zlc->fullzones);
1773}
1774
1775
1776
1777
1778
1779static void zlc_clear_zones_full(struct zonelist *zonelist)
1780{
1781 struct zonelist_cache *zlc;
1782
1783 zlc = zonelist->zlcache_ptr;
1784 if (!zlc)
1785 return;
1786
1787 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1788}
1789
1790static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1791{
1792 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1793}
1794
1795static void __paginginit init_zone_allows_reclaim(int nid)
1796{
1797 int i;
1798
1799 for_each_online_node(i)
1800 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1801 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1802 else
1803 zone_reclaim_mode = 1;
1804}
1805
1806#else
1807
1808static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1809{
1810 return NULL;
1811}
1812
1813static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1814 nodemask_t *allowednodes)
1815{
1816 return 1;
1817}
1818
1819static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1820{
1821}
1822
1823static void zlc_clear_zones_full(struct zonelist *zonelist)
1824{
1825}
1826
1827static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1828{
1829 return true;
1830}
1831
1832static inline void init_zone_allows_reclaim(int nid)
1833{
1834}
1835#endif
1836
1837
1838
1839
1840
1841static struct page *
1842get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1843 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1844 struct zone *preferred_zone, int migratetype)
1845{
1846 struct zoneref *z;
1847 struct page *page = NULL;
1848 int classzone_idx;
1849 struct zone *zone;
1850 nodemask_t *allowednodes = NULL;
1851 int zlc_active = 0;
1852 int did_zlc_setup = 0;
1853
1854 classzone_idx = zone_idx(preferred_zone);
1855zonelist_scan:
1856
1857
1858
1859
1860 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1861 high_zoneidx, nodemask) {
1862 if (NUMA_BUILD && zlc_active &&
1863 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1864 continue;
1865 if ((alloc_flags & ALLOC_CPUSET) &&
1866 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1867 continue;
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1895 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1896 goto this_zone_full;
1897
1898 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1899 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1900 unsigned long mark;
1901 int ret;
1902
1903 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1904 if (zone_watermark_ok(zone, order, mark,
1905 classzone_idx, alloc_flags))
1906 goto try_this_zone;
1907
1908 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1909
1910
1911
1912
1913
1914 allowednodes = zlc_setup(zonelist, alloc_flags);
1915 zlc_active = 1;
1916 did_zlc_setup = 1;
1917 }
1918
1919 if (zone_reclaim_mode == 0 ||
1920 !zone_allows_reclaim(preferred_zone, zone))
1921 goto this_zone_full;
1922
1923
1924
1925
1926
1927 if (NUMA_BUILD && zlc_active &&
1928 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1929 continue;
1930
1931 ret = zone_reclaim(zone, gfp_mask, order);
1932 switch (ret) {
1933 case ZONE_RECLAIM_NOSCAN:
1934
1935 continue;
1936 case ZONE_RECLAIM_FULL:
1937
1938 continue;
1939 default:
1940
1941 if (!zone_watermark_ok(zone, order, mark,
1942 classzone_idx, alloc_flags))
1943 goto this_zone_full;
1944 }
1945 }
1946
1947try_this_zone:
1948 page = buffered_rmqueue(preferred_zone, zone, order,
1949 gfp_mask, migratetype);
1950 if (page)
1951 break;
1952this_zone_full:
1953 if (NUMA_BUILD)
1954 zlc_mark_zone_full(zonelist, z);
1955 }
1956
1957 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1958
1959 zlc_active = 0;
1960 goto zonelist_scan;
1961 }
1962
1963 if (page)
1964
1965
1966
1967
1968
1969
1970
1971 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1972
1973 return page;
1974}
1975
1976
1977
1978
1979
1980static inline bool should_suppress_show_mem(void)
1981{
1982 bool ret = false;
1983
1984#if NODES_SHIFT > 8
1985 ret = in_interrupt();
1986#endif
1987 return ret;
1988}
1989
1990static DEFINE_RATELIMIT_STATE(nopage_rs,
1991 DEFAULT_RATELIMIT_INTERVAL,
1992 DEFAULT_RATELIMIT_BURST);
1993
1994void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1995{
1996 unsigned int filter = SHOW_MEM_FILTER_NODES;
1997
1998 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1999 debug_guardpage_minorder() > 0)
2000 return;
2001
2002
2003
2004
2005
2006
2007 if (!(gfp_mask & __GFP_NOMEMALLOC))
2008 if (test_thread_flag(TIF_MEMDIE) ||
2009 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2010 filter &= ~SHOW_MEM_FILTER_NODES;
2011 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2012 filter &= ~SHOW_MEM_FILTER_NODES;
2013
2014 if (fmt) {
2015 struct va_format vaf;
2016 va_list args;
2017
2018 va_start(args, fmt);
2019
2020 vaf.fmt = fmt;
2021 vaf.va = &args;
2022
2023 pr_warn("%pV", &vaf);
2024
2025 va_end(args);
2026 }
2027
2028 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2029 current->comm, order, gfp_mask);
2030
2031 dump_stack();
2032 if (!should_suppress_show_mem())
2033 show_mem(filter);
2034}
2035
2036static inline int
2037should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2038 unsigned long did_some_progress,
2039 unsigned long pages_reclaimed)
2040{
2041
2042 if (gfp_mask & __GFP_NORETRY)
2043 return 0;
2044
2045
2046 if (gfp_mask & __GFP_NOFAIL)
2047 return 1;
2048
2049
2050
2051
2052
2053
2054 if (!did_some_progress && pm_suspended_storage())
2055 return 0;
2056
2057
2058
2059
2060
2061
2062 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2063 return 1;
2064
2065
2066
2067
2068
2069
2070
2071
2072 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2073 return 1;
2074
2075 return 0;
2076}
2077
2078static inline struct page *
2079__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2080 struct zonelist *zonelist, enum zone_type high_zoneidx,
2081 nodemask_t *nodemask, struct zone *preferred_zone,
2082 int migratetype)
2083{
2084 struct page *page;
2085
2086
2087 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2088 schedule_timeout_uninterruptible(1);
2089 return NULL;
2090 }
2091
2092
2093
2094
2095
2096
2097 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2098 order, zonelist, high_zoneidx,
2099 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2100 preferred_zone, migratetype);
2101 if (page)
2102 goto out;
2103
2104 if (!(gfp_mask & __GFP_NOFAIL)) {
2105
2106 if (order > PAGE_ALLOC_COSTLY_ORDER)
2107 goto out;
2108
2109 if (high_zoneidx < ZONE_NORMAL)
2110 goto out;
2111
2112
2113
2114
2115
2116
2117
2118 if (gfp_mask & __GFP_THISNODE)
2119 goto out;
2120 }
2121
2122 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2123
2124out:
2125 clear_zonelist_oom(zonelist, gfp_mask);
2126 return page;
2127}
2128
2129#ifdef CONFIG_COMPACTION
2130
2131static struct page *
2132__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2133 struct zonelist *zonelist, enum zone_type high_zoneidx,
2134 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2135 int migratetype, bool sync_migration,
2136 bool *contended_compaction, bool *deferred_compaction,
2137 unsigned long *did_some_progress)
2138{
2139 if (!order)
2140 return NULL;
2141
2142 if (compaction_deferred(preferred_zone, order)) {
2143 *deferred_compaction = true;
2144 return NULL;
2145 }
2146
2147 current->flags |= PF_MEMALLOC;
2148 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2149 nodemask, sync_migration,
2150 contended_compaction);
2151 current->flags &= ~PF_MEMALLOC;
2152
2153 if (*did_some_progress != COMPACT_SKIPPED) {
2154 struct page *page;
2155
2156
2157 drain_pages(get_cpu());
2158 put_cpu();
2159
2160 page = get_page_from_freelist(gfp_mask, nodemask,
2161 order, zonelist, high_zoneidx,
2162 alloc_flags & ~ALLOC_NO_WATERMARKS,
2163 preferred_zone, migratetype);
2164 if (page) {
2165 preferred_zone->compact_blockskip_flush = false;
2166 preferred_zone->compact_considered = 0;
2167 preferred_zone->compact_defer_shift = 0;
2168 if (order >= preferred_zone->compact_order_failed)
2169 preferred_zone->compact_order_failed = order + 1;
2170 count_vm_event(COMPACTSUCCESS);
2171 return page;
2172 }
2173
2174
2175
2176
2177
2178
2179 count_vm_event(COMPACTFAIL);
2180
2181
2182
2183
2184
2185 if (sync_migration)
2186 defer_compaction(preferred_zone, order);
2187
2188 cond_resched();
2189 }
2190
2191 return NULL;
2192}
2193#else
2194static inline struct page *
2195__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2196 struct zonelist *zonelist, enum zone_type high_zoneidx,
2197 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2198 int migratetype, bool sync_migration,
2199 bool *contended_compaction, bool *deferred_compaction,
2200 unsigned long *did_some_progress)
2201{
2202 return NULL;
2203}
2204#endif
2205
2206
2207static int
2208__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2209 nodemask_t *nodemask)
2210{
2211 struct reclaim_state reclaim_state;
2212 int progress;
2213
2214 cond_resched();
2215
2216
2217 cpuset_memory_pressure_bump();
2218 current->flags |= PF_MEMALLOC;
2219 lockdep_set_current_reclaim_state(gfp_mask);
2220 reclaim_state.reclaimed_slab = 0;
2221 current->reclaim_state = &reclaim_state;
2222
2223 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2224
2225 current->reclaim_state = NULL;
2226 lockdep_clear_current_reclaim_state();
2227 current->flags &= ~PF_MEMALLOC;
2228
2229 cond_resched();
2230
2231 return progress;
2232}
2233
2234
2235static inline struct page *
2236__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2237 struct zonelist *zonelist, enum zone_type high_zoneidx,
2238 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2239 int migratetype, unsigned long *did_some_progress)
2240{
2241 struct page *page = NULL;
2242 bool drained = false;
2243
2244 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2245 nodemask);
2246 if (unlikely(!(*did_some_progress)))
2247 return NULL;
2248
2249
2250 if (NUMA_BUILD)
2251 zlc_clear_zones_full(zonelist);
2252
2253retry:
2254 page = get_page_from_freelist(gfp_mask, nodemask, order,
2255 zonelist, high_zoneidx,
2256 alloc_flags & ~ALLOC_NO_WATERMARKS,
2257 preferred_zone, migratetype);
2258
2259
2260
2261
2262
2263 if (!page && !drained) {
2264 drain_all_pages();
2265 drained = true;
2266 goto retry;
2267 }
2268
2269 return page;
2270}
2271
2272
2273
2274
2275
2276static inline struct page *
2277__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2278 struct zonelist *zonelist, enum zone_type high_zoneidx,
2279 nodemask_t *nodemask, struct zone *preferred_zone,
2280 int migratetype)
2281{
2282 struct page *page;
2283
2284 do {
2285 page = get_page_from_freelist(gfp_mask, nodemask, order,
2286 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2287 preferred_zone, migratetype);
2288
2289 if (!page && gfp_mask & __GFP_NOFAIL)
2290 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2291 } while (!page && (gfp_mask & __GFP_NOFAIL));
2292
2293 return page;
2294}
2295
2296static inline
2297void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
2298 enum zone_type high_zoneidx,
2299 enum zone_type classzone_idx)
2300{
2301 struct zoneref *z;
2302 struct zone *zone;
2303
2304 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2305 wakeup_kswapd(zone, order, classzone_idx);
2306}
2307
2308static inline int
2309gfp_to_alloc_flags(gfp_t gfp_mask)
2310{
2311 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2312 const gfp_t wait = gfp_mask & __GFP_WAIT;
2313
2314
2315 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2316
2317
2318
2319
2320
2321
2322
2323 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2324
2325 if (!wait) {
2326
2327
2328
2329
2330 if (!(gfp_mask & __GFP_NOMEMALLOC))
2331 alloc_flags |= ALLOC_HARDER;
2332
2333
2334
2335
2336 alloc_flags &= ~ALLOC_CPUSET;
2337 } else if (unlikely(rt_task(current)) && !in_interrupt())
2338 alloc_flags |= ALLOC_HARDER;
2339
2340 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2341 if (gfp_mask & __GFP_MEMALLOC)
2342 alloc_flags |= ALLOC_NO_WATERMARKS;
2343 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2344 alloc_flags |= ALLOC_NO_WATERMARKS;
2345 else if (!in_interrupt() &&
2346 ((current->flags & PF_MEMALLOC) ||
2347 unlikely(test_thread_flag(TIF_MEMDIE))))
2348 alloc_flags |= ALLOC_NO_WATERMARKS;
2349 }
2350#ifdef CONFIG_CMA
2351 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2352 alloc_flags |= ALLOC_CMA;
2353#endif
2354 return alloc_flags;
2355}
2356
2357bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2358{
2359 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2360}
2361
2362static inline struct page *
2363__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2364 struct zonelist *zonelist, enum zone_type high_zoneidx,
2365 nodemask_t *nodemask, struct zone *preferred_zone,
2366 int migratetype)
2367{
2368 const gfp_t wait = gfp_mask & __GFP_WAIT;
2369 struct page *page = NULL;
2370 int alloc_flags;
2371 unsigned long pages_reclaimed = 0;
2372 unsigned long did_some_progress;
2373 bool sync_migration = false;
2374 bool deferred_compaction = false;
2375 bool contended_compaction = false;
2376
2377
2378
2379
2380
2381
2382
2383 if (order >= MAX_ORDER) {
2384 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2385 return NULL;
2386 }
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2397 goto nopage;
2398
2399restart:
2400 if (!(gfp_mask & __GFP_NO_KSWAPD))
2401 wake_all_kswapd(order, zonelist, high_zoneidx,
2402 zone_idx(preferred_zone));
2403
2404
2405
2406
2407
2408
2409 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2410
2411
2412
2413
2414
2415 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2416 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2417 &preferred_zone);
2418
2419rebalance:
2420
2421 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2422 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2423 preferred_zone, migratetype);
2424 if (page)
2425 goto got_pg;
2426
2427
2428 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2429
2430
2431
2432
2433
2434 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2435
2436 page = __alloc_pages_high_priority(gfp_mask, order,
2437 zonelist, high_zoneidx, nodemask,
2438 preferred_zone, migratetype);
2439 if (page) {
2440 goto got_pg;
2441 }
2442 }
2443
2444
2445 if (!wait)
2446 goto nopage;
2447
2448
2449 if (current->flags & PF_MEMALLOC)
2450 goto nopage;
2451
2452
2453 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2454 goto nopage;
2455
2456
2457
2458
2459
2460 page = __alloc_pages_direct_compact(gfp_mask, order,
2461 zonelist, high_zoneidx,
2462 nodemask,
2463 alloc_flags, preferred_zone,
2464 migratetype, sync_migration,
2465 &contended_compaction,
2466 &deferred_compaction,
2467 &did_some_progress);
2468 if (page)
2469 goto got_pg;
2470 sync_migration = true;
2471
2472
2473
2474
2475
2476
2477
2478 if ((deferred_compaction || contended_compaction) &&
2479 (gfp_mask & __GFP_NO_KSWAPD))
2480 goto nopage;
2481
2482
2483 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2484 zonelist, high_zoneidx,
2485 nodemask,
2486 alloc_flags, preferred_zone,
2487 migratetype, &did_some_progress);
2488 if (page)
2489 goto got_pg;
2490
2491
2492
2493
2494
2495 if (!did_some_progress) {
2496 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2497 if (oom_killer_disabled)
2498 goto nopage;
2499
2500 if ((current->flags & PF_DUMPCORE) &&
2501 !(gfp_mask & __GFP_NOFAIL))
2502 goto nopage;
2503 page = __alloc_pages_may_oom(gfp_mask, order,
2504 zonelist, high_zoneidx,
2505 nodemask, preferred_zone,
2506 migratetype);
2507 if (page)
2508 goto got_pg;
2509
2510 if (!(gfp_mask & __GFP_NOFAIL)) {
2511
2512
2513
2514
2515
2516
2517 if (order > PAGE_ALLOC_COSTLY_ORDER)
2518 goto nopage;
2519
2520
2521
2522
2523
2524 if (high_zoneidx < ZONE_NORMAL)
2525 goto nopage;
2526 }
2527
2528 goto restart;
2529 }
2530 }
2531
2532
2533 pages_reclaimed += did_some_progress;
2534 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2535 pages_reclaimed)) {
2536
2537 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2538 goto rebalance;
2539 } else {
2540
2541
2542
2543
2544
2545 page = __alloc_pages_direct_compact(gfp_mask, order,
2546 zonelist, high_zoneidx,
2547 nodemask,
2548 alloc_flags, preferred_zone,
2549 migratetype, sync_migration,
2550 &contended_compaction,
2551 &deferred_compaction,
2552 &did_some_progress);
2553 if (page)
2554 goto got_pg;
2555 }
2556
2557nopage:
2558 warn_alloc_failed(gfp_mask, order, NULL);
2559 return page;
2560got_pg:
2561 if (kmemcheck_enabled)
2562 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2563
2564 return page;
2565}
2566
2567
2568
2569
2570struct page *
2571__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2572 struct zonelist *zonelist, nodemask_t *nodemask)
2573{
2574 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2575 struct zone *preferred_zone;
2576 struct page *page = NULL;
2577 int migratetype = allocflags_to_migratetype(gfp_mask);
2578 unsigned int cpuset_mems_cookie;
2579 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2580
2581 gfp_mask &= gfp_allowed_mask;
2582
2583 lockdep_trace_alloc(gfp_mask);
2584
2585 might_sleep_if(gfp_mask & __GFP_WAIT);
2586
2587 if (should_fail_alloc_page(gfp_mask, order))
2588 return NULL;
2589
2590
2591
2592
2593
2594
2595 if (unlikely(!zonelist->_zonerefs->zone))
2596 return NULL;
2597
2598retry_cpuset:
2599 cpuset_mems_cookie = get_mems_allowed();
2600
2601
2602 first_zones_zonelist(zonelist, high_zoneidx,
2603 nodemask ? : &cpuset_current_mems_allowed,
2604 &preferred_zone);
2605 if (!preferred_zone)
2606 goto out;
2607
2608#ifdef CONFIG_CMA
2609 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2610 alloc_flags |= ALLOC_CMA;
2611#endif
2612
2613 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2614 zonelist, high_zoneidx, alloc_flags,
2615 preferred_zone, migratetype);
2616 if (unlikely(!page))
2617 page = __alloc_pages_slowpath(gfp_mask, order,
2618 zonelist, high_zoneidx, nodemask,
2619 preferred_zone, migratetype);
2620
2621 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2622
2623out:
2624
2625
2626
2627
2628
2629
2630 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2631 goto retry_cpuset;
2632
2633 return page;
2634}
2635EXPORT_SYMBOL(__alloc_pages_nodemask);
2636
2637
2638
2639
2640unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2641{
2642 struct page *page;
2643
2644
2645
2646
2647
2648 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2649
2650 page = alloc_pages(gfp_mask, order);
2651 if (!page)
2652 return 0;
2653 return (unsigned long) page_address(page);
2654}
2655EXPORT_SYMBOL(__get_free_pages);
2656
2657unsigned long get_zeroed_page(gfp_t gfp_mask)
2658{
2659 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2660}
2661EXPORT_SYMBOL(get_zeroed_page);
2662
2663void __free_pages(struct page *page, unsigned int order)
2664{
2665 if (put_page_testzero(page)) {
2666 if (order == 0)
2667 free_hot_cold_page(page, 0);
2668 else
2669 __free_pages_ok(page, order);
2670 }
2671}
2672
2673EXPORT_SYMBOL(__free_pages);
2674
2675void free_pages(unsigned long addr, unsigned int order)
2676{
2677 if (addr != 0) {
2678 VM_BUG_ON(!virt_addr_valid((void *)addr));
2679 __free_pages(virt_to_page((void *)addr), order);
2680 }
2681}
2682
2683EXPORT_SYMBOL(free_pages);
2684
2685static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2686{
2687 if (addr) {
2688 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2689 unsigned long used = addr + PAGE_ALIGN(size);
2690
2691 split_page(virt_to_page((void *)addr), order);
2692 while (used < alloc_end) {
2693 free_page(used);
2694 used += PAGE_SIZE;
2695 }
2696 }
2697 return (void *)addr;
2698}
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2714{
2715 unsigned int order = get_order(size);
2716 unsigned long addr;
2717
2718 addr = __get_free_pages(gfp_mask, order);
2719 return make_alloc_exact(addr, order, size);
2720}
2721EXPORT_SYMBOL(alloc_pages_exact);
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2736{
2737 unsigned order = get_order(size);
2738 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2739 if (!p)
2740 return NULL;
2741 return make_alloc_exact((unsigned long)page_address(p), order, size);
2742}
2743EXPORT_SYMBOL(alloc_pages_exact_nid);
2744
2745
2746
2747
2748
2749
2750
2751
2752void free_pages_exact(void *virt, size_t size)
2753{
2754 unsigned long addr = (unsigned long)virt;
2755 unsigned long end = addr + PAGE_ALIGN(size);
2756
2757 while (addr < end) {
2758 free_page(addr);
2759 addr += PAGE_SIZE;
2760 }
2761}
2762EXPORT_SYMBOL(free_pages_exact);
2763
2764static unsigned int nr_free_zone_pages(int offset)
2765{
2766 struct zoneref *z;
2767 struct zone *zone;
2768
2769
2770 unsigned int sum = 0;
2771
2772 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2773
2774 for_each_zone_zonelist(zone, z, zonelist, offset) {
2775 unsigned long size = zone->present_pages;
2776 unsigned long high = high_wmark_pages(zone);
2777 if (size > high)
2778 sum += size - high;
2779 }
2780
2781 return sum;
2782}
2783
2784
2785
2786
2787unsigned int nr_free_buffer_pages(void)
2788{
2789 return nr_free_zone_pages(gfp_zone(GFP_USER));
2790}
2791EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2792
2793
2794
2795
2796unsigned int nr_free_pagecache_pages(void)
2797{
2798 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2799}
2800
2801static inline void show_node(struct zone *zone)
2802{
2803 if (NUMA_BUILD)
2804 printk("Node %d ", zone_to_nid(zone));
2805}
2806
2807void si_meminfo(struct sysinfo *val)
2808{
2809 val->totalram = totalram_pages;
2810 val->sharedram = 0;
2811 val->freeram = global_page_state(NR_FREE_PAGES);
2812 val->bufferram = nr_blockdev_pages();
2813 val->totalhigh = totalhigh_pages;
2814 val->freehigh = nr_free_highpages();
2815 val->mem_unit = PAGE_SIZE;
2816}
2817
2818EXPORT_SYMBOL(si_meminfo);
2819
2820#ifdef CONFIG_NUMA
2821void si_meminfo_node(struct sysinfo *val, int nid)
2822{
2823 pg_data_t *pgdat = NODE_DATA(nid);
2824
2825 val->totalram = pgdat->node_present_pages;
2826 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2827#ifdef CONFIG_HIGHMEM
2828 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
2829 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2830 NR_FREE_PAGES);
2831#else
2832 val->totalhigh = 0;
2833 val->freehigh = 0;
2834#endif
2835 val->mem_unit = PAGE_SIZE;
2836}
2837#endif
2838
2839
2840
2841
2842
2843bool skip_free_areas_node(unsigned int flags, int nid)
2844{
2845 bool ret = false;
2846 unsigned int cpuset_mems_cookie;
2847
2848 if (!(flags & SHOW_MEM_FILTER_NODES))
2849 goto out;
2850
2851 do {
2852 cpuset_mems_cookie = get_mems_allowed();
2853 ret = !node_isset(nid, cpuset_current_mems_allowed);
2854 } while (!put_mems_allowed(cpuset_mems_cookie));
2855out:
2856 return ret;
2857}
2858
2859#define K(x) ((x) << (PAGE_SHIFT-10))
2860
2861
2862
2863
2864
2865
2866
2867
2868void show_free_areas(unsigned int filter)
2869{
2870 int cpu;
2871 struct zone *zone;
2872
2873 for_each_populated_zone(zone) {
2874 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2875 continue;
2876 show_node(zone);
2877 printk("%s per-cpu:\n", zone->name);
2878
2879 for_each_online_cpu(cpu) {
2880 struct per_cpu_pageset *pageset;
2881
2882 pageset = per_cpu_ptr(zone->pageset, cpu);
2883
2884 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2885 cpu, pageset->pcp.high,
2886 pageset->pcp.batch, pageset->pcp.count);
2887 }
2888 }
2889
2890 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2891 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2892 " unevictable:%lu"
2893 " dirty:%lu writeback:%lu unstable:%lu\n"
2894 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2895 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
2896 " free_cma:%lu\n",
2897 global_page_state(NR_ACTIVE_ANON),
2898 global_page_state(NR_INACTIVE_ANON),
2899 global_page_state(NR_ISOLATED_ANON),
2900 global_page_state(NR_ACTIVE_FILE),
2901 global_page_state(NR_INACTIVE_FILE),
2902 global_page_state(NR_ISOLATED_FILE),
2903 global_page_state(NR_UNEVICTABLE),
2904 global_page_state(NR_FILE_DIRTY),
2905 global_page_state(NR_WRITEBACK),
2906 global_page_state(NR_UNSTABLE_NFS),
2907 global_page_state(NR_FREE_PAGES),
2908 global_page_state(NR_SLAB_RECLAIMABLE),
2909 global_page_state(NR_SLAB_UNRECLAIMABLE),
2910 global_page_state(NR_FILE_MAPPED),
2911 global_page_state(NR_SHMEM),
2912 global_page_state(NR_PAGETABLE),
2913 global_page_state(NR_BOUNCE),
2914 global_page_state(NR_FREE_CMA_PAGES));
2915
2916 for_each_populated_zone(zone) {
2917 int i;
2918
2919 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2920 continue;
2921 show_node(zone);
2922 printk("%s"
2923 " free:%lukB"
2924 " min:%lukB"
2925 " low:%lukB"
2926 " high:%lukB"
2927 " active_anon:%lukB"
2928 " inactive_anon:%lukB"
2929 " active_file:%lukB"
2930 " inactive_file:%lukB"
2931 " unevictable:%lukB"
2932 " isolated(anon):%lukB"
2933 " isolated(file):%lukB"
2934 " present:%lukB"
2935 " mlocked:%lukB"
2936 " dirty:%lukB"
2937 " writeback:%lukB"
2938 " mapped:%lukB"
2939 " shmem:%lukB"
2940 " slab_reclaimable:%lukB"
2941 " slab_unreclaimable:%lukB"
2942 " kernel_stack:%lukB"
2943 " pagetables:%lukB"
2944 " unstable:%lukB"
2945 " bounce:%lukB"
2946 " free_cma:%lukB"
2947 " writeback_tmp:%lukB"
2948 " pages_scanned:%lu"
2949 " all_unreclaimable? %s"
2950 "\n",
2951 zone->name,
2952 K(zone_page_state(zone, NR_FREE_PAGES)),
2953 K(min_wmark_pages(zone)),
2954 K(low_wmark_pages(zone)),
2955 K(high_wmark_pages(zone)),
2956 K(zone_page_state(zone, NR_ACTIVE_ANON)),
2957 K(zone_page_state(zone, NR_INACTIVE_ANON)),
2958 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2959 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2960 K(zone_page_state(zone, NR_UNEVICTABLE)),
2961 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2962 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2963 K(zone->present_pages),
2964 K(zone_page_state(zone, NR_MLOCK)),
2965 K(zone_page_state(zone, NR_FILE_DIRTY)),
2966 K(zone_page_state(zone, NR_WRITEBACK)),
2967 K(zone_page_state(zone, NR_FILE_MAPPED)),
2968 K(zone_page_state(zone, NR_SHMEM)),
2969 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2970 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2971 zone_page_state(zone, NR_KERNEL_STACK) *
2972 THREAD_SIZE / 1024,
2973 K(zone_page_state(zone, NR_PAGETABLE)),
2974 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2975 K(zone_page_state(zone, NR_BOUNCE)),
2976 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
2977 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2978 zone->pages_scanned,
2979 (zone->all_unreclaimable ? "yes" : "no")
2980 );
2981 printk("lowmem_reserve[]:");
2982 for (i = 0; i < MAX_NR_ZONES; i++)
2983 printk(" %lu", zone->lowmem_reserve[i]);
2984 printk("\n");
2985 }
2986
2987 for_each_populated_zone(zone) {
2988 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2989
2990 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2991 continue;
2992 show_node(zone);
2993 printk("%s: ", zone->name);
2994
2995 spin_lock_irqsave(&zone->lock, flags);
2996 for (order = 0; order < MAX_ORDER; order++) {
2997 nr[order] = zone->free_area[order].nr_free;
2998 total += nr[order] << order;
2999 }
3000 spin_unlock_irqrestore(&zone->lock, flags);
3001 for (order = 0; order < MAX_ORDER; order++)
3002 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3003 printk("= %lukB\n", K(total));
3004 }
3005
3006 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3007
3008 show_swap_cache_info();
3009}
3010
3011static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3012{
3013 zoneref->zone = zone;
3014 zoneref->zone_idx = zone_idx(zone);
3015}
3016
3017
3018
3019
3020
3021
3022static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3023 int nr_zones, enum zone_type zone_type)
3024{
3025 struct zone *zone;
3026
3027 BUG_ON(zone_type >= MAX_NR_ZONES);
3028 zone_type++;
3029
3030 do {
3031 zone_type--;
3032 zone = pgdat->node_zones + zone_type;
3033 if (populated_zone(zone)) {
3034 zoneref_set_zone(zone,
3035 &zonelist->_zonerefs[nr_zones++]);
3036 check_highest_zone(zone_type);
3037 }
3038
3039 } while (zone_type);
3040 return nr_zones;
3041}
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053#define ZONELIST_ORDER_DEFAULT 0
3054#define ZONELIST_ORDER_NODE 1
3055#define ZONELIST_ORDER_ZONE 2
3056
3057
3058
3059
3060static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3061static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3062
3063
3064#ifdef CONFIG_NUMA
3065
3066static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3067
3068#define NUMA_ZONELIST_ORDER_LEN 16
3069char numa_zonelist_order[16] = "default";
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079static int __parse_numa_zonelist_order(char *s)
3080{
3081 if (*s == 'd' || *s == 'D') {
3082 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3083 } else if (*s == 'n' || *s == 'N') {
3084 user_zonelist_order = ZONELIST_ORDER_NODE;
3085 } else if (*s == 'z' || *s == 'Z') {
3086 user_zonelist_order = ZONELIST_ORDER_ZONE;
3087 } else {
3088 printk(KERN_WARNING
3089 "Ignoring invalid numa_zonelist_order value: "
3090 "%s\n", s);
3091 return -EINVAL;
3092 }
3093 return 0;
3094}
3095
3096static __init int setup_numa_zonelist_order(char *s)
3097{
3098 int ret;
3099
3100 if (!s)
3101 return 0;
3102
3103 ret = __parse_numa_zonelist_order(s);
3104 if (ret == 0)
3105 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3106
3107 return ret;
3108}
3109early_param("numa_zonelist_order", setup_numa_zonelist_order);
3110
3111
3112
3113
3114int numa_zonelist_order_handler(ctl_table *table, int write,
3115 void __user *buffer, size_t *length,
3116 loff_t *ppos)
3117{
3118 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3119 int ret;
3120 static DEFINE_MUTEX(zl_order_mutex);
3121
3122 mutex_lock(&zl_order_mutex);
3123 if (write)
3124 strcpy(saved_string, (char*)table->data);
3125 ret = proc_dostring(table, write, buffer, length, ppos);
3126 if (ret)
3127 goto out;
3128 if (write) {
3129 int oldval = user_zonelist_order;
3130 if (__parse_numa_zonelist_order((char*)table->data)) {
3131
3132
3133
3134 strncpy((char*)table->data, saved_string,
3135 NUMA_ZONELIST_ORDER_LEN);
3136 user_zonelist_order = oldval;
3137 } else if (oldval != user_zonelist_order) {
3138 mutex_lock(&zonelists_mutex);
3139 build_all_zonelists(NULL, NULL);
3140 mutex_unlock(&zonelists_mutex);
3141 }
3142 }
3143out:
3144 mutex_unlock(&zl_order_mutex);
3145 return ret;
3146}
3147
3148
3149#define MAX_NODE_LOAD (nr_online_nodes)
3150static int node_load[MAX_NUMNODES];
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166static int find_next_best_node(int node, nodemask_t *used_node_mask)
3167{
3168 int n, val;
3169 int min_val = INT_MAX;
3170 int best_node = -1;
3171 const struct cpumask *tmp = cpumask_of_node(0);
3172
3173
3174 if (!node_isset(node, *used_node_mask)) {
3175 node_set(node, *used_node_mask);
3176 return node;
3177 }
3178
3179 for_each_node_state(n, N_HIGH_MEMORY) {
3180
3181
3182 if (node_isset(n, *used_node_mask))
3183 continue;
3184
3185
3186 val = node_distance(node, n);
3187
3188
3189 val += (n < node);
3190
3191
3192 tmp = cpumask_of_node(n);
3193 if (!cpumask_empty(tmp))
3194 val += PENALTY_FOR_NODE_WITH_CPUS;
3195
3196
3197 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3198 val += node_load[n];
3199
3200 if (val < min_val) {
3201 min_val = val;
3202 best_node = n;
3203 }
3204 }
3205
3206 if (best_node >= 0)
3207 node_set(best_node, *used_node_mask);
3208
3209 return best_node;
3210}
3211
3212
3213
3214
3215
3216
3217
3218static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3219{
3220 int j;
3221 struct zonelist *zonelist;
3222
3223 zonelist = &pgdat->node_zonelists[0];
3224 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3225 ;
3226 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3227 MAX_NR_ZONES - 1);
3228 zonelist->_zonerefs[j].zone = NULL;
3229 zonelist->_zonerefs[j].zone_idx = 0;
3230}
3231
3232
3233
3234
3235static void build_thisnode_zonelists(pg_data_t *pgdat)
3236{
3237 int j;
3238 struct zonelist *zonelist;
3239
3240 zonelist = &pgdat->node_zonelists[1];
3241 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3242 zonelist->_zonerefs[j].zone = NULL;
3243 zonelist->_zonerefs[j].zone_idx = 0;
3244}
3245
3246
3247
3248
3249
3250
3251
3252static int node_order[MAX_NUMNODES];
3253
3254static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3255{
3256 int pos, j, node;
3257 int zone_type;
3258 struct zone *z;
3259 struct zonelist *zonelist;
3260
3261 zonelist = &pgdat->node_zonelists[0];
3262 pos = 0;
3263 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3264 for (j = 0; j < nr_nodes; j++) {
3265 node = node_order[j];
3266 z = &NODE_DATA(node)->node_zones[zone_type];
3267 if (populated_zone(z)) {
3268 zoneref_set_zone(z,
3269 &zonelist->_zonerefs[pos++]);
3270 check_highest_zone(zone_type);
3271 }
3272 }
3273 }
3274 zonelist->_zonerefs[pos].zone = NULL;
3275 zonelist->_zonerefs[pos].zone_idx = 0;
3276}
3277
3278static int default_zonelist_order(void)
3279{
3280 int nid, zone_type;
3281 unsigned long low_kmem_size,total_size;
3282 struct zone *z;
3283 int average_size;
3284
3285
3286
3287
3288
3289
3290
3291 low_kmem_size = 0;
3292 total_size = 0;
3293 for_each_online_node(nid) {
3294 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3295 z = &NODE_DATA(nid)->node_zones[zone_type];
3296 if (populated_zone(z)) {
3297 if (zone_type < ZONE_NORMAL)
3298 low_kmem_size += z->present_pages;
3299 total_size += z->present_pages;
3300 } else if (zone_type == ZONE_NORMAL) {
3301
3302
3303
3304
3305
3306
3307
3308 return ZONELIST_ORDER_NODE;
3309 }
3310 }
3311 }
3312 if (!low_kmem_size ||
3313 low_kmem_size > total_size/2)
3314 return ZONELIST_ORDER_NODE;
3315
3316
3317
3318
3319
3320 average_size = total_size /
3321 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
3322 for_each_online_node(nid) {
3323 low_kmem_size = 0;
3324 total_size = 0;
3325 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3326 z = &NODE_DATA(nid)->node_zones[zone_type];
3327 if (populated_zone(z)) {
3328 if (zone_type < ZONE_NORMAL)
3329 low_kmem_size += z->present_pages;
3330 total_size += z->present_pages;
3331 }
3332 }
3333 if (low_kmem_size &&
3334 total_size > average_size &&
3335 low_kmem_size > total_size * 70/100)
3336 return ZONELIST_ORDER_NODE;
3337 }
3338 return ZONELIST_ORDER_ZONE;
3339}
3340
3341static void set_zonelist_order(void)
3342{
3343 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3344 current_zonelist_order = default_zonelist_order();
3345 else
3346 current_zonelist_order = user_zonelist_order;
3347}
3348
3349static void build_zonelists(pg_data_t *pgdat)
3350{
3351 int j, node, load;
3352 enum zone_type i;
3353 nodemask_t used_mask;
3354 int local_node, prev_node;
3355 struct zonelist *zonelist;
3356 int order = current_zonelist_order;
3357
3358
3359 for (i = 0; i < MAX_ZONELISTS; i++) {
3360 zonelist = pgdat->node_zonelists + i;
3361 zonelist->_zonerefs[0].zone = NULL;
3362 zonelist->_zonerefs[0].zone_idx = 0;
3363 }
3364
3365
3366 local_node = pgdat->node_id;
3367 load = nr_online_nodes;
3368 prev_node = local_node;
3369 nodes_clear(used_mask);
3370
3371 memset(node_order, 0, sizeof(node_order));
3372 j = 0;
3373
3374 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3375
3376
3377
3378
3379
3380 if (node_distance(local_node, node) !=
3381 node_distance(local_node, prev_node))
3382 node_load[node] = load;
3383
3384 prev_node = node;
3385 load--;
3386 if (order == ZONELIST_ORDER_NODE)
3387 build_zonelists_in_node_order(pgdat, node);
3388 else
3389 node_order[j++] = node;
3390 }
3391
3392 if (order == ZONELIST_ORDER_ZONE) {
3393
3394 build_zonelists_in_zone_order(pgdat, j);
3395 }
3396
3397 build_thisnode_zonelists(pgdat);
3398}
3399
3400
3401static void build_zonelist_cache(pg_data_t *pgdat)
3402{
3403 struct zonelist *zonelist;
3404 struct zonelist_cache *zlc;
3405 struct zoneref *z;
3406
3407 zonelist = &pgdat->node_zonelists[0];
3408 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3409 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3410 for (z = zonelist->_zonerefs; z->zone; z++)
3411 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3412}
3413
3414#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3415
3416
3417
3418
3419
3420
3421int local_memory_node(int node)
3422{
3423 struct zone *zone;
3424
3425 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3426 gfp_zone(GFP_KERNEL),
3427 NULL,
3428 &zone);
3429 return zone->node;
3430}
3431#endif
3432
3433#else
3434
3435static void set_zonelist_order(void)
3436{
3437 current_zonelist_order = ZONELIST_ORDER_ZONE;
3438}
3439
3440static void build_zonelists(pg_data_t *pgdat)
3441{
3442 int node, local_node;
3443 enum zone_type j;
3444 struct zonelist *zonelist;
3445
3446 local_node = pgdat->node_id;
3447
3448 zonelist = &pgdat->node_zonelists[0];
3449 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3460 if (!node_online(node))
3461 continue;
3462 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3463 MAX_NR_ZONES - 1);
3464 }
3465 for (node = 0; node < local_node; node++) {
3466 if (!node_online(node))
3467 continue;
3468 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3469 MAX_NR_ZONES - 1);
3470 }
3471
3472 zonelist->_zonerefs[j].zone = NULL;
3473 zonelist->_zonerefs[j].zone_idx = 0;
3474}
3475
3476
3477static void build_zonelist_cache(pg_data_t *pgdat)
3478{
3479 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3480}
3481
3482#endif
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3500static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3501static void setup_zone_pageset(struct zone *zone);
3502
3503
3504
3505
3506
3507DEFINE_MUTEX(zonelists_mutex);
3508
3509
3510static int __build_all_zonelists(void *data)
3511{
3512 int nid;
3513 int cpu;
3514 pg_data_t *self = data;
3515
3516#ifdef CONFIG_NUMA
3517 memset(node_load, 0, sizeof(node_load));
3518#endif
3519
3520 if (self && !node_online(self->node_id)) {
3521 build_zonelists(self);
3522 build_zonelist_cache(self);
3523 }
3524
3525 for_each_online_node(nid) {
3526 pg_data_t *pgdat = NODE_DATA(nid);
3527
3528 build_zonelists(pgdat);
3529 build_zonelist_cache(pgdat);
3530 }
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545 for_each_possible_cpu(cpu) {
3546 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3547
3548#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3549
3550
3551
3552
3553
3554
3555
3556
3557 if (cpu_online(cpu))
3558 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3559#endif
3560 }
3561
3562 return 0;
3563}
3564
3565
3566
3567
3568
3569void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3570{
3571 set_zonelist_order();
3572
3573 if (system_state == SYSTEM_BOOTING) {
3574 __build_all_zonelists(NULL);
3575 mminit_verify_zonelist();
3576 cpuset_init_current_mems_allowed();
3577 } else {
3578
3579
3580#ifdef CONFIG_MEMORY_HOTPLUG
3581 if (zone)
3582 setup_zone_pageset(zone);
3583#endif
3584 stop_machine(__build_all_zonelists, pgdat, NULL);
3585
3586 }
3587 vm_total_pages = nr_free_pagecache_pages();
3588
3589
3590
3591
3592
3593
3594
3595 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3596 page_group_by_mobility_disabled = 1;
3597 else
3598 page_group_by_mobility_disabled = 0;
3599
3600 printk("Built %i zonelists in %s order, mobility grouping %s. "
3601 "Total pages: %ld\n",
3602 nr_online_nodes,
3603 zonelist_order_name[current_zonelist_order],
3604 page_group_by_mobility_disabled ? "off" : "on",
3605 vm_total_pages);
3606#ifdef CONFIG_NUMA
3607 printk("Policy zone: %s\n", zone_names[policy_zone]);
3608#endif
3609}
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622#define PAGES_PER_WAITQUEUE 256
3623
3624#ifndef CONFIG_MEMORY_HOTPLUG
3625static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3626{
3627 unsigned long size = 1;
3628
3629 pages /= PAGES_PER_WAITQUEUE;
3630
3631 while (size < pages)
3632 size <<= 1;
3633
3634
3635
3636
3637
3638
3639 size = min(size, 4096UL);
3640
3641 return max(size, 4UL);
3642}
3643#else
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3662{
3663 return 4096UL;
3664}
3665#endif
3666
3667
3668
3669
3670
3671
3672static inline unsigned long wait_table_bits(unsigned long size)
3673{
3674 return ffz(~size);
3675}
3676
3677#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3678
3679
3680
3681
3682static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3683{
3684 unsigned long pfn;
3685
3686 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3687 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3688 return 1;
3689 }
3690 return 0;
3691}
3692
3693
3694
3695
3696
3697
3698
3699
3700static void setup_zone_migrate_reserve(struct zone *zone)
3701{
3702 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3703 struct page *page;
3704 unsigned long block_migratetype;
3705 int reserve;
3706
3707
3708
3709
3710
3711
3712
3713 start_pfn = zone->zone_start_pfn;
3714 end_pfn = start_pfn + zone->spanned_pages;
3715 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3716 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3717 pageblock_order;
3718
3719
3720
3721
3722
3723
3724
3725
3726 reserve = min(2, reserve);
3727
3728 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3729 if (!pfn_valid(pfn))
3730 continue;
3731 page = pfn_to_page(pfn);
3732
3733
3734 if (page_to_nid(page) != zone_to_nid(zone))
3735 continue;
3736
3737 block_migratetype = get_pageblock_migratetype(page);
3738
3739
3740 if (reserve > 0) {
3741
3742
3743
3744
3745 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3746 if (pageblock_is_reserved(pfn, block_end_pfn))
3747 continue;
3748
3749
3750 if (block_migratetype == MIGRATE_RESERVE) {
3751 reserve--;
3752 continue;
3753 }
3754
3755
3756 if (block_migratetype == MIGRATE_MOVABLE) {
3757 set_pageblock_migratetype(page,
3758 MIGRATE_RESERVE);
3759 move_freepages_block(zone, page,
3760 MIGRATE_RESERVE);
3761 reserve--;
3762 continue;
3763 }
3764 }
3765
3766
3767
3768
3769
3770 if (block_migratetype == MIGRATE_RESERVE) {
3771 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3772 move_freepages_block(zone, page, MIGRATE_MOVABLE);
3773 }
3774 }
3775}
3776
3777
3778
3779
3780
3781
3782void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3783 unsigned long start_pfn, enum memmap_context context)
3784{
3785 struct page *page;
3786 unsigned long end_pfn = start_pfn + size;
3787 unsigned long pfn;
3788 struct zone *z;
3789
3790 if (highest_memmap_pfn < end_pfn - 1)
3791 highest_memmap_pfn = end_pfn - 1;
3792
3793 z = &NODE_DATA(nid)->node_zones[zone];
3794 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3795
3796
3797
3798
3799
3800 if (context == MEMMAP_EARLY) {
3801 if (!early_pfn_valid(pfn))
3802 continue;
3803 if (!early_pfn_in_nid(pfn, nid))
3804 continue;
3805 }
3806 page = pfn_to_page(pfn);
3807 set_page_links(page, zone, nid, pfn);
3808 mminit_verify_page_links(page, zone, nid, pfn);
3809 init_page_count(page);
3810 reset_page_mapcount(page);
3811 SetPageReserved(page);
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826 if ((z->zone_start_pfn <= pfn)
3827 && (pfn < z->zone_start_pfn + z->spanned_pages)
3828 && !(pfn & (pageblock_nr_pages - 1)))
3829 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3830
3831 INIT_LIST_HEAD(&page->lru);
3832#ifdef WANT_PAGE_VIRTUAL
3833
3834 if (!is_highmem_idx(zone))
3835 set_page_address(page, __va(pfn << PAGE_SHIFT));
3836#endif
3837 }
3838}
3839
3840static void __meminit zone_init_free_lists(struct zone *zone)
3841{
3842 int order, t;
3843 for_each_migratetype_order(order, t) {
3844 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3845 zone->free_area[order].nr_free = 0;
3846 }
3847}
3848
3849#ifndef __HAVE_ARCH_MEMMAP_INIT
3850#define memmap_init(size, nid, zone, start_pfn) \
3851 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3852#endif
3853
3854static int __meminit zone_batchsize(struct zone *zone)
3855{
3856#ifdef CONFIG_MMU
3857 int batch;
3858
3859
3860
3861
3862
3863
3864
3865 batch = zone->present_pages / 1024;
3866 if (batch * PAGE_SIZE > 512 * 1024)
3867 batch = (512 * 1024) / PAGE_SIZE;
3868 batch /= 4;
3869 if (batch < 1)
3870 batch = 1;
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882 batch = rounddown_pow_of_two(batch + batch/2) - 1;
3883
3884 return batch;
3885
3886#else
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900 return 0;
3901#endif
3902}
3903
3904static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3905{
3906 struct per_cpu_pages *pcp;
3907 int migratetype;
3908
3909 memset(p, 0, sizeof(*p));
3910
3911 pcp = &p->pcp;
3912 pcp->count = 0;
3913 pcp->high = 6 * batch;
3914 pcp->batch = max(1UL, 1 * batch);
3915 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3916 INIT_LIST_HEAD(&pcp->lists[migratetype]);
3917}
3918
3919
3920
3921
3922
3923
3924static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3925 unsigned long high)
3926{
3927 struct per_cpu_pages *pcp;
3928
3929 pcp = &p->pcp;
3930 pcp->high = high;
3931 pcp->batch = max(1UL, high/4);
3932 if ((high/4) > (PAGE_SHIFT * 8))
3933 pcp->batch = PAGE_SHIFT * 8;
3934}
3935
3936static void __meminit setup_zone_pageset(struct zone *zone)
3937{
3938 int cpu;
3939
3940 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3941
3942 for_each_possible_cpu(cpu) {
3943 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3944
3945 setup_pageset(pcp, zone_batchsize(zone));
3946
3947 if (percpu_pagelist_fraction)
3948 setup_pagelist_highmark(pcp,
3949 (zone->present_pages /
3950 percpu_pagelist_fraction));
3951 }
3952}
3953
3954
3955
3956
3957
3958void __init setup_per_cpu_pageset(void)
3959{
3960 struct zone *zone;
3961
3962 for_each_populated_zone(zone)
3963 setup_zone_pageset(zone);
3964}
3965
3966static noinline __init_refok
3967int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3968{
3969 int i;
3970 struct pglist_data *pgdat = zone->zone_pgdat;
3971 size_t alloc_size;
3972
3973
3974
3975
3976
3977 zone->wait_table_hash_nr_entries =
3978 wait_table_hash_nr_entries(zone_size_pages);
3979 zone->wait_table_bits =
3980 wait_table_bits(zone->wait_table_hash_nr_entries);
3981 alloc_size = zone->wait_table_hash_nr_entries
3982 * sizeof(wait_queue_head_t);
3983
3984 if (!slab_is_available()) {
3985 zone->wait_table = (wait_queue_head_t *)
3986 alloc_bootmem_node_nopanic(pgdat, alloc_size);
3987 } else {
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998 zone->wait_table = vmalloc(alloc_size);
3999 }
4000 if (!zone->wait_table)
4001 return -ENOMEM;
4002
4003 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4004 init_waitqueue_head(zone->wait_table + i);
4005
4006 return 0;
4007}
4008
4009static __meminit void zone_pcp_init(struct zone *zone)
4010{
4011
4012
4013
4014
4015
4016 zone->pageset = &boot_pageset;
4017
4018 if (zone->present_pages)
4019 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4020 zone->name, zone->present_pages,
4021 zone_batchsize(zone));
4022}
4023
4024int __meminit init_currently_empty_zone(struct zone *zone,
4025 unsigned long zone_start_pfn,
4026 unsigned long size,
4027 enum memmap_context context)
4028{
4029 struct pglist_data *pgdat = zone->zone_pgdat;
4030 int ret;
4031 ret = zone_wait_table_init(zone, size);
4032 if (ret)
4033 return ret;
4034 pgdat->nr_zones = zone_idx(zone) + 1;
4035
4036 zone->zone_start_pfn = zone_start_pfn;
4037
4038 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4039 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4040 pgdat->node_id,
4041 (unsigned long)zone_idx(zone),
4042 zone_start_pfn, (zone_start_pfn + size));
4043
4044 zone_init_free_lists(zone);
4045
4046 return 0;
4047}
4048
4049#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4050#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4051
4052
4053
4054
4055
4056
4057int __meminit __early_pfn_to_nid(unsigned long pfn)
4058{
4059 unsigned long start_pfn, end_pfn;
4060 int i, nid;
4061
4062 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4063 if (start_pfn <= pfn && pfn < end_pfn)
4064 return nid;
4065
4066 return -1;
4067}
4068#endif
4069
4070int __meminit early_pfn_to_nid(unsigned long pfn)
4071{
4072 int nid;
4073
4074 nid = __early_pfn_to_nid(pfn);
4075 if (nid >= 0)
4076 return nid;
4077
4078 return 0;
4079}
4080
4081#ifdef CONFIG_NODES_SPAN_OTHER_NODES
4082bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4083{
4084 int nid;
4085
4086 nid = __early_pfn_to_nid(pfn);
4087 if (nid >= 0 && nid != node)
4088 return false;
4089 return true;
4090}
4091#endif
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4103{
4104 unsigned long start_pfn, end_pfn;
4105 int i, this_nid;
4106
4107 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4108 start_pfn = min(start_pfn, max_low_pfn);
4109 end_pfn = min(end_pfn, max_low_pfn);
4110
4111 if (start_pfn < end_pfn)
4112 free_bootmem_node(NODE_DATA(this_nid),
4113 PFN_PHYS(start_pfn),
4114 (end_pfn - start_pfn) << PAGE_SHIFT);
4115 }
4116}
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126void __init sparse_memory_present_with_active_regions(int nid)
4127{
4128 unsigned long start_pfn, end_pfn;
4129 int i, this_nid;
4130
4131 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4132 memory_present(this_nid, start_pfn, end_pfn);
4133}
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146void __meminit get_pfn_range_for_nid(unsigned int nid,
4147 unsigned long *start_pfn, unsigned long *end_pfn)
4148{
4149 unsigned long this_start_pfn, this_end_pfn;
4150 int i;
4151
4152 *start_pfn = -1UL;
4153 *end_pfn = 0;
4154
4155 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4156 *start_pfn = min(*start_pfn, this_start_pfn);
4157 *end_pfn = max(*end_pfn, this_end_pfn);
4158 }
4159
4160 if (*start_pfn == -1UL)
4161 *start_pfn = 0;
4162}
4163
4164
4165
4166
4167
4168
4169static void __init find_usable_zone_for_movable(void)
4170{
4171 int zone_index;
4172 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4173 if (zone_index == ZONE_MOVABLE)
4174 continue;
4175
4176 if (arch_zone_highest_possible_pfn[zone_index] >
4177 arch_zone_lowest_possible_pfn[zone_index])
4178 break;
4179 }
4180
4181 VM_BUG_ON(zone_index == -1);
4182 movable_zone = zone_index;
4183}
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195static void __meminit adjust_zone_range_for_zone_movable(int nid,
4196 unsigned long zone_type,
4197 unsigned long node_start_pfn,
4198 unsigned long node_end_pfn,
4199 unsigned long *zone_start_pfn,
4200 unsigned long *zone_end_pfn)
4201{
4202
4203 if (zone_movable_pfn[nid]) {
4204
4205 if (zone_type == ZONE_MOVABLE) {
4206 *zone_start_pfn = zone_movable_pfn[nid];
4207 *zone_end_pfn = min(node_end_pfn,
4208 arch_zone_highest_possible_pfn[movable_zone]);
4209
4210
4211 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4212 *zone_end_pfn > zone_movable_pfn[nid]) {
4213 *zone_end_pfn = zone_movable_pfn[nid];
4214
4215
4216 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4217 *zone_start_pfn = *zone_end_pfn;
4218 }
4219}
4220
4221
4222
4223
4224
4225static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4226 unsigned long zone_type,
4227 unsigned long *ignored)
4228{
4229 unsigned long node_start_pfn, node_end_pfn;
4230 unsigned long zone_start_pfn, zone_end_pfn;
4231
4232
4233 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4234 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4235 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4236 adjust_zone_range_for_zone_movable(nid, zone_type,
4237 node_start_pfn, node_end_pfn,
4238 &zone_start_pfn, &zone_end_pfn);
4239
4240
4241 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4242 return 0;
4243
4244
4245 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4246 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4247
4248
4249 return zone_end_pfn - zone_start_pfn;
4250}
4251
4252
4253
4254
4255
4256unsigned long __meminit __absent_pages_in_range(int nid,
4257 unsigned long range_start_pfn,
4258 unsigned long range_end_pfn)
4259{
4260 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4261 unsigned long start_pfn, end_pfn;
4262 int i;
4263
4264 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4265 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4266 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4267 nr_absent -= end_pfn - start_pfn;
4268 }
4269 return nr_absent;
4270}
4271
4272
4273
4274
4275
4276
4277
4278
4279unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4280 unsigned long end_pfn)
4281{
4282 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4283}
4284
4285
4286static unsigned long __meminit zone_absent_pages_in_node(int nid,
4287 unsigned long zone_type,
4288 unsigned long *ignored)
4289{
4290 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4291 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4292 unsigned long node_start_pfn, node_end_pfn;
4293 unsigned long zone_start_pfn, zone_end_pfn;
4294
4295 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4296 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4297 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4298
4299 adjust_zone_range_for_zone_movable(nid, zone_type,
4300 node_start_pfn, node_end_pfn,
4301 &zone_start_pfn, &zone_end_pfn);
4302 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4303}
4304
4305#else
4306static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4307 unsigned long zone_type,
4308 unsigned long *zones_size)
4309{
4310 return zones_size[zone_type];
4311}
4312
4313static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4314 unsigned long zone_type,
4315 unsigned long *zholes_size)
4316{
4317 if (!zholes_size)
4318 return 0;
4319
4320 return zholes_size[zone_type];
4321}
4322
4323#endif
4324
4325static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4326 unsigned long *zones_size, unsigned long *zholes_size)
4327{
4328 unsigned long realtotalpages, totalpages = 0;
4329 enum zone_type i;
4330
4331 for (i = 0; i < MAX_NR_ZONES; i++)
4332 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4333 zones_size);
4334 pgdat->node_spanned_pages = totalpages;
4335
4336 realtotalpages = totalpages;
4337 for (i = 0; i < MAX_NR_ZONES; i++)
4338 realtotalpages -=
4339 zone_absent_pages_in_node(pgdat->node_id, i,
4340 zholes_size);
4341 pgdat->node_present_pages = realtotalpages;
4342 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4343 realtotalpages);
4344}
4345
4346#ifndef CONFIG_SPARSEMEM
4347
4348
4349
4350
4351
4352
4353
4354static unsigned long __init usemap_size(unsigned long zonesize)
4355{
4356 unsigned long usemapsize;
4357
4358 usemapsize = roundup(zonesize, pageblock_nr_pages);
4359 usemapsize = usemapsize >> pageblock_order;
4360 usemapsize *= NR_PAGEBLOCK_BITS;
4361 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4362
4363 return usemapsize / 8;
4364}
4365
4366static void __init setup_usemap(struct pglist_data *pgdat,
4367 struct zone *zone, unsigned long zonesize)
4368{
4369 unsigned long usemapsize = usemap_size(zonesize);
4370 zone->pageblock_flags = NULL;
4371 if (usemapsize)
4372 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4373 usemapsize);
4374}
4375#else
4376static inline void setup_usemap(struct pglist_data *pgdat,
4377 struct zone *zone, unsigned long zonesize) {}
4378#endif
4379
4380#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4381
4382
4383void __init set_pageblock_order(void)
4384{
4385 unsigned int order;
4386
4387
4388 if (pageblock_order)
4389 return;
4390
4391 if (HPAGE_SHIFT > PAGE_SHIFT)
4392 order = HUGETLB_PAGE_ORDER;
4393 else
4394 order = MAX_ORDER - 1;
4395
4396
4397
4398
4399
4400
4401 pageblock_order = order;
4402}
4403#else
4404
4405
4406
4407
4408
4409
4410
4411void __init set_pageblock_order(void)
4412{
4413}
4414
4415#endif
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4426 unsigned long *zones_size, unsigned long *zholes_size)
4427{
4428 enum zone_type j;
4429 int nid = pgdat->node_id;
4430 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4431 int ret;
4432
4433 pgdat_resize_init(pgdat);
4434 init_waitqueue_head(&pgdat->kswapd_wait);
4435 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4436 pgdat_page_cgroup_init(pgdat);
4437
4438 for (j = 0; j < MAX_NR_ZONES; j++) {
4439 struct zone *zone = pgdat->node_zones + j;
4440 unsigned long size, realsize, memmap_pages;
4441
4442 size = zone_spanned_pages_in_node(nid, j, zones_size);
4443 realsize = size - zone_absent_pages_in_node(nid, j,
4444 zholes_size);
4445
4446
4447
4448
4449
4450
4451 memmap_pages =
4452 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
4453 if (realsize >= memmap_pages) {
4454 realsize -= memmap_pages;
4455 if (memmap_pages)
4456 printk(KERN_DEBUG
4457 " %s zone: %lu pages used for memmap\n",
4458 zone_names[j], memmap_pages);
4459 } else
4460 printk(KERN_WARNING
4461 " %s zone: %lu pages exceeds realsize %lu\n",
4462 zone_names[j], memmap_pages, realsize);
4463
4464
4465 if (j == 0 && realsize > dma_reserve) {
4466 realsize -= dma_reserve;
4467 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4468 zone_names[0], dma_reserve);
4469 }
4470
4471 if (!is_highmem_idx(j))
4472 nr_kernel_pages += realsize;
4473 nr_all_pages += realsize;
4474
4475 zone->spanned_pages = size;
4476 zone->present_pages = realsize;
4477#ifdef CONFIG_NUMA
4478 zone->node = nid;
4479 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
4480 / 100;
4481 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
4482#endif
4483 zone->name = zone_names[j];
4484 spin_lock_init(&zone->lock);
4485 spin_lock_init(&zone->lru_lock);
4486 zone_seqlock_init(zone);
4487 zone->zone_pgdat = pgdat;
4488
4489 zone_pcp_init(zone);
4490 lruvec_init(&zone->lruvec);
4491 if (!size)
4492 continue;
4493
4494 set_pageblock_order();
4495 setup_usemap(pgdat, zone, size);
4496 ret = init_currently_empty_zone(zone, zone_start_pfn,
4497 size, MEMMAP_EARLY);
4498 BUG_ON(ret);
4499 memmap_init(size, nid, j, zone_start_pfn);
4500 zone_start_pfn += size;
4501 }
4502}
4503
4504static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4505{
4506
4507 if (!pgdat->node_spanned_pages)
4508 return;
4509
4510#ifdef CONFIG_FLAT_NODE_MEM_MAP
4511
4512 if (!pgdat->node_mem_map) {
4513 unsigned long size, start, end;
4514 struct page *map;
4515
4516
4517
4518
4519
4520
4521 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4522 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
4523 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4524 size = (end - start) * sizeof(struct page);
4525 map = alloc_remap(pgdat->node_id, size);
4526 if (!map)
4527 map = alloc_bootmem_node_nopanic(pgdat, size);
4528 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4529 }
4530#ifndef CONFIG_NEED_MULTIPLE_NODES
4531
4532
4533
4534 if (pgdat == NODE_DATA(0)) {
4535 mem_map = NODE_DATA(0)->node_mem_map;
4536#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4537 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4538 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4539#endif
4540 }
4541#endif
4542#endif
4543}
4544
4545void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4546 unsigned long node_start_pfn, unsigned long *zholes_size)
4547{
4548 pg_data_t *pgdat = NODE_DATA(nid);
4549
4550
4551 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4552
4553 pgdat->node_id = nid;
4554 pgdat->node_start_pfn = node_start_pfn;
4555 init_zone_allows_reclaim(nid);
4556 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4557
4558 alloc_node_mem_map(pgdat);
4559#ifdef CONFIG_FLAT_NODE_MEM_MAP
4560 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4561 nid, (unsigned long)pgdat,
4562 (unsigned long)pgdat->node_mem_map);
4563#endif
4564
4565 free_area_init_core(pgdat, zones_size, zholes_size);
4566}
4567
4568#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4569
4570#if MAX_NUMNODES > 1
4571
4572
4573
4574static void __init setup_nr_node_ids(void)
4575{
4576 unsigned int node;
4577 unsigned int highest = 0;
4578
4579 for_each_node_mask(node, node_possible_map)
4580 highest = node;
4581 nr_node_ids = highest + 1;
4582}
4583#else
4584static inline void setup_nr_node_ids(void)
4585{
4586}
4587#endif
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608unsigned long __init node_map_pfn_alignment(void)
4609{
4610 unsigned long accl_mask = 0, last_end = 0;
4611 unsigned long start, end, mask;
4612 int last_nid = -1;
4613 int i, nid;
4614
4615 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4616 if (!start || last_nid < 0 || last_nid == nid) {
4617 last_nid = nid;
4618 last_end = end;
4619 continue;
4620 }
4621
4622
4623
4624
4625
4626
4627 mask = ~((1 << __ffs(start)) - 1);
4628 while (mask && last_end <= (start & (mask << 1)))
4629 mask <<= 1;
4630
4631
4632 accl_mask |= mask;
4633 }
4634
4635
4636 return ~accl_mask + 1;
4637}
4638
4639
4640static unsigned long __init find_min_pfn_for_node(int nid)
4641{
4642 unsigned long min_pfn = ULONG_MAX;
4643 unsigned long start_pfn;
4644 int i;
4645
4646 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4647 min_pfn = min(min_pfn, start_pfn);
4648
4649 if (min_pfn == ULONG_MAX) {
4650 printk(KERN_WARNING
4651 "Could not find start_pfn for node %d\n", nid);
4652 return 0;
4653 }
4654
4655 return min_pfn;
4656}
4657
4658
4659
4660
4661
4662
4663
4664unsigned long __init find_min_pfn_with_active_regions(void)
4665{
4666 return find_min_pfn_for_node(MAX_NUMNODES);
4667}
4668
4669
4670
4671
4672
4673
4674static unsigned long __init early_calculate_totalpages(void)
4675{
4676 unsigned long totalpages = 0;
4677 unsigned long start_pfn, end_pfn;
4678 int i, nid;
4679
4680 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4681 unsigned long pages = end_pfn - start_pfn;
4682
4683 totalpages += pages;
4684 if (pages)
4685 node_set_state(nid, N_HIGH_MEMORY);
4686 }
4687 return totalpages;
4688}
4689
4690
4691
4692
4693
4694
4695
4696static void __init find_zone_movable_pfns_for_nodes(void)
4697{
4698 int i, nid;
4699 unsigned long usable_startpfn;
4700 unsigned long kernelcore_node, kernelcore_remaining;
4701
4702 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4703 unsigned long totalpages = early_calculate_totalpages();
4704 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714 if (required_movablecore) {
4715 unsigned long corepages;
4716
4717
4718
4719
4720
4721 required_movablecore =
4722 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
4723 corepages = totalpages - required_movablecore;
4724
4725 required_kernelcore = max(required_kernelcore, corepages);
4726 }
4727
4728
4729 if (!required_kernelcore)
4730 goto out;
4731
4732
4733 find_usable_zone_for_movable();
4734 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4735
4736restart:
4737
4738 kernelcore_node = required_kernelcore / usable_nodes;
4739 for_each_node_state(nid, N_HIGH_MEMORY) {
4740 unsigned long start_pfn, end_pfn;
4741
4742
4743
4744
4745
4746
4747 if (required_kernelcore < kernelcore_node)
4748 kernelcore_node = required_kernelcore / usable_nodes;
4749
4750
4751
4752
4753
4754
4755 kernelcore_remaining = kernelcore_node;
4756
4757
4758 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4759 unsigned long size_pages;
4760
4761 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4762 if (start_pfn >= end_pfn)
4763 continue;
4764
4765
4766 if (start_pfn < usable_startpfn) {
4767 unsigned long kernel_pages;
4768 kernel_pages = min(end_pfn, usable_startpfn)
4769 - start_pfn;
4770
4771 kernelcore_remaining -= min(kernel_pages,
4772 kernelcore_remaining);
4773 required_kernelcore -= min(kernel_pages,
4774 required_kernelcore);
4775
4776
4777 if (end_pfn <= usable_startpfn) {
4778
4779
4780
4781
4782
4783
4784
4785 zone_movable_pfn[nid] = end_pfn;
4786 continue;
4787 }
4788 start_pfn = usable_startpfn;
4789 }
4790
4791
4792
4793
4794
4795
4796 size_pages = end_pfn - start_pfn;
4797 if (size_pages > kernelcore_remaining)
4798 size_pages = kernelcore_remaining;
4799 zone_movable_pfn[nid] = start_pfn + size_pages;
4800
4801
4802
4803
4804
4805
4806 required_kernelcore -= min(required_kernelcore,
4807 size_pages);
4808 kernelcore_remaining -= size_pages;
4809 if (!kernelcore_remaining)
4810 break;
4811 }
4812 }
4813
4814
4815
4816
4817
4818
4819
4820 usable_nodes--;
4821 if (usable_nodes && required_kernelcore > usable_nodes)
4822 goto restart;
4823
4824
4825 for (nid = 0; nid < MAX_NUMNODES; nid++)
4826 zone_movable_pfn[nid] =
4827 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4828
4829out:
4830
4831 node_states[N_HIGH_MEMORY] = saved_node_state;
4832}
4833
4834
4835static void __init check_for_regular_memory(pg_data_t *pgdat)
4836{
4837#ifdef CONFIG_HIGHMEM
4838 enum zone_type zone_type;
4839
4840 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4841 struct zone *zone = &pgdat->node_zones[zone_type];
4842 if (zone->present_pages) {
4843 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4844 break;
4845 }
4846 }
4847#endif
4848}
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4864{
4865 unsigned long start_pfn, end_pfn;
4866 int i, nid;
4867
4868
4869 memset(arch_zone_lowest_possible_pfn, 0,
4870 sizeof(arch_zone_lowest_possible_pfn));
4871 memset(arch_zone_highest_possible_pfn, 0,
4872 sizeof(arch_zone_highest_possible_pfn));
4873 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
4874 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
4875 for (i = 1; i < MAX_NR_ZONES; i++) {
4876 if (i == ZONE_MOVABLE)
4877 continue;
4878 arch_zone_lowest_possible_pfn[i] =
4879 arch_zone_highest_possible_pfn[i-1];
4880 arch_zone_highest_possible_pfn[i] =
4881 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
4882 }
4883 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
4884 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
4885
4886
4887 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4888 find_zone_movable_pfns_for_nodes();
4889
4890
4891 printk("Zone ranges:\n");
4892 for (i = 0; i < MAX_NR_ZONES; i++) {
4893 if (i == ZONE_MOVABLE)
4894 continue;
4895 printk(KERN_CONT " %-8s ", zone_names[i]);
4896 if (arch_zone_lowest_possible_pfn[i] ==
4897 arch_zone_highest_possible_pfn[i])
4898 printk(KERN_CONT "empty\n");
4899 else
4900 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4901 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4902 (arch_zone_highest_possible_pfn[i]
4903 << PAGE_SHIFT) - 1);
4904 }
4905
4906
4907 printk("Movable zone start for each node\n");
4908 for (i = 0; i < MAX_NUMNODES; i++) {
4909 if (zone_movable_pfn[i])
4910 printk(" Node %d: %#010lx\n", i,
4911 zone_movable_pfn[i] << PAGE_SHIFT);
4912 }
4913
4914
4915 printk("Early memory node ranges\n");
4916 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4917 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4918 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4919
4920
4921 mminit_verify_pageflags_layout();
4922 setup_nr_node_ids();
4923 for_each_online_node(nid) {
4924 pg_data_t *pgdat = NODE_DATA(nid);
4925 free_area_init_node(nid, NULL,
4926 find_min_pfn_for_node(nid), NULL);
4927
4928
4929 if (pgdat->node_present_pages)
4930 node_set_state(nid, N_HIGH_MEMORY);
4931 check_for_regular_memory(pgdat);
4932 }
4933}
4934
4935static int __init cmdline_parse_core(char *p, unsigned long *core)
4936{
4937 unsigned long long coremem;
4938 if (!p)
4939 return -EINVAL;
4940
4941 coremem = memparse(p, &p);
4942 *core = coremem >> PAGE_SHIFT;
4943
4944
4945 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
4946
4947 return 0;
4948}
4949
4950
4951
4952
4953
4954static int __init cmdline_parse_kernelcore(char *p)
4955{
4956 return cmdline_parse_core(p, &required_kernelcore);
4957}
4958
4959
4960
4961
4962
4963static int __init cmdline_parse_movablecore(char *p)
4964{
4965 return cmdline_parse_core(p, &required_movablecore);
4966}
4967
4968early_param("kernelcore", cmdline_parse_kernelcore);
4969early_param("movablecore", cmdline_parse_movablecore);
4970
4971#endif
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984void __init set_dma_reserve(unsigned long new_dma_reserve)
4985{
4986 dma_reserve = new_dma_reserve;
4987}
4988
4989void __init free_area_init(unsigned long *zones_size)
4990{
4991 free_area_init_node(0, zones_size,
4992 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4993}
4994
4995static int page_alloc_cpu_notify(struct notifier_block *self,
4996 unsigned long action, void *hcpu)
4997{
4998 int cpu = (unsigned long)hcpu;
4999
5000 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
5001 lru_add_drain_cpu(cpu);
5002 drain_pages(cpu);
5003
5004
5005
5006
5007
5008
5009
5010 vm_events_fold_cpu(cpu);
5011
5012
5013
5014
5015
5016
5017
5018
5019 refresh_cpu_vm_stats(cpu);
5020 }
5021 return NOTIFY_OK;
5022}
5023
5024void __init page_alloc_init(void)
5025{
5026 hotcpu_notifier(page_alloc_cpu_notify, 0);
5027}
5028
5029
5030
5031
5032
5033static void calculate_totalreserve_pages(void)
5034{
5035 struct pglist_data *pgdat;
5036 unsigned long reserve_pages = 0;
5037 enum zone_type i, j;
5038
5039 for_each_online_pgdat(pgdat) {
5040 for (i = 0; i < MAX_NR_ZONES; i++) {
5041 struct zone *zone = pgdat->node_zones + i;
5042 unsigned long max = 0;
5043
5044
5045 for (j = i; j < MAX_NR_ZONES; j++) {
5046 if (zone->lowmem_reserve[j] > max)
5047 max = zone->lowmem_reserve[j];
5048 }
5049
5050
5051 max += high_wmark_pages(zone);
5052
5053 if (max > zone->present_pages)
5054 max = zone->present_pages;
5055 reserve_pages += max;
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065 zone->dirty_balance_reserve = max;
5066 }
5067 }
5068 dirty_balance_reserve = reserve_pages;
5069 totalreserve_pages = reserve_pages;
5070}
5071
5072
5073
5074
5075
5076
5077
5078static void setup_per_zone_lowmem_reserve(void)
5079{
5080 struct pglist_data *pgdat;
5081 enum zone_type j, idx;
5082
5083 for_each_online_pgdat(pgdat) {
5084 for (j = 0; j < MAX_NR_ZONES; j++) {
5085 struct zone *zone = pgdat->node_zones + j;
5086 unsigned long present_pages = zone->present_pages;
5087
5088 zone->lowmem_reserve[j] = 0;
5089
5090 idx = j;
5091 while (idx) {
5092 struct zone *lower_zone;
5093
5094 idx--;
5095
5096 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5097 sysctl_lowmem_reserve_ratio[idx] = 1;
5098
5099 lower_zone = pgdat->node_zones + idx;
5100 lower_zone->lowmem_reserve[j] = present_pages /
5101 sysctl_lowmem_reserve_ratio[idx];
5102 present_pages += lower_zone->present_pages;
5103 }
5104 }
5105 }
5106
5107
5108 calculate_totalreserve_pages();
5109}
5110
5111static void __setup_per_zone_wmarks(void)
5112{
5113 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5114 unsigned long lowmem_pages = 0;
5115 struct zone *zone;
5116 unsigned long flags;
5117
5118
5119 for_each_zone(zone) {
5120 if (!is_highmem(zone))
5121 lowmem_pages += zone->present_pages;
5122 }
5123
5124 for_each_zone(zone) {
5125 u64 tmp;
5126
5127 spin_lock_irqsave(&zone->lock, flags);
5128 tmp = (u64)pages_min * zone->present_pages;
5129 do_div(tmp, lowmem_pages);
5130 if (is_highmem(zone)) {
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140 int min_pages;
5141
5142 min_pages = zone->present_pages / 1024;
5143 if (min_pages < SWAP_CLUSTER_MAX)
5144 min_pages = SWAP_CLUSTER_MAX;
5145 if (min_pages > 128)
5146 min_pages = 128;
5147 zone->watermark[WMARK_MIN] = min_pages;
5148 } else {
5149
5150
5151
5152
5153 zone->watermark[WMARK_MIN] = tmp;
5154 }
5155
5156 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5157 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5158
5159 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5160 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5161 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5162
5163 setup_zone_migrate_reserve(zone);
5164 spin_unlock_irqrestore(&zone->lock, flags);
5165 }
5166
5167
5168 calculate_totalreserve_pages();
5169}
5170
5171
5172
5173
5174
5175
5176
5177
5178void setup_per_zone_wmarks(void)
5179{
5180 mutex_lock(&zonelists_mutex);
5181 __setup_per_zone_wmarks();
5182 mutex_unlock(&zonelists_mutex);
5183}
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5207{
5208 unsigned int gb, ratio;
5209
5210
5211 gb = zone->present_pages >> (30 - PAGE_SHIFT);
5212 if (gb)
5213 ratio = int_sqrt(10 * gb);
5214 else
5215 ratio = 1;
5216
5217 zone->inactive_ratio = ratio;
5218}
5219
5220static void __meminit setup_per_zone_inactive_ratio(void)
5221{
5222 struct zone *zone;
5223
5224 for_each_zone(zone)
5225 calculate_zone_inactive_ratio(zone);
5226}
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252int __meminit init_per_zone_wmark_min(void)
5253{
5254 unsigned long lowmem_kbytes;
5255
5256 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5257
5258 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5259 if (min_free_kbytes < 128)
5260 min_free_kbytes = 128;
5261 if (min_free_kbytes > 65536)
5262 min_free_kbytes = 65536;
5263 setup_per_zone_wmarks();
5264 refresh_zone_stat_thresholds();
5265 setup_per_zone_lowmem_reserve();
5266 setup_per_zone_inactive_ratio();
5267 return 0;
5268}
5269module_init(init_per_zone_wmark_min)
5270
5271
5272
5273
5274
5275
5276int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5277 void __user *buffer, size_t *length, loff_t *ppos)
5278{
5279 proc_dointvec(table, write, buffer, length, ppos);
5280 if (write)
5281 setup_per_zone_wmarks();
5282 return 0;
5283}
5284
5285#ifdef CONFIG_NUMA
5286int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5287 void __user *buffer, size_t *length, loff_t *ppos)
5288{
5289 struct zone *zone;
5290 int rc;
5291
5292 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5293 if (rc)
5294 return rc;
5295
5296 for_each_zone(zone)
5297 zone->min_unmapped_pages = (zone->present_pages *
5298 sysctl_min_unmapped_ratio) / 100;
5299 return 0;
5300}
5301
5302int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5303 void __user *buffer, size_t *length, loff_t *ppos)
5304{
5305 struct zone *zone;
5306 int rc;
5307
5308 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5309 if (rc)
5310 return rc;
5311
5312 for_each_zone(zone)
5313 zone->min_slab_pages = (zone->present_pages *
5314 sysctl_min_slab_ratio) / 100;
5315 return 0;
5316}
5317#endif
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5329 void __user *buffer, size_t *length, loff_t *ppos)
5330{
5331 proc_dointvec_minmax(table, write, buffer, length, ppos);
5332 setup_per_zone_lowmem_reserve();
5333 return 0;
5334}
5335
5336
5337
5338
5339
5340
5341
5342int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5343 void __user *buffer, size_t *length, loff_t *ppos)
5344{
5345 struct zone *zone;
5346 unsigned int cpu;
5347 int ret;
5348
5349 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5350 if (!write || (ret < 0))
5351 return ret;
5352 for_each_populated_zone(zone) {
5353 for_each_possible_cpu(cpu) {
5354 unsigned long high;
5355 high = zone->present_pages / percpu_pagelist_fraction;
5356 setup_pagelist_highmark(
5357 per_cpu_ptr(zone->pageset, cpu), high);
5358 }
5359 }
5360 return 0;
5361}
5362
5363int hashdist = HASHDIST_DEFAULT;
5364
5365#ifdef CONFIG_NUMA
5366static int __init set_hashdist(char *str)
5367{
5368 if (!str)
5369 return 0;
5370 hashdist = simple_strtoul(str, &str, 0);
5371 return 1;
5372}
5373__setup("hashdist=", set_hashdist);
5374#endif
5375
5376
5377
5378
5379
5380
5381
5382void *__init alloc_large_system_hash(const char *tablename,
5383 unsigned long bucketsize,
5384 unsigned long numentries,
5385 int scale,
5386 int flags,
5387 unsigned int *_hash_shift,
5388 unsigned int *_hash_mask,
5389 unsigned long low_limit,
5390 unsigned long high_limit)
5391{
5392 unsigned long long max = high_limit;
5393 unsigned long log2qty, size;
5394 void *table = NULL;
5395
5396
5397 if (!numentries) {
5398
5399 numentries = nr_kernel_pages;
5400 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
5401 numentries >>= 20 - PAGE_SHIFT;
5402 numentries <<= 20 - PAGE_SHIFT;
5403
5404
5405 if (scale > PAGE_SHIFT)
5406 numentries >>= (scale - PAGE_SHIFT);
5407 else
5408 numentries <<= (PAGE_SHIFT - scale);
5409
5410
5411 if (unlikely(flags & HASH_SMALL)) {
5412
5413 WARN_ON(!(flags & HASH_EARLY));
5414 if (!(numentries >> *_hash_shift)) {
5415 numentries = 1UL << *_hash_shift;
5416 BUG_ON(!numentries);
5417 }
5418 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5419 numentries = PAGE_SIZE / bucketsize;
5420 }
5421 numentries = roundup_pow_of_two(numentries);
5422
5423
5424 if (max == 0) {
5425 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5426 do_div(max, bucketsize);
5427 }
5428 max = min(max, 0x80000000ULL);
5429
5430 if (numentries < low_limit)
5431 numentries = low_limit;
5432 if (numentries > max)
5433 numentries = max;
5434
5435 log2qty = ilog2(numentries);
5436
5437 do {
5438 size = bucketsize << log2qty;
5439 if (flags & HASH_EARLY)
5440 table = alloc_bootmem_nopanic(size);
5441 else if (hashdist)
5442 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5443 else {
5444
5445
5446
5447
5448
5449 if (get_order(size) < MAX_ORDER) {
5450 table = alloc_pages_exact(size, GFP_ATOMIC);
5451 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5452 }
5453 }
5454 } while (!table && size > PAGE_SIZE && --log2qty);
5455
5456 if (!table)
5457 panic("Failed to allocate %s hash table\n", tablename);
5458
5459 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5460 tablename,
5461 (1UL << log2qty),
5462 ilog2(size) - PAGE_SHIFT,
5463 size);
5464
5465 if (_hash_shift)
5466 *_hash_shift = log2qty;
5467 if (_hash_mask)
5468 *_hash_mask = (1 << log2qty) - 1;
5469
5470 return table;
5471}
5472
5473
5474static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5475 unsigned long pfn)
5476{
5477#ifdef CONFIG_SPARSEMEM
5478 return __pfn_to_section(pfn)->pageblock_flags;
5479#else
5480 return zone->pageblock_flags;
5481#endif
5482}
5483
5484static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5485{
5486#ifdef CONFIG_SPARSEMEM
5487 pfn &= (PAGES_PER_SECTION-1);
5488 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5489#else
5490 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
5491 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5492#endif
5493}
5494
5495
5496
5497
5498
5499
5500
5501
5502unsigned long get_pageblock_flags_group(struct page *page,
5503 int start_bitidx, int end_bitidx)
5504{
5505 struct zone *zone;
5506 unsigned long *bitmap;
5507 unsigned long pfn, bitidx;
5508 unsigned long flags = 0;
5509 unsigned long value = 1;
5510
5511 zone = page_zone(page);
5512 pfn = page_to_pfn(page);
5513 bitmap = get_pageblock_bitmap(zone, pfn);
5514 bitidx = pfn_to_bitidx(zone, pfn);
5515
5516 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5517 if (test_bit(bitidx + start_bitidx, bitmap))
5518 flags |= value;
5519
5520 return flags;
5521}
5522
5523
5524
5525
5526
5527
5528
5529
5530void set_pageblock_flags_group(struct page *page, unsigned long flags,
5531 int start_bitidx, int end_bitidx)
5532{
5533 struct zone *zone;
5534 unsigned long *bitmap;
5535 unsigned long pfn, bitidx;
5536 unsigned long value = 1;
5537
5538 zone = page_zone(page);
5539 pfn = page_to_pfn(page);
5540 bitmap = get_pageblock_bitmap(zone, pfn);
5541 bitidx = pfn_to_bitidx(zone, pfn);
5542 VM_BUG_ON(pfn < zone->zone_start_pfn);
5543 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5544
5545 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5546 if (flags & value)
5547 __set_bit(bitidx + start_bitidx, bitmap);
5548 else
5549 __clear_bit(bitidx + start_bitidx, bitmap);
5550}
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5561{
5562 unsigned long pfn, iter, found;
5563 int mt;
5564
5565
5566
5567
5568
5569 if (zone_idx(zone) == ZONE_MOVABLE)
5570 return false;
5571 mt = get_pageblock_migratetype(page);
5572 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5573 return false;
5574
5575 pfn = page_to_pfn(page);
5576 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5577 unsigned long check = pfn + iter;
5578
5579 if (!pfn_valid_within(check))
5580 continue;
5581
5582 page = pfn_to_page(check);
5583
5584
5585
5586
5587
5588
5589 if (!atomic_read(&page->_count)) {
5590 if (PageBuddy(page))
5591 iter += (1 << page_order(page)) - 1;
5592 continue;
5593 }
5594
5595 if (!PageLRU(page))
5596 found++;
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610 if (found > count)
5611 return true;
5612 }
5613 return false;
5614}
5615
5616bool is_pageblock_removable_nolock(struct page *page)
5617{
5618 struct zone *zone;
5619 unsigned long pfn;
5620
5621
5622
5623
5624
5625
5626
5627
5628 if (!node_online(page_to_nid(page)))
5629 return false;
5630
5631 zone = page_zone(page);
5632 pfn = page_to_pfn(page);
5633 if (zone->zone_start_pfn > pfn ||
5634 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5635 return false;
5636
5637 return !has_unmovable_pages(zone, page, 0);
5638}
5639
5640#ifdef CONFIG_CMA
5641
5642static unsigned long pfn_max_align_down(unsigned long pfn)
5643{
5644 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5645 pageblock_nr_pages) - 1);
5646}
5647
5648static unsigned long pfn_max_align_up(unsigned long pfn)
5649{
5650 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5651 pageblock_nr_pages));
5652}
5653
5654
5655static int __alloc_contig_migrate_range(struct compact_control *cc,
5656 unsigned long start, unsigned long end)
5657{
5658
5659 unsigned long nr_reclaimed;
5660 unsigned long pfn = start;
5661 unsigned int tries = 0;
5662 int ret = 0;
5663
5664 migrate_prep_local();
5665
5666 while (pfn < end || !list_empty(&cc->migratepages)) {
5667 if (fatal_signal_pending(current)) {
5668 ret = -EINTR;
5669 break;
5670 }
5671
5672 if (list_empty(&cc->migratepages)) {
5673 cc->nr_migratepages = 0;
5674 pfn = isolate_migratepages_range(cc->zone, cc,
5675 pfn, end, true);
5676 if (!pfn) {
5677 ret = -EINTR;
5678 break;
5679 }
5680 tries = 0;
5681 } else if (++tries == 5) {
5682 ret = ret < 0 ? ret : -EBUSY;
5683 break;
5684 }
5685
5686 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
5687 &cc->migratepages);
5688 cc->nr_migratepages -= nr_reclaimed;
5689
5690 ret = migrate_pages(&cc->migratepages,
5691 alloc_migrate_target,
5692 0, false, MIGRATE_SYNC);
5693 }
5694
5695 putback_lru_pages(&cc->migratepages);
5696 return ret > 0 ? 0 : ret;
5697}
5698
5699
5700
5701
5702static inline void __update_cma_watermarks(struct zone *zone, int count)
5703{
5704 unsigned long flags;
5705 spin_lock_irqsave(&zone->lock, flags);
5706 zone->min_cma_pages += count;
5707 spin_unlock_irqrestore(&zone->lock, flags);
5708 setup_per_zone_wmarks();
5709}
5710
5711
5712
5713
5714
5715
5716static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5717{
5718 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5719 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5720 int did_some_progress = 0;
5721 int order = 1;
5722
5723
5724
5725
5726
5727 __update_cma_watermarks(zone, count);
5728
5729
5730 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5731 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5732
5733 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5734 NULL);
5735 if (!did_some_progress) {
5736
5737 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5738 }
5739 }
5740
5741
5742 __update_cma_watermarks(zone, -count);
5743
5744 return count;
5745}
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767int alloc_contig_range(unsigned long start, unsigned long end,
5768 unsigned migratetype)
5769{
5770 struct zone *zone = page_zone(pfn_to_page(start));
5771 unsigned long outer_start, outer_end;
5772 int ret = 0, order;
5773
5774 struct compact_control cc = {
5775 .nr_migratepages = 0,
5776 .order = -1,
5777 .zone = page_zone(pfn_to_page(start)),
5778 .sync = true,
5779 .ignore_skip_hint = true,
5780 };
5781 INIT_LIST_HEAD(&cc.migratepages);
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799