1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/stddef.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/interrupt.h>
21#include <linux/pagemap.h>
22#include <linux/jiffies.h>
23#include <linux/bootmem.h>
24#include <linux/memblock.h>
25#include <linux/compiler.h>
26#include <linux/kernel.h>
27#include <linux/kmemcheck.h>
28#include <linux/module.h>
29#include <linux/suspend.h>
30#include <linux/pagevec.h>
31#include <linux/blkdev.h>
32#include <linux/slab.h>
33#include <linux/ratelimit.h>
34#include <linux/oom.h>
35#include <linux/notifier.h>
36#include <linux/topology.h>
37#include <linux/sysctl.h>
38#include <linux/cpu.h>
39#include <linux/cpuset.h>
40#include <linux/memory_hotplug.h>
41#include <linux/nodemask.h>
42#include <linux/vmalloc.h>
43#include <linux/vmstat.h>
44#include <linux/mempolicy.h>
45#include <linux/stop_machine.h>
46#include <linux/sort.h>
47#include <linux/pfn.h>
48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h>
51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h>
54#include <linux/compaction.h>
55#include <trace/events/kmem.h>
56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h>
58#include <linux/prefetch.h>
59#include <linux/migrate.h>
60#include <linux/page-debug-flags.h>
61#include <linux/hugetlb.h>
62#include <linux/sched/rt.h>
63
64#include <asm/sections.h>
65#include <asm/tlbflush.h>
66#include <asm/div64.h>
67#include "internal.h"
68
69
70static DEFINE_MUTEX(pcp_batch_high_lock);
71
72#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
73DEFINE_PER_CPU(int, numa_node);
74EXPORT_PER_CPU_SYMBOL(numa_node);
75#endif
76
77#ifdef CONFIG_HAVE_MEMORYLESS_NODES
78
79
80
81
82
83
84DEFINE_PER_CPU(int, _numa_mem_);
85EXPORT_PER_CPU_SYMBOL(_numa_mem_);
86#endif
87
88
89
90
91nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
92 [N_POSSIBLE] = NODE_MASK_ALL,
93 [N_ONLINE] = { { [0] = 1UL } },
94#ifndef CONFIG_NUMA
95 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
96#ifdef CONFIG_HIGHMEM
97 [N_HIGH_MEMORY] = { { [0] = 1UL } },
98#endif
99#ifdef CONFIG_MOVABLE_NODE
100 [N_MEMORY] = { { [0] = 1UL } },
101#endif
102 [N_CPU] = { { [0] = 1UL } },
103#endif
104};
105EXPORT_SYMBOL(node_states);
106
107
108static DEFINE_SPINLOCK(managed_page_count_lock);
109
110unsigned long totalram_pages __read_mostly;
111unsigned long totalreserve_pages __read_mostly;
112
113
114
115
116
117
118unsigned long dirty_balance_reserve __read_mostly;
119
120int percpu_pagelist_fraction;
121gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
122
123#ifdef CONFIG_PM_SLEEP
124
125
126
127
128
129
130
131
132
133static gfp_t saved_gfp_mask;
134
135void pm_restore_gfp_mask(void)
136{
137 WARN_ON(!mutex_is_locked(&pm_mutex));
138 if (saved_gfp_mask) {
139 gfp_allowed_mask = saved_gfp_mask;
140 saved_gfp_mask = 0;
141 }
142}
143
144void pm_restrict_gfp_mask(void)
145{
146 WARN_ON(!mutex_is_locked(&pm_mutex));
147 WARN_ON(saved_gfp_mask);
148 saved_gfp_mask = gfp_allowed_mask;
149 gfp_allowed_mask &= ~GFP_IOFS;
150}
151
152bool pm_suspended_storage(void)
153{
154 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
155 return false;
156 return true;
157}
158#endif
159
160#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
161int pageblock_order __read_mostly;
162#endif
163
164static void __free_pages_ok(struct page *page, unsigned int order);
165
166
167
168
169
170
171
172
173
174
175
176
177int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
178#ifdef CONFIG_ZONE_DMA
179 256,
180#endif
181#ifdef CONFIG_ZONE_DMA32
182 256,
183#endif
184#ifdef CONFIG_HIGHMEM
185 32,
186#endif
187 32,
188};
189
190EXPORT_SYMBOL(totalram_pages);
191
192static char * const zone_names[MAX_NR_ZONES] = {
193#ifdef CONFIG_ZONE_DMA
194 "DMA",
195#endif
196#ifdef CONFIG_ZONE_DMA32
197 "DMA32",
198#endif
199 "Normal",
200#ifdef CONFIG_HIGHMEM
201 "HighMem",
202#endif
203 "Movable",
204};
205
206int min_free_kbytes = 1024;
207int user_min_free_kbytes;
208
209static unsigned long __meminitdata nr_kernel_pages;
210static unsigned long __meminitdata nr_all_pages;
211static unsigned long __meminitdata dma_reserve;
212
213#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
214static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
215static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
216static unsigned long __initdata required_kernelcore;
217static unsigned long __initdata required_movablecore;
218static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
219
220
221int movable_zone;
222EXPORT_SYMBOL(movable_zone);
223#endif
224
225#if MAX_NUMNODES > 1
226int nr_node_ids __read_mostly = MAX_NUMNODES;
227int nr_online_nodes __read_mostly = 1;
228EXPORT_SYMBOL(nr_node_ids);
229EXPORT_SYMBOL(nr_online_nodes);
230#endif
231
232int page_group_by_mobility_disabled __read_mostly;
233
234void set_pageblock_migratetype(struct page *page, int migratetype)
235{
236
237 if (unlikely(page_group_by_mobility_disabled))
238 migratetype = MIGRATE_UNMOVABLE;
239
240 set_pageblock_flags_group(page, (unsigned long)migratetype,
241 PB_migrate, PB_migrate_end);
242}
243
244bool oom_killer_disabled __read_mostly;
245
246#ifdef CONFIG_DEBUG_VM
247static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
248{
249 int ret = 0;
250 unsigned seq;
251 unsigned long pfn = page_to_pfn(page);
252 unsigned long sp, start_pfn;
253
254 do {
255 seq = zone_span_seqbegin(zone);
256 start_pfn = zone->zone_start_pfn;
257 sp = zone->spanned_pages;
258 if (!zone_spans_pfn(zone, pfn))
259 ret = 1;
260 } while (zone_span_seqretry(zone, seq));
261
262 if (ret)
263 pr_err("page %lu outside zone [ %lu - %lu ]\n",
264 pfn, start_pfn, start_pfn + sp);
265
266 return ret;
267}
268
269static int page_is_consistent(struct zone *zone, struct page *page)
270{
271 if (!pfn_valid_within(page_to_pfn(page)))
272 return 0;
273 if (zone != page_zone(page))
274 return 0;
275
276 return 1;
277}
278
279
280
281static int bad_range(struct zone *zone, struct page *page)
282{
283 if (page_outside_zone_boundaries(zone, page))
284 return 1;
285 if (!page_is_consistent(zone, page))
286 return 1;
287
288 return 0;
289}
290#else
291static inline int bad_range(struct zone *zone, struct page *page)
292{
293 return 0;
294}
295#endif
296
297static void bad_page(struct page *page)
298{
299 static unsigned long resume;
300 static unsigned long nr_shown;
301 static unsigned long nr_unshown;
302
303
304 if (PageHWPoison(page)) {
305 page_mapcount_reset(page);
306 return;
307 }
308
309
310
311
312
313 if (nr_shown == 60) {
314 if (time_before(jiffies, resume)) {
315 nr_unshown++;
316 goto out;
317 }
318 if (nr_unshown) {
319 printk(KERN_ALERT
320 "BUG: Bad page state: %lu messages suppressed\n",
321 nr_unshown);
322 nr_unshown = 0;
323 }
324 nr_shown = 0;
325 }
326 if (nr_shown++ == 0)
327 resume = jiffies + 60 * HZ;
328
329 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
330 current->comm, page_to_pfn(page));
331 dump_page(page);
332
333 print_modules();
334 dump_stack();
335out:
336
337 page_mapcount_reset(page);
338 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
339}
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356static void free_compound_page(struct page *page)
357{
358 __free_pages_ok(page, compound_order(page));
359}
360
361void prep_compound_page(struct page *page, unsigned long order)
362{
363 int i;
364 int nr_pages = 1 << order;
365
366 set_compound_page_dtor(page, free_compound_page);
367 set_compound_order(page, order);
368 __SetPageHead(page);
369 for (i = 1; i < nr_pages; i++) {
370 struct page *p = page + i;
371 __SetPageTail(p);
372 set_page_count(p, 0);
373 p->first_page = page;
374 }
375}
376
377
378static int destroy_compound_page(struct page *page, unsigned long order)
379{
380 int i;
381 int nr_pages = 1 << order;
382 int bad = 0;
383
384 if (unlikely(compound_order(page) != order)) {
385 bad_page(page);
386 bad++;
387 }
388
389 __ClearPageHead(page);
390
391 for (i = 1; i < nr_pages; i++) {
392 struct page *p = page + i;
393
394 if (unlikely(!PageTail(p) || (p->first_page != page))) {
395 bad_page(page);
396 bad++;
397 }
398 __ClearPageTail(p);
399 }
400
401 return bad;
402}
403
404static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
405{
406 int i;
407
408
409
410
411
412 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
413 for (i = 0; i < (1 << order); i++)
414 clear_highpage(page + i);
415}
416
417#ifdef CONFIG_DEBUG_PAGEALLOC
418unsigned int _debug_guardpage_minorder;
419
420static int __init debug_guardpage_minorder_setup(char *buf)
421{
422 unsigned long res;
423
424 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
425 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
426 return 0;
427 }
428 _debug_guardpage_minorder = res;
429 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
430 return 0;
431}
432__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
433
434static inline void set_page_guard_flag(struct page *page)
435{
436 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
437}
438
439static inline void clear_page_guard_flag(struct page *page)
440{
441 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
442}
443#else
444static inline void set_page_guard_flag(struct page *page) { }
445static inline void clear_page_guard_flag(struct page *page) { }
446#endif
447
448static inline void set_page_order(struct page *page, int order)
449{
450 set_page_private(page, order);
451 __SetPageBuddy(page);
452}
453
454static inline void rmv_page_order(struct page *page)
455{
456 __ClearPageBuddy(page);
457 set_page_private(page, 0);
458}
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477static inline unsigned long
478__find_buddy_index(unsigned long page_idx, unsigned int order)
479{
480 return page_idx ^ (1 << order);
481}
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496static inline int page_is_buddy(struct page *page, struct page *buddy,
497 int order)
498{
499 if (!pfn_valid_within(page_to_pfn(buddy)))
500 return 0;
501
502 if (page_zone_id(page) != page_zone_id(buddy))
503 return 0;
504
505 if (page_is_guard(buddy) && page_order(buddy) == order) {
506 VM_BUG_ON(page_count(buddy) != 0);
507 return 1;
508 }
509
510 if (PageBuddy(buddy) && page_order(buddy) == order) {
511 VM_BUG_ON(page_count(buddy) != 0);
512 return 1;
513 }
514 return 0;
515}
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541static inline void __free_one_page(struct page *page,
542 struct zone *zone, unsigned int order,
543 int migratetype)
544{
545 unsigned long page_idx;
546 unsigned long combined_idx;
547 unsigned long uninitialized_var(buddy_idx);
548 struct page *buddy;
549
550 VM_BUG_ON(!zone_is_initialized(zone));
551
552 if (unlikely(PageCompound(page)))
553 if (unlikely(destroy_compound_page(page, order)))
554 return;
555
556 VM_BUG_ON(migratetype == -1);
557
558 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
559
560 VM_BUG_ON(page_idx & ((1 << order) - 1));
561 VM_BUG_ON(bad_range(zone, page));
562
563 while (order < MAX_ORDER-1) {
564 buddy_idx = __find_buddy_index(page_idx, order);
565 buddy = page + (buddy_idx - page_idx);
566 if (!page_is_buddy(page, buddy, order))
567 break;
568
569
570
571
572 if (page_is_guard(buddy)) {
573 clear_page_guard_flag(buddy);
574 set_page_private(page, 0);
575 __mod_zone_freepage_state(zone, 1 << order,
576 migratetype);
577 } else {
578 list_del(&buddy->lru);
579 zone->free_area[order].nr_free--;
580 rmv_page_order(buddy);
581 }
582 combined_idx = buddy_idx & page_idx;
583 page = page + (combined_idx - page_idx);
584 page_idx = combined_idx;
585 order++;
586 }
587 set_page_order(page, order);
588
589
590
591
592
593
594
595
596
597 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
598 struct page *higher_page, *higher_buddy;
599 combined_idx = buddy_idx & page_idx;
600 higher_page = page + (combined_idx - page_idx);
601 buddy_idx = __find_buddy_index(combined_idx, order + 1);
602 higher_buddy = higher_page + (buddy_idx - combined_idx);
603 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
604 list_add_tail(&page->lru,
605 &zone->free_area[order].free_list[migratetype]);
606 goto out;
607 }
608 }
609
610 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
611out:
612 zone->free_area[order].nr_free++;
613}
614
615static inline int free_pages_check(struct page *page)
616{
617 if (unlikely(page_mapcount(page) |
618 (page->mapping != NULL) |
619 (atomic_read(&page->_count) != 0) |
620 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
621 (mem_cgroup_bad_page_check(page)))) {
622 bad_page(page);
623 return 1;
624 }
625 page_nid_reset_last(page);
626 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
627 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
628 return 0;
629}
630
631
632
633
634
635
636
637
638
639
640
641
642static void free_pcppages_bulk(struct zone *zone, int count,
643 struct per_cpu_pages *pcp)
644{
645 int migratetype = 0;
646 int batch_free = 0;
647 int to_free = count;
648
649 spin_lock(&zone->lock);
650 zone->all_unreclaimable = 0;
651 zone->pages_scanned = 0;
652
653 while (to_free) {
654 struct page *page;
655 struct list_head *list;
656
657
658
659
660
661
662
663
664 do {
665 batch_free++;
666 if (++migratetype == MIGRATE_PCPTYPES)
667 migratetype = 0;
668 list = &pcp->lists[migratetype];
669 } while (list_empty(list));
670
671
672 if (batch_free == MIGRATE_PCPTYPES)
673 batch_free = to_free;
674
675 do {
676 int mt;
677
678 page = list_entry(list->prev, struct page, lru);
679
680 list_del(&page->lru);
681 mt = get_freepage_migratetype(page);
682
683 __free_one_page(page, zone, 0, mt);
684 trace_mm_page_pcpu_drain(page, 0, mt);
685 if (likely(!is_migrate_isolate_page(page))) {
686 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
687 if (is_migrate_cma(mt))
688 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
689 }
690 } while (--to_free && --batch_free && !list_empty(list));
691 }
692 spin_unlock(&zone->lock);
693}
694
695static void free_one_page(struct zone *zone, struct page *page, int order,
696 int migratetype)
697{
698 spin_lock(&zone->lock);
699 zone->all_unreclaimable = 0;
700 zone->pages_scanned = 0;
701
702 __free_one_page(page, zone, order, migratetype);
703 if (unlikely(!is_migrate_isolate(migratetype)))
704 __mod_zone_freepage_state(zone, 1 << order, migratetype);
705 spin_unlock(&zone->lock);
706}
707
708static bool free_pages_prepare(struct page *page, unsigned int order)
709{
710 int i;
711 int bad = 0;
712
713 trace_mm_page_free(page, order);
714 kmemcheck_free_shadow(page, order);
715
716 if (PageAnon(page))
717 page->mapping = NULL;
718 for (i = 0; i < (1 << order); i++)
719 bad += free_pages_check(page + i);
720 if (bad)
721 return false;
722
723 if (!PageHighMem(page)) {
724 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
725 debug_check_no_obj_freed(page_address(page),
726 PAGE_SIZE << order);
727 }
728 arch_free_page(page, order);
729 kernel_map_pages(page, 1 << order, 0);
730
731 return true;
732}
733
734static void __free_pages_ok(struct page *page, unsigned int order)
735{
736 unsigned long flags;
737 int migratetype;
738
739 if (!free_pages_prepare(page, order))
740 return;
741
742 local_irq_save(flags);
743 __count_vm_events(PGFREE, 1 << order);
744 migratetype = get_pageblock_migratetype(page);
745 set_freepage_migratetype(page, migratetype);
746 free_one_page(page_zone(page), page, order, migratetype);
747 local_irq_restore(flags);
748}
749
750void __init __free_pages_bootmem(struct page *page, unsigned int order)
751{
752 unsigned int nr_pages = 1 << order;
753 unsigned int loop;
754
755 prefetchw(page);
756 for (loop = 0; loop < nr_pages; loop++) {
757 struct page *p = &page[loop];
758
759 if (loop + 1 < nr_pages)
760 prefetchw(p + 1);
761 __ClearPageReserved(p);
762 set_page_count(p, 0);
763 }
764
765 page_zone(page)->managed_pages += 1 << order;
766 set_page_refcounted(page);
767 __free_pages(page, order);
768}
769
770#ifdef CONFIG_CMA
771
772void __init init_cma_reserved_pageblock(struct page *page)
773{
774 unsigned i = pageblock_nr_pages;
775 struct page *p = page;
776
777 do {
778 __ClearPageReserved(p);
779 set_page_count(p, 0);
780 } while (++p, --i);
781
782 set_page_refcounted(page);
783 set_pageblock_migratetype(page, MIGRATE_CMA);
784 __free_pages(page, pageblock_order);
785 adjust_managed_page_count(page, pageblock_nr_pages);
786}
787#endif
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803static inline void expand(struct zone *zone, struct page *page,
804 int low, int high, struct free_area *area,
805 int migratetype)
806{
807 unsigned long size = 1 << high;
808
809 while (high > low) {
810 area--;
811 high--;
812 size >>= 1;
813 VM_BUG_ON(bad_range(zone, &page[size]));
814
815#ifdef CONFIG_DEBUG_PAGEALLOC
816 if (high < debug_guardpage_minorder()) {
817
818
819
820
821
822
823 INIT_LIST_HEAD(&page[size].lru);
824 set_page_guard_flag(&page[size]);
825 set_page_private(&page[size], high);
826
827 __mod_zone_freepage_state(zone, -(1 << high),
828 migratetype);
829 continue;
830 }
831#endif
832 list_add(&page[size].lru, &area->free_list[migratetype]);
833 area->nr_free++;
834 set_page_order(&page[size], high);
835 }
836}
837
838
839
840
841static inline int check_new_page(struct page *page)
842{
843 if (unlikely(page_mapcount(page) |
844 (page->mapping != NULL) |
845 (atomic_read(&page->_count) != 0) |
846 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
847 (mem_cgroup_bad_page_check(page)))) {
848 bad_page(page);
849 return 1;
850 }
851 return 0;
852}
853
854static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
855{
856 int i;
857
858 for (i = 0; i < (1 << order); i++) {
859 struct page *p = page + i;
860 if (unlikely(check_new_page(p)))
861 return 1;
862 }
863
864 set_page_private(page, 0);
865 set_page_refcounted(page);
866
867 arch_alloc_page(page, order);
868 kernel_map_pages(page, 1 << order, 1);
869
870 if (gfp_flags & __GFP_ZERO)
871 prep_zero_page(page, order, gfp_flags);
872
873 if (order && (gfp_flags & __GFP_COMP))
874 prep_compound_page(page, order);
875
876 return 0;
877}
878
879
880
881
882
883static inline
884struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
885 int migratetype)
886{
887 unsigned int current_order;
888 struct free_area * area;
889 struct page *page;
890
891
892 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
893 area = &(zone->free_area[current_order]);
894 if (list_empty(&area->free_list[migratetype]))
895 continue;
896
897 page = list_entry(area->free_list[migratetype].next,
898 struct page, lru);
899 list_del(&page->lru);
900 rmv_page_order(page);
901 area->nr_free--;
902 expand(zone, page, order, current_order, area, migratetype);
903 return page;
904 }
905
906 return NULL;
907}
908
909
910
911
912
913
914static int fallbacks[MIGRATE_TYPES][4] = {
915 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
916 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
917#ifdef CONFIG_CMA
918 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
919 [MIGRATE_CMA] = { MIGRATE_RESERVE },
920#else
921 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
922#endif
923 [MIGRATE_RESERVE] = { MIGRATE_RESERVE },
924#ifdef CONFIG_MEMORY_ISOLATION
925 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE },
926#endif
927};
928
929
930
931
932
933
934int move_freepages(struct zone *zone,
935 struct page *start_page, struct page *end_page,
936 int migratetype)
937{
938 struct page *page;
939 unsigned long order;
940 int pages_moved = 0;
941
942#ifndef CONFIG_HOLES_IN_ZONE
943
944
945
946
947
948
949
950 BUG_ON(page_zone(start_page) != page_zone(end_page));
951#endif
952
953 for (page = start_page; page <= end_page;) {
954
955 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
956
957 if (!pfn_valid_within(page_to_pfn(page))) {
958 page++;
959 continue;
960 }
961
962 if (!PageBuddy(page)) {
963 page++;
964 continue;
965 }
966
967 order = page_order(page);
968 list_move(&page->lru,
969 &zone->free_area[order].free_list[migratetype]);
970 set_freepage_migratetype(page, migratetype);
971 page += 1 << order;
972 pages_moved += 1 << order;
973 }
974
975 return pages_moved;
976}
977
978int move_freepages_block(struct zone *zone, struct page *page,
979 int migratetype)
980{
981 unsigned long start_pfn, end_pfn;
982 struct page *start_page, *end_page;
983
984 start_pfn = page_to_pfn(page);
985 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
986 start_page = pfn_to_page(start_pfn);
987 end_page = start_page + pageblock_nr_pages - 1;
988 end_pfn = start_pfn + pageblock_nr_pages - 1;
989
990
991 if (!zone_spans_pfn(zone, start_pfn))
992 start_page = page;
993 if (!zone_spans_pfn(zone, end_pfn))
994 return 0;
995
996 return move_freepages(zone, start_page, end_page, migratetype);
997}
998
999static void change_pageblock_range(struct page *pageblock_page,
1000 int start_order, int migratetype)
1001{
1002 int nr_pageblocks = 1 << (start_order - pageblock_order);
1003
1004 while (nr_pageblocks--) {
1005 set_pageblock_migratetype(pageblock_page, migratetype);
1006 pageblock_page += pageblock_nr_pages;
1007 }
1008}
1009
1010
1011static inline struct page *
1012__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1013{
1014 struct free_area * area;
1015 int current_order;
1016 struct page *page;
1017 int migratetype, i;
1018
1019
1020 for (current_order = MAX_ORDER-1; current_order >= order;
1021 --current_order) {
1022 for (i = 0;; i++) {
1023 migratetype = fallbacks[start_migratetype][i];
1024
1025
1026 if (migratetype == MIGRATE_RESERVE)
1027 break;
1028
1029 area = &(zone->free_area[current_order]);
1030 if (list_empty(&area->free_list[migratetype]))
1031 continue;
1032
1033 page = list_entry(area->free_list[migratetype].next,
1034 struct page, lru);
1035 area->nr_free--;
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049 if (!is_migrate_cma(migratetype) &&
1050 (current_order >= pageblock_order / 2 ||
1051 start_migratetype == MIGRATE_RECLAIMABLE ||
1052 page_group_by_mobility_disabled)) {
1053 int pages;
1054 pages = move_freepages_block(zone, page,
1055 start_migratetype);
1056
1057
1058 if (pages >= (1 << (pageblock_order-1)) ||
1059 page_group_by_mobility_disabled)
1060 set_pageblock_migratetype(page,
1061 start_migratetype);
1062
1063 migratetype = start_migratetype;
1064 }
1065
1066
1067 list_del(&page->lru);
1068 rmv_page_order(page);
1069
1070
1071 if (current_order >= pageblock_order &&
1072 !is_migrate_cma(migratetype))
1073 change_pageblock_range(page, current_order,
1074 start_migratetype);
1075
1076 expand(zone, page, order, current_order, area,
1077 is_migrate_cma(migratetype)
1078 ? migratetype : start_migratetype);
1079
1080 trace_mm_page_alloc_extfrag(page, order, current_order,
1081 start_migratetype, migratetype);
1082
1083 return page;
1084 }
1085 }
1086
1087 return NULL;
1088}
1089
1090
1091
1092
1093
1094static struct page *__rmqueue(struct zone *zone, unsigned int order,
1095 int migratetype)
1096{
1097 struct page *page;
1098
1099retry_reserve:
1100 page = __rmqueue_smallest(zone, order, migratetype);
1101
1102 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1103 page = __rmqueue_fallback(zone, order, migratetype);
1104
1105
1106
1107
1108
1109
1110 if (!page) {
1111 migratetype = MIGRATE_RESERVE;
1112 goto retry_reserve;
1113 }
1114 }
1115
1116 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1117 return page;
1118}
1119
1120
1121
1122
1123
1124
1125static int rmqueue_bulk(struct zone *zone, unsigned int order,
1126 unsigned long count, struct list_head *list,
1127 int migratetype, int cold)
1128{
1129 int mt = migratetype, i;
1130
1131 spin_lock(&zone->lock);
1132 for (i = 0; i < count; ++i) {
1133 struct page *page = __rmqueue(zone, order, migratetype);
1134 if (unlikely(page == NULL))
1135 break;
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146 if (likely(cold == 0))
1147 list_add(&page->lru, list);
1148 else
1149 list_add_tail(&page->lru, list);
1150 if (IS_ENABLED(CONFIG_CMA)) {
1151 mt = get_pageblock_migratetype(page);
1152 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1153 mt = migratetype;
1154 }
1155 set_freepage_migratetype(page, mt);
1156 list = &page->lru;
1157 if (is_migrate_cma(mt))
1158 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1159 -(1 << order));
1160 }
1161 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1162 spin_unlock(&zone->lock);
1163 return i;
1164}
1165
1166#ifdef CONFIG_NUMA
1167
1168
1169
1170
1171
1172
1173
1174
1175void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1176{
1177 unsigned long flags;
1178 int to_drain;
1179 unsigned long batch;
1180
1181 local_irq_save(flags);
1182 batch = ACCESS_ONCE(pcp->batch);
1183 if (pcp->count >= batch)
1184 to_drain = batch;
1185 else
1186 to_drain = pcp->count;
1187 if (to_drain > 0) {
1188 free_pcppages_bulk(zone, to_drain, pcp);
1189 pcp->count -= to_drain;
1190 }
1191 local_irq_restore(flags);
1192}
1193#endif
1194
1195
1196
1197
1198
1199
1200
1201
1202static void drain_pages(unsigned int cpu)
1203{
1204 unsigned long flags;
1205 struct zone *zone;
1206
1207 for_each_populated_zone(zone) {
1208 struct per_cpu_pageset *pset;
1209 struct per_cpu_pages *pcp;
1210
1211 local_irq_save(flags);
1212 pset = per_cpu_ptr(zone->pageset, cpu);
1213
1214 pcp = &pset->pcp;
1215 if (pcp->count) {
1216 free_pcppages_bulk(zone, pcp->count, pcp);
1217 pcp->count = 0;
1218 }
1219 local_irq_restore(flags);
1220 }
1221}
1222
1223
1224
1225
1226void drain_local_pages(void *arg)
1227{
1228 drain_pages(smp_processor_id());
1229}
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240void drain_all_pages(void)
1241{
1242 int cpu;
1243 struct per_cpu_pageset *pcp;
1244 struct zone *zone;
1245
1246
1247
1248
1249
1250 static cpumask_t cpus_with_pcps;
1251
1252
1253
1254
1255
1256
1257
1258 for_each_online_cpu(cpu) {
1259 bool has_pcps = false;
1260 for_each_populated_zone(zone) {
1261 pcp = per_cpu_ptr(zone->pageset, cpu);
1262 if (pcp->pcp.count) {
1263 has_pcps = true;
1264 break;
1265 }
1266 }
1267 if (has_pcps)
1268 cpumask_set_cpu(cpu, &cpus_with_pcps);
1269 else
1270 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1271 }
1272 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1273}
1274
1275#ifdef CONFIG_HIBERNATION
1276
1277void mark_free_pages(struct zone *zone)
1278{
1279 unsigned long pfn, max_zone_pfn;
1280 unsigned long flags;
1281 int order, t;
1282 struct list_head *curr;
1283
1284 if (!zone->spanned_pages)
1285 return;
1286
1287 spin_lock_irqsave(&zone->lock, flags);
1288
1289 max_zone_pfn = zone_end_pfn(zone);
1290 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1291 if (pfn_valid(pfn)) {
1292 struct page *page = pfn_to_page(pfn);
1293
1294 if (!swsusp_page_is_forbidden(page))
1295 swsusp_unset_page_free(page);
1296 }
1297
1298 for_each_migratetype_order(order, t) {
1299 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1300 unsigned long i;
1301
1302 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1303 for (i = 0; i < (1UL << order); i++)
1304 swsusp_set_page_free(pfn_to_page(pfn + i));
1305 }
1306 }
1307 spin_unlock_irqrestore(&zone->lock, flags);
1308}
1309#endif
1310
1311
1312
1313
1314
1315void free_hot_cold_page(struct page *page, int cold)
1316{
1317 struct zone *zone = page_zone(page);
1318 struct per_cpu_pages *pcp;
1319 unsigned long flags;
1320 int migratetype;
1321
1322 if (!free_pages_prepare(page, 0))
1323 return;
1324
1325 migratetype = get_pageblock_migratetype(page);
1326 set_freepage_migratetype(page, migratetype);
1327 local_irq_save(flags);
1328 __count_vm_event(PGFREE);
1329
1330
1331
1332
1333
1334
1335
1336
1337 if (migratetype >= MIGRATE_PCPTYPES) {
1338 if (unlikely(is_migrate_isolate(migratetype))) {
1339 free_one_page(zone, page, 0, migratetype);
1340 goto out;
1341 }
1342 migratetype = MIGRATE_MOVABLE;
1343 }
1344
1345 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1346 if (cold)
1347 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1348 else
1349 list_add(&page->lru, &pcp->lists[migratetype]);
1350 pcp->count++;
1351 if (pcp->count >= pcp->high) {
1352 unsigned long batch = ACCESS_ONCE(pcp->batch);
1353 free_pcppages_bulk(zone, batch, pcp);
1354 pcp->count -= batch;
1355 }
1356
1357out:
1358 local_irq_restore(flags);
1359}
1360
1361
1362
1363
1364void free_hot_cold_page_list(struct list_head *list, int cold)
1365{
1366 struct page *page, *next;
1367
1368 list_for_each_entry_safe(page, next, list, lru) {
1369 trace_mm_page_free_batched(page, cold);
1370 free_hot_cold_page(page, cold);
1371 }
1372}
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382void split_page(struct page *page, unsigned int order)
1383{
1384 int i;
1385
1386 VM_BUG_ON(PageCompound(page));
1387 VM_BUG_ON(!page_count(page));
1388
1389#ifdef CONFIG_KMEMCHECK
1390
1391
1392
1393
1394 if (kmemcheck_page_is_tracked(page))
1395 split_page(virt_to_page(page[0].shadow), order);
1396#endif
1397
1398 for (i = 1; i < (1 << order); i++)
1399 set_page_refcounted(page + i);
1400}
1401EXPORT_SYMBOL_GPL(split_page);
1402
1403static int __isolate_free_page(struct page *page, unsigned int order)
1404{
1405 unsigned long watermark;
1406 struct zone *zone;
1407 int mt;
1408
1409 BUG_ON(!PageBuddy(page));
1410
1411 zone = page_zone(page);
1412 mt = get_pageblock_migratetype(page);
1413
1414 if (!is_migrate_isolate(mt)) {
1415
1416 watermark = low_wmark_pages(zone) + (1 << order);
1417 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1418 return 0;
1419
1420 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1421 }
1422
1423
1424 list_del(&page->lru);
1425 zone->free_area[order].nr_free--;
1426 rmv_page_order(page);
1427
1428
1429 if (order >= pageblock_order - 1) {
1430 struct page *endpage = page + (1 << order) - 1;
1431 for (; page < endpage; page += pageblock_nr_pages) {
1432 int mt = get_pageblock_migratetype(page);
1433 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1434 set_pageblock_migratetype(page,
1435 MIGRATE_MOVABLE);
1436 }
1437 }
1438
1439 return 1UL << order;
1440}
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452int split_free_page(struct page *page)
1453{
1454 unsigned int order;
1455 int nr_pages;
1456
1457 order = page_order(page);
1458
1459 nr_pages = __isolate_free_page(page, order);
1460 if (!nr_pages)
1461 return 0;
1462
1463
1464 set_page_refcounted(page);
1465 split_page(page, order);
1466 return nr_pages;
1467}
1468
1469
1470
1471
1472
1473
1474static inline
1475struct page *buffered_rmqueue(struct zone *preferred_zone,
1476 struct zone *zone, int order, gfp_t gfp_flags,
1477 int migratetype)
1478{
1479 unsigned long flags;
1480 struct page *page;
1481 int cold = !!(gfp_flags & __GFP_COLD);
1482
1483again:
1484 if (likely(order == 0)) {
1485 struct per_cpu_pages *pcp;
1486 struct list_head *list;
1487
1488 local_irq_save(flags);
1489 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1490 list = &pcp->lists[migratetype];
1491 if (list_empty(list)) {
1492 pcp->count += rmqueue_bulk(zone, 0,
1493 pcp->batch, list,
1494 migratetype, cold);
1495 if (unlikely(list_empty(list)))
1496 goto failed;
1497 }
1498
1499 if (cold)
1500 page = list_entry(list->prev, struct page, lru);
1501 else
1502 page = list_entry(list->next, struct page, lru);
1503
1504 list_del(&page->lru);
1505 pcp->count--;
1506 } else {
1507 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518 WARN_ON_ONCE(order > 1);
1519 }
1520 spin_lock_irqsave(&zone->lock, flags);
1521 page = __rmqueue(zone, order, migratetype);
1522 spin_unlock(&zone->lock);
1523 if (!page)
1524 goto failed;
1525 __mod_zone_freepage_state(zone, -(1 << order),
1526 get_pageblock_migratetype(page));
1527 }
1528
1529 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1530 zone_statistics(preferred_zone, zone, gfp_flags);
1531 local_irq_restore(flags);
1532
1533 VM_BUG_ON(bad_range(zone, page));
1534 if (prep_new_page(page, order, gfp_flags))
1535 goto again;
1536 return page;
1537
1538failed:
1539 local_irq_restore(flags);
1540 return NULL;
1541}
1542
1543#ifdef CONFIG_FAIL_PAGE_ALLOC
1544
1545static struct {
1546 struct fault_attr attr;
1547
1548 u32 ignore_gfp_highmem;
1549 u32 ignore_gfp_wait;
1550 u32 min_order;
1551} fail_page_alloc = {
1552 .attr = FAULT_ATTR_INITIALIZER,
1553 .ignore_gfp_wait = 1,
1554 .ignore_gfp_highmem = 1,
1555 .min_order = 1,
1556};
1557
1558static int __init setup_fail_page_alloc(char *str)
1559{
1560 return setup_fault_attr(&fail_page_alloc.attr, str);
1561}
1562__setup("fail_page_alloc=", setup_fail_page_alloc);
1563
1564static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1565{
1566 if (order < fail_page_alloc.min_order)
1567 return false;
1568 if (gfp_mask & __GFP_NOFAIL)
1569 return false;
1570 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1571 return false;
1572 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1573 return false;
1574
1575 return should_fail(&fail_page_alloc.attr, 1 << order);
1576}
1577
1578#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1579
1580static int __init fail_page_alloc_debugfs(void)
1581{
1582 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1583 struct dentry *dir;
1584
1585 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1586 &fail_page_alloc.attr);
1587 if (IS_ERR(dir))
1588 return PTR_ERR(dir);
1589
1590 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1591 &fail_page_alloc.ignore_gfp_wait))
1592 goto fail;
1593 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1594 &fail_page_alloc.ignore_gfp_highmem))
1595 goto fail;
1596 if (!debugfs_create_u32("min-order", mode, dir,
1597 &fail_page_alloc.min_order))
1598 goto fail;
1599
1600 return 0;
1601fail:
1602 debugfs_remove_recursive(dir);
1603
1604 return -ENOMEM;
1605}
1606
1607late_initcall(fail_page_alloc_debugfs);
1608
1609#endif
1610
1611#else
1612
1613static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1614{
1615 return false;
1616}
1617
1618#endif
1619
1620
1621
1622
1623
1624static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1625 int classzone_idx, int alloc_flags, long free_pages)
1626{
1627
1628 long min = mark;
1629 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1630 int o;
1631 long free_cma = 0;
1632
1633 free_pages -= (1 << order) - 1;
1634 if (alloc_flags & ALLOC_HIGH)
1635 min -= min / 2;
1636 if (alloc_flags & ALLOC_HARDER)
1637 min -= min / 4;
1638#ifdef CONFIG_CMA
1639
1640 if (!(alloc_flags & ALLOC_CMA))
1641 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1642#endif
1643
1644 if (free_pages - free_cma <= min + lowmem_reserve)
1645 return false;
1646 for (o = 0; o < order; o++) {
1647
1648 free_pages -= z->free_area[o].nr_free << o;
1649
1650
1651 min >>= 1;
1652
1653 if (free_pages <= min)
1654 return false;
1655 }
1656 return true;
1657}
1658
1659bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1660 int classzone_idx, int alloc_flags)
1661{
1662 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1663 zone_page_state(z, NR_FREE_PAGES));
1664}
1665
1666bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1667 int classzone_idx, int alloc_flags)
1668{
1669 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1670
1671 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1672 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1673
1674 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1675 free_pages);
1676}
1677
1678#ifdef CONFIG_NUMA
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1702{
1703 struct zonelist_cache *zlc;
1704 nodemask_t *allowednodes;
1705
1706 zlc = zonelist->zlcache_ptr;
1707 if (!zlc)
1708 return NULL;
1709
1710 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1711 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1712 zlc->last_full_zap = jiffies;
1713 }
1714
1715 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1716 &cpuset_current_mems_allowed :
1717 &node_states[N_MEMORY];
1718 return allowednodes;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1744 nodemask_t *allowednodes)
1745{
1746 struct zonelist_cache *zlc;
1747 int i;
1748 int n;
1749
1750 zlc = zonelist->zlcache_ptr;
1751 if (!zlc)
1752 return 1;
1753
1754 i = z - zonelist->_zonerefs;
1755 n = zlc->z_to_n[i];
1756
1757
1758 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1759}
1760
1761
1762
1763
1764
1765
1766static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1767{
1768 struct zonelist_cache *zlc;
1769 int i;
1770
1771 zlc = zonelist->zlcache_ptr;
1772 if (!zlc)
1773 return;
1774
1775 i = z - zonelist->_zonerefs;
1776
1777 set_bit(i, zlc->fullzones);
1778}
1779
1780
1781
1782
1783
1784static void zlc_clear_zones_full(struct zonelist *zonelist)
1785{
1786 struct zonelist_cache *zlc;
1787
1788 zlc = zonelist->zlcache_ptr;
1789 if (!zlc)
1790 return;
1791
1792 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1793}
1794
1795static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1796{
1797 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1798}
1799
1800static void __paginginit init_zone_allows_reclaim(int nid)
1801{
1802 int i;
1803
1804 for_each_online_node(i)
1805 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1806 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1807 else
1808 zone_reclaim_mode = 1;
1809}
1810
1811#else
1812
1813static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1814{
1815 return NULL;
1816}
1817
1818static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1819 nodemask_t *allowednodes)
1820{
1821 return 1;
1822}
1823
1824static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1825{
1826}
1827
1828static void zlc_clear_zones_full(struct zonelist *zonelist)
1829{
1830}
1831
1832static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1833{
1834 return true;
1835}
1836
1837static inline void init_zone_allows_reclaim(int nid)
1838{
1839}
1840#endif
1841
1842
1843
1844
1845
1846static struct page *
1847get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1848 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1849 struct zone *preferred_zone, int migratetype)
1850{
1851 struct zoneref *z;
1852 struct page *page = NULL;
1853 int classzone_idx;
1854 struct zone *zone;
1855 nodemask_t *allowednodes = NULL;
1856 int zlc_active = 0;
1857 int did_zlc_setup = 0;
1858
1859 classzone_idx = zone_idx(preferred_zone);
1860zonelist_scan:
1861
1862
1863
1864
1865 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1866 high_zoneidx, nodemask) {
1867 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1868 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1869 continue;
1870 if ((alloc_flags & ALLOC_CPUSET) &&
1871 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1872 continue;
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1900 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1901 goto this_zone_full;
1902
1903 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1904 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1905 unsigned long mark;
1906 int ret;
1907
1908 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1909 if (zone_watermark_ok(zone, order, mark,
1910 classzone_idx, alloc_flags))
1911 goto try_this_zone;
1912
1913 if (IS_ENABLED(CONFIG_NUMA) &&
1914 !did_zlc_setup && nr_online_nodes > 1) {
1915
1916
1917
1918
1919
1920 allowednodes = zlc_setup(zonelist, alloc_flags);
1921 zlc_active = 1;
1922 did_zlc_setup = 1;
1923 }
1924
1925 if (zone_reclaim_mode == 0 ||
1926 !zone_allows_reclaim(preferred_zone, zone))
1927 goto this_zone_full;
1928
1929
1930
1931
1932
1933 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1934 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1935 continue;
1936
1937 ret = zone_reclaim(zone, gfp_mask, order);
1938 switch (ret) {
1939 case ZONE_RECLAIM_NOSCAN:
1940
1941 continue;
1942 case ZONE_RECLAIM_FULL:
1943
1944 continue;
1945 default:
1946
1947 if (zone_watermark_ok(zone, order, mark,
1948 classzone_idx, alloc_flags))
1949 goto try_this_zone;
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
1961 ret == ZONE_RECLAIM_SOME)
1962 goto this_zone_full;
1963
1964 continue;
1965 }
1966 }
1967
1968try_this_zone:
1969 page = buffered_rmqueue(preferred_zone, zone, order,
1970 gfp_mask, migratetype);
1971 if (page)
1972 break;
1973this_zone_full:
1974 if (IS_ENABLED(CONFIG_NUMA))
1975 zlc_mark_zone_full(zonelist, z);
1976 }
1977
1978 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1979
1980 zlc_active = 0;
1981 goto zonelist_scan;
1982 }
1983
1984 if (page)
1985
1986
1987
1988
1989
1990
1991
1992 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1993
1994 return page;
1995}
1996
1997
1998
1999
2000
2001static inline bool should_suppress_show_mem(void)
2002{
2003 bool ret = false;
2004
2005#if NODES_SHIFT > 8
2006 ret = in_interrupt();
2007#endif
2008 return ret;
2009}
2010
2011static DEFINE_RATELIMIT_STATE(nopage_rs,
2012 DEFAULT_RATELIMIT_INTERVAL,
2013 DEFAULT_RATELIMIT_BURST);
2014
2015void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2016{
2017 unsigned int filter = SHOW_MEM_FILTER_NODES;
2018
2019 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2020 debug_guardpage_minorder() > 0)
2021 return;
2022
2023
2024
2025
2026
2027 if (!(gfp_mask & __GFP_WAIT))
2028 filter |= SHOW_MEM_FILTER_PAGE_COUNT;
2029
2030
2031
2032
2033
2034
2035 if (!(gfp_mask & __GFP_NOMEMALLOC))
2036 if (test_thread_flag(TIF_MEMDIE) ||
2037 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2038 filter &= ~SHOW_MEM_FILTER_NODES;
2039 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2040 filter &= ~SHOW_MEM_FILTER_NODES;
2041
2042 if (fmt) {
2043 struct va_format vaf;
2044 va_list args;
2045
2046 va_start(args, fmt);
2047
2048 vaf.fmt = fmt;
2049 vaf.va = &args;
2050
2051 pr_warn("%pV", &vaf);
2052
2053 va_end(args);
2054 }
2055
2056 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2057 current->comm, order, gfp_mask);
2058
2059 dump_stack();
2060 if (!should_suppress_show_mem())
2061 show_mem(filter);
2062}
2063
2064static inline int
2065should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2066 unsigned long did_some_progress,
2067 unsigned long pages_reclaimed)
2068{
2069
2070 if (gfp_mask & __GFP_NORETRY)
2071 return 0;
2072
2073
2074 if (gfp_mask & __GFP_NOFAIL)
2075 return 1;
2076
2077
2078
2079
2080
2081
2082 if (!did_some_progress && pm_suspended_storage())
2083 return 0;
2084
2085
2086
2087
2088
2089
2090 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2091 return 1;
2092
2093
2094
2095
2096
2097
2098
2099
2100 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2101 return 1;
2102
2103 return 0;
2104}
2105
2106static inline struct page *
2107__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2108 struct zonelist *zonelist, enum zone_type high_zoneidx,
2109 nodemask_t *nodemask, struct zone *preferred_zone,
2110 int migratetype)
2111{
2112 struct page *page;
2113
2114
2115 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2116 schedule_timeout_uninterruptible(1);
2117 return NULL;
2118 }
2119
2120
2121
2122
2123
2124
2125 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2126 order, zonelist, high_zoneidx,
2127 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2128 preferred_zone, migratetype);
2129 if (page)
2130 goto out;
2131
2132 if (!(gfp_mask & __GFP_NOFAIL)) {
2133
2134 if (order > PAGE_ALLOC_COSTLY_ORDER)
2135 goto out;
2136
2137 if (high_zoneidx < ZONE_NORMAL)
2138 goto out;
2139
2140
2141
2142
2143
2144
2145
2146 if (gfp_mask & __GFP_THISNODE)
2147 goto out;
2148 }
2149
2150 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2151
2152out:
2153 clear_zonelist_oom(zonelist, gfp_mask);
2154 return page;
2155}
2156
2157#ifdef CONFIG_COMPACTION
2158
2159static struct page *
2160__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2161 struct zonelist *zonelist, enum zone_type high_zoneidx,
2162 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2163 int migratetype, bool sync_migration,
2164 bool *contended_compaction, bool *deferred_compaction,
2165 unsigned long *did_some_progress)
2166{
2167 if (!order)
2168 return NULL;
2169
2170 if (compaction_deferred(preferred_zone, order)) {
2171 *deferred_compaction = true;
2172 return NULL;
2173 }
2174
2175 current->flags |= PF_MEMALLOC;
2176 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2177 nodemask, sync_migration,
2178 contended_compaction);
2179 current->flags &= ~PF_MEMALLOC;
2180
2181 if (*did_some_progress != COMPACT_SKIPPED) {
2182 struct page *page;
2183
2184
2185 drain_pages(get_cpu());
2186 put_cpu();
2187
2188 page = get_page_from_freelist(gfp_mask, nodemask,
2189 order, zonelist, high_zoneidx,
2190 alloc_flags & ~ALLOC_NO_WATERMARKS,
2191 preferred_zone, migratetype);
2192 if (page) {
2193 preferred_zone->compact_blockskip_flush = false;
2194 preferred_zone->compact_considered = 0;
2195 preferred_zone->compact_defer_shift = 0;
2196 if (order >= preferred_zone->compact_order_failed)
2197 preferred_zone->compact_order_failed = order + 1;
2198 count_vm_event(COMPACTSUCCESS);
2199 return page;
2200 }
2201
2202
2203
2204
2205
2206
2207 count_vm_event(COMPACTFAIL);
2208
2209
2210
2211
2212
2213 if (sync_migration)
2214 defer_compaction(preferred_zone, order);
2215
2216 cond_resched();
2217 }
2218
2219 return NULL;
2220}
2221#else
2222static inline struct page *
2223__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2224 struct zonelist *zonelist, enum zone_type high_zoneidx,
2225 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2226 int migratetype, bool sync_migration,
2227 bool *contended_compaction, bool *deferred_compaction,
2228 unsigned long *did_some_progress)
2229{
2230 return NULL;
2231}
2232#endif
2233
2234
2235static int
2236__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2237 nodemask_t *nodemask)
2238{
2239 struct reclaim_state reclaim_state;
2240 int progress;
2241
2242 cond_resched();
2243
2244
2245 cpuset_memory_pressure_bump();
2246 current->flags |= PF_MEMALLOC;
2247 lockdep_set_current_reclaim_state(gfp_mask);
2248 reclaim_state.reclaimed_slab = 0;
2249 current->reclaim_state = &reclaim_state;
2250
2251 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2252
2253 current->reclaim_state = NULL;
2254 lockdep_clear_current_reclaim_state();
2255 current->flags &= ~PF_MEMALLOC;
2256
2257 cond_resched();
2258
2259 return progress;
2260}
2261
2262
2263static inline struct page *
2264__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2265 struct zonelist *zonelist, enum zone_type high_zoneidx,
2266 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2267 int migratetype, unsigned long *did_some_progress)
2268{
2269 struct page *page = NULL;
2270 bool drained = false;
2271
2272 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2273 nodemask);
2274 if (unlikely(!(*did_some_progress)))
2275 return NULL;
2276
2277
2278 if (IS_ENABLED(CONFIG_NUMA))
2279 zlc_clear_zones_full(zonelist);
2280
2281retry:
2282 page = get_page_from_freelist(gfp_mask, nodemask, order,
2283 zonelist, high_zoneidx,
2284 alloc_flags & ~ALLOC_NO_WATERMARKS,
2285 preferred_zone, migratetype);
2286
2287
2288
2289
2290
2291 if (!page && !drained) {
2292 drain_all_pages();
2293 drained = true;
2294 goto retry;
2295 }
2296
2297 return page;
2298}
2299
2300
2301
2302
2303
2304static inline struct page *
2305__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2306 struct zonelist *zonelist, enum zone_type high_zoneidx,
2307 nodemask_t *nodemask, struct zone *preferred_zone,
2308 int migratetype)
2309{
2310 struct page *page;
2311
2312 do {
2313 page = get_page_from_freelist(gfp_mask, nodemask, order,
2314 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2315 preferred_zone, migratetype);
2316
2317 if (!page && gfp_mask & __GFP_NOFAIL)
2318 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2319 } while (!page && (gfp_mask & __GFP_NOFAIL));
2320
2321 return page;
2322}
2323
2324static inline
2325void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
2326 enum zone_type high_zoneidx,
2327 enum zone_type classzone_idx)
2328{
2329 struct zoneref *z;
2330 struct zone *zone;
2331
2332 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2333 wakeup_kswapd(zone, order, classzone_idx);
2334}
2335
2336static inline int
2337gfp_to_alloc_flags(gfp_t gfp_mask)
2338{
2339 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2340 const gfp_t wait = gfp_mask & __GFP_WAIT;
2341
2342
2343 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2344
2345
2346
2347
2348
2349
2350
2351 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2352
2353 if (!wait) {
2354
2355
2356
2357
2358 if (!(gfp_mask & __GFP_NOMEMALLOC))
2359 alloc_flags |= ALLOC_HARDER;
2360
2361
2362
2363
2364 alloc_flags &= ~ALLOC_CPUSET;
2365 } else if (unlikely(rt_task(current)) && !in_interrupt())
2366 alloc_flags |= ALLOC_HARDER;
2367
2368 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2369 if (gfp_mask & __GFP_MEMALLOC)
2370 alloc_flags |= ALLOC_NO_WATERMARKS;
2371 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2372 alloc_flags |= ALLOC_NO_WATERMARKS;
2373 else if (!in_interrupt() &&
2374 ((current->flags & PF_MEMALLOC) ||
2375 unlikely(test_thread_flag(TIF_MEMDIE))))
2376 alloc_flags |= ALLOC_NO_WATERMARKS;
2377 }
2378#ifdef CONFIG_CMA
2379 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2380 alloc_flags |= ALLOC_CMA;
2381#endif
2382 return alloc_flags;
2383}
2384
2385bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2386{
2387 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2388}
2389
2390static inline struct page *
2391__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2392 struct zonelist *zonelist, enum zone_type high_zoneidx,
2393 nodemask_t *nodemask, struct zone *preferred_zone,
2394 int migratetype)
2395{
2396 const gfp_t wait = gfp_mask & __GFP_WAIT;
2397 struct page *page = NULL;
2398 int alloc_flags;
2399 unsigned long pages_reclaimed = 0;
2400 unsigned long did_some_progress;
2401 bool sync_migration = false;
2402 bool deferred_compaction = false;
2403 bool contended_compaction = false;
2404
2405
2406
2407
2408
2409
2410
2411 if (order >= MAX_ORDER) {
2412 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2413 return NULL;
2414 }
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424 if (IS_ENABLED(CONFIG_NUMA) &&
2425 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2426 goto nopage;
2427
2428restart:
2429 if (!(gfp_mask & __GFP_NO_KSWAPD))
2430 wake_all_kswapd(order, zonelist, high_zoneidx,
2431 zone_idx(preferred_zone));
2432
2433
2434
2435
2436
2437
2438 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2439
2440
2441
2442
2443
2444 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2445 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2446 &preferred_zone);
2447
2448rebalance:
2449
2450 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2451 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2452 preferred_zone, migratetype);
2453 if (page)
2454 goto got_pg;
2455
2456
2457 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2458
2459
2460
2461
2462
2463 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2464
2465 page = __alloc_pages_high_priority(gfp_mask, order,
2466 zonelist, high_zoneidx, nodemask,
2467 preferred_zone, migratetype);
2468 if (page) {
2469 goto got_pg;
2470 }
2471 }
2472
2473
2474 if (!wait)
2475 goto nopage;
2476
2477
2478 if (current->flags & PF_MEMALLOC)
2479 goto nopage;
2480
2481
2482 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2483 goto nopage;
2484
2485
2486
2487
2488
2489 page = __alloc_pages_direct_compact(gfp_mask, order,
2490 zonelist, high_zoneidx,
2491 nodemask,
2492 alloc_flags, preferred_zone,
2493 migratetype, sync_migration,
2494 &contended_compaction,
2495 &deferred_compaction,
2496 &did_some_progress);
2497 if (page)
2498 goto got_pg;
2499 sync_migration = true;
2500
2501
2502
2503
2504
2505
2506
2507 if ((deferred_compaction || contended_compaction) &&
2508 (gfp_mask & __GFP_NO_KSWAPD))
2509 goto nopage;
2510
2511
2512 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2513 zonelist, high_zoneidx,
2514 nodemask,
2515 alloc_flags, preferred_zone,
2516 migratetype, &did_some_progress);
2517 if (page)
2518 goto got_pg;
2519
2520
2521
2522
2523
2524 if (!did_some_progress) {
2525 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2526 if (oom_killer_disabled)
2527 goto nopage;
2528
2529 if ((current->flags & PF_DUMPCORE) &&
2530 !(gfp_mask & __GFP_NOFAIL))
2531 goto nopage;
2532 page = __alloc_pages_may_oom(gfp_mask, order,
2533 zonelist, high_zoneidx,
2534 nodemask, preferred_zone,
2535 migratetype);
2536 if (page)
2537 goto got_pg;
2538
2539 if (!(gfp_mask & __GFP_NOFAIL)) {
2540
2541
2542
2543
2544
2545
2546 if (order > PAGE_ALLOC_COSTLY_ORDER)
2547 goto nopage;
2548
2549
2550
2551
2552
2553 if (high_zoneidx < ZONE_NORMAL)
2554 goto nopage;
2555 }
2556
2557 goto restart;
2558 }
2559 }
2560
2561
2562 pages_reclaimed += did_some_progress;
2563 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2564 pages_reclaimed)) {
2565
2566 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2567 goto rebalance;
2568 } else {
2569
2570
2571
2572
2573
2574 page = __alloc_pages_direct_compact(gfp_mask, order,
2575 zonelist, high_zoneidx,
2576 nodemask,
2577 alloc_flags, preferred_zone,
2578 migratetype, sync_migration,
2579 &contended_compaction,
2580 &deferred_compaction,
2581 &did_some_progress);
2582 if (page)
2583 goto got_pg;
2584 }
2585
2586nopage:
2587 warn_alloc_failed(gfp_mask, order, NULL);
2588 return page;
2589got_pg:
2590 if (kmemcheck_enabled)
2591 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2592
2593 return page;
2594}
2595
2596
2597
2598
2599struct page *
2600__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2601 struct zonelist *zonelist, nodemask_t *nodemask)
2602{
2603 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2604 struct zone *preferred_zone;
2605 struct page *page = NULL;
2606 int migratetype = allocflags_to_migratetype(gfp_mask);
2607 unsigned int cpuset_mems_cookie;
2608 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2609 struct mem_cgroup *memcg = NULL;
2610
2611 gfp_mask &= gfp_allowed_mask;
2612
2613 lockdep_trace_alloc(gfp_mask);
2614
2615 might_sleep_if(gfp_mask & __GFP_WAIT);
2616
2617 if (should_fail_alloc_page(gfp_mask, order))
2618 return NULL;
2619
2620
2621
2622
2623
2624
2625 if (unlikely(!zonelist->_zonerefs->zone))
2626 return NULL;
2627
2628
2629
2630
2631
2632 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2633 return NULL;
2634
2635retry_cpuset:
2636 cpuset_mems_cookie = get_mems_allowed();
2637
2638
2639 first_zones_zonelist(zonelist, high_zoneidx,
2640 nodemask ? : &cpuset_current_mems_allowed,
2641 &preferred_zone);
2642 if (!preferred_zone)
2643 goto out;
2644
2645#ifdef CONFIG_CMA
2646 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2647 alloc_flags |= ALLOC_CMA;
2648#endif
2649
2650 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2651 zonelist, high_zoneidx, alloc_flags,
2652 preferred_zone, migratetype);
2653 if (unlikely(!page)) {
2654
2655
2656
2657
2658
2659 gfp_mask = memalloc_noio_flags(gfp_mask);
2660 page = __alloc_pages_slowpath(gfp_mask, order,
2661 zonelist, high_zoneidx, nodemask,
2662 preferred_zone, migratetype);
2663 }
2664
2665 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2666
2667out:
2668
2669
2670
2671
2672
2673
2674 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2675 goto retry_cpuset;
2676
2677 memcg_kmem_commit_charge(page, memcg, order);
2678
2679 return page;
2680}
2681EXPORT_SYMBOL(__alloc_pages_nodemask);
2682
2683
2684
2685
2686unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2687{
2688 struct page *page;
2689
2690
2691
2692
2693
2694 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2695
2696 page = alloc_pages(gfp_mask, order);
2697 if (!page)
2698 return 0;
2699 return (unsigned long) page_address(page);
2700}
2701EXPORT_SYMBOL(__get_free_pages);
2702
2703unsigned long get_zeroed_page(gfp_t gfp_mask)
2704{
2705 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2706}
2707EXPORT_SYMBOL(get_zeroed_page);
2708
2709void __free_pages(struct page *page, unsigned int order)
2710{
2711 if (put_page_testzero(page)) {
2712 if (order == 0)
2713 free_hot_cold_page(page, 0);
2714 else
2715 __free_pages_ok(page, order);
2716 }
2717}
2718
2719EXPORT_SYMBOL(__free_pages);
2720
2721void free_pages(unsigned long addr, unsigned int order)
2722{
2723 if (addr != 0) {
2724 VM_BUG_ON(!virt_addr_valid((void *)addr));
2725 __free_pages(virt_to_page((void *)addr), order);
2726 }
2727}
2728
2729EXPORT_SYMBOL(free_pages);
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2743{
2744 memcg_kmem_uncharge_pages(page, order);
2745 __free_pages(page, order);
2746}
2747
2748void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2749{
2750 if (addr != 0) {
2751 VM_BUG_ON(!virt_addr_valid((void *)addr));
2752 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2753 }
2754}
2755
2756static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2757{
2758 if (addr) {
2759 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2760 unsigned long used = addr + PAGE_ALIGN(size);
2761
2762 split_page(virt_to_page((void *)addr), order);
2763 while (used < alloc_end) {
2764 free_page(used);
2765 used += PAGE_SIZE;
2766 }
2767 }
2768 return (void *)addr;
2769}
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2785{
2786 unsigned int order = get_order(size);
2787 unsigned long addr;
2788
2789 addr = __get_free_pages(gfp_mask, order);
2790 return make_alloc_exact(addr, order, size);
2791}
2792EXPORT_SYMBOL(alloc_pages_exact);
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2807{
2808 unsigned order = get_order(size);
2809 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2810 if (!p)
2811 return NULL;
2812 return make_alloc_exact((unsigned long)page_address(p), order, size);
2813}
2814EXPORT_SYMBOL(alloc_pages_exact_nid);
2815
2816
2817
2818
2819
2820
2821
2822
2823void free_pages_exact(void *virt, size_t size)
2824{
2825 unsigned long addr = (unsigned long)virt;
2826 unsigned long end = addr + PAGE_ALIGN(size);
2827
2828 while (addr < end) {
2829 free_page(addr);
2830 addr += PAGE_SIZE;
2831 }
2832}
2833EXPORT_SYMBOL(free_pages_exact);
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844static unsigned long nr_free_zone_pages(int offset)
2845{
2846 struct zoneref *z;
2847 struct zone *zone;
2848
2849
2850 unsigned long sum = 0;
2851
2852 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2853
2854 for_each_zone_zonelist(zone, z, zonelist, offset) {
2855 unsigned long size = zone->managed_pages;
2856 unsigned long high = high_wmark_pages(zone);
2857 if (size > high)
2858 sum += size - high;
2859 }
2860
2861 return sum;
2862}
2863
2864
2865
2866
2867
2868
2869
2870unsigned long nr_free_buffer_pages(void)
2871{
2872 return nr_free_zone_pages(gfp_zone(GFP_USER));
2873}
2874EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2875
2876
2877
2878
2879
2880
2881
2882unsigned long nr_free_pagecache_pages(void)
2883{
2884 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2885}
2886
2887static inline void show_node(struct zone *zone)
2888{
2889 if (IS_ENABLED(CONFIG_NUMA))
2890 printk("Node %d ", zone_to_nid(zone));
2891}
2892
2893void si_meminfo(struct sysinfo *val)
2894{
2895 val->totalram = totalram_pages;
2896 val->sharedram = 0;
2897 val->freeram = global_page_state(NR_FREE_PAGES);
2898 val->bufferram = nr_blockdev_pages();
2899 val->totalhigh = totalhigh_pages;
2900 val->freehigh = nr_free_highpages();
2901 val->mem_unit = PAGE_SIZE;
2902}
2903
2904EXPORT_SYMBOL(si_meminfo);
2905
2906#ifdef CONFIG_NUMA
2907void si_meminfo_node(struct sysinfo *val, int nid)
2908{
2909 int zone_type;
2910 unsigned long managed_pages = 0;
2911 pg_data_t *pgdat = NODE_DATA(nid);
2912
2913 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
2914 managed_pages += pgdat->node_zones[zone_type].managed_pages;
2915 val->totalram = managed_pages;
2916 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2917#ifdef CONFIG_HIGHMEM
2918 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
2919 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2920 NR_FREE_PAGES);
2921#else
2922 val->totalhigh = 0;
2923 val->freehigh = 0;
2924#endif
2925 val->mem_unit = PAGE_SIZE;
2926}
2927#endif
2928
2929
2930
2931
2932
2933bool skip_free_areas_node(unsigned int flags, int nid)
2934{
2935 bool ret = false;
2936 unsigned int cpuset_mems_cookie;
2937
2938 if (!(flags & SHOW_MEM_FILTER_NODES))
2939 goto out;
2940
2941 do {
2942 cpuset_mems_cookie = get_mems_allowed();
2943 ret = !node_isset(nid, cpuset_current_mems_allowed);
2944 } while (!put_mems_allowed(cpuset_mems_cookie));
2945out:
2946 return ret;
2947}
2948
2949#define K(x) ((x) << (PAGE_SHIFT-10))
2950
2951static void show_migration_types(unsigned char type)
2952{
2953 static const char types[MIGRATE_TYPES] = {
2954 [MIGRATE_UNMOVABLE] = 'U',
2955 [MIGRATE_RECLAIMABLE] = 'E',
2956 [MIGRATE_MOVABLE] = 'M',
2957 [MIGRATE_RESERVE] = 'R',
2958#ifdef CONFIG_CMA
2959 [MIGRATE_CMA] = 'C',
2960#endif
2961#ifdef CONFIG_MEMORY_ISOLATION
2962 [MIGRATE_ISOLATE] = 'I',
2963#endif
2964 };
2965 char tmp[MIGRATE_TYPES + 1];
2966 char *p = tmp;
2967 int i;
2968
2969 for (i = 0; i < MIGRATE_TYPES; i++) {
2970 if (type & (1 << i))
2971 *p++ = types[i];
2972 }
2973
2974 *p = '\0';
2975 printk("(%s) ", tmp);
2976}
2977
2978
2979
2980
2981
2982
2983
2984
2985void show_free_areas(unsigned int filter)
2986{
2987 int cpu;
2988 struct zone *zone;
2989
2990 for_each_populated_zone(zone) {
2991 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2992 continue;
2993 show_node(zone);
2994 printk("%s per-cpu:\n", zone->name);
2995
2996 for_each_online_cpu(cpu) {
2997 struct per_cpu_pageset *pageset;
2998
2999 pageset = per_cpu_ptr(zone->pageset, cpu);
3000
3001 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3002 cpu, pageset->pcp.high,
3003 pageset->pcp.batch, pageset->pcp.count);
3004 }
3005 }
3006
3007 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3008 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3009 " unevictable:%lu"
3010 " dirty:%lu writeback:%lu unstable:%lu\n"
3011 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3012 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3013 " free_cma:%lu\n",
3014 global_page_state(NR_ACTIVE_ANON),
3015 global_page_state(NR_INACTIVE_ANON),
3016 global_page_state(NR_ISOLATED_ANON),
3017 global_page_state(NR_ACTIVE_FILE),
3018 global_page_state(NR_INACTIVE_FILE),
3019 global_page_state(NR_ISOLATED_FILE),
3020 global_page_state(NR_UNEVICTABLE),
3021 global_page_state(NR_FILE_DIRTY),
3022 global_page_state(NR_WRITEBACK),
3023 global_page_state(NR_UNSTABLE_NFS),
3024 global_page_state(NR_FREE_PAGES),
3025 global_page_state(NR_SLAB_RECLAIMABLE),
3026 global_page_state(NR_SLAB_UNRECLAIMABLE),
3027 global_page_state(NR_FILE_MAPPED),
3028 global_page_state(NR_SHMEM),
3029 global_page_state(NR_PAGETABLE),
3030 global_page_state(NR_BOUNCE),
3031 global_page_state(NR_FREE_CMA_PAGES));
3032
3033 for_each_populated_zone(zone) {
3034 int i;
3035
3036 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3037 continue;
3038 show_node(zone);
3039 printk("%s"
3040 " free:%lukB"
3041 " min:%lukB"
3042 " low:%lukB"
3043 " high:%lukB"
3044 " active_anon:%lukB"
3045 " inactive_anon:%lukB"
3046 " active_file:%lukB"
3047 " inactive_file:%lukB"
3048 " unevictable:%lukB"
3049 " isolated(anon):%lukB"
3050 " isolated(file):%lukB"
3051 " present:%lukB"
3052 " managed:%lukB"
3053 " mlocked:%lukB"
3054 " dirty:%lukB"
3055 " writeback:%lukB"
3056 " mapped:%lukB"
3057 " shmem:%lukB"
3058 " slab_reclaimable:%lukB"
3059 " slab_unreclaimable:%lukB"
3060 " kernel_stack:%lukB"
3061 " pagetables:%lukB"
3062 " unstable:%lukB"
3063 " bounce:%lukB"
3064 " free_cma:%lukB"
3065 " writeback_tmp:%lukB"
3066 " pages_scanned:%lu"
3067 " all_unreclaimable? %s"
3068 "\n",
3069 zone->name,
3070 K(zone_page_state(zone, NR_FREE_PAGES)),
3071 K(min_wmark_pages(zone)),
3072 K(low_wmark_pages(zone)),
3073 K(high_wmark_pages(zone)),
3074 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3075 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3076 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3077 K(zone_page_state(zone, NR_INACTIVE_FILE)),
3078 K(zone_page_state(zone, NR_UNEVICTABLE)),
3079 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3080 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3081 K(zone->present_pages),
3082 K(zone->managed_pages),
3083 K(zone_page_state(zone, NR_MLOCK)),
3084 K(zone_page_state(zone, NR_FILE_DIRTY)),
3085 K(zone_page_state(zone, NR_WRITEBACK)),
3086 K(zone_page_state(zone, NR_FILE_MAPPED)),
3087 K(zone_page_state(zone, NR_SHMEM)),
3088 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3089 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
3090 zone_page_state(zone, NR_KERNEL_STACK) *
3091 THREAD_SIZE / 1024,
3092 K(zone_page_state(zone, NR_PAGETABLE)),
3093 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3094 K(zone_page_state(zone, NR_BOUNCE)),
3095 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3096 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3097 zone->pages_scanned,
3098 (zone->all_unreclaimable ? "yes" : "no")
3099 );
3100 printk("lowmem_reserve[]:");
3101 for (i = 0; i < MAX_NR_ZONES; i++)
3102 printk(" %lu", zone->lowmem_reserve[i]);
3103 printk("\n");
3104 }
3105
3106 for_each_populated_zone(zone) {
3107 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3108 unsigned char types[MAX_ORDER];
3109
3110 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3111 continue;
3112 show_node(zone);
3113 printk("%s: ", zone->name);
3114
3115 spin_lock_irqsave(&zone->lock, flags);
3116 for (order = 0; order < MAX_ORDER; order++) {
3117 struct free_area *area = &zone->free_area[order];
3118 int type;
3119
3120 nr[order] = area->nr_free;
3121 total += nr[order] << order;
3122
3123 types[order] = 0;
3124 for (type = 0; type < MIGRATE_TYPES; type++) {
3125 if (!list_empty(&area->free_list[type]))
3126 types[order] |= 1 << type;
3127 }
3128 }
3129 spin_unlock_irqrestore(&zone->lock, flags);
3130 for (order = 0; order < MAX_ORDER; order++) {
3131 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3132 if (nr[order])
3133 show_migration_types(types[order]);
3134 }
3135 printk("= %lukB\n", K(total));
3136 }
3137
3138 hugetlb_show_meminfo();
3139
3140 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3141
3142 show_swap_cache_info();
3143}
3144
3145static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3146{
3147 zoneref->zone = zone;
3148 zoneref->zone_idx = zone_idx(zone);
3149}
3150
3151
3152
3153
3154
3155
3156static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3157 int nr_zones)
3158{
3159 struct zone *zone;
3160 enum zone_type zone_type = MAX_NR_ZONES;
3161
3162 do {
3163 zone_type--;
3164 zone = pgdat->node_zones + zone_type;
3165 if (populated_zone(zone)) {
3166 zoneref_set_zone(zone,
3167 &zonelist->_zonerefs[nr_zones++]);
3168 check_highest_zone(zone_type);
3169 }
3170 } while (zone_type);
3171
3172 return nr_zones;
3173}
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185#define ZONELIST_ORDER_DEFAULT 0
3186#define ZONELIST_ORDER_NODE 1
3187#define ZONELIST_ORDER_ZONE 2
3188
3189
3190
3191
3192static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3193static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3194
3195
3196#ifdef CONFIG_NUMA
3197
3198static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3199
3200#define NUMA_ZONELIST_ORDER_LEN 16
3201char numa_zonelist_order[16] = "default";
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211static int __parse_numa_zonelist_order(char *s)
3212{
3213 if (*s == 'd' || *s == 'D') {
3214 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3215 } else if (*s == 'n' || *s == 'N') {
3216 user_zonelist_order = ZONELIST_ORDER_NODE;
3217 } else if (*s == 'z' || *s == 'Z') {
3218 user_zonelist_order = ZONELIST_ORDER_ZONE;
3219 } else {
3220 printk(KERN_WARNING
3221 "Ignoring invalid numa_zonelist_order value: "
3222 "%s\n", s);
3223 return -EINVAL;
3224 }
3225 return 0;
3226}
3227
3228static __init int setup_numa_zonelist_order(char *s)
3229{
3230 int ret;
3231
3232 if (!s)
3233 return 0;
3234
3235 ret = __parse_numa_zonelist_order(s);
3236 if (ret == 0)
3237 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3238
3239 return ret;
3240}
3241early_param("numa_zonelist_order", setup_numa_zonelist_order);
3242
3243
3244
3245
3246int numa_zonelist_order_handler(ctl_table *table, int write,
3247 void __user *buffer, size_t *length,
3248 loff_t *ppos)
3249{
3250 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3251 int ret;
3252 static DEFINE_MUTEX(zl_order_mutex);
3253
3254 mutex_lock(&zl_order_mutex);
3255 if (write) {
3256 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3257 ret = -EINVAL;
3258 goto out;
3259 }
3260 strcpy(saved_string, (char *)table->data);
3261 }
3262 ret = proc_dostring(table, write, buffer, length, ppos);
3263 if (ret)
3264 goto out;
3265 if (write) {
3266 int oldval = user_zonelist_order;
3267
3268 ret = __parse_numa_zonelist_order((char *)table->data);
3269 if (ret) {
3270
3271
3272
3273 strncpy((char *)table->data, saved_string,
3274 NUMA_ZONELIST_ORDER_LEN);
3275 user_zonelist_order = oldval;
3276 } else if (oldval != user_zonelist_order) {
3277 mutex_lock(&zonelists_mutex);
3278 build_all_zonelists(NULL, NULL);
3279 mutex_unlock(&zonelists_mutex);
3280 }
3281 }
3282out:
3283 mutex_unlock(&zl_order_mutex);
3284 return ret;
3285}
3286
3287
3288#define MAX_NODE_LOAD (nr_online_nodes)
3289static int node_load[MAX_NUMNODES];
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305static int find_next_best_node(int node, nodemask_t *used_node_mask)
3306{
3307 int n, val;
3308 int min_val = INT_MAX;
3309 int best_node = NUMA_NO_NODE;
3310 const struct cpumask *tmp = cpumask_of_node(0);
3311
3312
3313 if (!node_isset(node, *used_node_mask)) {
3314 node_set(node, *used_node_mask);
3315 return node;
3316 }
3317
3318 for_each_node_state(n, N_MEMORY) {
3319
3320
3321 if (node_isset(n, *used_node_mask))
3322 continue;
3323
3324
3325 val = node_distance(node, n);
3326
3327
3328 val += (n < node);
3329
3330
3331 tmp = cpumask_of_node(n);
3332 if (!cpumask_empty(tmp))
3333 val += PENALTY_FOR_NODE_WITH_CPUS;
3334
3335
3336 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3337 val += node_load[n];
3338
3339 if (val < min_val) {
3340 min_val = val;
3341 best_node = n;
3342 }
3343 }
3344
3345 if (best_node >= 0)
3346 node_set(best_node, *used_node_mask);
3347
3348 return best_node;
3349}
3350
3351
3352
3353
3354
3355
3356
3357static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3358{
3359 int j;
3360 struct zonelist *zonelist;
3361
3362 zonelist = &pgdat->node_zonelists[0];
3363 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3364 ;
3365 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3366 zonelist->_zonerefs[j].zone = NULL;
3367 zonelist->_zonerefs[j].zone_idx = 0;
3368}
3369
3370
3371
3372
3373static void build_thisnode_zonelists(pg_data_t *pgdat)
3374{
3375 int j;
3376 struct zonelist *zonelist;
3377
3378 zonelist = &pgdat->node_zonelists[1];
3379 j = build_zonelists_node(pgdat, zonelist, 0);
3380 zonelist->_zonerefs[j].zone = NULL;
3381 zonelist->_zonerefs[j].zone_idx = 0;
3382}
3383
3384
3385
3386
3387
3388
3389
3390static int node_order[MAX_NUMNODES];
3391
3392static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3393{
3394 int pos, j, node;
3395 int zone_type;
3396 struct zone *z;
3397 struct zonelist *zonelist;
3398
3399 zonelist = &pgdat->node_zonelists[0];
3400 pos = 0;
3401 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3402 for (j = 0; j < nr_nodes; j++) {
3403 node = node_order[j];
3404 z = &NODE_DATA(node)->node_zones[zone_type];
3405 if (populated_zone(z)) {
3406 zoneref_set_zone(z,
3407 &zonelist->_zonerefs[pos++]);
3408 check_highest_zone(zone_type);
3409 }
3410 }
3411 }
3412 zonelist->_zonerefs[pos].zone = NULL;
3413 zonelist->_zonerefs[pos].zone_idx = 0;
3414}
3415
3416static int default_zonelist_order(void)
3417{
3418 int nid, zone_type;
3419 unsigned long low_kmem_size,total_size;
3420 struct zone *z;
3421 int average_size;
3422
3423
3424
3425
3426
3427
3428
3429 low_kmem_size = 0;
3430 total_size = 0;
3431 for_each_online_node(nid) {
3432 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3433 z = &NODE_DATA(nid)->node_zones[zone_type];
3434 if (populated_zone(z)) {
3435 if (zone_type < ZONE_NORMAL)
3436 low_kmem_size += z->managed_pages;
3437 total_size += z->managed_pages;
3438 } else if (zone_type == ZONE_NORMAL) {
3439
3440
3441
3442
3443
3444
3445
3446 return ZONELIST_ORDER_NODE;
3447 }
3448 }
3449 }
3450 if (!low_kmem_size ||
3451 low_kmem_size > total_size/2)
3452 return ZONELIST_ORDER_NODE;
3453
3454
3455
3456
3457
3458 average_size = total_size /
3459 (nodes_weight(node_states[N_MEMORY]) + 1);
3460 for_each_online_node(nid) {
3461 low_kmem_size = 0;
3462 total_size = 0;
3463 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3464 z = &NODE_DATA(nid)->node_zones[zone_type];
3465 if (populated_zone(z)) {
3466 if (zone_type < ZONE_NORMAL)
3467 low_kmem_size += z->present_pages;
3468 total_size += z->present_pages;
3469 }
3470 }
3471 if (low_kmem_size &&
3472 total_size > average_size &&
3473 low_kmem_size > total_size * 70/100)
3474 return ZONELIST_ORDER_NODE;
3475 }
3476 return ZONELIST_ORDER_ZONE;
3477}
3478
3479static void set_zonelist_order(void)
3480{
3481 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3482 current_zonelist_order = default_zonelist_order();
3483 else
3484 current_zonelist_order = user_zonelist_order;
3485}
3486
3487static void build_zonelists(pg_data_t *pgdat)
3488{
3489 int j, node, load;
3490 enum zone_type i;
3491 nodemask_t used_mask;
3492 int local_node, prev_node;
3493 struct zonelist *zonelist;
3494 int order = current_zonelist_order;
3495
3496
3497 for (i = 0; i < MAX_ZONELISTS; i++) {
3498 zonelist = pgdat->node_zonelists + i;
3499 zonelist->_zonerefs[0].zone = NULL;
3500 zonelist->_zonerefs[0].zone_idx = 0;
3501 }
3502
3503
3504 local_node = pgdat->node_id;
3505 load = nr_online_nodes;
3506 prev_node = local_node;
3507 nodes_clear(used_mask);
3508
3509 memset(node_order, 0, sizeof(node_order));
3510 j = 0;
3511
3512 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3513
3514
3515
3516
3517
3518 if (node_distance(local_node, node) !=
3519 node_distance(local_node, prev_node))
3520 node_load[node] = load;
3521
3522 prev_node = node;
3523 load--;
3524 if (order == ZONELIST_ORDER_NODE)
3525 build_zonelists_in_node_order(pgdat, node);
3526 else
3527 node_order[j++] = node;
3528 }
3529
3530 if (order == ZONELIST_ORDER_ZONE) {
3531
3532 build_zonelists_in_zone_order(pgdat, j);
3533 }
3534
3535 build_thisnode_zonelists(pgdat);
3536}
3537
3538
3539static void build_zonelist_cache(pg_data_t *pgdat)
3540{
3541 struct zonelist *zonelist;
3542 struct zonelist_cache *zlc;
3543 struct zoneref *z;
3544
3545 zonelist = &pgdat->node_zonelists[0];
3546 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3547 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3548 for (z = zonelist->_zonerefs; z->zone; z++)
3549 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3550}
3551
3552#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3553
3554
3555
3556
3557
3558
3559int local_memory_node(int node)
3560{
3561 struct zone *zone;
3562
3563 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3564 gfp_zone(GFP_KERNEL),
3565 NULL,
3566 &zone);
3567 return zone->node;
3568}
3569#endif
3570
3571#else
3572
3573static void set_zonelist_order(void)
3574{
3575 current_zonelist_order = ZONELIST_ORDER_ZONE;
3576}
3577
3578static void build_zonelists(pg_data_t *pgdat)
3579{
3580 int node, local_node;
3581 enum zone_type j;
3582 struct zonelist *zonelist;
3583
3584 local_node = pgdat->node_id;
3585
3586 zonelist = &pgdat->node_zonelists[0];
3587 j = build_zonelists_node(pgdat, zonelist, 0);
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3598 if (!node_online(node))
3599 continue;
3600 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3601 }
3602 for (node = 0; node < local_node; node++) {
3603 if (!node_online(node))
3604 continue;
3605 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3606 }
3607
3608 zonelist->_zonerefs[j].zone = NULL;
3609 zonelist->_zonerefs[j].zone_idx = 0;
3610}
3611
3612
3613static void build_zonelist_cache(pg_data_t *pgdat)
3614{
3615 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3616}
3617
3618#endif
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3636static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3637static void setup_zone_pageset(struct zone *zone);
3638
3639
3640
3641
3642
3643DEFINE_MUTEX(zonelists_mutex);
3644
3645
3646static int __build_all_zonelists(void *data)
3647{
3648 int nid;
3649 int cpu;
3650 pg_data_t *self = data;
3651
3652#ifdef CONFIG_NUMA
3653 memset(node_load, 0, sizeof(node_load));
3654#endif
3655
3656 if (self && !node_online(self->node_id)) {
3657 build_zonelists(self);
3658 build_zonelist_cache(self);
3659 }
3660
3661 for_each_online_node(nid) {
3662 pg_data_t *pgdat = NODE_DATA(nid);
3663
3664 build_zonelists(pgdat);
3665 build_zonelist_cache(pgdat);
3666 }
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681 for_each_possible_cpu(cpu) {
3682 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3683
3684#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3685
3686
3687
3688
3689
3690
3691
3692
3693 if (cpu_online(cpu))
3694 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3695#endif
3696 }
3697
3698 return 0;
3699}
3700
3701
3702
3703
3704
3705void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3706{
3707 set_zonelist_order();
3708
3709 if (system_state == SYSTEM_BOOTING) {
3710 __build_all_zonelists(NULL);
3711 mminit_verify_zonelist();
3712 cpuset_init_current_mems_allowed();
3713 } else {
3714#ifdef CONFIG_MEMORY_HOTPLUG
3715 if (zone)
3716 setup_zone_pageset(zone);
3717#endif
3718
3719
3720 stop_machine(__build_all_zonelists, pgdat, NULL);
3721
3722 }
3723 vm_total_pages = nr_free_pagecache_pages();
3724
3725
3726
3727
3728
3729
3730
3731 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3732 page_group_by_mobility_disabled = 1;
3733 else
3734 page_group_by_mobility_disabled = 0;
3735
3736 printk("Built %i zonelists in %s order, mobility grouping %s. "
3737 "Total pages: %ld\n",
3738 nr_online_nodes,
3739 zonelist_order_name[current_zonelist_order],
3740 page_group_by_mobility_disabled ? "off" : "on",
3741 vm_total_pages);
3742#ifdef CONFIG_NUMA
3743 printk("Policy zone: %s\n", zone_names[policy_zone]);
3744#endif
3745}
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758#define PAGES_PER_WAITQUEUE 256
3759
3760#ifndef CONFIG_MEMORY_HOTPLUG
3761static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3762{
3763 unsigned long size = 1;
3764
3765 pages /= PAGES_PER_WAITQUEUE;
3766
3767 while (size < pages)
3768 size <<= 1;
3769
3770
3771
3772
3773
3774
3775 size = min(size, 4096UL);
3776
3777 return max(size, 4UL);
3778}
3779#else
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3798{
3799 return 4096UL;
3800}
3801#endif
3802
3803
3804
3805
3806
3807
3808static inline unsigned long wait_table_bits(unsigned long size)
3809{
3810 return ffz(~size);
3811}
3812
3813#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3814
3815
3816
3817
3818static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3819{
3820 unsigned long pfn;
3821
3822 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3823 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3824 return 1;
3825 }
3826 return 0;
3827}
3828
3829
3830
3831
3832
3833
3834
3835
3836static void setup_zone_migrate_reserve(struct zone *zone)
3837{
3838 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3839 struct page *page;
3840 unsigned long block_migratetype;
3841 int reserve;
3842
3843
3844
3845
3846
3847
3848
3849 start_pfn = zone->zone_start_pfn;
3850 end_pfn = zone_end_pfn(zone);
3851 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3852 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3853 pageblock_order;
3854
3855
3856
3857
3858
3859
3860
3861
3862 reserve = min(2, reserve);
3863
3864 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3865 if (!pfn_valid(pfn))
3866 continue;
3867 page = pfn_to_page(pfn);
3868
3869
3870 if (page_to_nid(page) != zone_to_nid(zone))
3871 continue;
3872
3873 block_migratetype = get_pageblock_migratetype(page);
3874
3875
3876 if (reserve > 0) {
3877
3878
3879
3880
3881 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3882 if (pageblock_is_reserved(pfn, block_end_pfn))
3883 continue;
3884
3885
3886 if (block_migratetype == MIGRATE_RESERVE) {
3887 reserve--;
3888 continue;
3889 }
3890
3891
3892 if (block_migratetype == MIGRATE_MOVABLE) {
3893 set_pageblock_migratetype(page,
3894 MIGRATE_RESERVE);
3895 move_freepages_block(zone, page,
3896 MIGRATE_RESERVE);
3897 reserve--;
3898 continue;
3899 }
3900 }
3901
3902
3903
3904
3905
3906 if (block_migratetype == MIGRATE_RESERVE) {
3907 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3908 move_freepages_block(zone, page, MIGRATE_MOVABLE);
3909 }
3910 }
3911}
3912
3913
3914
3915
3916
3917
3918void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3919 unsigned long start_pfn, enum memmap_context context)
3920{
3921 struct page *page;
3922 unsigned long end_pfn = start_pfn + size;
3923 unsigned long pfn;
3924 struct zone *z;
3925
3926 if (highest_memmap_pfn < end_pfn - 1)
3927 highest_memmap_pfn = end_pfn - 1;
3928
3929 z = &NODE_DATA(nid)->node_zones[zone];
3930 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3931
3932
3933
3934
3935
3936 if (context == MEMMAP_EARLY) {
3937 if (!early_pfn_valid(pfn))
3938 continue;
3939 if (!early_pfn_in_nid(pfn, nid))
3940 continue;
3941 }
3942 page = pfn_to_page(pfn);
3943 set_page_links(page, zone, nid, pfn);
3944 mminit_verify_page_links(page, zone, nid, pfn);
3945 init_page_count(page);
3946 page_mapcount_reset(page);
3947 page_nid_reset_last(page);
3948 SetPageReserved(page);
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963 if ((z->zone_start_pfn <= pfn)
3964 && (pfn < zone_end_pfn(z))
3965 && !(pfn & (pageblock_nr_pages - 1)))
3966 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3967
3968 INIT_LIST_HEAD(&page->lru);
3969#ifdef WANT_PAGE_VIRTUAL
3970
3971 if (!is_highmem_idx(zone))
3972 set_page_address(page, __va(pfn << PAGE_SHIFT));
3973#endif
3974 }
3975}
3976
3977static void __meminit zone_init_free_lists(struct zone *zone)
3978{
3979 int order, t;
3980 for_each_migratetype_order(order, t) {
3981 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3982 zone->free_area[order].nr_free = 0;
3983 }
3984}
3985
3986#ifndef __HAVE_ARCH_MEMMAP_INIT
3987#define memmap_init(size, nid, zone, start_pfn) \
3988 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3989#endif
3990
3991static int __meminit zone_batchsize(struct zone *zone)
3992{
3993#ifdef CONFIG_MMU
3994 int batch;
3995
3996
3997
3998
3999
4000
4001
4002 batch = zone->managed_pages / 1024;
4003 if (batch * PAGE_SIZE > 512 * 1024)
4004 batch = (512 * 1024) / PAGE_SIZE;
4005 batch /= 4;
4006 if (batch < 1)
4007 batch = 1;
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019 batch = rounddown_pow_of_two(batch + batch/2) - 1;
4020
4021 return batch;
4022
4023#else
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037 return 0;
4038#endif
4039}
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4055 unsigned long batch)
4056{
4057
4058 pcp->batch = 1;
4059 smp_wmb();
4060
4061
4062 pcp->high = high;
4063 smp_wmb();
4064
4065 pcp->batch = batch;
4066}
4067
4068
4069static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4070{
4071 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4072}
4073
4074static void pageset_init(struct per_cpu_pageset *p)
4075{
4076 struct per_cpu_pages *pcp;
4077 int migratetype;
4078
4079 memset(p, 0, sizeof(*p));
4080
4081 pcp = &p->pcp;
4082 pcp->count = 0;
4083 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4084 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4085}
4086
4087static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4088{
4089 pageset_init(p);
4090 pageset_set_batch(p, batch);
4091}
4092
4093
4094
4095
4096
4097static void pageset_set_high(struct per_cpu_pageset *p,
4098 unsigned long high)
4099{
4100 unsigned long batch = max(1UL, high / 4);
4101 if ((high / 4) > (PAGE_SHIFT * 8))
4102 batch = PAGE_SHIFT * 8;
4103
4104 pageset_update(&p->pcp, high, batch);
4105}
4106
4107static void __meminit pageset_set_high_and_batch(struct zone *zone,
4108 struct per_cpu_pageset *pcp)
4109{
4110 if (percpu_pagelist_fraction)
4111 pageset_set_high(pcp,
4112 (zone->managed_pages /
4113 percpu_pagelist_fraction));
4114 else
4115 pageset_set_batch(pcp, zone_batchsize(zone));
4116}
4117
4118static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4119{
4120 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4121
4122 pageset_init(pcp);
4123 pageset_set_high_and_batch(zone, pcp);
4124}
4125
4126static void __meminit setup_zone_pageset(struct zone *zone)
4127{
4128 int cpu;
4129 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4130 for_each_possible_cpu(cpu)
4131 zone_pageset_init(zone, cpu);
4132}
4133
4134
4135
4136
4137
4138void __init setup_per_cpu_pageset(void)
4139{
4140 struct zone *zone;
4141
4142 for_each_populated_zone(zone)
4143 setup_zone_pageset(zone);
4144}
4145
4146static noinline __init_refok
4147int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4148{
4149 int i;
4150 struct pglist_data *pgdat = zone->zone_pgdat;
4151 size_t alloc_size;
4152
4153
4154
4155
4156
4157 zone->wait_table_hash_nr_entries =
4158 wait_table_hash_nr_entries(zone_size_pages);
4159 zone->wait_table_bits =
4160 wait_table_bits(zone->wait_table_hash_nr_entries);
4161 alloc_size = zone->wait_table_hash_nr_entries
4162 * sizeof(wait_queue_head_t);
4163
4164 if (!slab_is_available()) {
4165 zone->wait_table = (wait_queue_head_t *)
4166 alloc_bootmem_node_nopanic(pgdat, alloc_size);
4167 } else {
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178 zone->wait_table = vmalloc(alloc_size);
4179 }
4180 if (!zone->wait_table)
4181 return -ENOMEM;
4182
4183 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4184 init_waitqueue_head(zone->wait_table + i);
4185
4186 return 0;
4187}
4188
4189static __meminit void zone_pcp_init(struct zone *zone)
4190{
4191
4192
4193
4194
4195
4196 zone->pageset = &boot_pageset;
4197
4198 if (zone->present_pages)
4199 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4200 zone->name, zone->present_pages,
4201 zone_batchsize(zone));
4202}
4203
4204int __meminit init_currently_empty_zone(struct zone *zone,
4205 unsigned long zone_start_pfn,
4206 unsigned long size,
4207 enum memmap_context context)
4208{
4209 struct pglist_data *pgdat = zone->zone_pgdat;
4210 int ret;
4211 ret = zone_wait_table_init(zone, size);
4212 if (ret)
4213 return ret;
4214 pgdat->nr_zones = zone_idx(zone) + 1;
4215
4216 zone->zone_start_pfn = zone_start_pfn;
4217
4218 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4219 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4220 pgdat->node_id,
4221 (unsigned long)zone_idx(zone),
4222 zone_start_pfn, (zone_start_pfn + size));
4223
4224 zone_init_free_lists(zone);
4225
4226 return 0;
4227}
4228
4229#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4230#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
4231
4232
4233
4234
4235
4236
4237int __meminit __early_pfn_to_nid(unsigned long pfn)
4238{
4239 unsigned long start_pfn, end_pfn;
4240 int i, nid;
4241
4242
4243
4244
4245 static unsigned long __meminitdata last_start_pfn, last_end_pfn;
4246 static int __meminitdata last_nid;
4247
4248 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4249 return last_nid;
4250
4251 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4252 if (start_pfn <= pfn && pfn < end_pfn) {
4253 last_start_pfn = start_pfn;
4254 last_end_pfn = end_pfn;
4255 last_nid = nid;
4256 return nid;
4257 }
4258
4259 return -1;
4260}
4261#endif
4262
4263int __meminit early_pfn_to_nid(unsigned long pfn)
4264{
4265 int nid;
4266
4267 nid = __early_pfn_to_nid(pfn);
4268 if (nid >= 0)
4269 return nid;
4270
4271 return 0;
4272}
4273
4274#ifdef CONFIG_NODES_SPAN_OTHER_NODES
4275bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4276{
4277 int nid;
4278
4279 nid = __early_pfn_to_nid(pfn);
4280 if (nid >= 0 && nid != node)
4281 return false;
4282 return true;
4283}
4284#endif
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4296{
4297 unsigned long start_pfn, end_pfn;
4298 int i, this_nid;
4299
4300 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4301 start_pfn = min(start_pfn, max_low_pfn);
4302 end_pfn = min(end_pfn, max_low_pfn);
4303
4304 if (start_pfn < end_pfn)
4305 free_bootmem_node(NODE_DATA(this_nid),
4306 PFN_PHYS(start_pfn),
4307 (end_pfn - start_pfn) << PAGE_SHIFT);
4308 }
4309}
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319void __init sparse_memory_present_with_active_regions(int nid)
4320{
4321 unsigned long start_pfn, end_pfn;
4322 int i, this_nid;
4323
4324 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4325 memory_present(this_nid, start_pfn, end_pfn);
4326}
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339void __meminit get_pfn_range_for_nid(unsigned int nid,
4340 unsigned long *start_pfn, unsigned long *end_pfn)
4341{
4342 unsigned long this_start_pfn, this_end_pfn;
4343 int i;
4344
4345 *start_pfn = -1UL;
4346 *end_pfn = 0;
4347
4348 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4349 *start_pfn = min(*start_pfn, this_start_pfn);
4350 *end_pfn = max(*end_pfn, this_end_pfn);
4351 }
4352
4353 if (*start_pfn == -1UL)
4354 *start_pfn = 0;
4355}
4356
4357
4358
4359
4360
4361
4362static void __init find_usable_zone_for_movable(void)
4363{
4364 int zone_index;
4365 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4366 if (zone_index == ZONE_MOVABLE)
4367 continue;
4368
4369 if (arch_zone_highest_possible_pfn[zone_index] >
4370 arch_zone_lowest_possible_pfn[zone_index])
4371 break;
4372 }
4373
4374 VM_BUG_ON(zone_index == -1);
4375 movable_zone = zone_index;
4376}
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388static void __meminit adjust_zone_range_for_zone_movable(int nid,
4389 unsigned long zone_type,
4390 unsigned long node_start_pfn,
4391 unsigned long node_end_pfn,
4392 unsigned long *zone_start_pfn,
4393 unsigned long *zone_end_pfn)
4394{
4395
4396 if (zone_movable_pfn[nid]) {
4397
4398 if (zone_type == ZONE_MOVABLE) {
4399 *zone_start_pfn = zone_movable_pfn[nid];
4400 *zone_end_pfn = min(node_end_pfn,
4401 arch_zone_highest_possible_pfn[movable_zone]);
4402
4403
4404 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4405 *zone_end_pfn > zone_movable_pfn[nid]) {
4406 *zone_end_pfn = zone_movable_pfn[nid];
4407
4408
4409 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4410 *zone_start_pfn = *zone_end_pfn;
4411 }
4412}
4413
4414
4415
4416
4417
4418static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4419 unsigned long zone_type,
4420 unsigned long node_start_pfn,
4421 unsigned long node_end_pfn,
4422 unsigned long *ignored)
4423{
4424 unsigned long zone_start_pfn, zone_end_pfn;
4425
4426
4427 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4428 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4429 adjust_zone_range_for_zone_movable(nid, zone_type,
4430 node_start_pfn, node_end_pfn,
4431 &zone_start_pfn, &zone_end_pfn);
4432
4433
4434 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4435 return 0;
4436
4437
4438 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4439 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4440
4441
4442 return zone_end_pfn - zone_start_pfn;
4443}
4444
4445
4446
4447
4448
4449unsigned long __meminit __absent_pages_in_range(int nid,
4450 unsigned long range_start_pfn,
4451 unsigned long range_end_pfn)
4452{
4453 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4454 unsigned long start_pfn, end_pfn;
4455 int i;
4456
4457 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4458 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4459 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4460 nr_absent -= end_pfn - start_pfn;
4461 }
4462 return nr_absent;
4463}
4464
4465
4466
4467
4468
4469
4470
4471
4472unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4473 unsigned long end_pfn)
4474{
4475 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4476}
4477
4478
4479static unsigned long __meminit zone_absent_pages_in_node(int nid,
4480 unsigned long zone_type,
4481 unsigned long node_start_pfn,
4482 unsigned long node_end_pfn,
4483 unsigned long *ignored)
4484{
4485 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4486 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4487 unsigned long zone_start_pfn, zone_end_pfn;
4488
4489 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4490 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4491
4492 adjust_zone_range_for_zone_movable(nid, zone_type,
4493 node_start_pfn, node_end_pfn,
4494 &zone_start_pfn, &zone_end_pfn);
4495 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4496}
4497
4498#else
4499static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4500 unsigned long zone_type,
4501 unsigned long node_start_pfn,
4502 unsigned long node_end_pfn,
4503 unsigned long *zones_size)
4504{
4505 return zones_size[zone_type];
4506}
4507
4508static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4509 unsigned long zone_type,
4510 unsigned long node_start_pfn,
4511 unsigned long node_end_pfn,
4512 unsigned long *zholes_size)
4513{
4514 if (!zholes_size)
4515 return 0;
4516
4517 return zholes_size[zone_type];
4518}
4519
4520#endif
4521
4522static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4523 unsigned long node_start_pfn,
4524 unsigned long node_end_pfn,
4525 unsigned long *zones_size,
4526 unsigned long *zholes_size)
4527{
4528 unsigned long realtotalpages, totalpages = 0;
4529 enum zone_type i;
4530
4531 for (i = 0; i < MAX_NR_ZONES; i++)
4532 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4533 node_start_pfn,
4534 node_end_pfn,
4535 zones_size);
4536 pgdat->node_spanned_pages = totalpages;
4537
4538 realtotalpages = totalpages;
4539 for (i = 0; i < MAX_NR_ZONES; i++)
4540 realtotalpages -=
4541 zone_absent_pages_in_node(pgdat->node_id, i,
4542 node_start_pfn, node_end_pfn,
4543 zholes_size);
4544 pgdat->node_present_pages = realtotalpages;
4545 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4546 realtotalpages);
4547}
4548
4549#ifndef CONFIG_SPARSEMEM
4550
4551
4552
4553
4554
4555
4556
4557static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
4558{
4559 unsigned long usemapsize;
4560
4561 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
4562 usemapsize = roundup(zonesize, pageblock_nr_pages);
4563 usemapsize = usemapsize >> pageblock_order;
4564 usemapsize *= NR_PAGEBLOCK_BITS;
4565 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4566
4567 return usemapsize / 8;
4568}
4569
4570static void __init setup_usemap(struct pglist_data *pgdat,
4571 struct zone *zone,
4572 unsigned long zone_start_pfn,
4573 unsigned long zonesize)
4574{
4575 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
4576 zone->pageblock_flags = NULL;
4577 if (usemapsize)
4578 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4579 usemapsize);
4580}
4581#else
4582static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4583 unsigned long zone_start_pfn, unsigned long zonesize) {}
4584#endif
4585
4586#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4587
4588
4589void __init set_pageblock_order(void)
4590{
4591 unsigned int order;
4592
4593
4594 if (pageblock_order)
4595 return;
4596
4597 if (HPAGE_SHIFT > PAGE_SHIFT)
4598 order = HUGETLB_PAGE_ORDER;
4599 else
4600 order = MAX_ORDER - 1;
4601
4602
4603
4604
4605
4606
4607 pageblock_order = order;
4608}
4609#else
4610
4611
4612
4613
4614
4615
4616
4617void __init set_pageblock_order(void)
4618{
4619}
4620
4621#endif
4622
4623static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4624 unsigned long present_pages)
4625{
4626 unsigned long pages = spanned_pages;
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636 if (spanned_pages > present_pages + (present_pages >> 4) &&
4637 IS_ENABLED(CONFIG_SPARSEMEM))
4638 pages = present_pages;
4639
4640 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4641}
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4652 unsigned long node_start_pfn, unsigned long node_end_pfn,
4653 unsigned long *zones_size, unsigned long *zholes_size)
4654{
4655 enum zone_type j;
4656 int nid = pgdat->node_id;
4657 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4658 int ret;
4659
4660 pgdat_resize_init(pgdat);
4661#ifdef CONFIG_NUMA_BALANCING
4662 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4663 pgdat->numabalancing_migrate_nr_pages = 0;
4664 pgdat->numabalancing_migrate_next_window = jiffies;
4665#endif
4666 init_waitqueue_head(&pgdat->kswapd_wait);
4667 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4668 pgdat_page_cgroup_init(pgdat);
4669
4670 for (j = 0; j < MAX_NR_ZONES; j++) {
4671 struct zone *zone = pgdat->node_zones + j;
4672 unsigned long size, realsize, freesize, memmap_pages;
4673
4674 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4675 node_end_pfn, zones_size);
4676 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4677 node_start_pfn,
4678 node_end_pfn,
4679 zholes_size);
4680
4681
4682
4683
4684
4685
4686 memmap_pages = calc_memmap_size(size, realsize);
4687 if (freesize >= memmap_pages) {
4688 freesize -= memmap_pages;
4689 if (memmap_pages)
4690 printk(KERN_DEBUG
4691 " %s zone: %lu pages used for memmap\n",
4692 zone_names[j], memmap_pages);
4693 } else
4694 printk(KERN_WARNING
4695 " %s zone: %lu pages exceeds freesize %lu\n",
4696 zone_names[j], memmap_pages, freesize);
4697
4698
4699 if (j == 0 && freesize > dma_reserve) {
4700 freesize -= dma_reserve;
4701 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4702 zone_names[0], dma_reserve);
4703 }
4704
4705 if (!is_highmem_idx(j))
4706 nr_kernel_pages += freesize;
4707
4708 else if (nr_kernel_pages > memmap_pages * 2)
4709 nr_kernel_pages -= memmap_pages;
4710 nr_all_pages += freesize;
4711
4712 zone->spanned_pages = size;
4713 zone->present_pages = realsize;
4714
4715
4716
4717
4718
4719 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4720#ifdef CONFIG_NUMA
4721 zone->node = nid;
4722 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4723 / 100;
4724 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4725#endif
4726 zone->name = zone_names[j];
4727 spin_lock_init(&zone->lock);
4728 spin_lock_init(&zone->lru_lock);
4729 zone_seqlock_init(zone);
4730 zone->zone_pgdat = pgdat;
4731
4732 zone_pcp_init(zone);
4733 lruvec_init(&zone->lruvec);
4734 if (!size)
4735 continue;
4736
4737 set_pageblock_order();
4738 setup_usemap(pgdat, zone, zone_start_pfn, size);
4739 ret = init_currently_empty_zone(zone, zone_start_pfn,
4740 size, MEMMAP_EARLY);
4741 BUG_ON(ret);
4742 memmap_init(size, nid, j, zone_start_pfn);
4743 zone_start_pfn += size;
4744 }
4745}
4746
4747static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4748{
4749
4750 if (!pgdat->node_spanned_pages)
4751 return;
4752
4753#ifdef CONFIG_FLAT_NODE_MEM_MAP
4754
4755 if (!pgdat->node_mem_map) {
4756 unsigned long size, start, end;
4757 struct page *map;
4758
4759
4760
4761
4762
4763
4764 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4765 end = pgdat_end_pfn(pgdat);
4766 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4767 size = (end - start) * sizeof(struct page);
4768 map = alloc_remap(pgdat->node_id, size);
4769 if (!map)
4770 map = alloc_bootmem_node_nopanic(pgdat, size);
4771 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4772 }
4773#ifndef CONFIG_NEED_MULTIPLE_NODES
4774
4775
4776
4777 if (pgdat == NODE_DATA(0)) {
4778 mem_map = NODE_DATA(0)->node_mem_map;
4779#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4780 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4781 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4782#endif
4783 }
4784#endif
4785#endif
4786}
4787
4788void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4789 unsigned long node_start_pfn, unsigned long *zholes_size)
4790{
4791 pg_data_t *pgdat = NODE_DATA(nid);
4792 unsigned long start_pfn = 0;
4793 unsigned long end_pfn = 0;
4794
4795
4796 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
4797
4798 pgdat->node_id = nid;
4799 pgdat->node_start_pfn = node_start_pfn;
4800 init_zone_allows_reclaim(nid);
4801#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4802 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4803#endif
4804 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4805 zones_size, zholes_size);
4806
4807 alloc_node_mem_map(pgdat);
4808#ifdef CONFIG_FLAT_NODE_MEM_MAP
4809 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4810 nid, (unsigned long)pgdat,
4811 (unsigned long)pgdat->node_mem_map);
4812#endif
4813
4814 free_area_init_core(pgdat, start_pfn, end_pfn,
4815 zones_size, zholes_size);
4816}
4817
4818#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4819
4820#if MAX_NUMNODES > 1
4821
4822
4823
4824void __init setup_nr_node_ids(void)
4825{
4826 unsigned int node;
4827 unsigned int highest = 0;
4828
4829 for_each_node_mask(node, node_possible_map)
4830 highest = node;
4831 nr_node_ids = highest + 1;
4832}
4833#endif
4834
4835
4836
4837
4838
4839
4840