1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/backing-dev.h>
30#include <linux/rmap.h>
31#include <linux/topology.h>
32#include <linux/cpu.h>
33#include <linux/cpuset.h>
34#include <linux/compaction.h>
35#include <linux/notifier.h>
36#include <linux/rwsem.h>
37#include <linux/delay.h>
38#include <linux/kthread.h>
39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
43#include <linux/oom.h>
44#include <linux/prefetch.h>
45
46#include <asm/tlbflush.h>
47#include <asm/div64.h>
48
49#include <linux/swapops.h>
50
51#include "internal.h"
52
53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h>
55
56struct scan_control {
57
58 unsigned long nr_scanned;
59
60
61 unsigned long nr_reclaimed;
62
63
64 unsigned long nr_to_reclaim;
65
66 unsigned long hibernation_mode;
67
68
69 gfp_t gfp_mask;
70
71 int may_writepage;
72
73
74 int may_unmap;
75
76
77 int may_swap;
78
79 int order;
80
81
82 int priority;
83
84
85
86
87
88 struct mem_cgroup *target_mem_cgroup;
89
90
91
92
93
94 nodemask_t *nodemask;
95};
96
97#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
98
99#ifdef ARCH_HAS_PREFETCH
100#define prefetch_prev_lru_page(_page, _base, _field) \
101 do { \
102 if ((_page)->lru.prev != _base) { \
103 struct page *prev; \
104 \
105 prev = lru_to_page(&(_page->lru)); \
106 prefetch(&prev->_field); \
107 } \
108 } while (0)
109#else
110#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
111#endif
112
113#ifdef ARCH_HAS_PREFETCHW
114#define prefetchw_prev_lru_page(_page, _base, _field) \
115 do { \
116 if ((_page)->lru.prev != _base) { \
117 struct page *prev; \
118 \
119 prev = lru_to_page(&(_page->lru)); \
120 prefetchw(&prev->_field); \
121 } \
122 } while (0)
123#else
124#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
125#endif
126
127
128
129
130int vm_swappiness = 60;
131long vm_total_pages;
132
133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem);
135
136#ifdef CONFIG_MEMCG
137static bool global_reclaim(struct scan_control *sc)
138{
139 return !sc->target_mem_cgroup;
140}
141#else
142static bool global_reclaim(struct scan_control *sc)
143{
144 return true;
145}
146#endif
147
148static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
149{
150 if (!mem_cgroup_disabled())
151 return mem_cgroup_get_lru_size(lruvec, lru);
152
153 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
154}
155
156
157
158
159void register_shrinker(struct shrinker *shrinker)
160{
161 atomic_long_set(&shrinker->nr_in_batch, 0);
162 down_write(&shrinker_rwsem);
163 list_add_tail(&shrinker->list, &shrinker_list);
164 up_write(&shrinker_rwsem);
165}
166EXPORT_SYMBOL(register_shrinker);
167
168
169
170
171void unregister_shrinker(struct shrinker *shrinker)
172{
173 down_write(&shrinker_rwsem);
174 list_del(&shrinker->list);
175 up_write(&shrinker_rwsem);
176}
177EXPORT_SYMBOL(unregister_shrinker);
178
179static inline int do_shrinker_shrink(struct shrinker *shrinker,
180 struct shrink_control *sc,
181 unsigned long nr_to_scan)
182{
183 sc->nr_to_scan = nr_to_scan;
184 return (*shrinker->shrink)(shrinker, sc);
185}
186
187#define SHRINK_BATCH 128
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207unsigned long shrink_slab(struct shrink_control *shrink,
208 unsigned long nr_pages_scanned,
209 unsigned long lru_pages)
210{
211 struct shrinker *shrinker;
212 unsigned long ret = 0;
213
214 if (nr_pages_scanned == 0)
215 nr_pages_scanned = SWAP_CLUSTER_MAX;
216
217 if (!down_read_trylock(&shrinker_rwsem)) {
218
219 ret = 1;
220 goto out;
221 }
222
223 list_for_each_entry(shrinker, &shrinker_list, list) {
224 unsigned long long delta;
225 long total_scan;
226 long max_pass;
227 int shrink_ret = 0;
228 long nr;
229 long new_nr;
230 long batch_size = shrinker->batch ? shrinker->batch
231 : SHRINK_BATCH;
232
233 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
234 if (max_pass <= 0)
235 continue;
236
237
238
239
240
241
242 nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
243
244 total_scan = nr;
245 delta = (4 * nr_pages_scanned) / shrinker->seeks;
246 delta *= max_pass;
247 do_div(delta, lru_pages + 1);
248 total_scan += delta;
249 if (total_scan < 0) {
250 printk(KERN_ERR "shrink_slab: %pF negative objects to "
251 "delete nr=%ld\n",
252 shrinker->shrink, total_scan);
253 total_scan = max_pass;
254 }
255
256
257
258
259
260
261
262
263
264
265
266
267
268 if (delta < max_pass / 4)
269 total_scan = min(total_scan, max_pass / 2);
270
271
272
273
274
275
276 if (total_scan > max_pass * 2)
277 total_scan = max_pass * 2;
278
279 trace_mm_shrink_slab_start(shrinker, shrink, nr,
280 nr_pages_scanned, lru_pages,
281 max_pass, delta, total_scan);
282
283 while (total_scan >= batch_size) {
284 int nr_before;
285
286 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
287 shrink_ret = do_shrinker_shrink(shrinker, shrink,
288 batch_size);
289 if (shrink_ret == -1)
290 break;
291 if (shrink_ret < nr_before)
292 ret += nr_before - shrink_ret;
293 count_vm_events(SLABS_SCANNED, batch_size);
294 total_scan -= batch_size;
295
296 cond_resched();
297 }
298
299
300
301
302
303
304 if (total_scan > 0)
305 new_nr = atomic_long_add_return(total_scan,
306 &shrinker->nr_in_batch);
307 else
308 new_nr = atomic_long_read(&shrinker->nr_in_batch);
309
310 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
311 }
312 up_read(&shrinker_rwsem);
313out:
314 cond_resched();
315 return ret;
316}
317
318static inline int is_page_cache_freeable(struct page *page)
319{
320
321
322
323
324
325 return page_count(page) - page_has_private(page) == 2;
326}
327
328static int may_write_to_queue(struct backing_dev_info *bdi,
329 struct scan_control *sc)
330{
331 if (current->flags & PF_SWAPWRITE)
332 return 1;
333 if (!bdi_write_congested(bdi))
334 return 1;
335 if (bdi == current->backing_dev_info)
336 return 1;
337 return 0;
338}
339
340
341
342
343
344
345
346
347
348
349
350
351
352static void handle_write_error(struct address_space *mapping,
353 struct page *page, int error)
354{
355 lock_page(page);
356 if (page_mapping(page) == mapping)
357 mapping_set_error(mapping, error);
358 unlock_page(page);
359}
360
361
362typedef enum {
363
364 PAGE_KEEP,
365
366 PAGE_ACTIVATE,
367
368 PAGE_SUCCESS,
369
370 PAGE_CLEAN,
371} pageout_t;
372
373
374
375
376
377static pageout_t pageout(struct page *page, struct address_space *mapping,
378 struct scan_control *sc)
379{
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396 if (!is_page_cache_freeable(page))
397 return PAGE_KEEP;
398 if (!mapping) {
399
400
401
402
403 if (page_has_private(page)) {
404 if (try_to_free_buffers(page)) {
405 ClearPageDirty(page);
406 printk("%s: orphaned page\n", __func__);
407 return PAGE_CLEAN;
408 }
409 }
410 return PAGE_KEEP;
411 }
412 if (mapping->a_ops->writepage == NULL)
413 return PAGE_ACTIVATE;
414 if (!may_write_to_queue(mapping->backing_dev_info, sc))
415 return PAGE_KEEP;
416
417 if (clear_page_dirty_for_io(page)) {
418 int res;
419 struct writeback_control wbc = {
420 .sync_mode = WB_SYNC_NONE,
421 .nr_to_write = SWAP_CLUSTER_MAX,
422 .range_start = 0,
423 .range_end = LLONG_MAX,
424 .for_reclaim = 1,
425 };
426
427 SetPageReclaim(page);
428 res = mapping->a_ops->writepage(page, &wbc);
429 if (res < 0)
430 handle_write_error(mapping, page, res);
431 if (res == AOP_WRITEPAGE_ACTIVATE) {
432 ClearPageReclaim(page);
433 return PAGE_ACTIVATE;
434 }
435
436 if (!PageWriteback(page)) {
437
438 ClearPageReclaim(page);
439 }
440 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
442 return PAGE_SUCCESS;
443 }
444
445 return PAGE_CLEAN;
446}
447
448
449
450
451
452static int __remove_mapping(struct address_space *mapping, struct page *page)
453{
454 BUG_ON(!PageLocked(page));
455 BUG_ON(mapping != page_mapping(page));
456
457 spin_lock_irq(&mapping->tree_lock);
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483 if (!page_freeze_refs(page, 2))
484 goto cannot_free;
485
486 if (unlikely(PageDirty(page))) {
487 page_unfreeze_refs(page, 2);
488 goto cannot_free;
489 }
490
491 if (PageSwapCache(page)) {
492 swp_entry_t swap = { .val = page_private(page) };
493 __delete_from_swap_cache(page);
494 spin_unlock_irq(&mapping->tree_lock);
495 swapcache_free(swap, page);
496 } else {
497 void (*freepage)(struct page *);
498
499 freepage = mapping->a_ops->freepage;
500
501 __delete_from_page_cache(page);
502 spin_unlock_irq(&mapping->tree_lock);
503 mem_cgroup_uncharge_cache_page(page);
504
505 if (freepage != NULL)
506 freepage(page);
507 }
508
509 return 1;
510
511cannot_free:
512 spin_unlock_irq(&mapping->tree_lock);
513 return 0;
514}
515
516
517
518
519
520
521
522int remove_mapping(struct address_space *mapping, struct page *page)
523{
524 if (__remove_mapping(mapping, page)) {
525
526
527
528
529
530 page_unfreeze_refs(page, 1);
531 return 1;
532 }
533 return 0;
534}
535
536
537
538
539
540
541
542
543
544
545void putback_lru_page(struct page *page)
546{
547 int lru;
548 int active = !!TestClearPageActive(page);
549 int was_unevictable = PageUnevictable(page);
550
551 VM_BUG_ON(PageLRU(page));
552
553redo:
554 ClearPageUnevictable(page);
555
556 if (page_evictable(page)) {
557
558
559
560
561
562
563 lru = active + page_lru_base_type(page);
564 lru_cache_add_lru(page, lru);
565 } else {
566
567
568
569
570 lru = LRU_UNEVICTABLE;
571 add_page_to_unevictable_list(page);
572
573
574
575
576
577
578
579
580
581
582 smp_mb();
583 }
584
585
586
587
588
589
590 if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
591 if (!isolate_lru_page(page)) {
592 put_page(page);
593 goto redo;
594 }
595
596
597
598
599 }
600
601 if (was_unevictable && lru != LRU_UNEVICTABLE)
602 count_vm_event(UNEVICTABLE_PGRESCUED);
603 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
604 count_vm_event(UNEVICTABLE_PGCULLED);
605
606 put_page(page);
607}
608
609enum page_references {
610 PAGEREF_RECLAIM,
611 PAGEREF_RECLAIM_CLEAN,
612 PAGEREF_KEEP,
613 PAGEREF_ACTIVATE,
614};
615
616static enum page_references page_check_references(struct page *page,
617 struct scan_control *sc)
618{
619 int referenced_ptes, referenced_page;
620 unsigned long vm_flags;
621
622 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
623 &vm_flags);
624 referenced_page = TestClearPageReferenced(page);
625
626
627
628
629
630 if (vm_flags & VM_LOCKED)
631 return PAGEREF_RECLAIM;
632
633 if (referenced_ptes) {
634 if (PageSwapBacked(page))
635 return PAGEREF_ACTIVATE;
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650 SetPageReferenced(page);
651
652 if (referenced_page || referenced_ptes > 1)
653 return PAGEREF_ACTIVATE;
654
655
656
657
658 if (vm_flags & VM_EXEC)
659 return PAGEREF_ACTIVATE;
660
661 return PAGEREF_KEEP;
662 }
663
664
665 if (referenced_page && !PageSwapBacked(page))
666 return PAGEREF_RECLAIM_CLEAN;
667
668 return PAGEREF_RECLAIM;
669}
670
671
672
673
674static unsigned long shrink_page_list(struct list_head *page_list,
675 struct zone *zone,
676 struct scan_control *sc,
677 enum ttu_flags ttu_flags,
678 unsigned long *ret_nr_dirty,
679 unsigned long *ret_nr_writeback,
680 bool force_reclaim)
681{
682 LIST_HEAD(ret_pages);
683 LIST_HEAD(free_pages);
684 int pgactivate = 0;
685 unsigned long nr_dirty = 0;
686 unsigned long nr_congested = 0;
687 unsigned long nr_reclaimed = 0;
688 unsigned long nr_writeback = 0;
689
690 cond_resched();
691
692 mem_cgroup_uncharge_start();
693 while (!list_empty(page_list)) {
694 struct address_space *mapping;
695 struct page *page;
696 int may_enter_fs;
697 enum page_references references = PAGEREF_RECLAIM_CLEAN;
698
699 cond_resched();
700
701 page = lru_to_page(page_list);
702 list_del(&page->lru);
703
704 if (!trylock_page(page))
705 goto keep;
706
707 VM_BUG_ON(PageActive(page));
708 VM_BUG_ON(page_zone(page) != zone);
709
710 sc->nr_scanned++;
711
712 if (unlikely(!page_evictable(page)))
713 goto cull_mlocked;
714
715 if (!sc->may_unmap && page_mapped(page))
716 goto keep_locked;
717
718
719 if (page_mapped(page) || PageSwapCache(page))
720 sc->nr_scanned++;
721
722 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
723 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
724
725 if (PageWriteback(page)) {
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743 if (global_reclaim(sc) ||
744 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
745
746
747
748
749
750
751
752
753
754
755
756 SetPageReclaim(page);
757 nr_writeback++;
758 goto keep_locked;
759 }
760 wait_on_page_writeback(page);
761 }
762
763 if (!force_reclaim)
764 references = page_check_references(page, sc);
765
766 switch (references) {
767 case PAGEREF_ACTIVATE:
768 goto activate_locked;
769 case PAGEREF_KEEP:
770 goto keep_locked;
771 case PAGEREF_RECLAIM:
772 case PAGEREF_RECLAIM_CLEAN:
773 ;
774 }
775
776
777
778
779
780 if (PageAnon(page) && !PageSwapCache(page)) {
781 if (!(sc->gfp_mask & __GFP_IO))
782 goto keep_locked;
783 if (!add_to_swap(page))
784 goto activate_locked;
785 may_enter_fs = 1;
786 }
787
788 mapping = page_mapping(page);
789
790
791
792
793
794 if (page_mapped(page) && mapping) {
795 switch (try_to_unmap(page, ttu_flags)) {
796 case SWAP_FAIL:
797 goto activate_locked;
798 case SWAP_AGAIN:
799 goto keep_locked;
800 case SWAP_MLOCK:
801 goto cull_mlocked;
802 case SWAP_SUCCESS:
803 ;
804 }
805 }
806
807 if (PageDirty(page)) {
808 nr_dirty++;
809
810
811
812
813
814
815 if (page_is_file_cache(page) &&
816 (!current_is_kswapd() ||
817 sc->priority >= DEF_PRIORITY - 2)) {
818
819
820
821
822
823
824 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
825 SetPageReclaim(page);
826
827 goto keep_locked;
828 }
829
830 if (references == PAGEREF_RECLAIM_CLEAN)
831 goto keep_locked;
832 if (!may_enter_fs)
833 goto keep_locked;
834 if (!sc->may_writepage)
835 goto keep_locked;
836
837
838 switch (pageout(page, mapping, sc)) {
839 case PAGE_KEEP:
840 nr_congested++;
841 goto keep_locked;
842 case PAGE_ACTIVATE:
843 goto activate_locked;
844 case PAGE_SUCCESS:
845 if (PageWriteback(page))
846 goto keep;
847 if (PageDirty(page))
848 goto keep;
849
850
851
852
853
854 if (!trylock_page(page))
855 goto keep;
856 if (PageDirty(page) || PageWriteback(page))
857 goto keep_locked;
858 mapping = page_mapping(page);
859 case PAGE_CLEAN:
860 ;
861 }
862 }
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885 if (page_has_private(page)) {
886 if (!try_to_release_page(page, sc->gfp_mask))
887 goto activate_locked;
888 if (!mapping && page_count(page) == 1) {
889 unlock_page(page);
890 if (put_page_testzero(page))
891 goto free_it;
892 else {
893
894
895
896
897
898
899
900 nr_reclaimed++;
901 continue;
902 }
903 }
904 }
905
906 if (!mapping || !__remove_mapping(mapping, page))
907 goto keep_locked;
908
909
910
911
912
913
914
915
916 __clear_page_locked(page);
917free_it:
918 nr_reclaimed++;
919
920
921
922
923
924 list_add(&page->lru, &free_pages);
925 continue;
926
927cull_mlocked:
928 if (PageSwapCache(page))
929 try_to_free_swap(page);
930 unlock_page(page);
931 putback_lru_page(page);
932 continue;
933
934activate_locked:
935
936 if (PageSwapCache(page) && vm_swap_full())
937 try_to_free_swap(page);
938 VM_BUG_ON(PageActive(page));
939 SetPageActive(page);
940 pgactivate++;
941keep_locked:
942 unlock_page(page);
943keep:
944 list_add(&page->lru, &ret_pages);
945 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
946 }
947
948
949
950
951
952
953
954 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
955 zone_set_flag(zone, ZONE_CONGESTED);
956
957 free_hot_cold_page_list(&free_pages, 1);
958
959 list_splice(&ret_pages, page_list);
960 count_vm_events(PGACTIVATE, pgactivate);
961 mem_cgroup_uncharge_end();
962 *ret_nr_dirty += nr_dirty;
963 *ret_nr_writeback += nr_writeback;
964 return nr_reclaimed;
965}
966
967unsigned long reclaim_clean_pages_from_list(struct zone *zone,
968 struct list_head *page_list)
969{
970 struct scan_control sc = {
971 .gfp_mask = GFP_KERNEL,
972 .priority = DEF_PRIORITY,
973 .may_unmap = 1,
974 };
975 unsigned long ret, dummy1, dummy2;
976 struct page *page, *next;
977 LIST_HEAD(clean_pages);
978
979 list_for_each_entry_safe(page, next, page_list, lru) {
980 if (page_is_file_cache(page) && !PageDirty(page)) {
981 ClearPageActive(page);
982 list_move(&page->lru, &clean_pages);
983 }
984 }
985
986 ret = shrink_page_list(&clean_pages, zone, &sc,
987 TTU_UNMAP|TTU_IGNORE_ACCESS,
988 &dummy1, &dummy2, true);
989 list_splice(&clean_pages, page_list);
990 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
991 return ret;
992}
993
994
995
996
997
998
999
1000
1001
1002
1003
1004int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1005{
1006 int ret = -EINVAL;
1007
1008
1009 if (!PageLRU(page))
1010 return ret;
1011
1012
1013 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1014 return ret;
1015
1016 ret = -EBUSY;
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1030
1031 if (PageWriteback(page))
1032 return ret;
1033
1034 if (PageDirty(page)) {
1035 struct address_space *mapping;
1036
1037
1038 if (mode & ISOLATE_CLEAN)
1039 return ret;
1040
1041
1042
1043
1044
1045
1046 mapping = page_mapping(page);
1047 if (mapping && !mapping->a_ops->migratepage)
1048 return ret;
1049 }
1050 }
1051
1052 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1053 return ret;
1054
1055 if (likely(get_page_unless_zero(page))) {
1056
1057
1058
1059
1060
1061 ClearPageLRU(page);
1062 ret = 0;
1063 }
1064
1065 return ret;
1066}
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1089 struct lruvec *lruvec, struct list_head *dst,
1090 unsigned long *nr_scanned, struct scan_control *sc,
1091 isolate_mode_t mode, enum lru_list lru)
1092{
1093 struct list_head *src = &lruvec->lists[lru];
1094 unsigned long nr_taken = 0;
1095 unsigned long scan;
1096
1097 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1098 struct page *page;
1099 int nr_pages;
1100
1101 page = lru_to_page(src);
1102 prefetchw_prev_lru_page(page, src, flags);
1103
1104 VM_BUG_ON(!PageLRU(page));
1105
1106 switch (__isolate_lru_page(page, mode)) {
1107 case 0:
1108 nr_pages = hpage_nr_pages(page);
1109 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1110 list_move(&page->lru, dst);
1111 nr_taken += nr_pages;
1112 break;
1113
1114 case -EBUSY:
1115
1116 list_move(&page->lru, src);
1117 continue;
1118
1119 default:
1120 BUG();
1121 }
1122 }
1123
1124 *nr_scanned = scan;
1125 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1126 nr_taken, mode, is_file_lru(lru));
1127 return nr_taken;
1128}
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155int isolate_lru_page(struct page *page)
1156{
1157 int ret = -EBUSY;
1158
1159 VM_BUG_ON(!page_count(page));
1160
1161 if (PageLRU(page)) {
1162 struct zone *zone = page_zone(page);
1163 struct lruvec *lruvec;
1164
1165 spin_lock_irq(&zone->lru_lock);
1166 lruvec = mem_cgroup_page_lruvec(page, zone);
1167 if (PageLRU(page)) {
1168 int lru = page_lru(page);
1169 get_page(page);
1170 ClearPageLRU(page);
1171 del_page_from_lru_list(page, lruvec, lru);
1172 ret = 0;
1173 }
1174 spin_unlock_irq(&zone->lru_lock);
1175 }
1176 return ret;
1177}
1178
1179
1180
1181
1182static int too_many_isolated(struct zone *zone, int file,
1183 struct scan_control *sc)
1184{
1185 unsigned long inactive, isolated;
1186
1187 if (current_is_kswapd())
1188 return 0;
1189
1190 if (!global_reclaim(sc))
1191 return 0;
1192
1193 if (file) {
1194 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1195 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1196 } else {
1197 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1198 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1199 }
1200
1201 return isolated > inactive;
1202}
1203
1204static noinline_for_stack void
1205putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1206{
1207 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1208 struct zone *zone = lruvec_zone(lruvec);
1209 LIST_HEAD(pages_to_free);
1210
1211
1212
1213
1214 while (!list_empty(page_list)) {
1215 struct page *page = lru_to_page(page_list);
1216 int lru;
1217
1218 VM_BUG_ON(PageLRU(page));
1219 list_del(&page->lru);
1220 if (unlikely(!page_evictable(page))) {
1221 spin_unlock_irq(&zone->lru_lock);
1222 putback_lru_page(page);
1223 spin_lock_irq(&zone->lru_lock);
1224 continue;
1225 }
1226
1227 lruvec = mem_cgroup_page_lruvec(page, zone);
1228
1229 SetPageLRU(page);
1230 lru = page_lru(page);
1231 add_page_to_lru_list(page, lruvec, lru);
1232
1233 if (is_active_lru(lru)) {
1234 int file = is_file_lru(lru);
1235 int numpages = hpage_nr_pages(page);
1236 reclaim_stat->recent_rotated[file] += numpages;
1237 }
1238 if (put_page_testzero(page)) {
1239 __ClearPageLRU(page);
1240 __ClearPageActive(page);
1241 del_page_from_lru_list(page, lruvec, lru);
1242
1243 if (unlikely(PageCompound(page))) {
1244 spin_unlock_irq(&zone->lru_lock);
1245 (*get_compound_page_dtor(page))(page);
1246 spin_lock_irq(&zone->lru_lock);
1247 } else
1248 list_add(&page->lru, &pages_to_free);
1249 }
1250 }
1251
1252
1253
1254
1255 list_splice(&pages_to_free, page_list);
1256}
1257
1258
1259
1260
1261
1262static noinline_for_stack unsigned long
1263shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1264 struct scan_control *sc, enum lru_list lru)
1265{
1266 LIST_HEAD(page_list);
1267 unsigned long nr_scanned;
1268 unsigned long nr_reclaimed = 0;
1269 unsigned long nr_taken;
1270 unsigned long nr_dirty = 0;
1271 unsigned long nr_writeback = 0;
1272 isolate_mode_t isolate_mode = 0;
1273 int file = is_file_lru(lru);
1274 struct zone *zone = lruvec_zone(lruvec);
1275 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1276
1277 while (unlikely(too_many_isolated(zone, file, sc))) {
1278 congestion_wait(BLK_RW_ASYNC, HZ/10);
1279
1280
1281 if (fatal_signal_pending(current))
1282 return SWAP_CLUSTER_MAX;
1283 }
1284
1285 lru_add_drain();
1286
1287 if (!sc->may_unmap)
1288 isolate_mode |= ISOLATE_UNMAPPED;
1289 if (!sc->may_writepage)
1290 isolate_mode |= ISOLATE_CLEAN;
1291
1292 spin_lock_irq(&zone->lru_lock);
1293
1294 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1295 &nr_scanned, sc, isolate_mode, lru);
1296
1297 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1298 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1299
1300 if (global_reclaim(sc)) {
1301 zone->pages_scanned += nr_scanned;
1302 if (current_is_kswapd())
1303 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1304 else
1305 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1306 }
1307 spin_unlock_irq(&zone->lru_lock);
1308
1309 if (nr_taken == 0)
1310 return 0;
1311
1312 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1313 &nr_dirty, &nr_writeback, false);
1314
1315 spin_lock_irq(&zone->lru_lock);
1316
1317 reclaim_stat->recent_scanned[file] += nr_taken;
1318
1319 if (global_reclaim(sc)) {
1320 if (current_is_kswapd())
1321 __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
1322 nr_reclaimed);
1323 else
1324 __count_zone_vm_events(PGSTEAL_DIRECT, zone,
1325 nr_reclaimed);
1326 }
1327
1328 putback_inactive_pages(lruvec, &page_list);
1329
1330 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1331
1332 spin_unlock_irq(&zone->lru_lock);
1333
1334 free_hot_cold_page_list(&page_list, 1);
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359 if (nr_writeback && nr_writeback >=
1360 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1361 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1362
1363 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1364 zone_idx(zone),
1365 nr_scanned, nr_reclaimed,
1366 sc->priority,
1367 trace_shrink_flags(file));
1368 return nr_reclaimed;
1369}
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389static void move_active_pages_to_lru(struct lruvec *lruvec,
1390 struct list_head *list,
1391 struct list_head *pages_to_free,
1392 enum lru_list lru)
1393{
1394 struct zone *zone = lruvec_zone(lruvec);
1395 unsigned long pgmoved = 0;
1396 struct page *page;
1397 int nr_pages;
1398
1399 while (!list_empty(list)) {
1400 page = lru_to_page(list);
1401 lruvec = mem_cgroup_page_lruvec(page, zone);
1402
1403 VM_BUG_ON(PageLRU(page));
1404 SetPageLRU(page);
1405
1406 nr_pages = hpage_nr_pages(page);
1407 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1408 list_move(&page->lru, &lruvec->lists[lru]);
1409 pgmoved += nr_pages;
1410
1411 if (put_page_testzero(page)) {
1412 __ClearPageLRU(page);
1413 __ClearPageActive(page);
1414 del_page_from_lru_list(page, lruvec, lru);
1415
1416 if (unlikely(PageCompound(page))) {
1417 spin_unlock_irq(&zone->lru_lock);
1418 (*get_compound_page_dtor(page))(page);
1419 spin_lock_irq(&zone->lru_lock);
1420 } else
1421 list_add(&page->lru, pages_to_free);
1422 }
1423 }
1424 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1425 if (!is_active_lru(lru))
1426 __count_vm_events(PGDEACTIVATE, pgmoved);
1427}
1428
1429static void shrink_active_list(unsigned long nr_to_scan,
1430 struct lruvec *lruvec,
1431 struct scan_control *sc,
1432 enum lru_list lru)
1433{
1434 unsigned long nr_taken;
1435 unsigned long nr_scanned;
1436 unsigned long vm_flags;
1437 LIST_HEAD(l_hold);
1438 LIST_HEAD(l_active);
1439 LIST_HEAD(l_inactive);
1440 struct page *page;
1441 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1442 unsigned long nr_rotated = 0;
1443 isolate_mode_t isolate_mode = 0;
1444 int file = is_file_lru(lru);
1445 struct zone *zone = lruvec_zone(lruvec);
1446
1447 lru_add_drain();
1448
1449 if (!sc->may_unmap)
1450 isolate_mode |= ISOLATE_UNMAPPED;
1451 if (!sc->may_writepage)
1452 isolate_mode |= ISOLATE_CLEAN;
1453
1454 spin_lock_irq(&zone->lru_lock);
1455
1456 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1457 &nr_scanned, sc, isolate_mode, lru);
1458 if (global_reclaim(sc))
1459 zone->pages_scanned += nr_scanned;
1460
1461 reclaim_stat->recent_scanned[file] += nr_taken;
1462
1463 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1464 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1465 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1466 spin_unlock_irq(&zone->lru_lock);
1467
1468 while (!list_empty(&l_hold)) {
1469 cond_resched();
1470 page = lru_to_page(&l_hold);
1471 list_del(&page->lru);
1472
1473 if (unlikely(!page_evictable(page))) {
1474 putback_lru_page(page);
1475 continue;
1476 }
1477
1478 if (unlikely(buffer_heads_over_limit)) {
1479 if (page_has_private(page) && trylock_page(page)) {
1480 if (page_has_private(page))
1481 try_to_release_page(page, 0);
1482 unlock_page(page);
1483 }
1484 }
1485
1486 if (page_referenced(page, 0, sc->target_mem_cgroup,
1487 &vm_flags)) {
1488 nr_rotated += hpage_nr_pages(page);
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1499 list_add(&page->lru, &l_active);
1500 continue;
1501 }
1502 }
1503
1504 ClearPageActive(page);
1505 list_add(&page->lru, &l_inactive);
1506 }
1507
1508
1509
1510
1511 spin_lock_irq(&zone->lru_lock);
1512
1513
1514
1515
1516
1517
1518 reclaim_stat->recent_rotated[file] += nr_rotated;
1519
1520 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1521 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1522 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1523 spin_unlock_irq(&zone->lru_lock);
1524
1525 free_hot_cold_page_list(&l_hold, 1);
1526}
1527
1528#ifdef CONFIG_SWAP
1529static int inactive_anon_is_low_global(struct zone *zone)
1530{
1531 unsigned long active, inactive;
1532
1533 active = zone_page_state(zone, NR_ACTIVE_ANON);
1534 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1535
1536 if (inactive * zone->inactive_ratio < active)
1537 return 1;
1538
1539 return 0;
1540}
1541
1542
1543
1544
1545
1546
1547
1548
1549static int inactive_anon_is_low(struct lruvec *lruvec)
1550{
1551
1552
1553
1554
1555 if (!total_swap_pages)
1556 return 0;
1557
1558 if (!mem_cgroup_disabled())
1559 return mem_cgroup_inactive_anon_is_low(lruvec);
1560
1561 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1562}
1563#else
1564static inline int inactive_anon_is_low(struct lruvec *lruvec)
1565{
1566 return 0;
1567}
1568#endif
1569
1570static int inactive_file_is_low_global(struct zone *zone)
1571{
1572 unsigned long active, inactive;
1573
1574 active = zone_page_state(zone, NR_ACTIVE_FILE);
1575 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1576
1577 return (active > inactive);
1578}
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594static int inactive_file_is_low(struct lruvec *lruvec)
1595{
1596 if (!mem_cgroup_disabled())
1597 return mem_cgroup_inactive_file_is_low(lruvec);
1598
1599 return inactive_file_is_low_global(lruvec_zone(lruvec));
1600}
1601
1602static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1603{
1604 if (is_file_lru(lru))
1605 return inactive_file_is_low(lruvec);
1606 else
1607 return inactive_anon_is_low(lruvec);
1608}
1609
1610static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1611 struct lruvec *lruvec, struct scan_control *sc)
1612{
1613 if (is_active_lru(lru)) {
1614 if (inactive_list_is_low(lruvec, lru))
1615 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1616 return 0;
1617 }
1618
1619 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1620}
1621
1622static int vmscan_swappiness(struct scan_control *sc)
1623{
1624 if (global_reclaim(sc))
1625 return vm_swappiness;
1626 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1627}
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1639 unsigned long *nr)
1640{
1641 unsigned long anon, file, free;
1642 unsigned long anon_prio, file_prio;
1643 unsigned long ap, fp;
1644 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1645 u64 fraction[2], denominator;
1646 enum lru_list lru;
1647 int noswap = 0;
1648 bool force_scan = false;
1649 struct zone *zone = lruvec_zone(lruvec);
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661 if (current_is_kswapd() && zone->all_unreclaimable)
1662 force_scan = true;
1663 if (!global_reclaim(sc))
1664 force_scan = true;
1665
1666
1667 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1668 noswap = 1;
1669 fraction[0] = 0;
1670 fraction[1] = 1;
1671 denominator = 1;
1672 goto out;
1673 }
1674
1675 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1676 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1677 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1678 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1679
1680 if (global_reclaim(sc)) {
1681 free = zone_page_state(zone, NR_FREE_PAGES);
1682
1683
1684 if (unlikely(file + free <= high_wmark_pages(zone))) {
1685 fraction[0] = 1;
1686 fraction[1] = 0;
1687 denominator = 1;
1688 goto out;
1689 }
1690 }
1691
1692
1693
1694
1695
1696 anon_prio = vmscan_swappiness(sc);
1697 file_prio = 200 - anon_prio;
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710 spin_lock_irq(&zone->lru_lock);
1711 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1712 reclaim_stat->recent_scanned[0] /= 2;
1713 reclaim_stat->recent_rotated[0] /= 2;
1714 }
1715
1716 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1717 reclaim_stat->recent_scanned[1] /= 2;
1718 reclaim_stat->recent_rotated[1] /= 2;
1719 }
1720
1721
1722
1723
1724
1725
1726 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1727 ap /= reclaim_stat->recent_rotated[0] + 1;
1728
1729 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1730 fp /= reclaim_stat->recent_rotated[1] + 1;
1731 spin_unlock_irq(&zone->lru_lock);
1732
1733 fraction[0] = ap;
1734 fraction[1] = fp;
1735 denominator = ap + fp + 1;
1736out:
1737 for_each_evictable_lru(lru) {
1738 int file = is_file_lru(lru);
1739 unsigned long scan;
1740
1741 scan = get_lru_size(lruvec, lru);
1742 if (sc->priority || noswap || !vmscan_swappiness(sc)) {
1743 scan >>= sc->priority;
1744 if (!scan && force_scan)
1745 scan = SWAP_CLUSTER_MAX;
1746 scan = div64_u64(scan * fraction[file], denominator);
1747 }
1748 nr[lru] = scan;
1749 }
1750}
1751
1752
1753static bool in_reclaim_compaction(struct scan_control *sc)
1754{
1755 if (COMPACTION_BUILD && sc->order &&
1756 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1757 sc->priority < DEF_PRIORITY - 2))
1758 return true;
1759
1760 return false;
1761}
1762
1763
1764
1765
1766
1767
1768
1769
1770static inline bool should_continue_reclaim(struct lruvec *lruvec,
1771 unsigned long nr_reclaimed,
1772 unsigned long nr_scanned,
1773 struct scan_control *sc)
1774{
1775 unsigned long pages_for_compaction;
1776 unsigned long inactive_lru_pages;
1777
1778
1779 if (!in_reclaim_compaction(sc))
1780 return false;
1781
1782
1783 if (sc->gfp_mask & __GFP_REPEAT) {
1784
1785
1786
1787
1788
1789
1790 if (!nr_reclaimed && !nr_scanned)
1791 return false;
1792 } else {
1793
1794
1795
1796
1797
1798
1799
1800
1801 if (!nr_reclaimed)
1802 return false;
1803 }
1804
1805
1806
1807
1808
1809 pages_for_compaction = (2UL << sc->order);
1810 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1811 if (nr_swap_pages > 0)
1812 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
1813 if (sc->nr_reclaimed < pages_for_compaction &&
1814 inactive_lru_pages > pages_for_compaction)
1815 return true;
1816
1817
1818 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
1819 case COMPACT_PARTIAL:
1820 case COMPACT_CONTINUE:
1821 return false;
1822 default:
1823 return true;
1824 }
1825}
1826
1827
1828
1829
1830static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1831{
1832 unsigned long nr[NR_LRU_LISTS];
1833 unsigned long nr_to_scan;
1834 enum lru_list lru;
1835 unsigned long nr_reclaimed, nr_scanned;
1836 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1837 struct blk_plug plug;
1838
1839restart:
1840 nr_reclaimed = 0;
1841 nr_scanned = sc->nr_scanned;
1842 get_scan_count(lruvec, sc, nr);
1843
1844 blk_start_plug(&plug);
1845 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1846 nr[LRU_INACTIVE_FILE]) {
1847 for_each_evictable_lru(lru) {
1848 if (nr[lru]) {
1849 nr_to_scan = min_t(unsigned long,
1850 nr[lru], SWAP_CLUSTER_MAX);
1851 nr[lru] -= nr_to_scan;
1852
1853 nr_reclaimed += shrink_list(lru, nr_to_scan,
1854 lruvec, sc);
1855 }
1856 }
1857
1858
1859
1860
1861
1862
1863
1864
1865 if (nr_reclaimed >= nr_to_reclaim &&
1866 sc->priority < DEF_PRIORITY)
1867 break;
1868 }
1869 blk_finish_plug(&plug);
1870 sc->nr_reclaimed += nr_reclaimed;
1871
1872
1873
1874
1875
1876 if (inactive_anon_is_low(lruvec))
1877 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1878 sc, LRU_ACTIVE_ANON);
1879
1880
1881 if (should_continue_reclaim(lruvec, nr_reclaimed,
1882 sc->nr_scanned - nr_scanned, sc))
1883 goto restart;
1884
1885 throttle_vm_writeout(sc->gfp_mask);
1886}
1887
1888static void shrink_zone(struct zone *zone, struct scan_control *sc)
1889{
1890 struct mem_cgroup *root = sc->target_mem_cgroup;
1891 struct mem_cgroup_reclaim_cookie reclaim = {
1892 .zone = zone,
1893 .priority = sc->priority,
1894 };
1895 struct mem_cgroup *memcg;
1896
1897 memcg = mem_cgroup_iter(root, NULL, &reclaim);
1898 do {
1899 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1900
1901 shrink_lruvec(lruvec, sc);
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913 if (!global_reclaim(sc)) {
1914 mem_cgroup_iter_break(root, memcg);
1915 break;
1916 }
1917 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1918 } while (memcg);
1919}
1920
1921
1922static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
1923{
1924 unsigned long balance_gap, watermark;
1925 bool watermark_ok;
1926
1927
1928 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
1929 return false;
1930
1931
1932
1933
1934
1935
1936
1937 balance_gap = min(low_wmark_pages(zone),
1938 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
1939 KSWAPD_ZONE_BALANCE_GAP_RATIO);
1940 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
1941 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
1942
1943
1944
1945
1946
1947 if (compaction_deferred(zone, sc->order))
1948 return watermark_ok;
1949
1950
1951 if (!compaction_suitable(zone, sc->order))
1952 return false;
1953
1954 return watermark_ok;
1955}
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
1979{
1980 struct zoneref *z;
1981 struct zone *zone;
1982 unsigned long nr_soft_reclaimed;
1983 unsigned long nr_soft_scanned;
1984 bool aborted_reclaim = false;
1985
1986
1987
1988
1989
1990
1991 if (buffer_heads_over_limit)
1992 sc->gfp_mask |= __GFP_HIGHMEM;
1993
1994 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1995 gfp_zone(sc->gfp_mask), sc->nodemask) {
1996 if (!populated_zone(zone))
1997 continue;
1998
1999
2000
2001
2002 if (global_reclaim(sc)) {
2003 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2004 continue;
2005 if (zone->all_unreclaimable &&
2006 sc->priority != DEF_PRIORITY)
2007 continue;
2008 if (COMPACTION_BUILD) {
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018 if (compaction_ready(zone, sc)) {
2019 aborted_reclaim = true;
2020 continue;
2021 }
2022 }
2023
2024
2025
2026
2027
2028
2029 nr_soft_scanned = 0;
2030 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2031 sc->order, sc->gfp_mask,
2032 &nr_soft_scanned);
2033 sc->nr_reclaimed += nr_soft_reclaimed;
2034 sc->nr_scanned += nr_soft_scanned;
2035
2036 }
2037
2038 shrink_zone(zone, sc);
2039 }
2040
2041 return aborted_reclaim;
2042}
2043
2044static bool zone_reclaimable(struct zone *zone)
2045{
2046 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2047}
2048
2049
2050static bool all_unreclaimable(struct zonelist *zonelist,
2051 struct scan_control *sc)
2052{
2053 struct zoneref *z;
2054 struct zone *zone;
2055
2056 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2057 gfp_zone(sc->gfp_mask), sc->nodemask) {
2058 if (!populated_zone(zone))
2059 continue;
2060 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2061 continue;
2062 if (!zone->all_unreclaimable)
2063 return false;
2064 }
2065
2066 return true;
2067}
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2086 struct scan_control *sc,
2087 struct shrink_control *shrink)
2088{
2089 unsigned long total_scanned = 0;
2090 struct reclaim_state *reclaim_state = current->reclaim_state;
2091 struct zoneref *z;
2092 struct zone *zone;
2093 unsigned long writeback_threshold;
2094 bool aborted_reclaim;
2095
2096 delayacct_freepages_start();
2097
2098 if (global_reclaim(sc))
2099 count_vm_event(ALLOCSTALL);
2100
2101 do {
2102 sc->nr_scanned = 0;
2103 aborted_reclaim = shrink_zones(zonelist, sc);
2104
2105
2106
2107
2108
2109 if (global_reclaim(sc)) {
2110 unsigned long lru_pages = 0;
2111 for_each_zone_zonelist(zone, z, zonelist,
2112 gfp_zone(sc->gfp_mask)) {
2113 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2114 continue;
2115
2116 lru_pages += zone_reclaimable_pages(zone);
2117 }
2118
2119 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2120 if (reclaim_state) {
2121 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2122 reclaim_state->reclaimed_slab = 0;
2123 }
2124 }
2125 total_scanned += sc->nr_scanned;
2126 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2127 goto out;
2128
2129
2130
2131
2132
2133
2134
2135
2136 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2137 if (total_scanned > writeback_threshold) {
2138 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2139 WB_REASON_TRY_TO_FREE_PAGES);
2140 sc->may_writepage = 1;
2141 }
2142
2143
2144 if (!sc->hibernation_mode && sc->nr_scanned &&
2145 sc->priority < DEF_PRIORITY - 2) {
2146 struct zone *preferred_zone;
2147
2148 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2149 &cpuset_current_mems_allowed,
2150 &preferred_zone);
2151 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2152 }
2153 } while (--sc->priority >= 0);
2154
2155out:
2156 delayacct_freepages_end();
2157
2158 if (sc->nr_reclaimed)
2159 return sc->nr_reclaimed;
2160
2161
2162
2163
2164
2165
2166 if (oom_killer_disabled)
2167 return 0;
2168
2169
2170 if (aborted_reclaim)
2171 return 1;
2172
2173
2174 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2175 return 1;
2176
2177 return 0;
2178}
2179
2180static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2181{
2182 struct zone *zone;
2183 unsigned long pfmemalloc_reserve = 0;
2184 unsigned long free_pages = 0;
2185 int i;
2186 bool wmark_ok;
2187
2188 for (i = 0; i <= ZONE_NORMAL; i++) {
2189 zone = &pgdat->node_zones[i];
2190 pfmemalloc_reserve += min_wmark_pages(zone);
2191 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2192 }
2193
2194 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2195
2196
2197 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2198 pgdat->classzone_idx = min(pgdat->classzone_idx,
2199 (enum zone_type)ZONE_NORMAL);
2200 wake_up_interruptible(&pgdat->kswapd_wait);
2201 }
2202
2203 return wmark_ok;
2204}
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2216 nodemask_t *nodemask)
2217{
2218 struct zone *zone;
2219 int high_zoneidx = gfp_zone(gfp_mask);
2220 pg_data_t *pgdat;
2221
2222
2223
2224
2225
2226
2227
2228
2229 if (current->flags & PF_KTHREAD)
2230 goto out;
2231
2232
2233
2234
2235
2236 if (fatal_signal_pending(current))
2237 goto out;
2238
2239
2240 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2241 pgdat = zone->zone_pgdat;
2242 if (pfmemalloc_watermark_ok(pgdat))
2243 goto out;
2244
2245
2246 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256 if (!(gfp_mask & __GFP_FS)) {
2257 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2258 pfmemalloc_watermark_ok(pgdat), HZ);
2259
2260 goto check_pending;
2261 }
2262
2263
2264 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2265 pfmemalloc_watermark_ok(pgdat));
2266
2267check_pending:
2268 if (fatal_signal_pending(current))
2269 return true;
2270
2271out:
2272 return false;
2273}
2274
2275unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2276 gfp_t gfp_mask, nodemask_t *nodemask)
2277{
2278 unsigned long nr_reclaimed;
2279 struct scan_control sc = {
2280 .gfp_mask = gfp_mask,
2281 .may_writepage = !laptop_mode,
2282 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2283 .may_unmap = 1,
2284 .may_swap = 1,
2285 .order = order,
2286 .priority = DEF_PRIORITY,
2287 .target_mem_cgroup = NULL,
2288 .nodemask = nodemask,
2289 };
2290 struct shrink_control shrink = {
2291 .gfp_mask = sc.gfp_mask,
2292 };
2293
2294
2295
2296
2297
2298
2299 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2300 return 1;
2301
2302 trace_mm_vmscan_direct_reclaim_begin(order,
2303 sc.may_writepage,
2304 gfp_mask);
2305
2306 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2307
2308 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2309
2310 return nr_reclaimed;
2311}
2312
2313#ifdef CONFIG_MEMCG
2314
2315unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2316 gfp_t gfp_mask, bool noswap,
2317 struct zone *zone,
2318 unsigned long *nr_scanned)
2319{
2320 struct scan_control sc = {
2321 .nr_scanned = 0,
2322 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2323 .may_writepage = !laptop_mode,
2324 .may_unmap = 1,
2325 .may_swap = !noswap,
2326 .order = 0,
2327 .priority = 0,
2328 .target_mem_cgroup = memcg,
2329 };
2330 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2331
2332 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2333 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2334
2335 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2336 sc.may_writepage,
2337 sc.gfp_mask);
2338
2339
2340
2341
2342
2343
2344
2345
2346 shrink_lruvec(lruvec, &sc);
2347
2348 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2349
2350 *nr_scanned = sc.nr_scanned;
2351 return sc.nr_reclaimed;
2352}
2353
2354unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2355 gfp_t gfp_mask,
2356 bool noswap)
2357{
2358 struct zonelist *zonelist;
2359 unsigned long nr_reclaimed;
2360 int nid;
2361 struct scan_control sc = {
2362 .may_writepage = !laptop_mode,
2363 .may_unmap = 1,
2364 .may_swap = !noswap,
2365 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2366 .order = 0,
2367 .priority = DEF_PRIORITY,
2368 .target_mem_cgroup = memcg,
2369 .nodemask = NULL,
2370 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2371 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2372 };
2373 struct shrink_control shrink = {
2374 .gfp_mask = sc.gfp_mask,
2375 };
2376
2377
2378
2379
2380
2381
2382 nid = mem_cgroup_select_victim_node(memcg);
2383
2384 zonelist = NODE_DATA(nid)->node_zonelists;
2385
2386 trace_mm_vmscan_memcg_reclaim_begin(0,
2387 sc.may_writepage,
2388 sc.gfp_mask);
2389
2390 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2391
2392 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2393
2394 return nr_reclaimed;
2395}
2396#endif
2397
2398static void age_active_anon(struct zone *zone, struct scan_control *sc)
2399{
2400 struct mem_cgroup *memcg;
2401
2402 if (!total_swap_pages)
2403 return;
2404
2405 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2406 do {
2407 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2408
2409 if (inactive_anon_is_low(lruvec))
2410 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2411 sc, LRU_ACTIVE_ANON);
2412
2413 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2414 } while (memcg);
2415}
2416
2417static bool zone_balanced(struct zone *zone, int order,
2418 unsigned long balance_gap, int classzone_idx)
2419{
2420 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2421 balance_gap, classzone_idx, 0))
2422 return false;
2423
2424 if (COMPACTION_BUILD && order && !compaction_suitable(zone, order))
2425 return false;
2426
2427 return true;
2428}
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2447 int classzone_idx)
2448{
2449 unsigned long present_pages = 0;
2450 int i;
2451
2452 for (i = 0; i <= classzone_idx; i++)
2453 present_pages += pgdat->node_zones[i].present_pages;
2454
2455
2456 return balanced_pages >= (present_pages >> 2);
2457}
2458
2459
2460
2461
2462
2463
2464
2465static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2466 int classzone_idx)
2467{
2468 int i;
2469 unsigned long balanced = 0;
2470 bool all_zones_ok = true;
2471
2472
2473 if (remaining)
2474 return false;
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2486 wake_up(&pgdat->pfmemalloc_wait);
2487 return false;
2488 }
2489
2490
2491 for (i = 0; i <= classzone_idx; i++) {
2492 struct zone *zone = pgdat->node_zones + i;
2493
2494 if (!populated_zone(zone))
2495 continue;
2496
2497
2498
2499
2500
2501
2502
2503 if (zone->all_unreclaimable) {
2504 balanced += zone->present_pages;
2505 continue;
2506 }
2507
2508 if (!zone_balanced(zone, order, 0, i))
2509 all_zones_ok = false;
2510 else
2511 balanced += zone->present_pages;
2512 }
2513
2514
2515
2516
2517
2518
2519 if (order)
2520 return pgdat_balanced(pgdat, balanced, classzone_idx);
2521 else
2522 return all_zones_ok;
2523}
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2547 int *classzone_idx)
2548{
2549 int all_zones_ok;
2550 unsigned long balanced;
2551 int i;
2552 int end_zone = 0;
2553 unsigned long total_scanned;
2554 struct reclaim_state *reclaim_state = current->reclaim_state;
2555 unsigned long nr_soft_reclaimed;
2556 unsigned long nr_soft_scanned;
2557 struct scan_control sc = {
2558 .gfp_mask = GFP_KERNEL,
2559 .may_unmap = 1,
2560 .may_swap = 1,
2561
2562
2563
2564
2565 .nr_to_reclaim = ULONG_MAX,
2566 .order = order,
2567 .target_mem_cgroup = NULL,
2568 };
2569 struct shrink_control shrink = {
2570 .gfp_mask = sc.gfp_mask,
2571 };
2572loop_again:
2573 total_scanned = 0;
2574 sc.priority = DEF_PRIORITY;
2575 sc.nr_reclaimed = 0;
2576 sc.may_writepage = !laptop_mode;
2577 count_vm_event(PAGEOUTRUN);
2578
2579 do {
2580 unsigned long lru_pages = 0;
2581 int has_under_min_watermark_zone = 0;
2582
2583 all_zones_ok = 1;
2584 balanced = 0;
2585
2586
2587
2588
2589
2590 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2591 struct zone *zone = pgdat->node_zones + i;
2592
2593 if (!populated_zone(zone))
2594 continue;
2595
2596 if (zone->all_unreclaimable &&
2597 sc.priority != DEF_PRIORITY)
2598 continue;
2599
2600
2601
2602
2603
2604 age_active_anon(zone, &sc);
2605
2606
2607
2608
2609
2610
2611
2612 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2613 end_zone = i;
2614 break;
2615 }
2616
2617 if (!zone_balanced(zone, order, 0, 0)) {
2618 end_zone = i;
2619 break;
2620 } else {
2621
2622 zone_clear_flag(zone, ZONE_CONGESTED);
2623 }
2624 }
2625 if (i < 0)
2626 goto out;
2627
2628 for (i = 0; i <= end_zone; i++) {
2629 struct zone *zone = pgdat->node_zones + i;
2630
2631 lru_pages += zone_reclaimable_pages(zone);
2632 }
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643 for (i = 0; i <= end_zone; i++) {
2644 struct zone *zone = pgdat->node_zones + i;
2645 int nr_slab, testorder;
2646 unsigned long balance_gap;
2647
2648 if (!populated_zone(zone))
2649 continue;
2650
2651 if (zone->all_unreclaimable &&
2652 sc.priority != DEF_PRIORITY)
2653 continue;
2654
2655 sc.nr_scanned = 0;
2656
2657 nr_soft_scanned = 0;
2658
2659
2660
2661 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2662 order, sc.gfp_mask,
2663 &nr_soft_scanned);
2664 sc.nr_reclaimed += nr_soft_reclaimed;
2665 total_scanned += nr_soft_scanned;
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675 balance_gap = min(low_wmark_pages(zone),
2676 (zone->present_pages +
2677 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2678 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2679
2680
2681
2682
2683
2684
2685
2686 testorder = order;
2687 if (COMPACTION_BUILD && order &&
2688 compaction_suitable(zone, order) !=
2689 COMPACT_SKIPPED)
2690 testorder = 0;
2691
2692 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2693 !zone_balanced(zone, testorder,
2694 balance_gap, end_zone)) {
2695 shrink_zone(zone, &sc);
2696
2697 reclaim_state->reclaimed_slab = 0;
2698 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2699 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2700 total_scanned += sc.nr_scanned;
2701
2702 if (nr_slab == 0 && !zone_reclaimable(zone))
2703 zone->all_unreclaimable = 1;
2704 }
2705
2706
2707
2708
2709
2710
2711 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2712 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2713 sc.may_writepage = 1;
2714
2715 if (zone->all_unreclaimable) {
2716 if (end_zone && end_zone == i)
2717 end_zone--;
2718 continue;
2719 }
2720
2721 if (!zone_balanced(zone, testorder, 0, end_zone)) {
2722 all_zones_ok = 0;
2723
2724
2725
2726
2727
2728 if (!zone_watermark_ok_safe(zone, order,
2729 min_wmark_pages(zone), end_zone, 0))
2730 has_under_min_watermark_zone = 1;
2731 } else {
2732
2733
2734
2735
2736
2737
2738
2739 zone_clear_flag(zone, ZONE_CONGESTED);
2740 if (i <= *classzone_idx)
2741 balanced += zone->present_pages;
2742 }
2743
2744 }
2745
2746
2747
2748
2749
2750
2751 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2752 pfmemalloc_watermark_ok(pgdat))
2753 wake_up(&pgdat->pfmemalloc_wait);
2754
2755 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2756 break;
2757
2758
2759
2760
2761 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2762 if (has_under_min_watermark_zone)
2763 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2764 else
2765 congestion_wait(BLK_RW_ASYNC, HZ/10);
2766 }
2767
2768
2769
2770
2771
2772
2773
2774 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2775 break;
2776 } while (--sc.priority >= 0);
2777out:
2778
2779
2780
2781
2782
2783
2784 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2785 cond_resched();
2786
2787 try_to_freeze();
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2804 order = sc.order = 0;
2805
2806 goto loop_again;
2807 }
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817 if (order) {
2818 int zones_need_compaction = 1;
2819
2820 for (i = 0; i <= end_zone; i++) {
2821 struct zone *zone = pgdat->node_zones + i;
2822
2823 if (!populated_zone(zone))
2824 continue;
2825
2826
2827 if (zone_watermark_ok(zone, order,
2828 low_wmark_pages(zone), *classzone_idx, 0))
2829 zones_need_compaction = 0;
2830 }
2831
2832 if (zones_need_compaction)
2833 compact_pgdat(pgdat, order);
2834 }
2835
2836
2837
2838
2839
2840
2841
2842 *classzone_idx = end_zone;
2843 return order;
2844}
2845
2846static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2847{
2848 long remaining = 0;
2849 DEFINE_WAIT(wait);
2850
2851 if (freezing(current) || kthread_should_stop())
2852 return;
2853
2854 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2855
2856
2857 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2858 remaining = schedule_timeout(HZ/10);
2859 finish_wait(&pgdat->kswapd_wait, &wait);
2860 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2861 }
2862
2863
2864
2865
2866
2867 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2868 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2879
2880
2881
2882
2883
2884
2885
2886 reset_isolation_suitable(pgdat);
2887
2888 if (!kthread_should_stop())
2889 schedule();
2890
2891 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2892 } else {
2893 if (remaining)
2894 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2895 else
2896 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2897 }
2898 finish_wait(&pgdat->kswapd_wait, &wait);
2899}
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914static int kswapd(void *p)
2915{
2916 unsigned long order, new_order;
2917 unsigned balanced_order;
2918 int classzone_idx, new_classzone_idx;
2919 int balanced_classzone_idx;
2920 pg_data_t *pgdat = (pg_data_t*)p;
2921 struct task_struct *tsk = current;
2922
2923 struct reclaim_state reclaim_state = {
2924 .reclaimed_slab = 0,
2925 };
2926 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2927
2928 lockdep_set_current_reclaim_state(GFP_KERNEL);
2929
2930 if (!cpumask_empty(cpumask))
2931 set_cpus_allowed_ptr(tsk, cpumask);
2932 current->reclaim_state = &reclaim_state;
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2947 set_freezable();
2948
2949 order = new_order = 0;
2950 balanced_order = 0;
2951 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2952 balanced_classzone_idx = classzone_idx;
2953 for ( ; ; ) {
2954 int ret;
2955
2956
2957
2958
2959
2960
2961 if (balanced_classzone_idx >= new_classzone_idx &&
2962 balanced_order == new_order) {
2963 new_order = pgdat->kswapd_max_order;
2964 new_classzone_idx = pgdat->classzone_idx;
2965 pgdat->kswapd_max_order = 0;
2966 pgdat->classzone_idx = pgdat->nr_zones - 1;
2967 }
2968
2969 if (order < new_order || classzone_idx > new_classzone_idx) {
2970
2971
2972
2973
2974 order = new_order;
2975 classzone_idx = new_classzone_idx;
2976 } else {
2977 kswapd_try_to_sleep(pgdat, balanced_order,
2978 balanced_classzone_idx);
2979 order = pgdat->kswapd_max_order;
2980 classzone_idx = pgdat->classzone_idx;
2981 new_order = order;
2982 new_classzone_idx = classzone_idx;
2983 pgdat->kswapd_max_order = 0;
2984 pgdat->classzone_idx = pgdat->nr_zones - 1;
2985 }
2986
2987 ret = try_to_freeze();
2988 if (kthread_should_stop())
2989 break;
2990
2991
2992
2993
2994
2995 if (!ret) {
2996 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2997 balanced_classzone_idx = classzone_idx;
2998 balanced_order = balance_pgdat(pgdat, order,
2999 &balanced_classzone_idx);
3000 }
3001 }
3002
3003 current->reclaim_state = NULL;
3004 return 0;
3005}
3006
3007
3008
3009
3010void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3011{
3012 pg_data_t *pgdat;
3013
3014 if (!populated_zone(zone))
3015 return;
3016
3017 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3018 return;
3019 pgdat = zone->zone_pgdat;
3020 if (pgdat->kswapd_max_order < order) {
3021 pgdat->kswapd_max_order = order;
3022 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3023 }
3024 if (!waitqueue_active(&pgdat->kswapd_wait))
3025 return;
3026 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
3027 return;
3028
3029 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3030 wake_up_interruptible(&pgdat->kswapd_wait);
3031}
3032
3033
3034
3035
3036
3037
3038
3039
3040unsigned long global_reclaimable_pages(void)
3041{
3042 int nr;
3043
3044 nr = global_page_state(NR_ACTIVE_FILE) +
3045 global_page_state(NR_INACTIVE_FILE);
3046
3047 if (nr_swap_pages > 0)
3048 nr += global_page_state(NR_ACTIVE_ANON) +
3049 global_page_state(NR_INACTIVE_ANON);
3050
3051 return nr;
3052}
3053
3054unsigned long zone_reclaimable_pages(struct zone *zone)
3055{
3056 int nr;
3057
3058 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3059 zone_page_state(zone, NR_INACTIVE_FILE);
3060
3061 if (nr_swap_pages > 0)
3062 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3063 zone_page_state(zone, NR_INACTIVE_ANON);
3064
3065 return nr;
3066}
3067
3068#ifdef CONFIG_HIBERNATION
3069
3070
3071
3072
3073
3074
3075
3076
3077unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3078{
3079 struct reclaim_state reclaim_state;
3080 struct scan_control sc = {
3081 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3082 .may_swap = 1,
3083 .may_unmap = 1,
3084 .may_writepage = 1,
3085 .nr_to_reclaim = nr_to_reclaim,
3086 .hibernation_mode = 1,
3087 .order = 0,
3088 .priority = DEF_PRIORITY,
3089 };
3090 struct shrink_control shrink = {
3091 .gfp_mask = sc.gfp_mask,
3092 };
3093 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3094 struct task_struct *p = current;
3095 unsigned long nr_reclaimed;
3096
3097 p->flags |= PF_MEMALLOC;
3098 lockdep_set_current_reclaim_state(sc.gfp_mask);
3099 reclaim_state.reclaimed_slab = 0;
3100 p->reclaim_state = &reclaim_state;
3101
3102 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3103
3104 p->reclaim_state = NULL;
3105 lockdep_clear_current_reclaim_state();
3106 p->flags &= ~PF_MEMALLOC;
3107
3108 return nr_reclaimed;
3109}
3110#endif
3111
3112
3113
3114
3115
3116static int __devinit cpu_callback(struct notifier_block *nfb,
3117 unsigned long action, void *hcpu)
3118{
3119 int nid;
3120
3121 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3122 for_each_node_state(nid, N_HIGH_MEMORY) {
3123 pg_data_t *pgdat = NODE_DATA(nid);
3124 const struct cpumask *mask;
3125
3126 mask = cpumask_of_node(pgdat->node_id);
3127
3128 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3129
3130 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3131 }
3132 }
3133 return NOTIFY_OK;
3134}
3135
3136
3137
3138
3139
3140int kswapd_run(int nid)
3141{
3142 pg_data_t *pgdat = NODE_DATA(nid);
3143 int ret = 0;
3144
3145 if (pgdat->kswapd)
3146 return 0;
3147
3148 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3149 if (IS_ERR(pgdat->kswapd)) {
3150
3151 BUG_ON(system_state == SYSTEM_BOOTING);
3152 pgdat->kswapd = NULL;
3153 pr_err("Failed to start kswapd on node %d\n", nid);
3154 ret = PTR_ERR(pgdat->kswapd);
3155 }
3156 return ret;
3157}
3158
3159
3160
3161
3162
3163void kswapd_stop(int nid)
3164{
3165 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3166
3167 if (kswapd) {
3168 kthread_stop(kswapd);
3169 NODE_DATA(nid)->kswapd = NULL;
3170 }
3171}
3172
3173static int __init kswapd_init(void)
3174{
3175 int nid;
3176
3177 swap_setup();
3178 for_each_node_state(nid, N_HIGH_MEMORY)
3179 kswapd_run(nid);
3180 hotcpu_notifier(cpu_callback, 0);
3181 return 0;
3182}
3183
3184module_init(kswapd_init)
3185
3186#ifdef CONFIG_NUMA
3187
3188
3189
3190
3191
3192
3193int zone_reclaim_mode __read_mostly;
3194
3195#define RECLAIM_OFF 0
3196#define RECLAIM_ZONE (1<<0)
3197#define RECLAIM_WRITE (1<<1)
3198#define RECLAIM_SWAP (1<<2)
3199
3200
3201
3202
3203
3204
3205#define ZONE_RECLAIM_PRIORITY 4
3206
3207
3208
3209
3210
3211int sysctl_min_unmapped_ratio = 1;
3212
3213
3214
3215
3216
3217int sysctl_min_slab_ratio = 5;
3218
3219static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3220{
3221 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3222 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3223 zone_page_state(zone, NR_ACTIVE_FILE);
3224
3225
3226
3227
3228
3229
3230 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3231}
3232
3233
3234static long zone_pagecache_reclaimable(struct zone *zone)
3235{
3236 long nr_pagecache_reclaimable;
3237 long delta = 0;
3238
3239
3240
3241
3242
3243
3244
3245 if (zone_reclaim_mode & RECLAIM_SWAP)
3246 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3247 else
3248 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3249
3250
3251 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3252 delta += zone_page_state(zone, NR_FILE_DIRTY);
3253
3254
3255 if (unlikely(delta > nr_pagecache_reclaimable))
3256 delta = nr_pagecache_reclaimable;
3257
3258 return nr_pagecache_reclaimable - delta;
3259}
3260
3261
3262
3263
3264static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3265{
3266
3267 const unsigned long nr_pages = 1 << order;
3268 struct task_struct *p = current;
3269 struct reclaim_state reclaim_state;
3270 struct scan_control sc = {
3271 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3272 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3273 .may_swap = 1,
3274 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3275 SWAP_CLUSTER_MAX),
3276 .gfp_mask = gfp_mask,
3277 .order = order,
3278 .priority = ZONE_RECLAIM_PRIORITY,
3279 };
3280 struct shrink_control shrink = {
3281 .gfp_mask = sc.gfp_mask,
3282 };
3283 unsigned long nr_slab_pages0, nr_slab_pages1;
3284
3285 cond_resched();
3286
3287
3288
3289
3290
3291 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3292 lockdep_set_current_reclaim_state(gfp_mask);
3293 reclaim_state.reclaimed_slab = 0;
3294 p->reclaim_state = &reclaim_state;
3295
3296 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3297
3298
3299
3300
3301 do {
3302 shrink_zone(zone, &sc);
3303 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3304 }
3305
3306 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3307 if (nr_slab_pages0 > zone->min_slab_pages) {
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318 for (;;) {
3319 unsigned long lru_pages = zone_reclaimable_pages(zone);
3320
3321
3322 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3323 break;
3324
3325
3326 nr_slab_pages1 = zone_page_state(zone,
3327 NR_SLAB_RECLAIMABLE);
3328 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3329 break;
3330 }
3331
3332
3333
3334
3335
3336 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3337 if (nr_slab_pages1 < nr_slab_pages0)
3338 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3339 }
3340
3341 p->reclaim_state = NULL;
3342 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3343 lockdep_clear_current_reclaim_state();
3344 return sc.nr_reclaimed >= nr_pages;
3345}
3346
3347int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3348{
3349 int node_id;
3350 int ret;
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3363 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3364 return ZONE_RECLAIM_FULL;
3365
3366 if (zone->all_unreclaimable)
3367 return ZONE_RECLAIM_FULL;
3368
3369
3370
3371
3372 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3373 return ZONE_RECLAIM_NOSCAN;
3374
3375
3376
3377
3378
3379
3380
3381 node_id = zone_to_nid(zone);
3382 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3383 return ZONE_RECLAIM_NOSCAN;
3384
3385 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3386 return ZONE_RECLAIM_NOSCAN;
3387
3388 ret = __zone_reclaim(zone, gfp_mask, order);
3389 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3390
3391 if (!ret)
3392 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3393
3394 return ret;
3395}
3396#endif
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410int page_evictable(struct page *page)
3411{
3412 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3413}
3414
3415#ifdef CONFIG_SHMEM
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425void check_move_unevictable_pages(struct page **pages, int nr_pages)
3426{
3427 struct lruvec *lruvec;
3428 struct zone *zone = NULL;
3429 int pgscanned = 0;
3430 int pgrescued = 0;
3431 int i;
3432
3433 for (i = 0; i < nr_pages; i++) {
3434 struct page *page = pages[i];
3435 struct zone *pagezone;
3436
3437 pgscanned++;
3438 pagezone = page_zone(page);
3439 if (pagezone != zone) {
3440 if (zone)
3441 spin_unlock_irq(&zone->lru_lock);
3442 zone = pagezone;
3443 spin_lock_irq(&zone->lru_lock);
3444 }
3445 lruvec = mem_cgroup_page_lruvec(page, zone);
3446
3447 if (!PageLRU(page) || !PageUnevictable(page))
3448 continue;
3449
3450 if (page_evictable(page)) {
3451 enum lru_list lru = page_lru_base_type(page);
3452
3453 VM_BUG_ON(PageActive(page));
3454 ClearPageUnevictable(page);
3455 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3456 add_page_to_lru_list(page, lruvec, lru);
3457 pgrescued++;
3458 }
3459 }
3460
3461 if (zone) {
3462 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3463 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3464 spin_unlock_irq(&zone->lru_lock);
3465 }
3466}
3467#endif
3468
3469static void warn_scan_unevictable_pages(void)
3470{
3471 printk_once(KERN_WARNING
3472 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3473 "disabled for lack of a legitimate use case. If you have "
3474 "one, please send an email to linux-mm@kvack.org.\n",
3475 current->comm);
3476}
3477
3478
3479
3480
3481
3482unsigned long scan_unevictable_pages;
3483
3484int scan_unevictable_handler(struct ctl_table *table, int write,
3485 void __user *buffer,
3486 size_t *length, loff_t *ppos)
3487{
3488 warn_scan_unevictable_pages();
3489 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3490 scan_unevictable_pages = 0;
3491 return 0;
3492}
3493
3494#ifdef CONFIG_NUMA
3495
3496
3497
3498
3499
3500static ssize_t read_scan_unevictable_node(struct device *dev,
3501 struct device_attribute *attr,
3502 char *buf)
3503{
3504 warn_scan_unevictable_pages();
3505 return sprintf(buf, "0\n");
3506}
3507
3508static ssize_t write_scan_unevictable_node(struct device *dev,
3509 struct device_attribute *attr,
3510 const char *buf, size_t count)
3511{
3512 warn_scan_unevictable_pages();
3513 return 1;
3514}
3515
3516
3517static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3518 read_scan_unevictable_node,
3519 write_scan_unevictable_node);
3520
3521int scan_unevictable_register_node(struct node *node)
3522{
3523 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3524}
3525
3526void scan_unevictable_unregister_node(struct node *node)
3527{
3528 device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3529}
3530#endif
3531