1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmpressure.h>
23#include <linux/vmstat.h>
24#include <linux/file.h>
25#include <linux/writeback.h>
26#include <linux/blkdev.h>
27#include <linux/buffer_head.h>
28
29#include <linux/mm_inline.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/compaction.h>
36#include <linux/notifier.h>
37#include <linux/rwsem.h>
38#include <linux/delay.h>
39#include <linux/kthread.h>
40#include <linux/freezer.h>
41#include <linux/memcontrol.h>
42#include <linux/delayacct.h>
43#include <linux/sysctl.h>
44#include <linux/oom.h>
45#include <linux/prefetch.h>
46
47#include <asm/tlbflush.h>
48#include <asm/div64.h>
49
50#include <linux/swapops.h>
51
52#include "internal.h"
53
54#define CREATE_TRACE_POINTS
55#include <trace/events/vmscan.h>
56
57struct scan_control {
58
59 unsigned long nr_scanned;
60
61
62 unsigned long nr_reclaimed;
63
64
65 unsigned long nr_to_reclaim;
66
67 unsigned long hibernation_mode;
68
69
70 gfp_t gfp_mask;
71
72 int may_writepage;
73
74
75 int may_unmap;
76
77
78 int may_swap;
79
80 int order;
81
82
83 int priority;
84
85
86
87
88
89 struct mem_cgroup *target_mem_cgroup;
90
91
92
93
94
95 nodemask_t *nodemask;
96};
97
98#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
99
100#ifdef ARCH_HAS_PREFETCH
101#define prefetch_prev_lru_page(_page, _base, _field) \
102 do { \
103 if ((_page)->lru.prev != _base) { \
104 struct page *prev; \
105 \
106 prev = lru_to_page(&(_page->lru)); \
107 prefetch(&prev->_field); \
108 } \
109 } while (0)
110#else
111#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
112#endif
113
114#ifdef ARCH_HAS_PREFETCHW
115#define prefetchw_prev_lru_page(_page, _base, _field) \
116 do { \
117 if ((_page)->lru.prev != _base) { \
118 struct page *prev; \
119 \
120 prev = lru_to_page(&(_page->lru)); \
121 prefetchw(&prev->_field); \
122 } \
123 } while (0)
124#else
125#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
126#endif
127
128
129
130
131int vm_swappiness = 60;
132unsigned long vm_total_pages;
133
134static LIST_HEAD(shrinker_list);
135static DECLARE_RWSEM(shrinker_rwsem);
136
137#ifdef CONFIG_MEMCG
138static bool global_reclaim(struct scan_control *sc)
139{
140 return !sc->target_mem_cgroup;
141}
142#else
143static bool global_reclaim(struct scan_control *sc)
144{
145 return true;
146}
147#endif
148
149static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
150{
151 if (!mem_cgroup_disabled())
152 return mem_cgroup_get_lru_size(lruvec, lru);
153
154 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
155}
156
157
158
159
160void register_shrinker(struct shrinker *shrinker)
161{
162 atomic_long_set(&shrinker->nr_in_batch, 0);
163 down_write(&shrinker_rwsem);
164 list_add_tail(&shrinker->list, &shrinker_list);
165 up_write(&shrinker_rwsem);
166}
167EXPORT_SYMBOL(register_shrinker);
168
169
170
171
172void unregister_shrinker(struct shrinker *shrinker)
173{
174 down_write(&shrinker_rwsem);
175 list_del(&shrinker->list);
176 up_write(&shrinker_rwsem);
177}
178EXPORT_SYMBOL(unregister_shrinker);
179
180static inline int do_shrinker_shrink(struct shrinker *shrinker,
181 struct shrink_control *sc,
182 unsigned long nr_to_scan)
183{
184 sc->nr_to_scan = nr_to_scan;
185 return (*shrinker->shrink)(shrinker, sc);
186}
187
188#define SHRINK_BATCH 128
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208unsigned long shrink_slab(struct shrink_control *shrink,
209 unsigned long nr_pages_scanned,
210 unsigned long lru_pages)
211{
212 struct shrinker *shrinker;
213 unsigned long ret = 0;
214
215 if (nr_pages_scanned == 0)
216 nr_pages_scanned = SWAP_CLUSTER_MAX;
217
218 if (!down_read_trylock(&shrinker_rwsem)) {
219
220 ret = 1;
221 goto out;
222 }
223
224 list_for_each_entry(shrinker, &shrinker_list, list) {
225 unsigned long long delta;
226 long total_scan;
227 long max_pass;
228 int shrink_ret = 0;
229 long nr;
230 long new_nr;
231 long batch_size = shrinker->batch ? shrinker->batch
232 : SHRINK_BATCH;
233
234 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
235 if (max_pass <= 0)
236 continue;
237
238
239
240
241
242
243 nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
244
245 total_scan = nr;
246 delta = (4 * nr_pages_scanned) / shrinker->seeks;
247 delta *= max_pass;
248 do_div(delta, lru_pages + 1);
249 total_scan += delta;
250 if (total_scan < 0) {
251 printk(KERN_ERR "shrink_slab: %pF negative objects to "
252 "delete nr=%ld\n",
253 shrinker->shrink, total_scan);
254 total_scan = max_pass;
255 }
256
257
258
259
260
261
262
263
264
265
266
267
268
269 if (delta < max_pass / 4)
270 total_scan = min(total_scan, max_pass / 2);
271
272
273
274
275
276
277 if (total_scan > max_pass * 2)
278 total_scan = max_pass * 2;
279
280 trace_mm_shrink_slab_start(shrinker, shrink, nr,
281 nr_pages_scanned, lru_pages,
282 max_pass, delta, total_scan);
283
284 while (total_scan >= batch_size) {
285 int nr_before;
286
287 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
288 shrink_ret = do_shrinker_shrink(shrinker, shrink,
289 batch_size);
290 if (shrink_ret == -1)
291 break;
292 if (shrink_ret < nr_before)
293 ret += nr_before - shrink_ret;
294 count_vm_events(SLABS_SCANNED, batch_size);
295 total_scan -= batch_size;
296
297 cond_resched();
298 }
299
300
301
302
303
304
305 if (total_scan > 0)
306 new_nr = atomic_long_add_return(total_scan,
307 &shrinker->nr_in_batch);
308 else
309 new_nr = atomic_long_read(&shrinker->nr_in_batch);
310
311 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
312 }
313 up_read(&shrinker_rwsem);
314out:
315 cond_resched();
316 return ret;
317}
318
319static inline int is_page_cache_freeable(struct page *page)
320{
321
322
323
324
325
326 return page_count(page) - page_has_private(page) == 2;
327}
328
329static int may_write_to_queue(struct backing_dev_info *bdi,
330 struct scan_control *sc)
331{
332 if (current->flags & PF_SWAPWRITE)
333 return 1;
334 if (!bdi_write_congested(bdi))
335 return 1;
336 if (bdi == current->backing_dev_info)
337 return 1;
338 return 0;
339}
340
341
342
343
344
345
346
347
348
349
350
351
352
353static void handle_write_error(struct address_space *mapping,
354 struct page *page, int error)
355{
356 lock_page(page);
357 if (page_mapping(page) == mapping)
358 mapping_set_error(mapping, error);
359 unlock_page(page);
360}
361
362
363typedef enum {
364
365 PAGE_KEEP,
366
367 PAGE_ACTIVATE,
368
369 PAGE_SUCCESS,
370
371 PAGE_CLEAN,
372} pageout_t;
373
374
375
376
377
378static pageout_t pageout(struct page *page, struct address_space *mapping,
379 struct scan_control *sc)
380{
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397 if (!is_page_cache_freeable(page))
398 return PAGE_KEEP;
399 if (!mapping) {
400
401
402
403
404 if (page_has_private(page)) {
405 if (try_to_free_buffers(page)) {
406 ClearPageDirty(page);
407 printk("%s: orphaned page\n", __func__);
408 return PAGE_CLEAN;
409 }
410 }
411 return PAGE_KEEP;
412 }
413 if (mapping->a_ops->writepage == NULL)
414 return PAGE_ACTIVATE;
415 if (!may_write_to_queue(mapping->backing_dev_info, sc))
416 return PAGE_KEEP;
417
418 if (clear_page_dirty_for_io(page)) {
419 int res;
420 struct writeback_control wbc = {
421 .sync_mode = WB_SYNC_NONE,
422 .nr_to_write = SWAP_CLUSTER_MAX,
423 .range_start = 0,
424 .range_end = LLONG_MAX,
425 .for_reclaim = 1,
426 };
427
428 SetPageReclaim(page);
429 res = mapping->a_ops->writepage(page, &wbc);
430 if (res < 0)
431 handle_write_error(mapping, page, res);
432 if (res == AOP_WRITEPAGE_ACTIVATE) {
433 ClearPageReclaim(page);
434 return PAGE_ACTIVATE;
435 }
436
437 if (!PageWriteback(page)) {
438
439 ClearPageReclaim(page);
440 }
441 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
442 inc_zone_page_state(page, NR_VMSCAN_WRITE);
443 return PAGE_SUCCESS;
444 }
445
446 return PAGE_CLEAN;
447}
448
449
450
451
452
453static int __remove_mapping(struct address_space *mapping, struct page *page)
454{
455 BUG_ON(!PageLocked(page));
456 BUG_ON(mapping != page_mapping(page));
457
458 spin_lock_irq(&mapping->tree_lock);
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484 if (!page_freeze_refs(page, 2))
485 goto cannot_free;
486
487 if (unlikely(PageDirty(page))) {
488 page_unfreeze_refs(page, 2);
489 goto cannot_free;
490 }
491
492 if (PageSwapCache(page)) {
493 swp_entry_t swap = { .val = page_private(page) };
494 __delete_from_swap_cache(page);
495 spin_unlock_irq(&mapping->tree_lock);
496 swapcache_free(swap, page);
497 } else {
498 void (*freepage)(struct page *);
499
500 freepage = mapping->a_ops->freepage;
501
502 __delete_from_page_cache(page);
503 spin_unlock_irq(&mapping->tree_lock);
504 mem_cgroup_uncharge_cache_page(page);
505
506 if (freepage != NULL)
507 freepage(page);
508 }
509
510 return 1;
511
512cannot_free:
513 spin_unlock_irq(&mapping->tree_lock);
514 return 0;
515}
516
517
518
519
520
521
522
523int remove_mapping(struct address_space *mapping, struct page *page)
524{
525 if (__remove_mapping(mapping, page)) {
526
527
528
529
530
531 page_unfreeze_refs(page, 1);
532 return 1;
533 }
534 return 0;
535}
536
537
538
539
540
541
542
543
544
545
546void putback_lru_page(struct page *page)
547{
548 int lru;
549 int was_unevictable = PageUnevictable(page);
550
551 VM_BUG_ON(PageLRU(page));
552
553redo:
554 ClearPageUnevictable(page);
555
556 if (page_evictable(page)) {
557
558
559
560
561
562
563 lru = page_lru_base_type(page);
564 lru_cache_add(page);
565 } else {
566
567
568
569
570 lru = LRU_UNEVICTABLE;
571 add_page_to_unevictable_list(page);
572
573
574
575
576
577
578
579
580
581
582 smp_mb();
583 }
584
585
586
587
588
589
590 if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
591 if (!isolate_lru_page(page)) {
592 put_page(page);
593 goto redo;
594 }
595
596
597
598
599 }
600
601 if (was_unevictable && lru != LRU_UNEVICTABLE)
602 count_vm_event(UNEVICTABLE_PGRESCUED);
603 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
604 count_vm_event(UNEVICTABLE_PGCULLED);
605
606 put_page(page);
607}
608
609enum page_references {
610 PAGEREF_RECLAIM,
611 PAGEREF_RECLAIM_CLEAN,
612 PAGEREF_KEEP,
613 PAGEREF_ACTIVATE,
614};
615
616static enum page_references page_check_references(struct page *page,
617 struct scan_control *sc)
618{
619 int referenced_ptes, referenced_page;
620 unsigned long vm_flags;
621
622 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
623 &vm_flags);
624 referenced_page = TestClearPageReferenced(page);
625
626
627
628
629
630 if (vm_flags & VM_LOCKED)
631 return PAGEREF_RECLAIM;
632
633 if (referenced_ptes) {
634 if (PageSwapBacked(page))
635 return PAGEREF_ACTIVATE;
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650 SetPageReferenced(page);
651
652 if (referenced_page || referenced_ptes > 1)
653 return PAGEREF_ACTIVATE;
654
655
656
657
658 if (vm_flags & VM_EXEC)
659 return PAGEREF_ACTIVATE;
660
661 return PAGEREF_KEEP;
662 }
663
664
665 if (referenced_page && !PageSwapBacked(page))
666 return PAGEREF_RECLAIM_CLEAN;
667
668 return PAGEREF_RECLAIM;
669}
670
671
672static void page_check_dirty_writeback(struct page *page,
673 bool *dirty, bool *writeback)
674{
675 struct address_space *mapping;
676
677
678
679
680
681 if (!page_is_file_cache(page)) {
682 *dirty = false;
683 *writeback = false;
684 return;
685 }
686
687
688 *dirty = PageDirty(page);
689 *writeback = PageWriteback(page);
690
691
692 if (!page_has_private(page))
693 return;
694
695 mapping = page_mapping(page);
696 if (mapping && mapping->a_ops->is_dirty_writeback)
697 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
698}
699
700
701
702
703static unsigned long shrink_page_list(struct list_head *page_list,
704 struct zone *zone,
705 struct scan_control *sc,
706 enum ttu_flags ttu_flags,
707 unsigned long *ret_nr_dirty,
708 unsigned long *ret_nr_unqueued_dirty,
709 unsigned long *ret_nr_congested,
710 unsigned long *ret_nr_writeback,
711 unsigned long *ret_nr_immediate,
712 bool force_reclaim)
713{
714 LIST_HEAD(ret_pages);
715 LIST_HEAD(free_pages);
716 int pgactivate = 0;
717 unsigned long nr_unqueued_dirty = 0;
718 unsigned long nr_dirty = 0;
719 unsigned long nr_congested = 0;
720 unsigned long nr_reclaimed = 0;
721 unsigned long nr_writeback = 0;
722 unsigned long nr_immediate = 0;
723
724 cond_resched();
725
726 mem_cgroup_uncharge_start();
727 while (!list_empty(page_list)) {
728 struct address_space *mapping;
729 struct page *page;
730 int may_enter_fs;
731 enum page_references references = PAGEREF_RECLAIM_CLEAN;
732 bool dirty, writeback;
733
734 cond_resched();
735
736 page = lru_to_page(page_list);
737 list_del(&page->lru);
738
739 if (!trylock_page(page))
740 goto keep;
741
742 VM_BUG_ON(PageActive(page));
743 VM_BUG_ON(page_zone(page) != zone);
744
745 sc->nr_scanned++;
746
747 if (unlikely(!page_evictable(page)))
748 goto cull_mlocked;
749
750 if (!sc->may_unmap && page_mapped(page))
751 goto keep_locked;
752
753
754 if (page_mapped(page) || PageSwapCache(page))
755 sc->nr_scanned++;
756
757 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
758 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
759
760
761
762
763
764
765
766 page_check_dirty_writeback(page, &dirty, &writeback);
767 if (dirty || writeback)
768 nr_dirty++;
769
770 if (dirty && !writeback)
771 nr_unqueued_dirty++;
772
773
774
775
776
777
778
779 mapping = page_mapping(page);
780 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
781 (writeback && PageReclaim(page)))
782 nr_congested++;
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821 if (PageWriteback(page)) {
822
823 if (current_is_kswapd() &&
824 PageReclaim(page) &&
825 zone_is_reclaim_writeback(zone)) {
826 nr_immediate++;
827 goto keep_locked;
828
829
830 } else if (global_reclaim(sc) ||
831 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
832
833
834
835
836
837
838
839
840
841
842
843 SetPageReclaim(page);
844 nr_writeback++;
845
846 goto keep_locked;
847
848
849 } else {
850 wait_on_page_writeback(page);
851 }
852 }
853
854 if (!force_reclaim)
855 references = page_check_references(page, sc);
856
857 switch (references) {
858 case PAGEREF_ACTIVATE:
859 goto activate_locked;
860 case PAGEREF_KEEP:
861 goto keep_locked;
862 case PAGEREF_RECLAIM:
863 case PAGEREF_RECLAIM_CLEAN:
864 ;
865 }
866
867
868
869
870
871 if (PageAnon(page) && !PageSwapCache(page)) {
872 if (!(sc->gfp_mask & __GFP_IO))
873 goto keep_locked;
874 if (!add_to_swap(page, page_list))
875 goto activate_locked;
876 may_enter_fs = 1;
877
878
879 mapping = page_mapping(page);
880 }
881
882
883
884
885
886 if (page_mapped(page) && mapping) {
887 switch (try_to_unmap(page, ttu_flags)) {
888 case SWAP_FAIL:
889 goto activate_locked;
890 case SWAP_AGAIN:
891 goto keep_locked;
892 case SWAP_MLOCK:
893 goto cull_mlocked;
894 case SWAP_SUCCESS:
895 ;
896 }
897 }
898
899 if (PageDirty(page)) {
900
901
902
903
904
905 if (page_is_file_cache(page) &&
906 (!current_is_kswapd() ||
907 !zone_is_reclaim_dirty(zone))) {
908
909
910
911
912
913
914 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
915 SetPageReclaim(page);
916
917 goto keep_locked;
918 }
919
920 if (references == PAGEREF_RECLAIM_CLEAN)
921 goto keep_locked;
922 if (!may_enter_fs)
923 goto keep_locked;
924 if (!sc->may_writepage)
925 goto keep_locked;
926
927
928 switch (pageout(page, mapping, sc)) {
929 case PAGE_KEEP:
930 goto keep_locked;
931 case PAGE_ACTIVATE:
932 goto activate_locked;
933 case PAGE_SUCCESS:
934 if (PageWriteback(page))
935 goto keep;
936 if (PageDirty(page))
937 goto keep;
938
939
940
941
942
943 if (!trylock_page(page))
944 goto keep;
945 if (PageDirty(page) || PageWriteback(page))
946 goto keep_locked;
947 mapping = page_mapping(page);
948 case PAGE_CLEAN:
949 ;
950 }
951 }
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974 if (page_has_private(page)) {
975 if (!try_to_release_page(page, sc->gfp_mask))
976 goto activate_locked;
977 if (!mapping && page_count(page) == 1) {
978 unlock_page(page);
979 if (put_page_testzero(page))
980 goto free_it;
981 else {
982
983
984
985
986
987
988
989 nr_reclaimed++;
990 continue;
991 }
992 }
993 }
994
995 if (!mapping || !__remove_mapping(mapping, page))
996 goto keep_locked;
997
998
999
1000
1001
1002
1003
1004
1005 __clear_page_locked(page);
1006free_it:
1007 nr_reclaimed++;
1008
1009
1010
1011
1012
1013 list_add(&page->lru, &free_pages);
1014 continue;
1015
1016cull_mlocked:
1017 if (PageSwapCache(page))
1018 try_to_free_swap(page);
1019 unlock_page(page);
1020 putback_lru_page(page);
1021 continue;
1022
1023activate_locked:
1024
1025 if (PageSwapCache(page) && vm_swap_full())
1026 try_to_free_swap(page);
1027 VM_BUG_ON(PageActive(page));
1028 SetPageActive(page);
1029 pgactivate++;
1030keep_locked:
1031 unlock_page(page);
1032keep:
1033 list_add(&page->lru, &ret_pages);
1034 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1035 }
1036
1037 free_hot_cold_page_list(&free_pages, 1);
1038
1039 list_splice(&ret_pages, page_list);
1040 count_vm_events(PGACTIVATE, pgactivate);
1041 mem_cgroup_uncharge_end();
1042 *ret_nr_dirty += nr_dirty;
1043 *ret_nr_congested += nr_congested;
1044 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
1045 *ret_nr_writeback += nr_writeback;
1046 *ret_nr_immediate += nr_immediate;
1047 return nr_reclaimed;
1048}
1049
1050unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1051 struct list_head *page_list)
1052{
1053 struct scan_control sc = {
1054 .gfp_mask = GFP_KERNEL,
1055 .priority = DEF_PRIORITY,
1056 .may_unmap = 1,
1057 };
1058 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
1059 struct page *page, *next;
1060 LIST_HEAD(clean_pages);
1061
1062 list_for_each_entry_safe(page, next, page_list, lru) {
1063 if (page_is_file_cache(page) && !PageDirty(page)) {
1064 ClearPageActive(page);
1065 list_move(&page->lru, &clean_pages);
1066 }
1067 }
1068
1069 ret = shrink_page_list(&clean_pages, zone, &sc,
1070 TTU_UNMAP|TTU_IGNORE_ACCESS,
1071 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
1072 list_splice(&clean_pages, page_list);
1073 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
1074 return ret;
1075}
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1088{
1089 int ret = -EINVAL;
1090
1091
1092 if (!PageLRU(page))
1093 return ret;
1094
1095
1096 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1097 return ret;
1098
1099 ret = -EBUSY;
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1113
1114 if (PageWriteback(page))
1115 return ret;
1116
1117 if (PageDirty(page)) {
1118 struct address_space *mapping;
1119
1120
1121 if (mode & ISOLATE_CLEAN)
1122 return ret;
1123
1124
1125
1126
1127
1128
1129 mapping = page_mapping(page);
1130 if (mapping && !mapping->a_ops->migratepage)
1131 return ret;
1132 }
1133 }
1134
1135 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1136 return ret;
1137
1138 if (likely(get_page_unless_zero(page))) {
1139
1140
1141
1142
1143
1144 ClearPageLRU(page);
1145 ret = 0;
1146 }
1147
1148 return ret;
1149}
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1172 struct lruvec *lruvec, struct list_head *dst,
1173 unsigned long *nr_scanned, struct scan_control *sc,
1174 isolate_mode_t mode, enum lru_list lru)
1175{
1176 struct list_head *src = &lruvec->lists[lru];
1177 unsigned long nr_taken = 0;
1178 unsigned long scan;
1179
1180 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1181 struct page *page;
1182 int nr_pages;
1183
1184 page = lru_to_page(src);
1185 prefetchw_prev_lru_page(page, src, flags);
1186
1187 VM_BUG_ON(!PageLRU(page));
1188
1189 switch (__isolate_lru_page(page, mode)) {
1190 case 0:
1191 nr_pages = hpage_nr_pages(page);
1192 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1193 list_move(&page->lru, dst);
1194 nr_taken += nr_pages;
1195 break;
1196
1197 case -EBUSY:
1198
1199 list_move(&page->lru, src);
1200 continue;
1201
1202 default:
1203 BUG();
1204 }
1205 }
1206
1207 *nr_scanned = scan;
1208 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1209 nr_taken, mode, is_file_lru(lru));
1210 return nr_taken;
1211}
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238int isolate_lru_page(struct page *page)
1239{
1240 int ret = -EBUSY;
1241
1242 VM_BUG_ON(!page_count(page));
1243
1244 if (PageLRU(page)) {
1245 struct zone *zone = page_zone(page);
1246 struct lruvec *lruvec;
1247
1248 spin_lock_irq(&zone->lru_lock);
1249 lruvec = mem_cgroup_page_lruvec(page, zone);
1250 if (PageLRU(page)) {
1251 int lru = page_lru(page);
1252 get_page(page);
1253 ClearPageLRU(page);
1254 del_page_from_lru_list(page, lruvec, lru);
1255 ret = 0;
1256 }
1257 spin_unlock_irq(&zone->lru_lock);
1258 }
1259 return ret;
1260}
1261
1262
1263
1264
1265
1266
1267
1268
1269static int too_many_isolated(struct zone *zone, int file,
1270 struct scan_control *sc)
1271{
1272 unsigned long inactive, isolated;
1273
1274 if (current_is_kswapd())
1275 return 0;
1276
1277 if (!global_reclaim(sc))
1278 return 0;
1279
1280 if (file) {
1281 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1282 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1283 } else {
1284 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1285 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1286 }
1287
1288
1289
1290
1291
1292
1293 if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
1294 inactive >>= 3;
1295
1296 return isolated > inactive;
1297}
1298
1299static noinline_for_stack void
1300putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1301{
1302 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1303 struct zone *zone = lruvec_zone(lruvec);
1304 LIST_HEAD(pages_to_free);
1305
1306
1307
1308
1309 while (!list_empty(page_list)) {
1310 struct page *page = lru_to_page(page_list);
1311 int lru;
1312
1313 VM_BUG_ON(PageLRU(page));
1314 list_del(&page->lru);
1315 if (unlikely(!page_evictable(page))) {
1316 spin_unlock_irq(&zone->lru_lock);
1317 putback_lru_page(page);
1318 spin_lock_irq(&zone->lru_lock);
1319 continue;
1320 }
1321
1322 lruvec = mem_cgroup_page_lruvec(page, zone);
1323
1324 SetPageLRU(page);
1325 lru = page_lru(page);
1326 add_page_to_lru_list(page, lruvec, lru);
1327
1328 if (is_active_lru(lru)) {
1329 int file = is_file_lru(lru);
1330 int numpages = hpage_nr_pages(page);
1331 reclaim_stat->recent_rotated[file] += numpages;
1332 }
1333 if (put_page_testzero(page)) {
1334 __ClearPageLRU(page);
1335 __ClearPageActive(page);
1336 del_page_from_lru_list(page, lruvec, lru);
1337
1338 if (unlikely(PageCompound(page))) {
1339 spin_unlock_irq(&zone->lru_lock);
1340 (*get_compound_page_dtor(page))(page);
1341 spin_lock_irq(&zone->lru_lock);
1342 } else
1343 list_add(&page->lru, &pages_to_free);
1344 }
1345 }
1346
1347
1348
1349
1350 list_splice(&pages_to_free, page_list);
1351}
1352
1353
1354
1355
1356
1357static noinline_for_stack unsigned long
1358shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1359 struct scan_control *sc, enum lru_list lru)
1360{
1361 LIST_HEAD(page_list);
1362 unsigned long nr_scanned;
1363 unsigned long nr_reclaimed = 0;
1364 unsigned long nr_taken;
1365 unsigned long nr_dirty = 0;
1366 unsigned long nr_congested = 0;
1367 unsigned long nr_unqueued_dirty = 0;
1368 unsigned long nr_writeback = 0;
1369 unsigned long nr_immediate = 0;
1370 isolate_mode_t isolate_mode = 0;
1371 int file = is_file_lru(lru);
1372 struct zone *zone = lruvec_zone(lruvec);
1373 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1374
1375 while (unlikely(too_many_isolated(zone, file, sc))) {
1376 congestion_wait(BLK_RW_ASYNC, HZ/10);
1377
1378
1379 if (fatal_signal_pending(current))
1380 return SWAP_CLUSTER_MAX;
1381 }
1382
1383 lru_add_drain();
1384
1385 if (!sc->may_unmap)
1386 isolate_mode |= ISOLATE_UNMAPPED;
1387 if (!sc->may_writepage)
1388 isolate_mode |= ISOLATE_CLEAN;
1389
1390 spin_lock_irq(&zone->lru_lock);
1391
1392 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1393 &nr_scanned, sc, isolate_mode, lru);
1394
1395 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1396 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1397
1398 if (global_reclaim(sc)) {
1399 zone->pages_scanned += nr_scanned;
1400 if (current_is_kswapd())
1401 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1402 else
1403 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1404 }
1405 spin_unlock_irq(&zone->lru_lock);
1406
1407 if (nr_taken == 0)
1408 return 0;
1409
1410 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1411 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1412 &nr_writeback, &nr_immediate,
1413 false);
1414
1415 spin_lock_irq(&zone->lru_lock);
1416
1417 reclaim_stat->recent_scanned[file] += nr_taken;
1418
1419 if (global_reclaim(sc)) {
1420 if (current_is_kswapd())
1421 __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
1422 nr_reclaimed);
1423 else
1424 __count_zone_vm_events(PGSTEAL_DIRECT, zone,
1425 nr_reclaimed);
1426 }
1427
1428 putback_inactive_pages(lruvec, &page_list);
1429
1430 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1431
1432 spin_unlock_irq(&zone->lru_lock);
1433
1434 free_hot_cold_page_list(&page_list, 1);
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450 if (nr_writeback && nr_writeback == nr_taken)
1451 zone_set_flag(zone, ZONE_WRITEBACK);
1452
1453
1454
1455
1456
1457 if (global_reclaim(sc)) {
1458
1459
1460
1461
1462 if (nr_dirty && nr_dirty == nr_congested)
1463 zone_set_flag(zone, ZONE_CONGESTED);
1464
1465
1466
1467
1468
1469
1470
1471
1472 if (nr_unqueued_dirty == nr_taken)
1473 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1474
1475
1476
1477
1478
1479
1480
1481 if (nr_unqueued_dirty == nr_taken || nr_immediate)
1482 congestion_wait(BLK_RW_ASYNC, HZ/10);
1483 }
1484
1485
1486
1487
1488
1489
1490 if (!sc->hibernation_mode && !current_is_kswapd())
1491 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1492
1493 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1494 zone_idx(zone),
1495 nr_scanned, nr_reclaimed,
1496 sc->priority,
1497 trace_shrink_flags(file));
1498 return nr_reclaimed;
1499}
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519static void move_active_pages_to_lru(struct lruvec *lruvec,
1520 struct list_head *list,
1521 struct list_head *pages_to_free,
1522 enum lru_list lru)
1523{
1524 struct zone *zone = lruvec_zone(lruvec);
1525 unsigned long pgmoved = 0;
1526 struct page *page;
1527 int nr_pages;
1528
1529 while (!list_empty(list)) {
1530 page = lru_to_page(list);
1531 lruvec = mem_cgroup_page_lruvec(page, zone);
1532
1533 VM_BUG_ON(PageLRU(page));
1534 SetPageLRU(page);
1535
1536 nr_pages = hpage_nr_pages(page);
1537 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1538 list_move(&page->lru, &lruvec->lists[lru]);
1539 pgmoved += nr_pages;
1540
1541 if (put_page_testzero(page)) {
1542 __ClearPageLRU(page);
1543 __ClearPageActive(page);
1544 del_page_from_lru_list(page, lruvec, lru);
1545
1546 if (unlikely(PageCompound(page))) {
1547 spin_unlock_irq(&zone->lru_lock);
1548 (*get_compound_page_dtor(page))(page);
1549 spin_lock_irq(&zone->lru_lock);
1550 } else
1551 list_add(&page->lru, pages_to_free);
1552 }
1553 }
1554 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1555 if (!is_active_lru(lru))
1556 __count_vm_events(PGDEACTIVATE, pgmoved);
1557}
1558
1559static void shrink_active_list(unsigned long nr_to_scan,
1560 struct lruvec *lruvec,
1561 struct scan_control *sc,
1562 enum lru_list lru)
1563{
1564 unsigned long nr_taken;
1565 unsigned long nr_scanned;
1566 unsigned long vm_flags;
1567 LIST_HEAD(l_hold);
1568 LIST_HEAD(l_active);
1569 LIST_HEAD(l_inactive);
1570 struct page *page;
1571 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1572 unsigned long nr_rotated = 0;
1573 isolate_mode_t isolate_mode = 0;
1574 int file = is_file_lru(lru);
1575 struct zone *zone = lruvec_zone(lruvec);
1576
1577 lru_add_drain();
1578
1579 if (!sc->may_unmap)
1580 isolate_mode |= ISOLATE_UNMAPPED;
1581 if (!sc->may_writepage)
1582 isolate_mode |= ISOLATE_CLEAN;
1583
1584 spin_lock_irq(&zone->lru_lock);
1585
1586 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1587 &nr_scanned, sc, isolate_mode, lru);
1588 if (global_reclaim(sc))
1589 zone->pages_scanned += nr_scanned;
1590
1591 reclaim_stat->recent_scanned[file] += nr_taken;
1592
1593 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1594 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1595 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1596 spin_unlock_irq(&zone->lru_lock);
1597
1598 while (!list_empty(&l_hold)) {
1599 cond_resched();
1600 page = lru_to_page(&l_hold);
1601 list_del(&page->lru);
1602
1603 if (unlikely(!page_evictable(page))) {
1604 putback_lru_page(page);
1605 continue;
1606 }
1607
1608 if (unlikely(buffer_heads_over_limit)) {
1609 if (page_has_private(page) && trylock_page(page)) {
1610 if (page_has_private(page))
1611 try_to_release_page(page, 0);
1612 unlock_page(page);
1613 }
1614 }
1615
1616 if (page_referenced(page, 0, sc->target_mem_cgroup,
1617 &vm_flags)) {
1618 nr_rotated += hpage_nr_pages(page);
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1629 list_add(&page->lru, &l_active);
1630 continue;
1631 }
1632 }
1633
1634 ClearPageActive(page);
1635 list_add(&page->lru, &l_inactive);
1636 }
1637
1638
1639
1640
1641 spin_lock_irq(&zone->lru_lock);
1642
1643
1644
1645
1646
1647
1648 reclaim_stat->recent_rotated[file] += nr_rotated;
1649
1650 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1651 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1652 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1653 spin_unlock_irq(&zone->lru_lock);
1654
1655 free_hot_cold_page_list(&l_hold, 1);
1656}
1657
1658#ifdef CONFIG_SWAP
1659static int inactive_anon_is_low_global(struct zone *zone)
1660{
1661 unsigned long active, inactive;
1662
1663 active = zone_page_state(zone, NR_ACTIVE_ANON);
1664 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1665
1666 if (inactive * zone->inactive_ratio < active)
1667 return 1;
1668
1669 return 0;
1670}
1671
1672
1673
1674
1675
1676
1677
1678
1679static int inactive_anon_is_low(struct lruvec *lruvec)
1680{
1681
1682
1683
1684
1685 if (!total_swap_pages)
1686 return 0;
1687
1688 if (!mem_cgroup_disabled())
1689 return mem_cgroup_inactive_anon_is_low(lruvec);
1690
1691 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1692}
1693#else
1694static inline int inactive_anon_is_low(struct lruvec *lruvec)
1695{
1696 return 0;
1697}
1698#endif
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714static int inactive_file_is_low(struct lruvec *lruvec)
1715{
1716 unsigned long inactive;
1717 unsigned long active;
1718
1719 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1720 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1721
1722 return active > inactive;
1723}
1724
1725static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1726{
1727 if (is_file_lru(lru))
1728 return inactive_file_is_low(lruvec);
1729 else
1730 return inactive_anon_is_low(lruvec);
1731}
1732
1733static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1734 struct lruvec *lruvec, struct scan_control *sc)
1735{
1736 if (is_active_lru(lru)) {
1737 if (inactive_list_is_low(lruvec, lru))
1738 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1739 return 0;
1740 }
1741
1742 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1743}
1744
1745static int vmscan_swappiness(struct scan_control *sc)
1746{
1747 if (global_reclaim(sc))
1748 return vm_swappiness;
1749 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1750}
1751
1752enum scan_balance {
1753 SCAN_EQUAL,
1754 SCAN_FRACT,
1755 SCAN_ANON,
1756 SCAN_FILE,
1757};
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1769 unsigned long *nr)
1770{
1771 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1772 u64 fraction[2];
1773 u64 denominator = 0;
1774 struct zone *zone = lruvec_zone(lruvec);
1775 unsigned long anon_prio, file_prio;
1776 enum scan_balance scan_balance;
1777 unsigned long anon, file, free;
1778 bool force_scan = false;
1779 unsigned long ap, fp;
1780 enum lru_list lru;
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792 if (current_is_kswapd() && zone->all_unreclaimable)
1793 force_scan = true;
1794 if (!global_reclaim(sc))
1795 force_scan = true;
1796
1797
1798 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1799 scan_balance = SCAN_FILE;
1800 goto out;
1801 }
1802
1803
1804
1805
1806
1807
1808
1809
1810 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1811 scan_balance = SCAN_FILE;
1812 goto out;
1813 }
1814
1815
1816
1817
1818
1819
1820 if (!sc->priority && vmscan_swappiness(sc)) {
1821 scan_balance = SCAN_EQUAL;
1822 goto out;
1823 }
1824
1825 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1826 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1827 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1828 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1829
1830
1831
1832
1833
1834
1835
1836 if (global_reclaim(sc)) {
1837 free = zone_page_state(zone, NR_FREE_PAGES);
1838 if (unlikely(file + free <= high_wmark_pages(zone))) {
1839 scan_balance = SCAN_ANON;
1840 goto out;
1841 }
1842 }
1843
1844
1845
1846
1847
1848 if (!inactive_file_is_low(lruvec)) {
1849 scan_balance = SCAN_FILE;
1850 goto out;
1851 }
1852
1853 scan_balance = SCAN_FRACT;
1854
1855
1856
1857
1858
1859 anon_prio = vmscan_swappiness(sc);
1860 file_prio = 200 - anon_prio;
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873 spin_lock_irq(&zone->lru_lock);
1874 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1875 reclaim_stat->recent_scanned[0] /= 2;
1876 reclaim_stat->recent_rotated[0] /= 2;
1877 }
1878
1879 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1880 reclaim_stat->recent_scanned[1] /= 2;
1881 reclaim_stat->recent_rotated[1] /= 2;
1882 }
1883
1884
1885
1886
1887
1888
1889 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1890 ap /= reclaim_stat->recent_rotated[0] + 1;
1891
1892 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1893 fp /= reclaim_stat->recent_rotated[1] + 1;
1894 spin_unlock_irq(&zone->lru_lock);
1895
1896 fraction[0] = ap;
1897 fraction[1] = fp;
1898 denominator = ap + fp + 1;
1899out:
1900 for_each_evictable_lru(lru) {
1901 int file = is_file_lru(lru);
1902 unsigned long size;
1903 unsigned long scan;
1904
1905 size = get_lru_size(lruvec, lru);
1906 scan = size >> sc->priority;
1907
1908 if (!scan && force_scan)
1909 scan = min(size, SWAP_CLUSTER_MAX);
1910
1911 switch (scan_balance) {
1912 case SCAN_EQUAL:
1913
1914 break;
1915 case SCAN_FRACT:
1916
1917
1918
1919
1920 scan = div64_u64(scan * fraction[file], denominator);
1921 break;
1922 case SCAN_FILE:
1923 case SCAN_ANON:
1924
1925 if ((scan_balance == SCAN_FILE) != file)
1926 scan = 0;
1927 break;
1928 default:
1929
1930 BUG();
1931 }
1932 nr[lru] = scan;
1933 }
1934}
1935
1936
1937
1938
1939static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1940{
1941 unsigned long nr[NR_LRU_LISTS];
1942 unsigned long targets[NR_LRU_LISTS];
1943 unsigned long nr_to_scan;
1944 enum lru_list lru;
1945 unsigned long nr_reclaimed = 0;
1946 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1947 struct blk_plug plug;
1948 bool scan_adjusted = false;
1949
1950 get_scan_count(lruvec, sc, nr);
1951
1952
1953 memcpy(targets, nr, sizeof(nr));
1954
1955 blk_start_plug(&plug);
1956 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1957 nr[LRU_INACTIVE_FILE]) {
1958 unsigned long nr_anon, nr_file, percentage;
1959 unsigned long nr_scanned;
1960
1961 for_each_evictable_lru(lru) {
1962 if (nr[lru]) {
1963 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
1964 nr[lru] -= nr_to_scan;
1965
1966 nr_reclaimed += shrink_list(lru, nr_to_scan,
1967 lruvec, sc);
1968 }
1969 }
1970
1971 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
1972 continue;
1973
1974
1975
1976
1977
1978
1979
1980 if (global_reclaim(sc) && !current_is_kswapd())
1981 break;
1982
1983
1984
1985
1986
1987
1988
1989
1990 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
1991 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
1992
1993 if (nr_file > nr_anon) {
1994 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
1995 targets[LRU_ACTIVE_ANON] + 1;
1996 lru = LRU_BASE;
1997 percentage = nr_anon * 100 / scan_target;
1998 } else {
1999 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2000 targets[LRU_ACTIVE_FILE] + 1;
2001 lru = LRU_FILE;
2002 percentage = nr_file * 100 / scan_target;
2003 }
2004
2005
2006 nr[lru] = 0;
2007 nr[lru + LRU_ACTIVE] = 0;
2008
2009
2010
2011
2012
2013 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2014 nr_scanned = targets[lru] - nr[lru];
2015 nr[lru] = targets[lru] * (100 - percentage) / 100;
2016 nr[lru] -= min(nr[lru], nr_scanned);
2017
2018 lru += LRU_ACTIVE;
2019 nr_scanned = targets[lru] - nr[lru];
2020 nr[lru] = targets[lru] * (100 - percentage) / 100;
2021 nr[lru] -= min(nr[lru], nr_scanned);
2022
2023 scan_adjusted = true;
2024 }
2025 blk_finish_plug(&plug);
2026 sc->nr_reclaimed += nr_reclaimed;
2027
2028
2029
2030
2031
2032 if (inactive_anon_is_low(lruvec))
2033 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2034 sc, LRU_ACTIVE_ANON);
2035
2036 throttle_vm_writeout(sc->gfp_mask);
2037}
2038
2039
2040static bool in_reclaim_compaction(struct scan_control *sc)
2041{
2042 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2043 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2044 sc->priority < DEF_PRIORITY - 2))
2045 return true;
2046
2047 return false;
2048}
2049
2050
2051
2052
2053
2054
2055
2056
2057static inline bool should_continue_reclaim(struct zone *zone,
2058 unsigned long nr_reclaimed,
2059 unsigned long nr_scanned,
2060 struct scan_control *sc)
2061{
2062 unsigned long pages_for_compaction;
2063 unsigned long inactive_lru_pages;
2064
2065
2066 if (!in_reclaim_compaction(sc))
2067 return false;
2068
2069
2070 if (sc->gfp_mask & __GFP_REPEAT) {
2071
2072
2073
2074
2075
2076
2077 if (!nr_reclaimed && !nr_scanned)
2078 return false;
2079 } else {
2080
2081
2082
2083
2084
2085
2086
2087
2088 if (!nr_reclaimed)
2089 return false;
2090 }
2091
2092
2093
2094
2095
2096 pages_for_compaction = (2UL << sc->order);
2097 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
2098 if (get_nr_swap_pages() > 0)
2099 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
2100 if (sc->nr_reclaimed < pages_for_compaction &&
2101 inactive_lru_pages > pages_for_compaction)
2102 return true;
2103
2104
2105 switch (compaction_suitable(zone, sc->order)) {
2106 case COMPACT_PARTIAL:
2107 case COMPACT_CONTINUE:
2108 return false;
2109 default:
2110 return true;
2111 }
2112}
2113
2114static void shrink_zone(struct zone *zone, struct scan_control *sc)
2115{
2116 unsigned long nr_reclaimed, nr_scanned;
2117
2118 do {
2119 struct mem_cgroup *root = sc->target_mem_cgroup;
2120 struct mem_cgroup_reclaim_cookie reclaim = {
2121 .zone = zone,
2122 .priority = sc->priority,
2123 };
2124 struct mem_cgroup *memcg;
2125
2126 nr_reclaimed = sc->nr_reclaimed;
2127 nr_scanned = sc->nr_scanned;
2128
2129 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2130 do {
2131 struct lruvec *lruvec;
2132
2133 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2134
2135 shrink_lruvec(lruvec, sc);
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147 if (!global_reclaim(sc) &&
2148 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2149 mem_cgroup_iter_break(root, memcg);
2150 break;
2151 }
2152 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2153 } while (memcg);
2154
2155 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2156 sc->nr_scanned - nr_scanned,
2157 sc->nr_reclaimed - nr_reclaimed);
2158
2159 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2160 sc->nr_scanned - nr_scanned, sc));
2161}
2162
2163
2164static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2165{
2166 unsigned long balance_gap, watermark;
2167 bool watermark_ok;
2168
2169
2170 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2171 return false;
2172
2173
2174
2175
2176
2177
2178
2179 balance_gap = min(low_wmark_pages(zone),
2180 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2181 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2182 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2183 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2184
2185
2186
2187
2188
2189 if (compaction_deferred(zone, sc->order))
2190 return watermark_ok;
2191
2192
2193 if (!compaction_suitable(zone, sc->order))
2194 return false;
2195
2196 return watermark_ok;
2197}
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2221{
2222 struct zoneref *z;
2223 struct zone *zone;
2224 unsigned long nr_soft_reclaimed;
2225 unsigned long nr_soft_scanned;
2226 bool aborted_reclaim = false;
2227
2228
2229
2230
2231
2232
2233 if (buffer_heads_over_limit)
2234 sc->gfp_mask |= __GFP_HIGHMEM;
2235
2236 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2237 gfp_zone(sc->gfp_mask), sc->nodemask) {
2238 if (!populated_zone(zone))
2239 continue;
2240
2241
2242
2243
2244 if (global_reclaim(sc)) {
2245 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2246 continue;
2247 if (zone->all_unreclaimable &&
2248 sc->priority != DEF_PRIORITY)
2249 continue;
2250 if (IS_ENABLED(CONFIG_COMPACTION)) {
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260 if (compaction_ready(zone, sc)) {
2261 aborted_reclaim = true;
2262 continue;
2263 }
2264 }
2265
2266
2267
2268
2269
2270
2271 nr_soft_scanned = 0;
2272 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2273 sc->order, sc->gfp_mask,
2274 &nr_soft_scanned);
2275 sc->nr_reclaimed += nr_soft_reclaimed;
2276 sc->nr_scanned += nr_soft_scanned;
2277
2278 }
2279
2280 shrink_zone(zone, sc);
2281 }
2282
2283 return aborted_reclaim;
2284}
2285
2286static bool zone_reclaimable(struct zone *zone)
2287{
2288 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2289}
2290
2291
2292static bool all_unreclaimable(struct zonelist *zonelist,
2293 struct scan_control *sc)
2294{
2295 struct zoneref *z;
2296 struct zone *zone;
2297
2298 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2299 gfp_zone(sc->gfp_mask), sc->nodemask) {
2300 if (!populated_zone(zone))
2301 continue;
2302 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2303 continue;
2304 if (!zone->all_unreclaimable)
2305 return false;
2306 }
2307
2308 return true;
2309}
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2328 struct scan_control *sc,
2329 struct shrink_control *shrink)
2330{
2331 unsigned long total_scanned = 0;
2332 struct reclaim_state *reclaim_state = current->reclaim_state;
2333 struct zoneref *z;
2334 struct zone *zone;
2335 unsigned long writeback_threshold;
2336 bool aborted_reclaim;
2337
2338 delayacct_freepages_start();
2339
2340 if (global_reclaim(sc))
2341 count_vm_event(ALLOCSTALL);
2342
2343 do {
2344 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2345 sc->priority);
2346 sc->nr_scanned = 0;
2347 aborted_reclaim = shrink_zones(zonelist, sc);
2348
2349
2350
2351
2352
2353
2354
2355 if (global_reclaim(sc)) {
2356 unsigned long lru_pages = 0;
2357 for_each_zone_zonelist(zone, z, zonelist,
2358 gfp_zone(sc->gfp_mask)) {
2359 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2360 continue;
2361
2362 lru_pages += zone_reclaimable_pages(zone);
2363 }
2364
2365 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2366 if (reclaim_state) {
2367 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2368 reclaim_state->reclaimed_slab = 0;
2369 }
2370 }
2371 total_scanned += sc->nr_scanned;
2372 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2373 goto out;
2374
2375
2376
2377
2378
2379 if (sc->priority < DEF_PRIORITY - 2)
2380 sc->may_writepage = 1;
2381
2382
2383
2384
2385
2386
2387
2388
2389 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2390 if (total_scanned > writeback_threshold) {
2391 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2392 WB_REASON_TRY_TO_FREE_PAGES);
2393 sc->may_writepage = 1;
2394 }
2395 } while (--sc->priority >= 0 && !aborted_reclaim);
2396
2397out:
2398 delayacct_freepages_end();
2399
2400 if (sc->nr_reclaimed)
2401 return sc->nr_reclaimed;
2402
2403
2404
2405
2406
2407
2408 if (oom_killer_disabled)
2409 return 0;
2410
2411
2412 if (aborted_reclaim)
2413 return 1;
2414
2415
2416 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2417 return 1;
2418
2419 return 0;
2420}
2421
2422static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2423{
2424 struct zone *zone;
2425 unsigned long pfmemalloc_reserve = 0;
2426 unsigned long free_pages = 0;
2427 int i;
2428 bool wmark_ok;
2429
2430 for (i = 0; i <= ZONE_NORMAL; i++) {
2431 zone = &pgdat->node_zones[i];
2432 pfmemalloc_reserve += min_wmark_pages(zone);
2433 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2434 }
2435
2436 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2437
2438
2439 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2440 pgdat->classzone_idx = min(pgdat->classzone_idx,
2441 (enum zone_type)ZONE_NORMAL);
2442 wake_up_interruptible(&pgdat->kswapd_wait);
2443 }
2444
2445 return wmark_ok;
2446}
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2458 nodemask_t *nodemask)
2459{
2460 struct zone *zone;
2461 int high_zoneidx = gfp_zone(gfp_mask);
2462 pg_data_t *pgdat;
2463
2464
2465
2466
2467
2468
2469
2470
2471 if (current->flags & PF_KTHREAD)
2472 goto out;
2473
2474
2475
2476
2477
2478 if (fatal_signal_pending(current))
2479 goto out;
2480
2481
2482 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2483 pgdat = zone->zone_pgdat;
2484 if (pfmemalloc_watermark_ok(pgdat))
2485 goto out;
2486
2487
2488 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498 if (!(gfp_mask & __GFP_FS)) {
2499 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2500 pfmemalloc_watermark_ok(pgdat), HZ);
2501
2502 goto check_pending;
2503 }
2504
2505
2506 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2507 pfmemalloc_watermark_ok(pgdat));
2508
2509check_pending:
2510 if (fatal_signal_pending(current))
2511 return true;
2512
2513out:
2514 return false;
2515}
2516
2517unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2518 gfp_t gfp_mask, nodemask_t *nodemask)
2519{
2520 unsigned long nr_reclaimed;
2521 struct scan_control sc = {
2522 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2523 .may_writepage = !laptop_mode,
2524 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2525 .may_unmap = 1,
2526 .may_swap = 1,
2527 .order = order,
2528 .priority = DEF_PRIORITY,
2529 .target_mem_cgroup = NULL,
2530 .nodemask = nodemask,
2531 };
2532 struct shrink_control shrink = {
2533 .gfp_mask = sc.gfp_mask,
2534 };
2535
2536
2537
2538
2539
2540
2541 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2542 return 1;
2543
2544 trace_mm_vmscan_direct_reclaim_begin(order,
2545 sc.may_writepage,
2546 gfp_mask);
2547
2548 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2549
2550 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2551
2552 return nr_reclaimed;
2553}
2554
2555#ifdef CONFIG_MEMCG
2556
2557unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2558 gfp_t gfp_mask, bool noswap,
2559 struct zone *zone,
2560 unsigned long *nr_scanned)
2561{
2562 struct scan_control sc = {
2563 .nr_scanned = 0,
2564 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2565 .may_writepage = !laptop_mode,
2566 .may_unmap = 1,
2567 .may_swap = !noswap,
2568 .order = 0,
2569 .priority = 0,
2570 .target_mem_cgroup = memcg,
2571 };
2572 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2573
2574 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2575 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2576
2577 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2578 sc.may_writepage,
2579 sc.gfp_mask);
2580
2581
2582
2583
2584
2585
2586
2587
2588 shrink_lruvec(lruvec, &sc);
2589
2590 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2591
2592 *nr_scanned = sc.nr_scanned;
2593 return sc.nr_reclaimed;
2594}
2595
2596unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2597 gfp_t gfp_mask,
2598 bool noswap)
2599{
2600 struct zonelist *zonelist;
2601 unsigned long nr_reclaimed;
2602 int nid;
2603 struct scan_control sc = {
2604 .may_writepage = !laptop_mode,
2605 .may_unmap = 1,
2606 .may_swap = !noswap,
2607 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2608 .order = 0,
2609 .priority = DEF_PRIORITY,
2610 .target_mem_cgroup = memcg,
2611 .nodemask = NULL,
2612 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2613 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2614 };
2615 struct shrink_control shrink = {
2616 .gfp_mask = sc.gfp_mask,
2617 };
2618
2619
2620
2621
2622
2623
2624 nid = mem_cgroup_select_victim_node(memcg);
2625
2626 zonelist = NODE_DATA(nid)->node_zonelists;
2627
2628 trace_mm_vmscan_memcg_reclaim_begin(0,
2629 sc.may_writepage,
2630 sc.gfp_mask);
2631
2632 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2633
2634 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2635
2636 return nr_reclaimed;
2637}
2638#endif
2639
2640static void age_active_anon(struct zone *zone, struct scan_control *sc)
2641{
2642 struct mem_cgroup *memcg;
2643
2644 if (!total_swap_pages)
2645 return;
2646
2647 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2648 do {
2649 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2650
2651 if (inactive_anon_is_low(lruvec))
2652 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2653 sc, LRU_ACTIVE_ANON);
2654
2655 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2656 } while (memcg);
2657}
2658
2659static bool zone_balanced(struct zone *zone, int order,
2660 unsigned long balance_gap, int classzone_idx)
2661{
2662 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2663 balance_gap, classzone_idx, 0))
2664 return false;
2665
2666 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2667 !compaction_suitable(zone, order))
2668 return false;
2669
2670 return true;
2671}
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2694{
2695 unsigned long managed_pages = 0;
2696 unsigned long balanced_pages = 0;
2697 int i;
2698
2699
2700 for (i = 0; i <= classzone_idx; i++) {
2701 struct zone *zone = pgdat->node_zones + i;
2702
2703 if (!populated_zone(zone))
2704 continue;
2705
2706 managed_pages += zone->managed_pages;
2707
2708
2709
2710
2711
2712
2713
2714
2715 if (zone->all_unreclaimable) {
2716 balanced_pages += zone->managed_pages;
2717 continue;
2718 }
2719
2720 if (zone_balanced(zone, order, 0, i))
2721 balanced_pages += zone->managed_pages;
2722 else if (!order)
2723 return false;
2724 }
2725
2726 if (order)
2727 return balanced_pages >= (managed_pages >> 2);
2728 else
2729 return true;
2730}
2731
2732
2733
2734
2735
2736
2737
2738static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2739 int classzone_idx)
2740{
2741
2742 if (remaining)
2743 return false;
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2755 wake_up(&pgdat->pfmemalloc_wait);
2756 return false;
2757 }
2758
2759 return pgdat_balanced(pgdat, order, classzone_idx);
2760}
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770static bool kswapd_shrink_zone(struct zone *zone,
2771 int classzone_idx,
2772 struct scan_control *sc,
2773 unsigned long lru_pages,
2774 unsigned long *nr_attempted)
2775{
2776 unsigned long nr_slab;
2777 int testorder = sc->order;
2778 unsigned long balance_gap;
2779 struct reclaim_state *reclaim_state = current->reclaim_state;
2780 struct shrink_control shrink = {
2781 .gfp_mask = sc->gfp_mask,
2782 };
2783 bool lowmem_pressure;
2784
2785
2786 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2787
2788
2789
2790
2791
2792
2793
2794 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2795 compaction_suitable(zone, sc->order) !=
2796 COMPACT_SKIPPED)
2797 testorder = 0;
2798
2799
2800
2801
2802
2803
2804
2805 balance_gap = min(low_wmark_pages(zone),
2806 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2807 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2808
2809
2810
2811
2812
2813 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2814 if (!lowmem_pressure && zone_balanced(zone, testorder,
2815 balance_gap, classzone_idx))
2816 return true;
2817
2818 shrink_zone(zone, sc);
2819
2820 reclaim_state->reclaimed_slab = 0;
2821 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2822 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2823
2824
2825 *nr_attempted += sc->nr_to_reclaim;
2826
2827 if (nr_slab == 0 && !zone_reclaimable(zone))
2828 zone->all_unreclaimable = 1;
2829
2830 zone_clear_flag(zone, ZONE_WRITEBACK);
2831
2832
2833
2834
2835
2836
2837
2838 if (!zone->all_unreclaimable &&
2839 zone_balanced(zone, testorder, 0, classzone_idx)) {
2840 zone_clear_flag(zone, ZONE_CONGESTED);
2841 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2842 }
2843
2844 return sc->nr_scanned >= sc->nr_to_reclaim;
2845}
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2869 int *classzone_idx)
2870{
2871 int i;
2872 int end_zone = 0;
2873 unsigned long nr_soft_reclaimed;
2874 unsigned long nr_soft_scanned;
2875 struct scan_control sc = {
2876 .gfp_mask = GFP_KERNEL,
2877 .priority = DEF_PRIORITY,
2878 .may_unmap = 1,
2879 .may_swap = 1,
2880 .may_writepage = !laptop_mode,
2881 .order = order,
2882 .target_mem_cgroup = NULL,
2883 };
2884 count_vm_event(PAGEOUTRUN);
2885
2886 do {
2887 unsigned long lru_pages = 0;
2888 unsigned long nr_attempted = 0;
2889 bool raise_priority = true;
2890 bool pgdat_needs_compaction = (order > 0);
2891
2892 sc.nr_reclaimed = 0;
2893
2894
2895
2896
2897
2898 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2899 struct zone *zone = pgdat->node_zones + i;
2900
2901 if (!populated_zone(zone))
2902 continue;
2903
2904 if (zone->all_unreclaimable &&
2905 sc.priority != DEF_PRIORITY)
2906 continue;
2907
2908
2909
2910
2911
2912 age_active_anon(zone, &sc);
2913
2914
2915
2916
2917
2918
2919
2920 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2921 end_zone = i;
2922 break;
2923 }
2924
2925 if (!zone_balanced(zone, order, 0, 0)) {
2926 end_zone = i;
2927 break;
2928 } else {
2929
2930
2931
2932
2933 zone_clear_flag(zone, ZONE_CONGESTED);
2934 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2935 }
2936 }
2937
2938 if (i < 0)
2939 goto out;
2940
2941 for (i = 0; i <= end_zone; i++) {
2942 struct zone *zone = pgdat->node_zones + i;
2943
2944 if (!populated_zone(zone))
2945 continue;
2946
2947 lru_pages += zone_reclaimable_pages(zone);
2948
2949
2950
2951
2952
2953
2954 if (pgdat_needs_compaction &&
2955 zone_watermark_ok(zone, order,
2956 low_wmark_pages(zone),
2957 *classzone_idx, 0))
2958 pgdat_needs_compaction = false;
2959 }
2960
2961
2962
2963
2964
2965 if (sc.priority < DEF_PRIORITY - 2)
2966 sc.may_writepage = 1;
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977 for (i = 0; i <= end_zone; i++) {
2978 struct zone *zone = pgdat->node_zones + i;
2979
2980 if (!populated_zone(zone))
2981 continue;
2982
2983 if (zone->all_unreclaimable &&
2984 sc.priority != DEF_PRIORITY)
2985 continue;
2986
2987 sc.nr_scanned = 0;
2988
2989 nr_soft_scanned = 0;
2990
2991
2992
2993 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2994 order, sc.gfp_mask,
2995 &nr_soft_scanned);
2996 sc.nr_reclaimed += nr_soft_reclaimed;
2997
2998
2999
3000
3001
3002
3003
3004 if (kswapd_shrink_zone(zone, end_zone, &sc,
3005 lru_pages, &nr_attempted))
3006 raise_priority = false;
3007 }
3008
3009
3010
3011
3012
3013
3014 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3015 pfmemalloc_watermark_ok(pgdat))
3016 wake_up(&pgdat->pfmemalloc_wait);
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026 if (order && sc.nr_reclaimed >= 2UL << order)
3027 order = sc.order = 0;
3028
3029
3030 if (try_to_freeze() || kthread_should_stop())
3031 break;
3032
3033
3034
3035
3036
3037 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
3038 compact_pgdat(pgdat, order);
3039
3040
3041
3042
3043
3044 if (raise_priority || !sc.nr_reclaimed)
3045 sc.priority--;
3046 } while (sc.priority >= 1 &&
3047 !pgdat_balanced(pgdat, order, *classzone_idx));
3048
3049out:
3050
3051
3052
3053
3054
3055
3056 *classzone_idx = end_zone;
3057 return order;
3058}
3059
3060static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3061{
3062 long remaining = 0;
3063 DEFINE_WAIT(wait);
3064
3065 if (freezing(current) || kthread_should_stop())
3066 return;
3067
3068 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3069
3070
3071 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3072 remaining = schedule_timeout(HZ/10);
3073 finish_wait(&pgdat->kswapd_wait, &wait);
3074 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3075 }
3076
3077
3078
3079
3080
3081 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
3082 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3093
3094
3095
3096
3097
3098
3099
3100 reset_isolation_suitable(pgdat);
3101
3102 if (!kthread_should_stop())
3103 schedule();
3104
3105 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3106 } else {
3107 if (remaining)
3108 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3109 else
3110 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3111 }
3112 finish_wait(&pgdat->kswapd_wait, &wait);
3113}
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128static int kswapd(void *p)
3129{
3130 unsigned long order, new_order;
3131 unsigned balanced_order;
3132 int classzone_idx, new_classzone_idx;
3133 int balanced_classzone_idx;
3134 pg_data_t *pgdat = (pg_data_t*)p;
3135 struct task_struct *tsk = current;
3136
3137 struct reclaim_state reclaim_state = {
3138 .reclaimed_slab = 0,
3139 };
3140 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3141
3142 lockdep_set_current_reclaim_state(GFP_KERNEL);
3143
3144 if (!cpumask_empty(cpumask))
3145 set_cpus_allowed_ptr(tsk, cpumask);
3146 current->reclaim_state = &reclaim_state;
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3161 set_freezable();
3162
3163 order = new_order = 0;
3164 balanced_order = 0;
3165 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3166 balanced_classzone_idx = classzone_idx;
3167 for ( ; ; ) {
3168 bool ret;
3169
3170
3171
3172
3173
3174
3175 if (balanced_classzone_idx >= new_classzone_idx &&
3176 balanced_order == new_order) {
3177 new_order = pgdat->kswapd_max_order;
3178 new_classzone_idx = pgdat->classzone_idx;
3179 pgdat->kswapd_max_order = 0;
3180 pgdat->classzone_idx = pgdat->nr_zones - 1;
3181 }
3182
3183 if (order < new_order || classzone_idx > new_classzone_idx) {
3184
3185
3186
3187
3188 order = new_order;
3189 classzone_idx = new_classzone_idx;
3190 } else {
3191 kswapd_try_to_sleep(pgdat, balanced_order,
3192 balanced_classzone_idx);
3193 order = pgdat->kswapd_max_order;
3194 classzone_idx = pgdat->classzone_idx;
3195 new_order = order;
3196 new_classzone_idx = classzone_idx;
3197 pgdat->kswapd_max_order = 0;
3198 pgdat->classzone_idx = pgdat->nr_zones - 1;
3199 }
3200
3201 ret = try_to_freeze();
3202 if (kthread_should_stop())
3203 break;
3204
3205
3206
3207
3208
3209 if (!ret) {
3210 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3211 balanced_classzone_idx = classzone_idx;
3212 balanced_order = balance_pgdat(pgdat, order,
3213 &balanced_classzone_idx);
3214 }
3215 }
3216
3217 current->reclaim_state = NULL;
3218 return 0;
3219}
3220
3221
3222
3223
3224void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3225{
3226 pg_data_t *pgdat;
3227
3228 if (!populated_zone(zone))
3229 return;
3230
3231 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3232 return;
3233 pgdat = zone->zone_pgdat;
3234 if (pgdat->kswapd_max_order < order) {
3235 pgdat->kswapd_max_order = order;
3236 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3237 }
3238 if (!waitqueue_active(&pgdat->kswapd_wait))
3239 return;
3240 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
3241 return;
3242
3243 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3244 wake_up_interruptible(&pgdat->kswapd_wait);
3245}
3246
3247
3248
3249
3250
3251
3252
3253
3254unsigned long global_reclaimable_pages(void)
3255{
3256 int nr;
3257
3258 nr = global_page_state(NR_ACTIVE_FILE) +
3259 global_page_state(NR_INACTIVE_FILE);
3260
3261 if (get_nr_swap_pages() > 0)
3262 nr += global_page_state(NR_ACTIVE_ANON) +
3263 global_page_state(NR_INACTIVE_ANON);
3264
3265 return nr;
3266}
3267
3268unsigned long zone_reclaimable_pages(struct zone *zone)
3269{
3270 int nr;
3271
3272 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3273 zone_page_state(zone, NR_INACTIVE_FILE);
3274
3275 if (get_nr_swap_pages() > 0)
3276 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3277 zone_page_state(zone, NR_INACTIVE_ANON);
3278
3279 return nr;
3280}
3281
3282#ifdef CONFIG_HIBERNATION
3283
3284
3285
3286
3287
3288
3289
3290
3291unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3292{
3293 struct reclaim_state reclaim_state;
3294 struct scan_control sc = {
3295 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3296 .may_swap = 1,
3297 .may_unmap = 1,
3298 .may_writepage = 1,
3299 .nr_to_reclaim = nr_to_reclaim,
3300 .hibernation_mode = 1,
3301 .order = 0,
3302 .priority = DEF_PRIORITY,
3303 };
3304 struct shrink_control shrink = {
3305 .gfp_mask = sc.gfp_mask,
3306 };
3307 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3308 struct task_struct *p = current;
3309 unsigned long nr_reclaimed;
3310
3311 p->flags |= PF_MEMALLOC;
3312 lockdep_set_current_reclaim_state(sc.gfp_mask);
3313 reclaim_state.reclaimed_slab = 0;
3314 p->reclaim_state = &reclaim_state;
3315
3316 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3317
3318 p->reclaim_state = NULL;
3319 lockdep_clear_current_reclaim_state();
3320 p->flags &= ~PF_MEMALLOC;
3321
3322 return nr_reclaimed;
3323}
3324#endif
3325
3326
3327
3328
3329
3330static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3331 void *hcpu)
3332{
3333 int nid;
3334
3335 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3336 for_each_node_state(nid, N_MEMORY) {
3337 pg_data_t *pgdat = NODE_DATA(nid);
3338 const struct cpumask *mask;
3339
3340 mask = cpumask_of_node(pgdat->node_id);
3341
3342 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3343
3344 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3345 }
3346 }
3347 return NOTIFY_OK;
3348}
3349
3350
3351
3352
3353
3354int kswapd_run(int nid)
3355{
3356 pg_data_t *pgdat = NODE_DATA(nid);
3357 int ret = 0;
3358
3359 if (pgdat->kswapd)
3360 return 0;
3361
3362 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3363 if (IS_ERR(pgdat->kswapd)) {
3364
3365 BUG_ON(system_state == SYSTEM_BOOTING);
3366 pr_err("Failed to start kswapd on node %d\n", nid);
3367 ret = PTR_ERR(pgdat->kswapd);
3368 pgdat->kswapd = NULL;
3369 }
3370 return ret;
3371}
3372
3373
3374
3375
3376
3377void kswapd_stop(int nid)
3378{
3379 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3380
3381 if (kswapd) {
3382 kthread_stop(kswapd);
3383 NODE_DATA(nid)->kswapd = NULL;
3384 }
3385}
3386
3387static int __init kswapd_init(void)
3388{
3389 int nid;
3390
3391 swap_setup();
3392 for_each_node_state(nid, N_MEMORY)
3393 kswapd_run(nid);
3394 hotcpu_notifier(cpu_callback, 0);
3395 return 0;
3396}
3397
3398module_init(kswapd_init)
3399
3400#ifdef CONFIG_NUMA
3401
3402
3403
3404
3405
3406
3407int zone_reclaim_mode __read_mostly;
3408
3409#define RECLAIM_OFF 0
3410#define RECLAIM_ZONE (1<<0)
3411#define RECLAIM_WRITE (1<<1)
3412#define RECLAIM_SWAP (1<<2)
3413
3414
3415
3416
3417
3418
3419#define ZONE_RECLAIM_PRIORITY 4
3420
3421
3422
3423
3424
3425int sysctl_min_unmapped_ratio = 1;
3426
3427
3428
3429
3430
3431int sysctl_min_slab_ratio = 5;
3432
3433static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3434{
3435 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3436 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3437 zone_page_state(zone, NR_ACTIVE_FILE);
3438
3439
3440
3441
3442
3443
3444 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3445}
3446
3447
3448static long zone_pagecache_reclaimable(struct zone *zone)
3449{
3450 long nr_pagecache_reclaimable;
3451 long delta = 0;
3452
3453
3454
3455
3456
3457
3458
3459 if (zone_reclaim_mode & RECLAIM_SWAP)
3460 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3461 else
3462 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3463
3464
3465 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3466 delta += zone_page_state(zone, NR_FILE_DIRTY);
3467
3468
3469 if (unlikely(delta > nr_pagecache_reclaimable))
3470 delta = nr_pagecache_reclaimable;
3471
3472 return nr_pagecache_reclaimable - delta;
3473}
3474
3475
3476
3477
3478static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3479{
3480
3481 const unsigned long nr_pages = 1 << order;
3482 struct task_struct *p = current;
3483 struct reclaim_state reclaim_state;
3484 struct scan_control sc = {
3485 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3486 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3487 .may_swap = 1,
3488 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3489 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3490 .order = order,
3491 .priority = ZONE_RECLAIM_PRIORITY,
3492 };
3493 struct shrink_control shrink = {
3494 .gfp_mask = sc.gfp_mask,
3495 };
3496 unsigned long nr_slab_pages0, nr_slab_pages1;
3497
3498 cond_resched();
3499
3500
3501
3502
3503
3504 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3505 lockdep_set_current_reclaim_state(gfp_mask);
3506 reclaim_state.reclaimed_slab = 0;
3507 p->reclaim_state = &reclaim_state;
3508
3509 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3510
3511
3512
3513
3514 do {
3515 shrink_zone(zone, &sc);
3516 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3517 }
3518
3519 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3520 if (nr_slab_pages0 > zone->min_slab_pages) {
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531 for (;;) {
3532 unsigned long lru_pages = zone_reclaimable_pages(zone);
3533
3534
3535 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3536 break;
3537
3538
3539 nr_slab_pages1 = zone_page_state(zone,
3540 NR_SLAB_RECLAIMABLE);
3541 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3542 break;
3543 }
3544
3545
3546
3547
3548
3549 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3550 if (nr_slab_pages1 < nr_slab_pages0)
3551 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3552 }
3553
3554 p->reclaim_state = NULL;
3555 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3556 lockdep_clear_current_reclaim_state();
3557 return sc.nr_reclaimed >= nr_pages;
3558}
3559
3560int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3561{
3562 int node_id;
3563 int ret;
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3576 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3577 return ZONE_RECLAIM_FULL;
3578
3579 if (zone->all_unreclaimable)
3580 return ZONE_RECLAIM_FULL;
3581
3582
3583
3584
3585 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3586 return ZONE_RECLAIM_NOSCAN;
3587
3588
3589
3590
3591
3592
3593
3594 node_id = zone_to_nid(zone);
3595 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3596 return ZONE_RECLAIM_NOSCAN;
3597
3598 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3599 return ZONE_RECLAIM_NOSCAN;
3600
3601 ret = __zone_reclaim(zone, gfp_mask, order);
3602 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3603
3604 if (!ret)
3605 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3606
3607 return ret;
3608}
3609#endif
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623int page_evictable(struct page *page)
3624{
3625 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3626}
3627
3628#ifdef CONFIG_SHMEM
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638void check_move_unevictable_pages(struct page **pages, int nr_pages)
3639{
3640 struct lruvec *lruvec;
3641 struct zone *zone = NULL;
3642 int pgscanned = 0;
3643 int pgrescued = 0;
3644 int i;
3645
3646 for (i = 0; i < nr_pages; i++) {
3647 struct page *page = pages[i];
3648 struct zone *pagezone;
3649
3650 pgscanned++;
3651 pagezone = page_zone(page);
3652 if (pagezone != zone) {
3653 if (zone)
3654 spin_unlock_irq(&zone->lru_lock);
3655 zone = pagezone;
3656 spin_lock_irq(&zone->lru_lock);
3657 }
3658 lruvec = mem_cgroup_page_lruvec(page, zone);
3659
3660 if (!PageLRU(page) || !PageUnevictable(page))
3661 continue;
3662
3663 if (page_evictable(page)) {
3664 enum lru_list lru = page_lru_base_type(page);
3665
3666 VM_BUG_ON(PageActive(page));
3667 ClearPageUnevictable(page);
3668 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3669 add_page_to_lru_list(page, lruvec, lru);
3670 pgrescued++;
3671 }
3672 }
3673
3674 if (zone) {
3675 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3676 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3677 spin_unlock_irq(&zone->lru_lock);
3678 }
3679}
3680#endif
3681
3682static void warn_scan_unevictable_pages(void)
3683{
3684 printk_once(KERN_WARNING
3685 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3686 "disabled for lack of a legitimate use case. If you have "
3687 "one, please send an email to linux-mm@kvack.org.\n",
3688 current->comm);
3689}
3690
3691
3692
3693
3694
3695unsigned long scan_unevictable_pages;
3696
3697int scan_unevictable_handler(struct ctl_table *table, int write,
3698 void __user *buffer,
3699 size_t *length, loff_t *ppos)
3700{
3701 warn_scan_unevictable_pages();
3702 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3703 scan_unevictable_pages = 0;
3704 return 0;
3705}
3706
3707#ifdef CONFIG_NUMA
3708
3709
3710
3711
3712
3713static ssize_t read_scan_unevictable_node(struct device *dev,
3714 struct device_attribute *attr,
3715 char *buf)
3716{
3717 warn_scan_unevictable_pages();
3718 return sprintf(buf, "0\n");
3719}
3720
3721static ssize_t write_scan_unevictable_node(struct device *dev,
3722 struct device_attribute *attr,
3723 const char *buf, size_t count)
3724{
3725 warn_scan_unevictable_pages();
3726 return 1;
3727}
3728
3729
3730static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3731 read_scan_unevictable_node,
3732 write_scan_unevictable_node);
3733
3734int scan_unevictable_register_node(struct node *node)
3735{
3736 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3737}
3738
3739void scan_unevictable_unregister_node(struct node *node)
3740{
3741 device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3742}
3743#endif
3744