1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/backing-dev.h>
30#include <linux/rmap.h>
31#include <linux/topology.h>
32#include <linux/cpu.h>
33#include <linux/cpuset.h>
34#include <linux/compaction.h>
35#include <linux/notifier.h>
36#include <linux/rwsem.h>
37#include <linux/delay.h>
38#include <linux/kthread.h>
39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
43#include <linux/oom.h>
44#include <linux/prefetch.h>
45
46#include <asm/tlbflush.h>
47#include <asm/div64.h>
48
49#include <linux/swapops.h>
50
51#include "internal.h"
52
53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h>
55
56
57
58
59
60
61
62
63
64
65
66
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control {
75
76 unsigned long nr_scanned;
77
78
79 unsigned long nr_reclaimed;
80
81
82 unsigned long nr_to_reclaim;
83
84 unsigned long hibernation_mode;
85
86
87 gfp_t gfp_mask;
88
89 int may_writepage;
90
91
92 int may_unmap;
93
94
95 int may_swap;
96
97 int order;
98
99
100
101
102
103 reclaim_mode_t reclaim_mode;
104
105
106
107
108
109 struct mem_cgroup *target_mem_cgroup;
110
111
112
113
114
115 nodemask_t *nodemask;
116};
117
118struct mem_cgroup_zone {
119 struct mem_cgroup *mem_cgroup;
120 struct zone *zone;
121};
122
123#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
124
125#ifdef ARCH_HAS_PREFETCH
126#define prefetch_prev_lru_page(_page, _base, _field) \
127 do { \
128 if ((_page)->lru.prev != _base) { \
129 struct page *prev; \
130 \
131 prev = lru_to_page(&(_page->lru)); \
132 prefetch(&prev->_field); \
133 } \
134 } while (0)
135#else
136#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
137#endif
138
139#ifdef ARCH_HAS_PREFETCHW
140#define prefetchw_prev_lru_page(_page, _base, _field) \
141 do { \
142 if ((_page)->lru.prev != _base) { \
143 struct page *prev; \
144 \
145 prev = lru_to_page(&(_page->lru)); \
146 prefetchw(&prev->_field); \
147 } \
148 } while (0)
149#else
150#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
151#endif
152
153
154
155
156int vm_swappiness = 60;
157long vm_total_pages;
158
159static LIST_HEAD(shrinker_list);
160static DECLARE_RWSEM(shrinker_rwsem);
161
162#ifdef CONFIG_CGROUP_MEM_RES_CTLR
163static bool global_reclaim(struct scan_control *sc)
164{
165 return !sc->target_mem_cgroup;
166}
167
168static bool scanning_global_lru(struct mem_cgroup_zone *mz)
169{
170 return !mz->mem_cgroup;
171}
172#else
173static bool global_reclaim(struct scan_control *sc)
174{
175 return true;
176}
177
178static bool scanning_global_lru(struct mem_cgroup_zone *mz)
179{
180 return true;
181}
182#endif
183
184static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
185{
186 if (!scanning_global_lru(mz))
187 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
188
189 return &mz->zone->reclaim_stat;
190}
191
192static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
193 enum lru_list lru)
194{
195 if (!scanning_global_lru(mz))
196 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
197 zone_to_nid(mz->zone),
198 zone_idx(mz->zone),
199 BIT(lru));
200
201 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
202}
203
204
205
206
207
208void register_shrinker(struct shrinker *shrinker)
209{
210 atomic_long_set(&shrinker->nr_in_batch, 0);
211 down_write(&shrinker_rwsem);
212 list_add_tail(&shrinker->list, &shrinker_list);
213 up_write(&shrinker_rwsem);
214}
215EXPORT_SYMBOL(register_shrinker);
216
217
218
219
220void unregister_shrinker(struct shrinker *shrinker)
221{
222 down_write(&shrinker_rwsem);
223 list_del(&shrinker->list);
224 up_write(&shrinker_rwsem);
225}
226EXPORT_SYMBOL(unregister_shrinker);
227
228static inline int do_shrinker_shrink(struct shrinker *shrinker,
229 struct shrink_control *sc,
230 unsigned long nr_to_scan)
231{
232 sc->nr_to_scan = nr_to_scan;
233 return (*shrinker->shrink)(shrinker, sc);
234}
235
236#define SHRINK_BATCH 128
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256unsigned long shrink_slab(struct shrink_control *shrink,
257 unsigned long nr_pages_scanned,
258 unsigned long lru_pages)
259{
260 struct shrinker *shrinker;
261 unsigned long ret = 0;
262
263 if (nr_pages_scanned == 0)
264 nr_pages_scanned = SWAP_CLUSTER_MAX;
265
266 if (!down_read_trylock(&shrinker_rwsem)) {
267
268 ret = 1;
269 goto out;
270 }
271
272 list_for_each_entry(shrinker, &shrinker_list, list) {
273 unsigned long long delta;
274 long total_scan;
275 long max_pass;
276 int shrink_ret = 0;
277 long nr;
278 long new_nr;
279 long batch_size = shrinker->batch ? shrinker->batch
280 : SHRINK_BATCH;
281
282 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
283 if (max_pass <= 0)
284 continue;
285
286
287
288
289
290
291 nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
292
293 total_scan = nr;
294 delta = (4 * nr_pages_scanned) / shrinker->seeks;
295 delta *= max_pass;
296 do_div(delta, lru_pages + 1);
297 total_scan += delta;
298 if (total_scan < 0) {
299 printk(KERN_ERR "shrink_slab: %pF negative objects to "
300 "delete nr=%ld\n",
301 shrinker->shrink, total_scan);
302 total_scan = max_pass;
303 }
304
305
306
307
308
309
310
311
312
313
314
315
316
317 if (delta < max_pass / 4)
318 total_scan = min(total_scan, max_pass / 2);
319
320
321
322
323
324
325 if (total_scan > max_pass * 2)
326 total_scan = max_pass * 2;
327
328 trace_mm_shrink_slab_start(shrinker, shrink, nr,
329 nr_pages_scanned, lru_pages,
330 max_pass, delta, total_scan);
331
332 while (total_scan >= batch_size) {
333 int nr_before;
334
335 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
336 shrink_ret = do_shrinker_shrink(shrinker, shrink,
337 batch_size);
338 if (shrink_ret == -1)
339 break;
340 if (shrink_ret < nr_before)
341 ret += nr_before - shrink_ret;
342 count_vm_events(SLABS_SCANNED, batch_size);
343 total_scan -= batch_size;
344
345 cond_resched();
346 }
347
348
349
350
351
352
353 if (total_scan > 0)
354 new_nr = atomic_long_add_return(total_scan,
355 &shrinker->nr_in_batch);
356 else
357 new_nr = atomic_long_read(&shrinker->nr_in_batch);
358
359 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
360 }
361 up_read(&shrinker_rwsem);
362out:
363 cond_resched();
364 return ret;
365}
366
367static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync)
369{
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371
372
373
374
375
376
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382
383
384
385
386
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
388 sc->reclaim_mode |= syncmode;
389 else if (sc->order && priority < DEF_PRIORITY - 2)
390 sc->reclaim_mode |= syncmode;
391 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393}
394
395static void reset_reclaim_mode(struct scan_control *sc)
396{
397 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
398}
399
400static inline int is_page_cache_freeable(struct page *page)
401{
402
403
404
405
406
407 return page_count(page) - page_has_private(page) == 2;
408}
409
410static int may_write_to_queue(struct backing_dev_info *bdi,
411 struct scan_control *sc)
412{
413 if (current->flags & PF_SWAPWRITE)
414 return 1;
415 if (!bdi_write_congested(bdi))
416 return 1;
417 if (bdi == current->backing_dev_info)
418 return 1;
419
420
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0;
424}
425
426
427
428
429
430
431
432
433
434
435
436
437
438static void handle_write_error(struct address_space *mapping,
439 struct page *page, int error)
440{
441 lock_page(page);
442 if (page_mapping(page) == mapping)
443 mapping_set_error(mapping, error);
444 unlock_page(page);
445}
446
447
448typedef enum {
449
450 PAGE_KEEP,
451
452 PAGE_ACTIVATE,
453
454 PAGE_SUCCESS,
455
456 PAGE_CLEAN,
457} pageout_t;
458
459
460
461
462
463static pageout_t pageout(struct page *page, struct address_space *mapping,
464 struct scan_control *sc)
465{
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482 if (!is_page_cache_freeable(page))
483 return PAGE_KEEP;
484 if (!mapping) {
485
486
487
488
489 if (page_has_private(page)) {
490 if (try_to_free_buffers(page)) {
491 ClearPageDirty(page);
492 printk("%s: orphaned page\n", __func__);
493 return PAGE_CLEAN;
494 }
495 }
496 return PAGE_KEEP;
497 }
498 if (mapping->a_ops->writepage == NULL)
499 return PAGE_ACTIVATE;
500 if (!may_write_to_queue(mapping->backing_dev_info, sc))
501 return PAGE_KEEP;
502
503 if (clear_page_dirty_for_io(page)) {
504 int res;
505 struct writeback_control wbc = {
506 .sync_mode = WB_SYNC_NONE,
507 .nr_to_write = SWAP_CLUSTER_MAX,
508 .range_start = 0,
509 .range_end = LLONG_MAX,
510 .for_reclaim = 1,
511 };
512
513 SetPageReclaim(page);
514 res = mapping->a_ops->writepage(page, &wbc);
515 if (res < 0)
516 handle_write_error(mapping, page, res);
517 if (res == AOP_WRITEPAGE_ACTIVATE) {
518 ClearPageReclaim(page);
519 return PAGE_ACTIVATE;
520 }
521
522 if (!PageWriteback(page)) {
523
524 ClearPageReclaim(page);
525 }
526 trace_mm_vmscan_writepage(page,
527 trace_reclaim_flags(page, sc->reclaim_mode));
528 inc_zone_page_state(page, NR_VMSCAN_WRITE);
529 return PAGE_SUCCESS;
530 }
531
532 return PAGE_CLEAN;
533}
534
535
536
537
538
539static int __remove_mapping(struct address_space *mapping, struct page *page)
540{
541 BUG_ON(!PageLocked(page));
542 BUG_ON(mapping != page_mapping(page));
543
544 spin_lock_irq(&mapping->tree_lock);
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570 if (!page_freeze_refs(page, 2))
571 goto cannot_free;
572
573 if (unlikely(PageDirty(page))) {
574 page_unfreeze_refs(page, 2);
575 goto cannot_free;
576 }
577
578 if (PageSwapCache(page)) {
579 swp_entry_t swap = { .val = page_private(page) };
580 __delete_from_swap_cache(page);
581 spin_unlock_irq(&mapping->tree_lock);
582 swapcache_free(swap, page);
583 } else {
584 void (*freepage)(struct page *);
585
586 freepage = mapping->a_ops->freepage;
587
588 __delete_from_page_cache(page);
589 spin_unlock_irq(&mapping->tree_lock);
590 mem_cgroup_uncharge_cache_page(page);
591
592 if (freepage != NULL)
593 freepage(page);
594 }
595
596 return 1;
597
598cannot_free:
599 spin_unlock_irq(&mapping->tree_lock);
600 return 0;
601}
602
603
604
605
606
607
608
609int remove_mapping(struct address_space *mapping, struct page *page)
610{
611 if (__remove_mapping(mapping, page)) {
612
613
614
615
616
617 page_unfreeze_refs(page, 1);
618 return 1;
619 }
620 return 0;
621}
622
623
624
625
626
627
628
629
630
631
632void putback_lru_page(struct page *page)
633{
634 int lru;
635 int active = !!TestClearPageActive(page);
636 int was_unevictable = PageUnevictable(page);
637
638 VM_BUG_ON(PageLRU(page));
639
640redo:
641 ClearPageUnevictable(page);
642
643 if (page_evictable(page, NULL)) {
644
645
646
647
648
649
650 lru = active + page_lru_base_type(page);
651 lru_cache_add_lru(page, lru);
652 } else {
653
654
655
656
657 lru = LRU_UNEVICTABLE;
658 add_page_to_unevictable_list(page);
659
660
661
662
663
664
665
666
667
668
669 smp_mb();
670 }
671
672
673
674
675
676
677 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
678 if (!isolate_lru_page(page)) {
679 put_page(page);
680 goto redo;
681 }
682
683
684
685
686 }
687
688 if (was_unevictable && lru != LRU_UNEVICTABLE)
689 count_vm_event(UNEVICTABLE_PGRESCUED);
690 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
691 count_vm_event(UNEVICTABLE_PGCULLED);
692
693 put_page(page);
694}
695
696enum page_references {
697 PAGEREF_RECLAIM,
698 PAGEREF_RECLAIM_CLEAN,
699 PAGEREF_KEEP,
700 PAGEREF_ACTIVATE,
701};
702
703static enum page_references page_check_references(struct page *page,
704 struct mem_cgroup_zone *mz,
705 struct scan_control *sc)
706{
707 int referenced_ptes, referenced_page;
708 unsigned long vm_flags;
709
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
711 referenced_page = TestClearPageReferenced(page);
712
713
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717
718
719
720
721 if (vm_flags & VM_LOCKED)
722 return PAGEREF_RECLAIM;
723
724 if (referenced_ptes) {
725 if (PageAnon(page))
726 return PAGEREF_ACTIVATE;
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741 SetPageReferenced(page);
742
743 if (referenced_page || referenced_ptes > 1)
744 return PAGEREF_ACTIVATE;
745
746
747
748
749 if (vm_flags & VM_EXEC)
750 return PAGEREF_ACTIVATE;
751
752 return PAGEREF_KEEP;
753 }
754
755
756 if (referenced_page && !PageSwapBacked(page))
757 return PAGEREF_RECLAIM_CLEAN;
758
759 return PAGEREF_RECLAIM;
760}
761
762
763
764
765static unsigned long shrink_page_list(struct list_head *page_list,
766 struct mem_cgroup_zone *mz,
767 struct scan_control *sc,
768 int priority,
769 unsigned long *ret_nr_dirty,
770 unsigned long *ret_nr_writeback)
771{
772 LIST_HEAD(ret_pages);
773 LIST_HEAD(free_pages);
774 int pgactivate = 0;
775 unsigned long nr_dirty = 0;
776 unsigned long nr_congested = 0;
777 unsigned long nr_reclaimed = 0;
778 unsigned long nr_writeback = 0;
779
780 cond_resched();
781
782 while (!list_empty(page_list)) {
783 enum page_references references;
784 struct address_space *mapping;
785 struct page *page;
786 int may_enter_fs;
787
788 cond_resched();
789
790 page = lru_to_page(page_list);
791 list_del(&page->lru);
792
793 if (!trylock_page(page))
794 goto keep;
795
796 VM_BUG_ON(PageActive(page));
797 VM_BUG_ON(page_zone(page) != mz->zone);
798
799 sc->nr_scanned++;
800
801 if (unlikely(!page_evictable(page, NULL)))
802 goto cull_mlocked;
803
804 if (!sc->may_unmap && page_mapped(page))
805 goto keep_locked;
806
807
808 if (page_mapped(page) || PageSwapCache(page))
809 sc->nr_scanned++;
810
811 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
812 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
813
814 if (PageWriteback(page)) {
815 nr_writeback++;
816
817
818
819
820
821
822 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
823 may_enter_fs)
824 wait_on_page_writeback(page);
825 else {
826 unlock_page(page);
827 goto keep_lumpy;
828 }
829 }
830
831 references = page_check_references(page, mz, sc);
832 switch (references) {
833 case PAGEREF_ACTIVATE:
834 goto activate_locked;
835 case PAGEREF_KEEP:
836 goto keep_locked;
837 case PAGEREF_RECLAIM:
838 case PAGEREF_RECLAIM_CLEAN:
839 ;
840 }
841
842
843
844
845
846 if (PageAnon(page) && !PageSwapCache(page)) {
847 if (!(sc->gfp_mask & __GFP_IO))
848 goto keep_locked;
849 if (!add_to_swap(page))
850 goto activate_locked;
851 may_enter_fs = 1;
852 }
853
854 mapping = page_mapping(page);
855
856
857
858
859
860 if (page_mapped(page) && mapping) {
861 switch (try_to_unmap(page, TTU_UNMAP)) {
862 case SWAP_FAIL:
863 goto activate_locked;
864 case SWAP_AGAIN:
865 goto keep_locked;
866 case SWAP_MLOCK:
867 goto cull_mlocked;
868 case SWAP_SUCCESS:
869 ;
870 }
871 }
872
873 if (PageDirty(page)) {
874 nr_dirty++;
875
876
877
878
879
880
881 if (page_is_file_cache(page) &&
882 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
883
884
885
886
887
888
889 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
890 SetPageReclaim(page);
891
892 goto keep_locked;
893 }
894
895 if (references == PAGEREF_RECLAIM_CLEAN)
896 goto keep_locked;
897 if (!may_enter_fs)
898 goto keep_locked;
899 if (!sc->may_writepage)
900 goto keep_locked;
901
902
903 switch (pageout(page, mapping, sc)) {
904 case PAGE_KEEP:
905 nr_congested++;
906 goto keep_locked;
907 case PAGE_ACTIVATE:
908 goto activate_locked;
909 case PAGE_SUCCESS:
910 if (PageWriteback(page))
911 goto keep_lumpy;
912 if (PageDirty(page))
913 goto keep;
914
915
916
917
918
919 if (!trylock_page(page))
920 goto keep;
921 if (PageDirty(page) || PageWriteback(page))
922 goto keep_locked;
923 mapping = page_mapping(page);
924 case PAGE_CLEAN:
925 ;
926 }
927 }
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950 if (page_has_private(page)) {
951 if (!try_to_release_page(page, sc->gfp_mask))
952 goto activate_locked;
953 if (!mapping && page_count(page) == 1) {
954 unlock_page(page);
955 if (put_page_testzero(page))
956 goto free_it;
957 else {
958
959
960
961
962
963
964
965 nr_reclaimed++;
966 continue;
967 }
968 }
969 }
970
971 if (!mapping || !__remove_mapping(mapping, page))
972 goto keep_locked;
973
974
975
976
977
978
979
980
981 __clear_page_locked(page);
982free_it:
983 nr_reclaimed++;
984
985
986
987
988
989 list_add(&page->lru, &free_pages);
990 continue;
991
992cull_mlocked:
993 if (PageSwapCache(page))
994 try_to_free_swap(page);
995 unlock_page(page);
996 putback_lru_page(page);
997 reset_reclaim_mode(sc);
998 continue;
999
1000activate_locked:
1001
1002 if (PageSwapCache(page) && vm_swap_full())
1003 try_to_free_swap(page);
1004 VM_BUG_ON(PageActive(page));
1005 SetPageActive(page);
1006 pgactivate++;
1007keep_locked:
1008 unlock_page(page);
1009keep:
1010 reset_reclaim_mode(sc);
1011keep_lumpy:
1012 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 }
1015
1016
1017
1018
1019
1020
1021
1022 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
1023 zone_set_flag(mz->zone, ZONE_CONGESTED);
1024
1025 free_hot_cold_page_list(&free_pages, 1);
1026
1027 list_splice(&ret_pages, page_list);
1028 count_vm_events(PGACTIVATE, pgactivate);
1029 *ret_nr_dirty += nr_dirty;
1030 *ret_nr_writeback += nr_writeback;
1031 return nr_reclaimed;
1032}
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1045{
1046 bool all_lru_mode;
1047 int ret = -EINVAL;
1048
1049
1050 if (!PageLRU(page))
1051 return ret;
1052
1053 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1054 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1055
1056
1057
1058
1059
1060
1061 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1062 return ret;
1063
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret;
1066
1067
1068
1069
1070
1071
1072 if (PageUnevictable(page))
1073 return ret;
1074
1075 ret = -EBUSY;
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1089
1090 if (PageWriteback(page))
1091 return ret;
1092
1093 if (PageDirty(page)) {
1094 struct address_space *mapping;
1095
1096
1097 if (mode & ISOLATE_CLEAN)
1098 return ret;
1099
1100
1101
1102
1103
1104
1105 mapping = page_mapping(page);
1106 if (mapping && !mapping->a_ops->migratepage)
1107 return ret;
1108 }
1109 }
1110
1111 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1112 return ret;
1113
1114 if (likely(get_page_unless_zero(page))) {
1115
1116
1117
1118
1119
1120 ClearPageLRU(page);
1121 ret = 0;
1122 }
1123
1124 return ret;
1125}
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst,
1150 unsigned long *nr_scanned, int order, isolate_mode_t mode,
1151 int active, int file)
1152{
1153 struct lruvec *lruvec;
1154 struct list_head *src;
1155 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan;
1160 int lru = LRU_BASE;
1161
1162 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1163 if (active)
1164 lru += LRU_ACTIVE;
1165 if (file)
1166 lru += LRU_FILE;
1167 src = &lruvec->lists[lru];
1168
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page;
1171 unsigned long pfn;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175
1176 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags);
1178
1179 VM_BUG_ON(!PageLRU(page));
1180
1181 switch (__isolate_lru_page(page, mode, file)) {
1182 case 0:
1183 mem_cgroup_lru_del(page);
1184 list_move(&page->lru, dst);
1185 nr_taken += hpage_nr_pages(page);
1186 break;
1187
1188 case -EBUSY:
1189
1190 list_move(&page->lru, src);
1191 continue;
1192
1193 default:
1194 BUG();
1195 }
1196
1197 if (!order)
1198 continue;
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << order) - 1);
1212 end_pfn = pfn + (1 << order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230
1231
1232
1233
1234
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 }
1275
1276 *nr_scanned = scan;
1277
1278 trace_mm_vmscan_lru_isolate(order,
1279 nr_to_scan, scan,
1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file);
1283 return nr_taken;
1284}
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311int isolate_lru_page(struct page *page)
1312{
1313 int ret = -EBUSY;
1314
1315 VM_BUG_ON(!page_count(page));
1316
1317 if (PageLRU(page)) {
1318 struct zone *zone = page_zone(page);
1319
1320 spin_lock_irq(&zone->lru_lock);
1321 if (PageLRU(page)) {
1322 int lru = page_lru(page);
1323 ret = 0;
1324 get_page(page);
1325 ClearPageLRU(page);
1326
1327 del_page_from_lru_list(zone, page, lru);
1328 }
1329 spin_unlock_irq(&zone->lru_lock);
1330 }
1331 return ret;
1332}
1333
1334
1335
1336
1337static int too_many_isolated(struct zone *zone, int file,
1338 struct scan_control *sc)
1339{
1340 unsigned long inactive, isolated;
1341
1342 if (current_is_kswapd())
1343 return 0;
1344
1345 if (!global_reclaim(sc))
1346 return 0;
1347
1348 if (file) {
1349 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1350 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1351 } else {
1352 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1353 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1354 }
1355
1356 return isolated > inactive;
1357}
1358
1359static noinline_for_stack void
1360putback_inactive_pages(struct mem_cgroup_zone *mz,
1361 struct list_head *page_list)
1362{
1363 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1364 struct zone *zone = mz->zone;
1365 LIST_HEAD(pages_to_free);
1366
1367
1368
1369
1370 while (!list_empty(page_list)) {
1371 struct page *page = lru_to_page(page_list);
1372 int lru;
1373
1374 VM_BUG_ON(PageLRU(page));
1375 list_del(&page->lru);
1376 if (unlikely(!page_evictable(page, NULL))) {
1377 spin_unlock_irq(&zone->lru_lock);
1378 putback_lru_page(page);
1379 spin_lock_irq(&zone->lru_lock);
1380 continue;
1381 }
1382 SetPageLRU(page);
1383 lru = page_lru(page);
1384 add_page_to_lru_list(zone, page, lru);
1385 if (is_active_lru(lru)) {
1386 int file = is_file_lru(lru);
1387 int numpages = hpage_nr_pages(page);
1388 reclaim_stat->recent_rotated[file] += numpages;
1389 }
1390 if (put_page_testzero(page)) {
1391 __ClearPageLRU(page);
1392 __ClearPageActive(page);
1393 del_page_from_lru_list(zone, page, lru);
1394
1395 if (unlikely(PageCompound(page))) {
1396 spin_unlock_irq(&zone->lru_lock);
1397 (*get_compound_page_dtor(page))(page);
1398 spin_lock_irq(&zone->lru_lock);
1399 } else
1400 list_add(&page->lru, &pages_to_free);
1401 }
1402 }
1403
1404
1405
1406
1407 list_splice(&pages_to_free, page_list);
1408}
1409
1410static noinline_for_stack void
1411update_isolated_counts(struct mem_cgroup_zone *mz,
1412 struct list_head *page_list,
1413 unsigned long *nr_anon,
1414 unsigned long *nr_file)
1415{
1416 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1417 struct zone *zone = mz->zone;
1418 unsigned int count[NR_LRU_LISTS] = { 0, };
1419 unsigned long nr_active = 0;
1420 struct page *page;
1421 int lru;
1422
1423
1424
1425
1426 list_for_each_entry(page, page_list, lru) {
1427 int numpages = hpage_nr_pages(page);
1428 lru = page_lru_base_type(page);
1429 if (PageActive(page)) {
1430 lru += LRU_ACTIVE;
1431 ClearPageActive(page);
1432 nr_active += numpages;
1433 }
1434 count[lru] += numpages;
1435 }
1436
1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1440 -count[LRU_ACTIVE_FILE]);
1441 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1442 -count[LRU_INACTIVE_FILE]);
1443 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1444 -count[LRU_ACTIVE_ANON]);
1445 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1446 -count[LRU_INACTIVE_ANON]);
1447
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450
1451 reclaim_stat->recent_scanned[0] += *nr_anon;
1452 reclaim_stat->recent_scanned[1] += *nr_file;
1453}
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463static inline bool should_reclaim_stall(unsigned long nr_taken,
1464 unsigned long nr_freed,
1465 int priority,
1466 struct scan_control *sc)
1467{
1468 int lumpy_stall_priority;
1469
1470
1471 if (current_is_kswapd())
1472 return false;
1473
1474
1475 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1476 return false;
1477
1478
1479 if (nr_freed == nr_taken)
1480 return false;
1481
1482
1483
1484
1485
1486
1487
1488 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1489 lumpy_stall_priority = DEF_PRIORITY;
1490 else
1491 lumpy_stall_priority = DEF_PRIORITY / 3;
1492
1493 return priority <= lumpy_stall_priority;
1494}
1495
1496
1497
1498
1499
1500static noinline_for_stack unsigned long
1501shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1502 struct scan_control *sc, int priority, int file)
1503{
1504 LIST_HEAD(page_list);
1505 unsigned long nr_scanned;
1506 unsigned long nr_reclaimed = 0;
1507 unsigned long nr_taken;
1508 unsigned long nr_anon;
1509 unsigned long nr_file;
1510 unsigned long nr_dirty = 0;
1511 unsigned long nr_writeback = 0;
1512 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1513 struct zone *zone = mz->zone;
1514
1515 while (unlikely(too_many_isolated(zone, file, sc))) {
1516 congestion_wait(BLK_RW_ASYNC, HZ/10);
1517
1518
1519 if (fatal_signal_pending(current))
1520 return SWAP_CLUSTER_MAX;
1521 }
1522
1523 set_reclaim_mode(priority, sc, false);
1524 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1525 reclaim_mode |= ISOLATE_ACTIVE;
1526
1527 lru_add_drain();
1528
1529 if (!sc->may_unmap)
1530 reclaim_mode |= ISOLATE_UNMAPPED;
1531 if (!sc->may_writepage)
1532 reclaim_mode |= ISOLATE_CLEAN;
1533
1534 spin_lock_irq(&zone->lru_lock);
1535
1536 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
1537 &nr_scanned, sc->order,
1538 reclaim_mode, 0, file);
1539 if (global_reclaim(sc)) {
1540 zone->pages_scanned += nr_scanned;
1541 if (current_is_kswapd())
1542 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1543 nr_scanned);
1544 else
1545 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1546 nr_scanned);
1547 }
1548
1549 if (nr_taken == 0) {
1550 spin_unlock_irq(&zone->lru_lock);
1551 return 0;
1552 }
1553
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1555
1556 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1557 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1558
1559 spin_unlock_irq(&zone->lru_lock);
1560
1561 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1562 &nr_dirty, &nr_writeback);
1563
1564
1565 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1566 set_reclaim_mode(priority, sc, true);
1567 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1568 priority, &nr_dirty, &nr_writeback);
1569 }
1570
1571 spin_lock_irq(&zone->lru_lock);
1572
1573 if (current_is_kswapd())
1574 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1575 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1576
1577 putback_inactive_pages(mz, &page_list);
1578
1579 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1580 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1581
1582 spin_unlock_irq(&zone->lru_lock);
1583
1584 free_hot_cold_page_list(&page_list, 1);
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1610 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1611
1612 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1613 zone_idx(zone),
1614 nr_scanned, nr_reclaimed,
1615 priority,
1616 trace_shrink_flags(file, sc->reclaim_mode));
1617 return nr_reclaimed;
1618}
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638static void move_active_pages_to_lru(struct zone *zone,
1639 struct list_head *list,
1640 struct list_head *pages_to_free,
1641 enum lru_list lru)
1642{
1643 unsigned long pgmoved = 0;
1644 struct page *page;
1645
1646 while (!list_empty(list)) {
1647 struct lruvec *lruvec;
1648
1649 page = lru_to_page(list);
1650
1651 VM_BUG_ON(PageLRU(page));
1652 SetPageLRU(page);
1653
1654 lruvec = mem_cgroup_lru_add_list(zone, page, lru);
1655 list_move(&page->lru, &lruvec->lists[lru]);
1656 pgmoved += hpage_nr_pages(page);
1657
1658 if (put_page_testzero(page)) {
1659 __ClearPageLRU(page);
1660 __ClearPageActive(page);
1661 del_page_from_lru_list(zone, page, lru);
1662
1663 if (unlikely(PageCompound(page))) {
1664 spin_unlock_irq(&zone->lru_lock);
1665 (*get_compound_page_dtor(page))(page);
1666 spin_lock_irq(&zone->lru_lock);
1667 } else
1668 list_add(&page->lru, pages_to_free);
1669 }
1670 }
1671 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1672 if (!is_active_lru(lru))
1673 __count_vm_events(PGDEACTIVATE, pgmoved);
1674}
1675
1676static void shrink_active_list(unsigned long nr_to_scan,
1677 struct mem_cgroup_zone *mz,
1678 struct scan_control *sc,
1679 int priority, int file)
1680{
1681 unsigned long nr_taken;
1682 unsigned long nr_scanned;
1683 unsigned long vm_flags;
1684 LIST_HEAD(l_hold);
1685 LIST_HEAD(l_active);
1686 LIST_HEAD(l_inactive);
1687 struct page *page;
1688 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1689 unsigned long nr_rotated = 0;
1690 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1691 struct zone *zone = mz->zone;
1692
1693 lru_add_drain();
1694
1695 if (!sc->may_unmap)
1696 reclaim_mode |= ISOLATE_UNMAPPED;
1697 if (!sc->may_writepage)
1698 reclaim_mode |= ISOLATE_CLEAN;
1699
1700 spin_lock_irq(&zone->lru_lock);
1701
1702 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
1703 &nr_scanned, sc->order,
1704 reclaim_mode, 1, file);
1705 if (global_reclaim(sc))
1706 zone->pages_scanned += nr_scanned;
1707
1708 reclaim_stat->recent_scanned[file] += nr_taken;
1709
1710 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1711 if (file)
1712 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1713 else
1714 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1715 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1716 spin_unlock_irq(&zone->lru_lock);
1717
1718 while (!list_empty(&l_hold)) {
1719 cond_resched();
1720 page = lru_to_page(&l_hold);
1721 list_del(&page->lru);
1722
1723 if (unlikely(!page_evictable(page, NULL))) {
1724 putback_lru_page(page);
1725 continue;
1726 }
1727
1728 if (unlikely(buffer_heads_over_limit)) {
1729 if (page_has_private(page) && trylock_page(page)) {
1730 if (page_has_private(page))
1731 try_to_release_page(page, 0);
1732 unlock_page(page);
1733 }
1734 }
1735
1736 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1737 nr_rotated += hpage_nr_pages(page);
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1748 list_add(&page->lru, &l_active);
1749 continue;
1750 }
1751 }
1752
1753 ClearPageActive(page);
1754 list_add(&page->lru, &l_inactive);
1755 }
1756
1757
1758
1759
1760 spin_lock_irq(&zone->lru_lock);
1761
1762
1763
1764
1765
1766
1767 reclaim_stat->recent_rotated[file] += nr_rotated;
1768
1769 move_active_pages_to_lru(zone, &l_active, &l_hold,
1770 LRU_ACTIVE + file * LRU_FILE);
1771 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1772 LRU_BASE + file * LRU_FILE);
1773 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1774 spin_unlock_irq(&zone->lru_lock);
1775
1776 free_hot_cold_page_list(&l_hold, 1);
1777}
1778
1779#ifdef CONFIG_SWAP
1780static int inactive_anon_is_low_global(struct zone *zone)
1781{
1782 unsigned long active, inactive;
1783
1784 active = zone_page_state(zone, NR_ACTIVE_ANON);
1785 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1786
1787 if (inactive * zone->inactive_ratio < active)
1788 return 1;
1789
1790 return 0;
1791}
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1802{
1803
1804
1805
1806
1807 if (!total_swap_pages)
1808 return 0;
1809
1810 if (!scanning_global_lru(mz))
1811 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
1812 mz->zone);
1813
1814 return inactive_anon_is_low_global(mz->zone);
1815}
1816#else
1817static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1818{
1819 return 0;
1820}
1821#endif
1822
1823static int inactive_file_is_low_global(struct zone *zone)
1824{
1825 unsigned long active, inactive;
1826
1827 active = zone_page_state(zone, NR_ACTIVE_FILE);
1828 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1829
1830 return (active > inactive);
1831}
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847static int inactive_file_is_low(struct mem_cgroup_zone *mz)
1848{
1849 if (!scanning_global_lru(mz))
1850 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
1851 mz->zone);
1852
1853 return inactive_file_is_low_global(mz->zone);
1854}
1855
1856static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
1857{
1858 if (file)
1859 return inactive_file_is_low(mz);
1860 else
1861 return inactive_anon_is_low(mz);
1862}
1863
1864static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1865 struct mem_cgroup_zone *mz,
1866 struct scan_control *sc, int priority)
1867{
1868 int file = is_file_lru(lru);
1869
1870 if (is_active_lru(lru)) {
1871 if (inactive_list_is_low(mz, file))
1872 shrink_active_list(nr_to_scan, mz, sc, priority, file);
1873 return 0;
1874 }
1875
1876 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
1877}
1878
1879static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1880 struct scan_control *sc)
1881{
1882 if (global_reclaim(sc))
1883 return vm_swappiness;
1884 return mem_cgroup_swappiness(mz->mem_cgroup);
1885}
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1896 unsigned long *nr, int priority)
1897{
1898 unsigned long anon, file, free;
1899 unsigned long anon_prio, file_prio;
1900 unsigned long ap, fp;
1901 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1902 u64 fraction[2], denominator;
1903 enum lru_list lru;
1904 int noswap = 0;
1905 bool force_scan = false;
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917 if (current_is_kswapd() && mz->zone->all_unreclaimable)
1918 force_scan = true;
1919 if (!global_reclaim(sc))
1920 force_scan = true;
1921
1922
1923 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1924 noswap = 1;
1925 fraction[0] = 0;
1926 fraction[1] = 1;
1927 denominator = 1;
1928 goto out;
1929 }
1930
1931 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
1932 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
1933 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
1934 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
1935
1936 if (global_reclaim(sc)) {
1937 free = zone_page_state(mz->zone, NR_FREE_PAGES);
1938
1939
1940 if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
1941 fraction[0] = 1;
1942 fraction[1] = 0;
1943 denominator = 1;
1944 goto out;
1945 }
1946 }
1947
1948
1949
1950
1951
1952 anon_prio = vmscan_swappiness(mz, sc);
1953 file_prio = 200 - vmscan_swappiness(mz, sc);
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966 spin_lock_irq(&mz->zone->lru_lock);
1967 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1968 reclaim_stat->recent_scanned[0] /= 2;
1969 reclaim_stat->recent_rotated[0] /= 2;
1970 }
1971
1972 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1973 reclaim_stat->recent_scanned[1] /= 2;
1974 reclaim_stat->recent_rotated[1] /= 2;
1975 }
1976
1977
1978
1979
1980
1981
1982 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1983 ap /= reclaim_stat->recent_rotated[0] + 1;
1984
1985 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1986 fp /= reclaim_stat->recent_rotated[1] + 1;
1987 spin_unlock_irq(&mz->zone->lru_lock);
1988
1989 fraction[0] = ap;
1990 fraction[1] = fp;
1991 denominator = ap + fp + 1;
1992out:
1993 for_each_evictable_lru(lru) {
1994 int file = is_file_lru(lru);
1995 unsigned long scan;
1996
1997 scan = zone_nr_lru_pages(mz, lru);
1998 if (priority || noswap) {
1999 scan >>= priority;
2000 if (!scan && force_scan)
2001 scan = SWAP_CLUSTER_MAX;
2002 scan = div64_u64(scan * fraction[file], denominator);
2003 }
2004 nr[lru] = scan;
2005 }
2006}
2007
2008
2009
2010
2011
2012
2013
2014
2015static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2016 unsigned long nr_reclaimed,
2017 unsigned long nr_scanned,
2018 struct scan_control *sc)
2019{
2020 unsigned long pages_for_compaction;
2021 unsigned long inactive_lru_pages;
2022
2023
2024 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
2025 return false;
2026
2027
2028 if (sc->gfp_mask & __GFP_REPEAT) {
2029
2030
2031
2032
2033
2034
2035 if (!nr_reclaimed && !nr_scanned)
2036 return false;
2037 } else {
2038
2039
2040
2041
2042
2043
2044
2045
2046 if (!nr_reclaimed)
2047 return false;
2048 }
2049
2050
2051
2052
2053
2054 pages_for_compaction = (2UL << sc->order);
2055 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
2056 if (nr_swap_pages > 0)
2057 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
2058 if (sc->nr_reclaimed < pages_for_compaction &&
2059 inactive_lru_pages > pages_for_compaction)
2060 return true;
2061
2062
2063 switch (compaction_suitable(mz->zone, sc->order)) {
2064 case COMPACT_PARTIAL:
2065 case COMPACT_CONTINUE:
2066 return false;
2067 default:
2068 return true;
2069 }
2070}
2071
2072
2073
2074
2075static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2076 struct scan_control *sc)
2077{
2078 unsigned long nr[NR_LRU_LISTS];
2079 unsigned long nr_to_scan;
2080 enum lru_list lru;
2081 unsigned long nr_reclaimed, nr_scanned;
2082 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2083 struct blk_plug plug;
2084
2085restart:
2086 nr_reclaimed = 0;
2087 nr_scanned = sc->nr_scanned;
2088 get_scan_count(mz, sc, nr, priority);
2089
2090 blk_start_plug(&plug);
2091 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2092 nr[LRU_INACTIVE_FILE]) {
2093 for_each_evictable_lru(lru) {
2094 if (nr[lru]) {
2095 nr_to_scan = min_t(unsigned long,
2096 nr[lru], SWAP_CLUSTER_MAX);
2097 nr[lru] -= nr_to_scan;
2098
2099 nr_reclaimed += shrink_list(lru, nr_to_scan,
2100 mz, sc, priority);
2101 }
2102 }
2103
2104
2105
2106
2107
2108
2109
2110
2111 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
2112 break;
2113 }
2114 blk_finish_plug(&plug);
2115 sc->nr_reclaimed += nr_reclaimed;
2116
2117
2118
2119
2120
2121 if (inactive_anon_is_low(mz))
2122 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
2123
2124
2125 if (should_continue_reclaim(mz, nr_reclaimed,
2126 sc->nr_scanned - nr_scanned, sc))
2127 goto restart;
2128
2129 throttle_vm_writeout(sc->gfp_mask);
2130}
2131
2132static void shrink_zone(int priority, struct zone *zone,
2133 struct scan_control *sc)
2134{
2135 struct mem_cgroup *root = sc->target_mem_cgroup;
2136 struct mem_cgroup_reclaim_cookie reclaim = {
2137 .zone = zone,
2138 .priority = priority,
2139 };
2140 struct mem_cgroup *memcg;
2141
2142 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2143 do {
2144 struct mem_cgroup_zone mz = {
2145 .mem_cgroup = memcg,
2146 .zone = zone,
2147 };
2148
2149 shrink_mem_cgroup_zone(priority, &mz, sc);
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160 if (!global_reclaim(sc)) {
2161 mem_cgroup_iter_break(root, memcg);
2162 break;
2163 }
2164 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2165 } while (memcg);
2166}
2167
2168
2169static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2170{
2171 unsigned long balance_gap, watermark;
2172 bool watermark_ok;
2173
2174
2175 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2176 return false;
2177
2178
2179
2180
2181
2182
2183
2184 balance_gap = min(low_wmark_pages(zone),
2185 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2186 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2187 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2188 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2189
2190
2191
2192
2193
2194 if (compaction_deferred(zone))
2195 return watermark_ok;
2196
2197
2198 if (!compaction_suitable(zone, sc->order))
2199 return false;
2200
2201 return watermark_ok;
2202}
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225static bool shrink_zones(int priority, struct zonelist *zonelist,
2226 struct scan_control *sc)
2227{
2228 struct zoneref *z;
2229 struct zone *zone;
2230 unsigned long nr_soft_reclaimed;
2231 unsigned long nr_soft_scanned;
2232 bool aborted_reclaim = false;
2233
2234
2235
2236
2237
2238
2239 if (buffer_heads_over_limit)
2240 sc->gfp_mask |= __GFP_HIGHMEM;
2241
2242 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2243 gfp_zone(sc->gfp_mask), sc->nodemask) {
2244 if (!populated_zone(zone))
2245 continue;
2246
2247
2248
2249
2250 if (global_reclaim(sc)) {
2251 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2252 continue;
2253 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2254 continue;
2255 if (COMPACTION_BUILD) {
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265 if (compaction_ready(zone, sc)) {
2266 aborted_reclaim = true;
2267 continue;
2268 }
2269 }
2270
2271
2272
2273
2274
2275
2276 nr_soft_scanned = 0;
2277 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2278 sc->order, sc->gfp_mask,
2279 &nr_soft_scanned);
2280 sc->nr_reclaimed += nr_soft_reclaimed;
2281 sc->nr_scanned += nr_soft_scanned;
2282
2283 }
2284
2285 shrink_zone(priority, zone, sc);
2286 }
2287
2288 return aborted_reclaim;
2289}
2290
2291static bool zone_reclaimable(struct zone *zone)
2292{
2293 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2294}
2295
2296
2297static bool all_unreclaimable(struct zonelist *zonelist,
2298 struct scan_control *sc)
2299{
2300 struct zoneref *z;
2301 struct zone *zone;
2302
2303 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2304 gfp_zone(sc->gfp_mask), sc->nodemask) {
2305 if (!populated_zone(zone))
2306 continue;
2307 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2308 continue;
2309 if (!zone->all_unreclaimable)
2310 return false;
2311 }
2312
2313 return true;
2314}
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2333 struct scan_control *sc,
2334 struct shrink_control *shrink)
2335{
2336 int priority;
2337 unsigned long total_scanned = 0;
2338 struct reclaim_state *reclaim_state = current->reclaim_state;
2339 struct zoneref *z;
2340 struct zone *zone;
2341 unsigned long writeback_threshold;
2342 bool aborted_reclaim;
2343
2344 get_mems_allowed();
2345 delayacct_freepages_start();
2346
2347 if (global_reclaim(sc))
2348 count_vm_event(ALLOCSTALL);
2349
2350 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2351 sc->nr_scanned = 0;
2352 if (!priority)
2353 disable_swap_token(sc->target_mem_cgroup);
2354 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2355
2356
2357
2358
2359
2360 if (global_reclaim(sc)) {
2361 unsigned long lru_pages = 0;
2362 for_each_zone_zonelist(zone, z, zonelist,
2363 gfp_zone(sc->gfp_mask)) {
2364 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2365 continue;
2366
2367 lru_pages += zone_reclaimable_pages(zone);
2368 }
2369
2370 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2371 if (reclaim_state) {
2372 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2373 reclaim_state->reclaimed_slab = 0;
2374 }
2375 }
2376 total_scanned += sc->nr_scanned;
2377 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2378 goto out;
2379
2380
2381
2382
2383
2384
2385
2386
2387 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2388 if (total_scanned > writeback_threshold) {
2389 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2390 WB_REASON_TRY_TO_FREE_PAGES);
2391 sc->may_writepage = 1;
2392 }
2393
2394
2395 if (!sc->hibernation_mode && sc->nr_scanned &&
2396 priority < DEF_PRIORITY - 2) {
2397 struct zone *preferred_zone;
2398
2399 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2400 &cpuset_current_mems_allowed,
2401 &preferred_zone);
2402 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2403 }
2404 }
2405
2406out:
2407 delayacct_freepages_end();
2408 put_mems_allowed();
2409
2410 if (sc->nr_reclaimed)
2411 return sc->nr_reclaimed;
2412
2413
2414
2415
2416
2417
2418 if (oom_killer_disabled)
2419 return 0;
2420
2421
2422 if (aborted_reclaim)
2423 return 1;
2424
2425
2426 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2427 return 1;
2428
2429 return 0;
2430}
2431
2432unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2433 gfp_t gfp_mask, nodemask_t *nodemask)
2434{
2435 unsigned long nr_reclaimed;
2436 struct scan_control sc = {
2437 .gfp_mask = gfp_mask,
2438 .may_writepage = !laptop_mode,
2439 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2440 .may_unmap = 1,
2441 .may_swap = 1,
2442 .order = order,
2443 .target_mem_cgroup = NULL,
2444 .nodemask = nodemask,
2445 };
2446 struct shrink_control shrink = {
2447 .gfp_mask = sc.gfp_mask,
2448 };
2449
2450 trace_mm_vmscan_direct_reclaim_begin(order,
2451 sc.may_writepage,
2452 gfp_mask);
2453
2454 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2455
2456 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2457
2458 return nr_reclaimed;
2459}
2460
2461#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2462
2463unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2464 gfp_t gfp_mask, bool noswap,
2465 struct zone *zone,
2466 unsigned long *nr_scanned)
2467{
2468 struct scan_control sc = {
2469 .nr_scanned = 0,
2470 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2471 .may_writepage = !laptop_mode,
2472 .may_unmap = 1,
2473 .may_swap = !noswap,
2474 .order = 0,
2475 .target_mem_cgroup = memcg,
2476 };
2477 struct mem_cgroup_zone mz = {
2478 .mem_cgroup = memcg,
2479 .zone = zone,
2480 };
2481
2482 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2483 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2484
2485 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
2486 sc.may_writepage,
2487 sc.gfp_mask);
2488
2489
2490
2491
2492
2493
2494
2495
2496 shrink_mem_cgroup_zone(0, &mz, &sc);
2497
2498 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2499
2500 *nr_scanned = sc.nr_scanned;
2501 return sc.nr_reclaimed;
2502}
2503
2504unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2505 gfp_t gfp_mask,
2506 bool noswap)
2507{
2508 struct zonelist *zonelist;
2509 unsigned long nr_reclaimed;
2510 int nid;
2511 struct scan_control sc = {
2512 .may_writepage = !laptop_mode,
2513 .may_unmap = 1,
2514 .may_swap = !noswap,
2515 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2516 .order = 0,
2517 .target_mem_cgroup = memcg,
2518 .nodemask = NULL,
2519 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2520 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2521 };
2522 struct shrink_control shrink = {
2523 .gfp_mask = sc.gfp_mask,
2524 };
2525
2526
2527
2528
2529
2530
2531 nid = mem_cgroup_select_victim_node(memcg);
2532
2533 zonelist = NODE_DATA(nid)->node_zonelists;
2534
2535 trace_mm_vmscan_memcg_reclaim_begin(0,
2536 sc.may_writepage,
2537 sc.gfp_mask);
2538
2539 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2540
2541 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2542
2543 return nr_reclaimed;
2544}
2545#endif
2546
2547static void age_active_anon(struct zone *zone, struct scan_control *sc,
2548 int priority)
2549{
2550 struct mem_cgroup *memcg;
2551
2552 if (!total_swap_pages)
2553 return;
2554
2555 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2556 do {
2557 struct mem_cgroup_zone mz = {
2558 .mem_cgroup = memcg,
2559 .zone = zone,
2560 };
2561
2562 if (inactive_anon_is_low(&mz))
2563 shrink_active_list(SWAP_CLUSTER_MAX, &mz,
2564 sc, priority, 0);
2565
2566 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2567 } while (memcg);
2568}
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2587 int classzone_idx)
2588{
2589 unsigned long present_pages = 0;
2590 int i;
2591
2592 for (i = 0; i <= classzone_idx; i++)
2593 present_pages += pgdat->node_zones[i].present_pages;
2594
2595
2596 return balanced_pages >= (present_pages >> 2);
2597}
2598
2599
2600static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2601 int classzone_idx)
2602{
2603 int i;
2604 unsigned long balanced = 0;
2605 bool all_zones_ok = true;
2606
2607
2608 if (remaining)
2609 return true;
2610
2611
2612 for (i = 0; i <= classzone_idx; i++) {
2613 struct zone *zone = pgdat->node_zones + i;
2614
2615 if (!populated_zone(zone))
2616 continue;
2617
2618
2619
2620
2621
2622
2623
2624 if (zone->all_unreclaimable) {
2625 balanced += zone->present_pages;
2626 continue;
2627 }
2628
2629 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2630 i, 0))
2631 all_zones_ok = false;
2632 else
2633 balanced += zone->present_pages;
2634 }
2635
2636
2637
2638
2639
2640
2641 if (order)
2642 return !pgdat_balanced(pgdat, balanced, classzone_idx);
2643 else
2644 return !all_zones_ok;
2645}
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2669 int *classzone_idx)
2670{
2671 int all_zones_ok;
2672 unsigned long balanced;
2673 int priority;
2674 int i;
2675 int end_zone = 0;
2676 unsigned long total_scanned;
2677 struct reclaim_state *reclaim_state = current->reclaim_state;
2678 unsigned long nr_soft_reclaimed;
2679 unsigned long nr_soft_scanned;
2680 struct scan_control sc = {
2681 .gfp_mask = GFP_KERNEL,
2682 .may_unmap = 1,
2683 .may_swap = 1,
2684
2685
2686
2687
2688 .nr_to_reclaim = ULONG_MAX,
2689 .order = order,
2690 .target_mem_cgroup = NULL,
2691 };
2692 struct shrink_control shrink = {
2693 .gfp_mask = sc.gfp_mask,
2694 };
2695loop_again:
2696 total_scanned = 0;
2697 sc.nr_reclaimed = 0;
2698 sc.may_writepage = !laptop_mode;
2699 count_vm_event(PAGEOUTRUN);
2700
2701 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2702 unsigned long lru_pages = 0;
2703 int has_under_min_watermark_zone = 0;
2704
2705
2706 if (!priority)
2707 disable_swap_token(NULL);
2708
2709 all_zones_ok = 1;
2710 balanced = 0;
2711
2712
2713
2714
2715
2716 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2717 struct zone *zone = pgdat->node_zones + i;
2718
2719 if (!populated_zone(zone))
2720 continue;
2721
2722 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2723 continue;
2724
2725
2726
2727
2728
2729 age_active_anon(zone, &sc, priority);
2730
2731
2732
2733
2734
2735
2736
2737 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2738 end_zone = i;
2739 break;
2740 }
2741
2742 if (!zone_watermark_ok_safe(zone, order,
2743 high_wmark_pages(zone), 0, 0)) {
2744 end_zone = i;
2745 break;
2746 } else {
2747
2748 zone_clear_flag(zone, ZONE_CONGESTED);
2749 }
2750 }
2751 if (i < 0)
2752 goto out;
2753
2754 for (i = 0; i <= end_zone; i++) {
2755 struct zone *zone = pgdat->node_zones + i;
2756
2757 lru_pages += zone_reclaimable_pages(zone);
2758 }
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769 for (i = 0; i <= end_zone; i++) {
2770 struct zone *zone = pgdat->node_zones + i;
2771 int nr_slab;
2772 unsigned long balance_gap;
2773
2774 if (!populated_zone(zone))
2775 continue;
2776
2777 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2778 continue;
2779
2780 sc.nr_scanned = 0;
2781
2782 nr_soft_scanned = 0;
2783
2784
2785
2786 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2787 order, sc.gfp_mask,
2788 &nr_soft_scanned);
2789 sc.nr_reclaimed += nr_soft_reclaimed;
2790 total_scanned += nr_soft_scanned;
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800 balance_gap = min(low_wmark_pages(zone),
2801 (zone->present_pages +
2802 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2803 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2804 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2805 !zone_watermark_ok_safe(zone, order,
2806 high_wmark_pages(zone) + balance_gap,
2807 end_zone, 0)) {
2808 shrink_zone(priority, zone, &sc);
2809
2810 reclaim_state->reclaimed_slab = 0;
2811 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2812 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2813 total_scanned += sc.nr_scanned;
2814
2815 if (nr_slab == 0 && !zone_reclaimable(zone))
2816 zone->all_unreclaimable = 1;
2817 }
2818
2819
2820
2821
2822
2823
2824 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2825 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2826 sc.may_writepage = 1;
2827
2828 if (zone->all_unreclaimable) {
2829 if (end_zone && end_zone == i)
2830 end_zone--;
2831 continue;
2832 }
2833
2834 if (!zone_watermark_ok_safe(zone, order,
2835 high_wmark_pages(zone), end_zone, 0)) {
2836 all_zones_ok = 0;
2837
2838
2839
2840
2841
2842 if (!zone_watermark_ok_safe(zone, order,
2843 min_wmark_pages(zone), end_zone, 0))
2844 has_under_min_watermark_zone = 1;
2845 } else {
2846
2847
2848
2849
2850
2851
2852
2853 zone_clear_flag(zone, ZONE_CONGESTED);
2854 if (i <= *classzone_idx)
2855 balanced += zone->present_pages;
2856 }
2857
2858 }
2859 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2860 break;
2861
2862
2863
2864
2865 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2866 if (has_under_min_watermark_zone)
2867 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2868 else
2869 congestion_wait(BLK_RW_ASYNC, HZ/10);
2870 }
2871
2872
2873
2874
2875
2876
2877
2878 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2879 break;
2880 }
2881out:
2882
2883
2884
2885
2886
2887
2888 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2889 cond_resched();
2890
2891 try_to_freeze();
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2908 order = sc.order = 0;
2909
2910 goto loop_again;
2911 }
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921 if (order) {
2922 for (i = 0; i <= end_zone; i++) {
2923 struct zone *zone = pgdat->node_zones + i;
2924
2925 if (!populated_zone(zone))
2926 continue;
2927
2928 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2929 continue;
2930
2931
2932 if (!zone_watermark_ok(zone, 0,
2933 high_wmark_pages(zone), 0, 0)) {
2934 order = sc.order = 0;
2935 goto loop_again;
2936 }
2937
2938
2939 zone_clear_flag(zone, ZONE_CONGESTED);
2940 if (i <= *classzone_idx)
2941 balanced += zone->present_pages;
2942 }
2943 }
2944
2945
2946
2947
2948
2949
2950
2951 *classzone_idx = end_zone;
2952 return order;
2953}
2954
2955static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2956{
2957 long remaining = 0;
2958 DEFINE_WAIT(wait);
2959
2960 if (freezing(current) || kthread_should_stop())
2961 return;
2962
2963 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2964
2965
2966 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2967 remaining = schedule_timeout(HZ/10);
2968 finish_wait(&pgdat->kswapd_wait, &wait);
2969 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2970 }
2971
2972
2973
2974
2975
2976 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2977 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2988 schedule();
2989 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2990 } else {
2991 if (remaining)
2992 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2993 else
2994 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2995 }
2996 finish_wait(&pgdat->kswapd_wait, &wait);
2997}
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012static int kswapd(void *p)
3013{
3014 unsigned long order, new_order;
3015 unsigned balanced_order;
3016 int classzone_idx, new_classzone_idx;
3017 int balanced_classzone_idx;
3018 pg_data_t *pgdat = (pg_data_t*)p;
3019 struct task_struct *tsk = current;
3020
3021 struct reclaim_state reclaim_state = {
3022 .reclaimed_slab = 0,
3023 };
3024 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3025
3026 lockdep_set_current_reclaim_state(GFP_KERNEL);
3027
3028 if (!cpumask_empty(cpumask))
3029 set_cpus_allowed_ptr(tsk, cpumask);
3030 current->reclaim_state = &reclaim_state;
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3045 set_freezable();
3046
3047 order = new_order = 0;
3048 balanced_order = 0;
3049 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3050 balanced_classzone_idx = classzone_idx;
3051 for ( ; ; ) {
3052 int ret;
3053
3054
3055
3056
3057
3058
3059 if (balanced_classzone_idx >= new_classzone_idx &&
3060 balanced_order == new_order) {
3061 new_order = pgdat->kswapd_max_order;
3062 new_classzone_idx = pgdat->classzone_idx;
3063 pgdat->kswapd_max_order = 0;
3064 pgdat->classzone_idx = pgdat->nr_zones - 1;
3065 }
3066
3067 if (order < new_order || classzone_idx > new_classzone_idx) {
3068
3069
3070
3071
3072 order = new_order;
3073 classzone_idx = new_classzone_idx;
3074 } else {
3075 kswapd_try_to_sleep(pgdat, balanced_order,
3076 balanced_classzone_idx);
3077 order = pgdat->kswapd_max_order;
3078 classzone_idx = pgdat->classzone_idx;
3079 new_order = order;
3080 new_classzone_idx = classzone_idx;
3081 pgdat->kswapd_max_order = 0;
3082 pgdat->classzone_idx = pgdat->nr_zones - 1;
3083 }
3084
3085 ret = try_to_freeze();
3086 if (kthread_should_stop())
3087 break;
3088
3089
3090
3091
3092
3093 if (!ret) {
3094 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3095 balanced_classzone_idx = classzone_idx;
3096 balanced_order = balance_pgdat(pgdat, order,
3097 &balanced_classzone_idx);
3098 }
3099 }
3100 return 0;
3101}
3102
3103
3104
3105
3106void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3107{
3108 pg_data_t *pgdat;
3109
3110 if (!populated_zone(zone))
3111 return;
3112
3113 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3114 return;
3115 pgdat = zone->zone_pgdat;
3116 if (pgdat->kswapd_max_order < order) {
3117 pgdat->kswapd_max_order = order;
3118 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3119 }
3120 if (!waitqueue_active(&pgdat->kswapd_wait))
3121 return;
3122 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
3123 return;
3124
3125 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3126 wake_up_interruptible(&pgdat->kswapd_wait);
3127}
3128
3129
3130
3131
3132
3133
3134
3135
3136unsigned long global_reclaimable_pages(void)
3137{
3138 int nr;
3139
3140 nr = global_page_state(NR_ACTIVE_FILE) +
3141 global_page_state(NR_INACTIVE_FILE);
3142
3143 if (nr_swap_pages > 0)
3144 nr += global_page_state(NR_ACTIVE_ANON) +
3145 global_page_state(NR_INACTIVE_ANON);
3146
3147 return nr;
3148}
3149
3150unsigned long zone_reclaimable_pages(struct zone *zone)
3151{
3152 int nr;
3153
3154 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3155 zone_page_state(zone, NR_INACTIVE_FILE);
3156
3157 if (nr_swap_pages > 0)
3158 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3159 zone_page_state(zone, NR_INACTIVE_ANON);
3160
3161 return nr;
3162}
3163
3164#ifdef CONFIG_HIBERNATION
3165
3166
3167
3168
3169
3170
3171
3172
3173unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3174{
3175 struct reclaim_state reclaim_state;
3176 struct scan_control sc = {
3177 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3178 .may_swap = 1,
3179 .may_unmap = 1,
3180 .may_writepage = 1,
3181 .nr_to_reclaim = nr_to_reclaim,
3182 .hibernation_mode = 1,
3183 .order = 0,
3184 };
3185 struct shrink_control shrink = {
3186 .gfp_mask = sc.gfp_mask,
3187 };
3188 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3189 struct task_struct *p = current;
3190 unsigned long nr_reclaimed;
3191
3192 p->flags |= PF_MEMALLOC;
3193 lockdep_set_current_reclaim_state(sc.gfp_mask);
3194 reclaim_state.reclaimed_slab = 0;
3195 p->reclaim_state = &reclaim_state;
3196
3197 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3198
3199 p->reclaim_state = NULL;
3200 lockdep_clear_current_reclaim_state();
3201 p->flags &= ~PF_MEMALLOC;
3202
3203 return nr_reclaimed;
3204}
3205#endif
3206
3207
3208
3209
3210
3211static int __devinit cpu_callback(struct notifier_block *nfb,
3212 unsigned long action, void *hcpu)
3213{
3214 int nid;
3215
3216 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3217 for_each_node_state(nid, N_HIGH_MEMORY) {
3218 pg_data_t *pgdat = NODE_DATA(nid);
3219 const struct cpumask *mask;
3220
3221 mask = cpumask_of_node(pgdat->node_id);
3222
3223 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3224
3225 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3226 }
3227 }
3228 return NOTIFY_OK;
3229}
3230
3231
3232
3233
3234
3235int kswapd_run(int nid)
3236{
3237 pg_data_t *pgdat = NODE_DATA(nid);
3238 int ret = 0;
3239
3240 if (pgdat->kswapd)
3241 return 0;
3242
3243 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3244 if (IS_ERR(pgdat->kswapd)) {
3245
3246 BUG_ON(system_state == SYSTEM_BOOTING);
3247 printk("Failed to start kswapd on node %d\n",nid);
3248 ret = -1;
3249 }
3250 return ret;
3251}
3252
3253
3254
3255
3256void kswapd_stop(int nid)
3257{
3258 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3259
3260 if (kswapd)
3261 kthread_stop(kswapd);
3262}
3263
3264static int __init kswapd_init(void)
3265{
3266 int nid;
3267
3268 swap_setup();
3269 for_each_node_state(nid, N_HIGH_MEMORY)
3270 kswapd_run(nid);
3271 hotcpu_notifier(cpu_callback, 0);
3272 return 0;
3273}
3274
3275module_init(kswapd_init)
3276
3277#ifdef CONFIG_NUMA
3278
3279
3280
3281
3282
3283
3284int zone_reclaim_mode __read_mostly;
3285
3286#define RECLAIM_OFF 0
3287#define RECLAIM_ZONE (1<<0)
3288#define RECLAIM_WRITE (1<<1)
3289#define RECLAIM_SWAP (1<<2)
3290
3291
3292
3293
3294
3295
3296#define ZONE_RECLAIM_PRIORITY 4
3297
3298
3299
3300
3301
3302int sysctl_min_unmapped_ratio = 1;
3303
3304
3305
3306
3307
3308int sysctl_min_slab_ratio = 5;
3309
3310static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3311{
3312 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3313 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3314 zone_page_state(zone, NR_ACTIVE_FILE);
3315
3316
3317
3318
3319
3320
3321 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3322}
3323
3324
3325static long zone_pagecache_reclaimable(struct zone *zone)
3326{
3327 long nr_pagecache_reclaimable;
3328 long delta = 0;
3329
3330
3331
3332
3333
3334
3335
3336 if (zone_reclaim_mode & RECLAIM_SWAP)
3337 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3338 else
3339 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3340
3341
3342 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3343 delta += zone_page_state(zone, NR_FILE_DIRTY);
3344
3345
3346 if (unlikely(delta > nr_pagecache_reclaimable))
3347 delta = nr_pagecache_reclaimable;
3348
3349 return nr_pagecache_reclaimable - delta;
3350}
3351
3352
3353
3354
3355static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3356{
3357
3358 const unsigned long nr_pages = 1 << order;
3359 struct task_struct *p = current;
3360 struct reclaim_state reclaim_state;
3361 int priority;
3362 struct scan_control sc = {
3363 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3364 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3365 .may_swap = 1,
3366 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3367 SWAP_CLUSTER_MAX),
3368 .gfp_mask = gfp_mask,
3369 .order = order,
3370 };
3371 struct shrink_control shrink = {
3372 .gfp_mask = sc.gfp_mask,
3373 };
3374 unsigned long nr_slab_pages0, nr_slab_pages1;
3375
3376 cond_resched();
3377
3378
3379
3380
3381
3382 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3383 lockdep_set_current_reclaim_state(gfp_mask);
3384 reclaim_state.reclaimed_slab = 0;
3385 p->reclaim_state = &reclaim_state;
3386
3387 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3388
3389
3390
3391
3392 priority = ZONE_RECLAIM_PRIORITY;
3393 do {
3394 shrink_zone(priority, zone, &sc);
3395 priority--;
3396 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3397 }
3398
3399 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3400 if (nr_slab_pages0 > zone->min_slab_pages) {
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411 for (;;) {
3412 unsigned long lru_pages = zone_reclaimable_pages(zone);
3413
3414
3415 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3416 break;
3417
3418
3419 nr_slab_pages1 = zone_page_state(zone,
3420 NR_SLAB_RECLAIMABLE);
3421 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3422 break;
3423 }
3424
3425
3426
3427
3428
3429 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3430 if (nr_slab_pages1 < nr_slab_pages0)
3431 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3432 }
3433
3434 p->reclaim_state = NULL;
3435 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3436 lockdep_clear_current_reclaim_state();
3437 return sc.nr_reclaimed >= nr_pages;
3438}
3439
3440int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3441{
3442 int node_id;
3443 int ret;
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3456 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3457 return ZONE_RECLAIM_FULL;
3458
3459 if (zone->all_unreclaimable)
3460 return ZONE_RECLAIM_FULL;
3461
3462
3463
3464
3465 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3466 return ZONE_RECLAIM_NOSCAN;
3467
3468
3469
3470
3471
3472
3473
3474 node_id = zone_to_nid(zone);
3475 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3476 return ZONE_RECLAIM_NOSCAN;
3477
3478 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3479 return ZONE_RECLAIM_NOSCAN;
3480
3481 ret = __zone_reclaim(zone, gfp_mask, order);
3482 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3483
3484 if (!ret)
3485 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3486
3487 return ret;
3488}
3489#endif
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505int page_evictable(struct page *page, struct vm_area_struct *vma)
3506{
3507
3508 if (mapping_unevictable(page_mapping(page)))
3509 return 0;
3510
3511 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
3512 return 0;
3513
3514 return 1;
3515}
3516
3517#ifdef CONFIG_SHMEM
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527void check_move_unevictable_pages(struct page **pages, int nr_pages)
3528{
3529 struct lruvec *lruvec;
3530 struct zone *zone = NULL;
3531 int pgscanned = 0;
3532 int pgrescued = 0;
3533 int i;
3534
3535 for (i = 0; i < nr_pages; i++) {
3536 struct page *page = pages[i];
3537 struct zone *pagezone;
3538
3539 pgscanned++;
3540 pagezone = page_zone(page);
3541 if (pagezone != zone) {
3542 if (zone)
3543 spin_unlock_irq(&zone->lru_lock);
3544 zone = pagezone;
3545 spin_lock_irq(&zone->lru_lock);
3546 }
3547
3548 if (!PageLRU(page) || !PageUnevictable(page))
3549 continue;
3550
3551 if (page_evictable(page, NULL)) {
3552 enum lru_list lru = page_lru_base_type(page);
3553
3554 VM_BUG_ON(PageActive(page));
3555 ClearPageUnevictable(page);
3556 __dec_zone_state(zone, NR_UNEVICTABLE);
3557 lruvec = mem_cgroup_lru_move_lists(zone, page,
3558 LRU_UNEVICTABLE, lru);
3559 list_move(&page->lru, &lruvec->lists[lru]);
3560 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3561 pgrescued++;
3562 }
3563 }
3564
3565 if (zone) {
3566 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3567 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3568 spin_unlock_irq(&zone->lru_lock);
3569 }
3570}
3571#endif
3572
3573static void warn_scan_unevictable_pages(void)
3574{
3575 printk_once(KERN_WARNING
3576 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3577 "disabled for lack of a legitimate use case. If you have "
3578 "one, please send an email to linux-mm@kvack.org.\n",
3579 current->comm);
3580}
3581
3582
3583
3584
3585
3586unsigned long scan_unevictable_pages;
3587
3588int scan_unevictable_handler(struct ctl_table *table, int write,
3589 void __user *buffer,
3590 size_t *length, loff_t *ppos)
3591{
3592 warn_scan_unevictable_pages();
3593 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3594 scan_unevictable_pages = 0;
3595 return 0;
3596}
3597
3598#ifdef CONFIG_NUMA
3599
3600
3601
3602
3603
3604static ssize_t read_scan_unevictable_node(struct device *dev,
3605 struct device_attribute *attr,
3606 char *buf)
3607{
3608 warn_scan_unevictable_pages();
3609 return sprintf(buf, "0\n");
3610}
3611
3612static ssize_t write_scan_unevictable_node(struct device *dev,
3613 struct device_attribute *attr,
3614 const char *buf, size_t count)
3615{
3616 warn_scan_unevictable_pages();
3617 return 1;
3618}
3619
3620
3621static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3622 read_scan_unevictable_node,
3623 write_scan_unevictable_node);
3624
3625int scan_unevictable_register_node(struct node *node)
3626{
3627 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3628}
3629
3630void scan_unevictable_unregister_node(struct node *node)
3631{
3632 device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3633}
3634#endif
3635