1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/backing-dev.h>
30#include <linux/rmap.h>
31#include <linux/topology.h>
32#include <linux/cpu.h>
33#include <linux/cpuset.h>
34#include <linux/compaction.h>
35#include <linux/notifier.h>
36#include <linux/rwsem.h>
37#include <linux/delay.h>
38#include <linux/kthread.h>
39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
43#include <linux/oom.h>
44#include <linux/prefetch.h>
45
46#include <asm/tlbflush.h>
47#include <asm/div64.h>
48
49#include <linux/swapops.h>
50
51#include "internal.h"
52
53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h>
55
56
57
58
59
60
61
62
63
64
65
66
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control {
75
76 unsigned long nr_scanned;
77
78
79 unsigned long nr_reclaimed;
80
81
82 unsigned long nr_to_reclaim;
83
84 unsigned long hibernation_mode;
85
86
87 gfp_t gfp_mask;
88
89 int may_writepage;
90
91
92 int may_unmap;
93
94
95 int may_swap;
96
97 int order;
98
99
100
101
102
103 reclaim_mode_t reclaim_mode;
104
105
106
107
108
109 struct mem_cgroup *target_mem_cgroup;
110
111
112
113
114
115 nodemask_t *nodemask;
116};
117
118struct mem_cgroup_zone {
119 struct mem_cgroup *mem_cgroup;
120 struct zone *zone;
121};
122
123#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
124
125#ifdef ARCH_HAS_PREFETCH
126#define prefetch_prev_lru_page(_page, _base, _field) \
127 do { \
128 if ((_page)->lru.prev != _base) { \
129 struct page *prev; \
130 \
131 prev = lru_to_page(&(_page->lru)); \
132 prefetch(&prev->_field); \
133 } \
134 } while (0)
135#else
136#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
137#endif
138
139#ifdef ARCH_HAS_PREFETCHW
140#define prefetchw_prev_lru_page(_page, _base, _field) \
141 do { \
142 if ((_page)->lru.prev != _base) { \
143 struct page *prev; \
144 \
145 prev = lru_to_page(&(_page->lru)); \
146 prefetchw(&prev->_field); \
147 } \
148 } while (0)
149#else
150#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
151#endif
152
153
154
155
156int vm_swappiness = 60;
157long vm_total_pages;
158
159static LIST_HEAD(shrinker_list);
160static DECLARE_RWSEM(shrinker_rwsem);
161
162#ifdef CONFIG_CGROUP_MEM_RES_CTLR
163static bool global_reclaim(struct scan_control *sc)
164{
165 return !sc->target_mem_cgroup;
166}
167
168static bool scanning_global_lru(struct mem_cgroup_zone *mz)
169{
170 return !mz->mem_cgroup;
171}
172#else
173static bool global_reclaim(struct scan_control *sc)
174{
175 return true;
176}
177
178static bool scanning_global_lru(struct mem_cgroup_zone *mz)
179{
180 return true;
181}
182#endif
183
184static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
185{
186 if (!scanning_global_lru(mz))
187 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
188
189 return &mz->zone->reclaim_stat;
190}
191
192static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
193 enum lru_list lru)
194{
195 if (!scanning_global_lru(mz))
196 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
197 zone_to_nid(mz->zone),
198 zone_idx(mz->zone),
199 BIT(lru));
200
201 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
202}
203
204
205
206
207
208void register_shrinker(struct shrinker *shrinker)
209{
210 atomic_long_set(&shrinker->nr_in_batch, 0);
211 down_write(&shrinker_rwsem);
212 list_add_tail(&shrinker->list, &shrinker_list);
213 up_write(&shrinker_rwsem);
214}
215EXPORT_SYMBOL(register_shrinker);
216
217
218
219
220void unregister_shrinker(struct shrinker *shrinker)
221{
222 down_write(&shrinker_rwsem);
223 list_del(&shrinker->list);
224 up_write(&shrinker_rwsem);
225}
226EXPORT_SYMBOL(unregister_shrinker);
227
228static inline int do_shrinker_shrink(struct shrinker *shrinker,
229 struct shrink_control *sc,
230 unsigned long nr_to_scan)
231{
232 sc->nr_to_scan = nr_to_scan;
233 return (*shrinker->shrink)(shrinker, sc);
234}
235
236#define SHRINK_BATCH 128
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256unsigned long shrink_slab(struct shrink_control *shrink,
257 unsigned long nr_pages_scanned,
258 unsigned long lru_pages)
259{
260 struct shrinker *shrinker;
261 unsigned long ret = 0;
262
263 if (nr_pages_scanned == 0)
264 nr_pages_scanned = SWAP_CLUSTER_MAX;
265
266 if (!down_read_trylock(&shrinker_rwsem)) {
267
268 ret = 1;
269 goto out;
270 }
271
272 list_for_each_entry(shrinker, &shrinker_list, list) {
273 unsigned long long delta;
274 long total_scan;
275 long max_pass;
276 int shrink_ret = 0;
277 long nr;
278 long new_nr;
279 long batch_size = shrinker->batch ? shrinker->batch
280 : SHRINK_BATCH;
281
282 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
283 if (max_pass <= 0)
284 continue;
285
286
287
288
289
290
291 nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
292
293 total_scan = nr;
294 delta = (4 * nr_pages_scanned) / shrinker->seeks;
295 delta *= max_pass;
296 do_div(delta, lru_pages + 1);
297 total_scan += delta;
298 if (total_scan < 0) {
299 printk(KERN_ERR "shrink_slab: %pF negative objects to "
300 "delete nr=%ld\n",
301 shrinker->shrink, total_scan);
302 total_scan = max_pass;
303 }
304
305
306
307
308
309
310
311
312
313
314
315
316
317 if (delta < max_pass / 4)
318 total_scan = min(total_scan, max_pass / 2);
319
320
321
322
323
324
325 if (total_scan > max_pass * 2)
326 total_scan = max_pass * 2;
327
328 trace_mm_shrink_slab_start(shrinker, shrink, nr,
329 nr_pages_scanned, lru_pages,
330 max_pass, delta, total_scan);
331
332 while (total_scan >= batch_size) {
333 int nr_before;
334
335 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
336 shrink_ret = do_shrinker_shrink(shrinker, shrink,
337 batch_size);
338 if (shrink_ret == -1)
339 break;
340 if (shrink_ret < nr_before)
341 ret += nr_before - shrink_ret;
342 count_vm_events(SLABS_SCANNED, batch_size);
343 total_scan -= batch_size;
344
345 cond_resched();
346 }
347
348
349
350
351
352
353 if (total_scan > 0)
354 new_nr = atomic_long_add_return(total_scan,
355 &shrinker->nr_in_batch);
356 else
357 new_nr = atomic_long_read(&shrinker->nr_in_batch);
358
359 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
360 }
361 up_read(&shrinker_rwsem);
362out:
363 cond_resched();
364 return ret;
365}
366
367static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync)
369{
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371
372
373
374
375
376
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382
383
384
385
386
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
388 sc->reclaim_mode |= syncmode;
389 else if (sc->order && priority < DEF_PRIORITY - 2)
390 sc->reclaim_mode |= syncmode;
391 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393}
394
395static void reset_reclaim_mode(struct scan_control *sc)
396{
397 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
398}
399
400static inline int is_page_cache_freeable(struct page *page)
401{
402
403
404
405
406
407 return page_count(page) - page_has_private(page) == 2;
408}
409
410static int may_write_to_queue(struct backing_dev_info *bdi,
411 struct scan_control *sc)
412{
413 if (current->flags & PF_SWAPWRITE)
414 return 1;
415 if (!bdi_write_congested(bdi))
416 return 1;
417 if (bdi == current->backing_dev_info)
418 return 1;
419
420
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0;
424}
425
426
427
428
429
430
431
432
433
434
435
436
437
438static void handle_write_error(struct address_space *mapping,
439 struct page *page, int error)
440{
441 lock_page(page);
442 if (page_mapping(page) == mapping)
443 mapping_set_error(mapping, error);
444 unlock_page(page);
445}
446
447
448typedef enum {
449
450 PAGE_KEEP,
451
452 PAGE_ACTIVATE,
453
454 PAGE_SUCCESS,
455
456 PAGE_CLEAN,
457} pageout_t;
458
459
460
461
462
463static pageout_t pageout(struct page *page, struct address_space *mapping,
464 struct scan_control *sc)
465{
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482 if (!is_page_cache_freeable(page))
483 return PAGE_KEEP;
484 if (!mapping) {
485
486
487
488
489 if (page_has_private(page)) {
490 if (try_to_free_buffers(page)) {
491 ClearPageDirty(page);
492 printk("%s: orphaned page\n", __func__);
493 return PAGE_CLEAN;
494 }
495 }
496 return PAGE_KEEP;
497 }
498 if (mapping->a_ops->writepage == NULL)
499 return PAGE_ACTIVATE;
500 if (!may_write_to_queue(mapping->backing_dev_info, sc))
501 return PAGE_KEEP;
502
503 if (clear_page_dirty_for_io(page)) {
504 int res;
505 struct writeback_control wbc = {
506 .sync_mode = WB_SYNC_NONE,
507 .nr_to_write = SWAP_CLUSTER_MAX,
508 .range_start = 0,
509 .range_end = LLONG_MAX,
510 .for_reclaim = 1,
511 };
512
513 SetPageReclaim(page);
514 res = mapping->a_ops->writepage(page, &wbc);
515 if (res < 0)
516 handle_write_error(mapping, page, res);
517 if (res == AOP_WRITEPAGE_ACTIVATE) {
518 ClearPageReclaim(page);
519 return PAGE_ACTIVATE;
520 }
521
522 if (!PageWriteback(page)) {
523
524 ClearPageReclaim(page);
525 }
526 trace_mm_vmscan_writepage(page,
527 trace_reclaim_flags(page, sc->reclaim_mode));
528 inc_zone_page_state(page, NR_VMSCAN_WRITE);
529 return PAGE_SUCCESS;
530 }
531
532 return PAGE_CLEAN;
533}
534
535
536
537
538
539static int __remove_mapping(struct address_space *mapping, struct page *page)
540{
541 BUG_ON(!PageLocked(page));
542 BUG_ON(mapping != page_mapping(page));
543
544 spin_lock_irq(&mapping->tree_lock);
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570 if (!page_freeze_refs(page, 2))
571 goto cannot_free;
572
573 if (unlikely(PageDirty(page))) {
574 page_unfreeze_refs(page, 2);
575 goto cannot_free;
576 }
577
578 if (PageSwapCache(page)) {
579 swp_entry_t swap = { .val = page_private(page) };
580 __delete_from_swap_cache(page);
581 spin_unlock_irq(&mapping->tree_lock);
582 swapcache_free(swap, page);
583 } else {
584 void (*freepage)(struct page *);
585
586 freepage = mapping->a_ops->freepage;
587
588 __delete_from_page_cache(page);
589 spin_unlock_irq(&mapping->tree_lock);
590 mem_cgroup_uncharge_cache_page(page);
591
592 if (freepage != NULL)
593 freepage(page);
594 }
595
596 return 1;
597
598cannot_free:
599 spin_unlock_irq(&mapping->tree_lock);
600 return 0;
601}
602
603
604
605
606
607
608
609int remove_mapping(struct address_space *mapping, struct page *page)
610{
611 if (__remove_mapping(mapping, page)) {
612
613
614
615
616
617 page_unfreeze_refs(page, 1);
618 return 1;
619 }
620 return 0;
621}
622
623
624
625
626
627
628
629
630
631
632void putback_lru_page(struct page *page)
633{
634 int lru;
635 int active = !!TestClearPageActive(page);
636 int was_unevictable = PageUnevictable(page);
637
638 VM_BUG_ON(PageLRU(page));
639
640redo:
641 ClearPageUnevictable(page);
642
643 if (page_evictable(page, NULL)) {
644
645
646
647
648
649
650 lru = active + page_lru_base_type(page);
651 lru_cache_add_lru(page, lru);
652 } else {
653
654
655
656
657 lru = LRU_UNEVICTABLE;
658 add_page_to_unevictable_list(page);
659
660
661
662
663
664
665
666
667
668
669 smp_mb();
670 }
671
672
673
674
675
676
677 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
678 if (!isolate_lru_page(page)) {
679 put_page(page);
680 goto redo;
681 }
682
683
684
685
686 }
687
688 if (was_unevictable && lru != LRU_UNEVICTABLE)
689 count_vm_event(UNEVICTABLE_PGRESCUED);
690 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
691 count_vm_event(UNEVICTABLE_PGCULLED);
692
693 put_page(page);
694}
695
696enum page_references {
697 PAGEREF_RECLAIM,
698 PAGEREF_RECLAIM_CLEAN,
699 PAGEREF_KEEP,
700 PAGEREF_ACTIVATE,
701};
702
703static enum page_references page_check_references(struct page *page,
704 struct mem_cgroup_zone *mz,
705 struct scan_control *sc)
706{
707 int referenced_ptes, referenced_page;
708 unsigned long vm_flags;
709
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
711 referenced_page = TestClearPageReferenced(page);
712
713
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717
718
719
720
721 if (vm_flags & VM_LOCKED)
722 return PAGEREF_RECLAIM;
723
724 if (referenced_ptes) {
725 if (PageAnon(page))
726 return PAGEREF_ACTIVATE;
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741 SetPageReferenced(page);
742
743 if (referenced_page || referenced_ptes > 1)
744 return PAGEREF_ACTIVATE;
745
746
747
748
749 if (vm_flags & VM_EXEC)
750 return PAGEREF_ACTIVATE;
751
752 return PAGEREF_KEEP;
753 }
754
755
756 if (referenced_page && !PageSwapBacked(page))
757 return PAGEREF_RECLAIM_CLEAN;
758
759 return PAGEREF_RECLAIM;
760}
761
762
763
764
765static unsigned long shrink_page_list(struct list_head *page_list,
766 struct mem_cgroup_zone *mz,
767 struct scan_control *sc,
768 int priority,
769 unsigned long *ret_nr_dirty,
770 unsigned long *ret_nr_writeback)
771{
772 LIST_HEAD(ret_pages);
773 LIST_HEAD(free_pages);
774 int pgactivate = 0;
775 unsigned long nr_dirty = 0;
776 unsigned long nr_congested = 0;
777 unsigned long nr_reclaimed = 0;
778 unsigned long nr_writeback = 0;
779
780 cond_resched();
781
782 while (!list_empty(page_list)) {
783 enum page_references references;
784 struct address_space *mapping;
785 struct page *page;
786 int may_enter_fs;
787
788 cond_resched();
789
790 page = lru_to_page(page_list);
791 list_del(&page->lru);
792
793 if (!trylock_page(page))
794 goto keep;
795
796 VM_BUG_ON(PageActive(page));
797 VM_BUG_ON(page_zone(page) != mz->zone);
798
799 sc->nr_scanned++;
800
801 if (unlikely(!page_evictable(page, NULL)))
802 goto cull_mlocked;
803
804 if (!sc->may_unmap && page_mapped(page))
805 goto keep_locked;
806
807
808 if (page_mapped(page) || PageSwapCache(page))
809 sc->nr_scanned++;
810
811 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
812 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
813
814 if (PageWriteback(page)) {
815 nr_writeback++;
816
817
818
819
820
821
822 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
823 may_enter_fs)
824 wait_on_page_writeback(page);
825 else {
826 unlock_page(page);
827 goto keep_lumpy;
828 }
829 }
830
831 references = page_check_references(page, mz, sc);
832 switch (references) {
833 case PAGEREF_ACTIVATE:
834 goto activate_locked;
835 case PAGEREF_KEEP:
836 goto keep_locked;
837 case PAGEREF_RECLAIM:
838 case PAGEREF_RECLAIM_CLEAN:
839 ;
840 }
841
842
843
844
845
846 if (PageAnon(page) && !PageSwapCache(page)) {
847 if (!(sc->gfp_mask & __GFP_IO))
848 goto keep_locked;
849 if (!add_to_swap(page))
850 goto activate_locked;
851 may_enter_fs = 1;
852 }
853
854 mapping = page_mapping(page);
855
856
857
858
859
860 if (page_mapped(page) && mapping) {
861 switch (try_to_unmap(page, TTU_UNMAP)) {
862 case SWAP_FAIL:
863 goto activate_locked;
864 case SWAP_AGAIN:
865 goto keep_locked;
866 case SWAP_MLOCK:
867 goto cull_mlocked;
868 case SWAP_SUCCESS:
869 ;
870 }
871 }
872
873 if (PageDirty(page)) {
874 nr_dirty++;
875
876
877
878
879
880
881 if (page_is_file_cache(page) &&
882 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
883
884
885
886
887
888
889 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
890 SetPageReclaim(page);
891
892 goto keep_locked;
893 }
894
895 if (references == PAGEREF_RECLAIM_CLEAN)
896 goto keep_locked;
897 if (!may_enter_fs)
898 goto keep_locked;
899 if (!sc->may_writepage)
900 goto keep_locked;
901
902
903 switch (pageout(page, mapping, sc)) {
904 case PAGE_KEEP:
905 nr_congested++;
906 goto keep_locked;
907 case PAGE_ACTIVATE:
908 goto activate_locked;
909 case PAGE_SUCCESS:
910 if (PageWriteback(page))
911 goto keep_lumpy;
912 if (PageDirty(page))
913 goto keep;
914
915
916
917
918
919 if (!trylock_page(page))
920 goto keep;
921 if (PageDirty(page) || PageWriteback(page))
922 goto keep_locked;
923 mapping = page_mapping(page);
924 case PAGE_CLEAN:
925 ;
926 }
927 }
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950 if (page_has_private(page)) {
951 if (!try_to_release_page(page, sc->gfp_mask))
952 goto activate_locked;
953 if (!mapping && page_count(page) == 1) {
954 unlock_page(page);
955 if (put_page_testzero(page))
956 goto free_it;
957 else {
958
959
960
961
962
963
964
965 nr_reclaimed++;
966 continue;
967 }
968 }
969 }
970
971 if (!mapping || !__remove_mapping(mapping, page))
972 goto keep_locked;
973
974
975
976
977
978
979
980
981 __clear_page_locked(page);
982free_it:
983 nr_reclaimed++;
984
985
986
987
988
989 list_add(&page->lru, &free_pages);
990 continue;
991
992cull_mlocked:
993 if (PageSwapCache(page))
994 try_to_free_swap(page);
995 unlock_page(page);
996 putback_lru_page(page);
997 reset_reclaim_mode(sc);
998 continue;
999
1000activate_locked:
1001
1002 if (PageSwapCache(page) && vm_swap_full())
1003 try_to_free_swap(page);
1004 VM_BUG_ON(PageActive(page));
1005 SetPageActive(page);
1006 pgactivate++;
1007keep_locked:
1008 unlock_page(page);
1009keep:
1010 reset_reclaim_mode(sc);
1011keep_lumpy:
1012 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 }
1015
1016
1017
1018
1019
1020
1021
1022 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
1023 zone_set_flag(mz->zone, ZONE_CONGESTED);
1024
1025 free_hot_cold_page_list(&free_pages, 1);
1026
1027 list_splice(&ret_pages, page_list);
1028 count_vm_events(PGACTIVATE, pgactivate);
1029 *ret_nr_dirty += nr_dirty;
1030 *ret_nr_writeback += nr_writeback;
1031 return nr_reclaimed;
1032}
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1045{
1046 bool all_lru_mode;
1047 int ret = -EINVAL;
1048
1049
1050 if (!PageLRU(page))
1051 return ret;
1052
1053 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1054 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1055
1056
1057
1058
1059
1060
1061 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1062 return ret;
1063
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret;
1066
1067
1068
1069
1070
1071
1072 if (PageUnevictable(page))
1073 return ret;
1074
1075 ret = -EBUSY;
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1089
1090 if (PageWriteback(page))
1091 return ret;
1092
1093 if (PageDirty(page)) {
1094 struct address_space *mapping;
1095
1096
1097 if (mode & ISOLATE_CLEAN)
1098 return ret;
1099
1100
1101
1102
1103
1104
1105 mapping = page_mapping(page);
1106 if (mapping && !mapping->a_ops->migratepage)
1107 return ret;
1108 }
1109 }
1110
1111 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1112 return ret;
1113
1114 if (likely(get_page_unless_zero(page))) {
1115
1116
1117
1118
1119
1120 ClearPageLRU(page);
1121 ret = 0;
1122 }
1123
1124 return ret;
1125}
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst,
1150 unsigned long *nr_scanned, int order, isolate_mode_t mode,
1151 int active, int file)
1152{
1153 struct lruvec *lruvec;
1154 struct list_head *src;
1155 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan;
1160 int lru = LRU_BASE;
1161
1162 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1163 if (active)
1164 lru += LRU_ACTIVE;
1165 if (file)
1166 lru += LRU_FILE;
1167 src = &lruvec->lists[lru];
1168
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page;
1171 unsigned long pfn;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175
1176 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags);
1178
1179 VM_BUG_ON(!PageLRU(page));
1180
1181 switch (__isolate_lru_page(page, mode, file)) {
1182 case 0:
1183 mem_cgroup_lru_del(page);
1184 list_move(&page->lru, dst);
1185 nr_taken += hpage_nr_pages(page);
1186 break;
1187
1188 case -EBUSY:
1189
1190 list_move(&page->lru, src);
1191 continue;
1192
1193 default:
1194 BUG();
1195 }
1196
1197 if (!order)
1198 continue;
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << order) - 1);
1212 end_pfn = pfn + (1 << order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230
1231
1232
1233
1234
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 }
1275
1276 *nr_scanned = scan;
1277
1278 trace_mm_vmscan_lru_isolate(order,
1279 nr_to_scan, scan,
1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file);
1283 return nr_taken;
1284}
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311int isolate_lru_page(struct page *page)
1312{
1313 int ret = -EBUSY;
1314
1315 VM_BUG_ON(!page_count(page));
1316
1317 if (PageLRU(page)) {
1318 struct zone *zone = page_zone(page);
1319
1320 spin_lock_irq(&zone->lru_lock);
1321 if (PageLRU(page)) {
1322 int lru = page_lru(page);
1323 ret = 0;
1324 get_page(page);
1325 ClearPageLRU(page);
1326
1327 del_page_from_lru_list(zone, page, lru);
1328 }
1329 spin_unlock_irq(&zone->lru_lock);
1330 }
1331 return ret;
1332}
1333
1334
1335
1336
1337static int too_many_isolated(struct zone *zone, int file,
1338 struct scan_control *sc)
1339{
1340 unsigned long inactive, isolated;
1341
1342 if (current_is_kswapd())
1343 return 0;
1344
1345 if (!global_reclaim(sc))
1346 return 0;
1347
1348 if (file) {
1349 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1350 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1351 } else {
1352 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1353 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1354 }
1355
1356 return isolated > inactive;
1357}
1358
1359static noinline_for_stack void
1360putback_inactive_pages(struct mem_cgroup_zone *mz,
1361 struct list_head *page_list)
1362{
1363 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1364 struct zone *zone = mz->zone;
1365 LIST_HEAD(pages_to_free);
1366
1367
1368
1369
1370 while (!list_empty(page_list)) {
1371 struct page *page = lru_to_page(page_list);
1372 int lru;
1373
1374 VM_BUG_ON(PageLRU(page));
1375 list_del(&page->lru);
1376 if (unlikely(!page_evictable(page, NULL))) {
1377 spin_unlock_irq(&zone->lru_lock);
1378 putback_lru_page(page);
1379 spin_lock_irq(&zone->lru_lock);
1380 continue;
1381 }
1382 SetPageLRU(page);
1383 lru = page_lru(page);
1384 add_page_to_lru_list(zone, page, lru);
1385 if (is_active_lru(lru)) {
1386 int file = is_file_lru(lru);
1387 int numpages = hpage_nr_pages(page);
1388 reclaim_stat->recent_rotated[file] += numpages;
1389 }
1390 if (put_page_testzero(page)) {
1391 __ClearPageLRU(page);
1392 __ClearPageActive(page);
1393 del_page_from_lru_list(zone, page, lru);
1394
1395 if (unlikely(PageCompound(page))) {
1396 spin_unlock_irq(&zone->lru_lock);
1397 (*get_compound_page_dtor(page))(page);
1398 spin_lock_irq(&zone->lru_lock);
1399 } else
1400 list_add(&page->lru, &pages_to_free);
1401 }
1402 }
1403
1404
1405
1406
1407 list_splice(&pages_to_free, page_list);
1408}
1409
1410static noinline_for_stack void
1411update_isolated_counts(struct mem_cgroup_zone *mz,
1412 struct list_head *page_list,
1413 unsigned long *nr_anon,
1414 unsigned long *nr_file)
1415{
1416 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1417 struct zone *zone = mz->zone;
1418 unsigned int count[NR_LRU_LISTS] = { 0, };
1419 unsigned long nr_active = 0;
1420 struct page *page;
1421 int lru;
1422
1423
1424
1425
1426 list_for_each_entry(page, page_list, lru) {
1427 int numpages = hpage_nr_pages(page);
1428 lru = page_lru_base_type(page);
1429 if (PageActive(page)) {
1430 lru += LRU_ACTIVE;
1431 ClearPageActive(page);
1432 nr_active += numpages;
1433 }
1434 count[lru] += numpages;
1435 }
1436
1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1440 -count[LRU_ACTIVE_FILE]);
1441 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1442 -count[LRU_INACTIVE_FILE]);
1443 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1444 -count[LRU_ACTIVE_ANON]);
1445 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1446 -count[LRU_INACTIVE_ANON]);
1447
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450
1451 reclaim_stat->recent_scanned[0] += *nr_anon;
1452 reclaim_stat->recent_scanned[1] += *nr_file;
1453}
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463static inline bool should_reclaim_stall(unsigned long nr_taken,
1464 unsigned long nr_freed,
1465 int priority,
1466 struct scan_control *sc)
1467{
1468 int lumpy_stall_priority;
1469
1470
1471 if (current_is_kswapd())
1472 return false;
1473
1474
1475 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1476 return false;
1477
1478
1479 if (nr_freed == nr_taken)
1480 return false;
1481
1482
1483
1484
1485
1486
1487
1488 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1489 lumpy_stall_priority = DEF_PRIORITY;
1490 else
1491 lumpy_stall_priority = DEF_PRIORITY / 3;
1492
1493 return priority <= lumpy_stall_priority;
1494}
1495
1496
1497
1498
1499
1500static noinline_for_stack unsigned long
1501shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1502 struct scan_control *sc, int priority, int file)
1503{
1504 LIST_HEAD(page_list);
1505 unsigned long nr_scanned;
1506 unsigned long nr_reclaimed = 0;
1507 unsigned long nr_taken;
1508 unsigned long nr_anon;
1509 unsigned long nr_file;
1510 unsigned long nr_dirty = 0;
1511 unsigned long nr_writeback = 0;
1512 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1513 struct zone *zone = mz->zone;
1514
1515 while (unlikely(too_many_isolated(zone, file, sc))) {
1516 congestion_wait(BLK_RW_ASYNC, HZ/10);
1517
1518
1519 if (fatal_signal_pending(current))
1520 return SWAP_CLUSTER_MAX;
1521 }
1522
1523 set_reclaim_mode(priority, sc, false);
1524 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1525 reclaim_mode |= ISOLATE_ACTIVE;
1526
1527 lru_add_drain();
1528
1529 if (!sc->may_unmap)
1530 reclaim_mode |= ISOLATE_UNMAPPED;
1531 if (!sc->may_writepage)
1532 reclaim_mode |= ISOLATE_CLEAN;
1533
1534 spin_lock_irq(&zone->lru_lock);
1535
1536 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list,
1537 &nr_scanned, sc->order,
1538 reclaim_mode, 0, file);
1539 if (global_reclaim(sc)) {
1540 zone->pages_scanned += nr_scanned;
1541 if (current_is_kswapd())
1542 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1543 nr_scanned);
1544 else
1545 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1546 nr_scanned);
1547 }
1548
1549 if (nr_taken == 0) {
1550 spin_unlock_irq(&zone->lru_lock);
1551 return 0;
1552 }
1553
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1555
1556 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1557 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1558
1559 spin_unlock_irq(&zone->lru_lock);
1560
1561 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1562 &nr_dirty, &nr_writeback);
1563
1564
1565 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1566 set_reclaim_mode(priority, sc, true);
1567 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1568 priority, &nr_dirty, &nr_writeback);
1569 }
1570
1571 spin_lock_irq(&zone->lru_lock);
1572
1573 if (current_is_kswapd())
1574 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1575 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1576
1577 putback_inactive_pages(mz, &page_list);
1578
1579 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1580 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1581
1582 spin_unlock_irq(&zone->lru_lock);
1583
1584 free_hot_cold_page_list(&page_list, 1);
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1610 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1611
1612 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1613 zone_idx(zone),
1614 nr_scanned, nr_reclaimed,
1615 priority,
1616 trace_shrink_flags(file, sc->reclaim_mode));
1617 return nr_reclaimed;
1618}
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638static void move_active_pages_to_lru(struct zone *zone,
1639 struct list_head *list,
1640 struct list_head *pages_to_free,
1641 enum lru_list lru)
1642{
1643 unsigned long pgmoved = 0;
1644 struct page *page;
1645
1646 if (buffer_heads_over_limit) {
1647 spin_unlock_irq(&zone->lru_lock);
1648 list_for_each_entry(page, list, lru) {
1649 if (page_has_private(page) && trylock_page(page)) {
1650 if (page_has_private(page))
1651 try_to_release_page(page, 0);
1652 unlock_page(page);
1653 }
1654 }
1655 spin_lock_irq(&zone->lru_lock);
1656 }
1657
1658 while (!list_empty(list)) {
1659 struct lruvec *lruvec;
1660
1661 page = lru_to_page(list);
1662
1663 VM_BUG_ON(PageLRU(page));
1664 SetPageLRU(page);
1665
1666 lruvec = mem_cgroup_lru_add_list(zone, page, lru);
1667 list_move(&page->lru, &lruvec->lists[lru]);
1668 pgmoved += hpage_nr_pages(page);
1669
1670 if (put_page_testzero(page)) {
1671 __ClearPageLRU(page);
1672 __ClearPageActive(page);
1673 del_page_from_lru_list(zone, page, lru);
1674
1675 if (unlikely(PageCompound(page))) {
1676 spin_unlock_irq(&zone->lru_lock);
1677 (*get_compound_page_dtor(page))(page);
1678 spin_lock_irq(&zone->lru_lock);
1679 } else
1680 list_add(&page->lru, pages_to_free);
1681 }
1682 }
1683 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1684 if (!is_active_lru(lru))
1685 __count_vm_events(PGDEACTIVATE, pgmoved);
1686}
1687
1688static void shrink_active_list(unsigned long nr_to_scan,
1689 struct mem_cgroup_zone *mz,
1690 struct scan_control *sc,
1691 int priority, int file)
1692{
1693 unsigned long nr_taken;
1694 unsigned long nr_scanned;
1695 unsigned long vm_flags;
1696 LIST_HEAD(l_hold);
1697 LIST_HEAD(l_active);
1698 LIST_HEAD(l_inactive);
1699 struct page *page;
1700 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1701 unsigned long nr_rotated = 0;
1702 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1703 struct zone *zone = mz->zone;
1704
1705 lru_add_drain();
1706
1707 if (!sc->may_unmap)
1708 reclaim_mode |= ISOLATE_UNMAPPED;
1709 if (!sc->may_writepage)
1710 reclaim_mode |= ISOLATE_CLEAN;
1711
1712 spin_lock_irq(&zone->lru_lock);
1713
1714 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold,
1715 &nr_scanned, sc->order,
1716 reclaim_mode, 1, file);
1717 if (global_reclaim(sc))
1718 zone->pages_scanned += nr_scanned;
1719
1720 reclaim_stat->recent_scanned[file] += nr_taken;
1721
1722 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1723 if (file)
1724 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1725 else
1726 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1727 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1728 spin_unlock_irq(&zone->lru_lock);
1729
1730 while (!list_empty(&l_hold)) {
1731 cond_resched();
1732 page = lru_to_page(&l_hold);
1733 list_del(&page->lru);
1734
1735 if (unlikely(!page_evictable(page, NULL))) {
1736 putback_lru_page(page);
1737 continue;
1738 }
1739
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page);
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1752 list_add(&page->lru, &l_active);
1753 continue;
1754 }
1755 }
1756
1757 ClearPageActive(page);
1758 list_add(&page->lru, &l_inactive);
1759 }
1760
1761
1762
1763
1764 spin_lock_irq(&zone->lru_lock);
1765
1766
1767
1768
1769
1770
1771 reclaim_stat->recent_rotated[file] += nr_rotated;
1772
1773 move_active_pages_to_lru(zone, &l_active, &l_hold,
1774 LRU_ACTIVE + file * LRU_FILE);
1775 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1776 LRU_BASE + file * LRU_FILE);
1777 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1778 spin_unlock_irq(&zone->lru_lock);
1779
1780 free_hot_cold_page_list(&l_hold, 1);
1781}
1782
1783#ifdef CONFIG_SWAP
1784static int inactive_anon_is_low_global(struct zone *zone)
1785{
1786 unsigned long active, inactive;
1787
1788 active = zone_page_state(zone, NR_ACTIVE_ANON);
1789 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1790
1791 if (inactive * zone->inactive_ratio < active)
1792 return 1;
1793
1794 return 0;
1795}
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1806{
1807
1808
1809
1810
1811 if (!total_swap_pages)
1812 return 0;
1813
1814 if (!scanning_global_lru(mz))
1815 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
1816 mz->zone);
1817
1818 return inactive_anon_is_low_global(mz->zone);
1819}
1820#else
1821static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1822{
1823 return 0;
1824}
1825#endif
1826
1827static int inactive_file_is_low_global(struct zone *zone)
1828{
1829 unsigned long active, inactive;
1830
1831 active = zone_page_state(zone, NR_ACTIVE_FILE);
1832 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1833
1834 return (active > inactive);
1835}
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851static int inactive_file_is_low(struct mem_cgroup_zone *mz)
1852{
1853 if (!scanning_global_lru(mz))
1854 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
1855 mz->zone);
1856
1857 return inactive_file_is_low_global(mz->zone);
1858}
1859
1860static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
1861{
1862 if (file)
1863 return inactive_file_is_low(mz);
1864 else
1865 return inactive_anon_is_low(mz);
1866}
1867
1868static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1869 struct mem_cgroup_zone *mz,
1870 struct scan_control *sc, int priority)
1871{
1872 int file = is_file_lru(lru);
1873
1874 if (is_active_lru(lru)) {
1875 if (inactive_list_is_low(mz, file))
1876 shrink_active_list(nr_to_scan, mz, sc, priority, file);
1877 return 0;
1878 }
1879
1880 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
1881}
1882
1883static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1884 struct scan_control *sc)
1885{
1886 if (global_reclaim(sc))
1887 return vm_swappiness;
1888 return mem_cgroup_swappiness(mz->mem_cgroup);
1889}
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1900 unsigned long *nr, int priority)
1901{
1902 unsigned long anon, file, free;
1903 unsigned long anon_prio, file_prio;
1904 unsigned long ap, fp;
1905 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1906 u64 fraction[2], denominator;
1907 enum lru_list lru;
1908 int noswap = 0;
1909 bool force_scan = false;
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921 if (current_is_kswapd() && mz->zone->all_unreclaimable)
1922 force_scan = true;
1923 if (!global_reclaim(sc))
1924 force_scan = true;
1925
1926
1927 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1928 noswap = 1;
1929 fraction[0] = 0;
1930 fraction[1] = 1;
1931 denominator = 1;
1932 goto out;
1933 }
1934
1935 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
1936 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
1937 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
1938 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
1939
1940 if (global_reclaim(sc)) {
1941 free = zone_page_state(mz->zone, NR_FREE_PAGES);
1942
1943
1944 if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
1945 fraction[0] = 1;
1946 fraction[1] = 0;
1947 denominator = 1;
1948 goto out;
1949 }
1950 }
1951
1952
1953
1954
1955
1956 anon_prio = vmscan_swappiness(mz, sc);
1957 file_prio = 200 - vmscan_swappiness(mz, sc);
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970 spin_lock_irq(&mz->zone->lru_lock);
1971 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1972 reclaim_stat->recent_scanned[0] /= 2;
1973 reclaim_stat->recent_rotated[0] /= 2;
1974 }
1975
1976 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1977 reclaim_stat->recent_scanned[1] /= 2;
1978 reclaim_stat->recent_rotated[1] /= 2;
1979 }
1980
1981
1982
1983
1984
1985
1986 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1987 ap /= reclaim_stat->recent_rotated[0] + 1;
1988
1989 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1990 fp /= reclaim_stat->recent_rotated[1] + 1;
1991 spin_unlock_irq(&mz->zone->lru_lock);
1992
1993 fraction[0] = ap;
1994 fraction[1] = fp;
1995 denominator = ap + fp + 1;
1996out:
1997 for_each_evictable_lru(lru) {
1998 int file = is_file_lru(lru);
1999 unsigned long scan;
2000
2001 scan = zone_nr_lru_pages(mz, lru);
2002 if (priority || noswap) {
2003 scan >>= priority;
2004 if (!scan && force_scan)
2005 scan = SWAP_CLUSTER_MAX;
2006 scan = div64_u64(scan * fraction[file], denominator);
2007 }
2008 nr[lru] = scan;
2009 }
2010}
2011
2012
2013
2014
2015
2016
2017
2018
2019static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2020 unsigned long nr_reclaimed,
2021 unsigned long nr_scanned,
2022 struct scan_control *sc)
2023{
2024 unsigned long pages_for_compaction;
2025 unsigned long inactive_lru_pages;
2026
2027
2028 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
2029 return false;
2030
2031
2032 if (sc->gfp_mask & __GFP_REPEAT) {
2033
2034
2035
2036
2037
2038
2039 if (!nr_reclaimed && !nr_scanned)
2040 return false;
2041 } else {
2042
2043
2044
2045
2046
2047
2048
2049
2050 if (!nr_reclaimed)
2051 return false;
2052 }
2053
2054
2055
2056
2057
2058 pages_for_compaction = (2UL << sc->order);
2059 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
2060 if (nr_swap_pages > 0)
2061 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
2062 if (sc->nr_reclaimed < pages_for_compaction &&
2063 inactive_lru_pages > pages_for_compaction)
2064 return true;
2065
2066
2067 switch (compaction_suitable(mz->zone, sc->order)) {
2068 case COMPACT_PARTIAL:
2069 case COMPACT_CONTINUE:
2070 return false;
2071 default:
2072 return true;
2073 }
2074}
2075
2076
2077
2078
2079static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2080 struct scan_control *sc)
2081{
2082 unsigned long nr[NR_LRU_LISTS];
2083 unsigned long nr_to_scan;
2084 enum lru_list lru;
2085 unsigned long nr_reclaimed, nr_scanned;
2086 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2087 struct blk_plug plug;
2088
2089restart:
2090 nr_reclaimed = 0;
2091 nr_scanned = sc->nr_scanned;
2092 get_scan_count(mz, sc, nr, priority);
2093
2094 blk_start_plug(&plug);
2095 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2096 nr[LRU_INACTIVE_FILE]) {
2097 for_each_evictable_lru(lru) {
2098 if (nr[lru]) {
2099 nr_to_scan = min_t(unsigned long,
2100 nr[lru], SWAP_CLUSTER_MAX);
2101 nr[lru] -= nr_to_scan;
2102
2103 nr_reclaimed += shrink_list(lru, nr_to_scan,
2104 mz, sc, priority);
2105 }
2106 }
2107
2108
2109
2110
2111
2112
2113
2114
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
2116 break;
2117 }
2118 blk_finish_plug(&plug);
2119 sc->nr_reclaimed += nr_reclaimed;
2120
2121
2122
2123
2124
2125 if (inactive_anon_is_low(mz))
2126 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
2127
2128
2129 if (should_continue_reclaim(mz, nr_reclaimed,
2130 sc->nr_scanned - nr_scanned, sc))
2131 goto restart;
2132
2133 throttle_vm_writeout(sc->gfp_mask);
2134}
2135
2136static void shrink_zone(int priority, struct zone *zone,
2137 struct scan_control *sc)
2138{
2139 struct mem_cgroup *root = sc->target_mem_cgroup;
2140 struct mem_cgroup_reclaim_cookie reclaim = {
2141 .zone = zone,
2142 .priority = priority,
2143 };
2144 struct mem_cgroup *memcg;
2145
2146 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2147 do {
2148 struct mem_cgroup_zone mz = {
2149 .mem_cgroup = memcg,
2150 .zone = zone,
2151 };
2152
2153 shrink_mem_cgroup_zone(priority, &mz, sc);
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164 if (!global_reclaim(sc)) {
2165 mem_cgroup_iter_break(root, memcg);
2166 break;
2167 }
2168 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2169 } while (memcg);
2170}
2171
2172
2173static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2174{
2175 unsigned long balance_gap, watermark;
2176 bool watermark_ok;
2177
2178
2179 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2180 return false;
2181
2182
2183
2184
2185
2186
2187
2188 balance_gap = min(low_wmark_pages(zone),
2189 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2190 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2191 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2192 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2193
2194
2195
2196
2197
2198 if (compaction_deferred(zone))
2199 return watermark_ok;
2200
2201
2202 if (!compaction_suitable(zone, sc->order))
2203 return false;
2204
2205 return watermark_ok;
2206}
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229static bool shrink_zones(int priority, struct zonelist *zonelist,
2230 struct scan_control *sc)
2231{
2232 struct zoneref *z;
2233 struct zone *zone;
2234 unsigned long nr_soft_reclaimed;
2235 unsigned long nr_soft_scanned;
2236 bool aborted_reclaim = false;
2237
2238 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2239 gfp_zone(sc->gfp_mask), sc->nodemask) {
2240 if (!populated_zone(zone))
2241 continue;
2242
2243
2244
2245
2246 if (global_reclaim(sc)) {
2247 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2248 continue;
2249 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2250 continue;
2251 if (COMPACTION_BUILD) {
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261 if (compaction_ready(zone, sc)) {
2262 aborted_reclaim = true;
2263 continue;
2264 }
2265 }
2266
2267
2268
2269
2270
2271
2272 nr_soft_scanned = 0;
2273 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2274 sc->order, sc->gfp_mask,
2275 &nr_soft_scanned);
2276 sc->nr_reclaimed += nr_soft_reclaimed;
2277 sc->nr_scanned += nr_soft_scanned;
2278
2279 }
2280
2281 shrink_zone(priority, zone, sc);
2282 }
2283
2284 return aborted_reclaim;
2285}
2286
2287static bool zone_reclaimable(struct zone *zone)
2288{
2289 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2290}
2291
2292
2293static bool all_unreclaimable(struct zonelist *zonelist,
2294 struct scan_control *sc)
2295{
2296 struct zoneref *z;
2297 struct zone *zone;
2298
2299 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2300 gfp_zone(sc->gfp_mask), sc->nodemask) {
2301 if (!populated_zone(zone))
2302 continue;
2303 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2304 continue;
2305 if (!zone->all_unreclaimable)
2306 return false;
2307 }
2308
2309 return true;
2310}
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2329 struct scan_control *sc,
2330 struct shrink_control *shrink)
2331{
2332 int priority;
2333 unsigned long total_scanned = 0;
2334 struct reclaim_state *reclaim_state = current->reclaim_state;
2335 struct zoneref *z;
2336 struct zone *zone;
2337 unsigned long writeback_threshold;
2338 bool aborted_reclaim;
2339
2340 get_mems_allowed();
2341 delayacct_freepages_start();
2342
2343 if (global_reclaim(sc))
2344 count_vm_event(ALLOCSTALL);
2345
2346 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2347 sc->nr_scanned = 0;
2348 if (!priority)
2349 disable_swap_token(sc->target_mem_cgroup);
2350 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2351
2352
2353
2354
2355
2356 if (global_reclaim(sc)) {
2357 unsigned long lru_pages = 0;
2358 for_each_zone_zonelist(zone, z, zonelist,
2359 gfp_zone(sc->gfp_mask)) {
2360 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2361 continue;
2362
2363 lru_pages += zone_reclaimable_pages(zone);
2364 }
2365
2366 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2367 if (reclaim_state) {
2368 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2369 reclaim_state->reclaimed_slab = 0;
2370 }
2371 }
2372 total_scanned += sc->nr_scanned;
2373 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2374 goto out;
2375
2376
2377
2378
2379
2380
2381
2382
2383 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2384 if (total_scanned > writeback_threshold) {
2385 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2386 WB_REASON_TRY_TO_FREE_PAGES);
2387 sc->may_writepage = 1;
2388 }
2389
2390
2391 if (!sc->hibernation_mode && sc->nr_scanned &&
2392 priority < DEF_PRIORITY - 2) {
2393 struct zone *preferred_zone;
2394
2395 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2396 &cpuset_current_mems_allowed,
2397 &preferred_zone);
2398 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2399 }
2400 }
2401
2402out:
2403 delayacct_freepages_end();
2404 put_mems_allowed();
2405
2406 if (sc->nr_reclaimed)
2407 return sc->nr_reclaimed;
2408
2409
2410
2411
2412
2413
2414 if (oom_killer_disabled)
2415 return 0;
2416
2417
2418 if (aborted_reclaim)
2419 return 1;
2420
2421
2422 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
2423 return 1;
2424
2425 return 0;
2426}
2427
2428unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2429 gfp_t gfp_mask, nodemask_t *nodemask)
2430{
2431 unsigned long nr_reclaimed;
2432 struct scan_control sc = {
2433 .gfp_mask = gfp_mask,
2434 .may_writepage = !laptop_mode,
2435 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2436 .may_unmap = 1,
2437 .may_swap = 1,
2438 .order = order,
2439 .target_mem_cgroup = NULL,
2440 .nodemask = nodemask,
2441 };
2442 struct shrink_control shrink = {
2443 .gfp_mask = sc.gfp_mask,
2444 };
2445
2446 trace_mm_vmscan_direct_reclaim_begin(order,
2447 sc.may_writepage,
2448 gfp_mask);
2449
2450 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2451
2452 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2453
2454 return nr_reclaimed;
2455}
2456
2457#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2458
2459unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2460 gfp_t gfp_mask, bool noswap,
2461 struct zone *zone,
2462 unsigned long *nr_scanned)
2463{
2464 struct scan_control sc = {
2465 .nr_scanned = 0,
2466 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2467 .may_writepage = !laptop_mode,
2468 .may_unmap = 1,
2469 .may_swap = !noswap,
2470 .order = 0,
2471 .target_mem_cgroup = memcg,
2472 };
2473 struct mem_cgroup_zone mz = {
2474 .mem_cgroup = memcg,
2475 .zone = zone,
2476 };
2477
2478 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2479 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2480
2481 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
2482 sc.may_writepage,
2483 sc.gfp_mask);
2484
2485
2486
2487
2488
2489
2490
2491
2492 shrink_mem_cgroup_zone(0, &mz, &sc);
2493
2494 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2495
2496 *nr_scanned = sc.nr_scanned;
2497 return sc.nr_reclaimed;
2498}
2499
2500unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2501 gfp_t gfp_mask,
2502 bool noswap)
2503{
2504 struct zonelist *zonelist;
2505 unsigned long nr_reclaimed;
2506 int nid;
2507 struct scan_control sc = {
2508 .may_writepage = !laptop_mode,
2509 .may_unmap = 1,
2510 .may_swap = !noswap,
2511 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2512 .order = 0,
2513 .target_mem_cgroup = memcg,
2514 .nodemask = NULL,
2515 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2516 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2517 };
2518 struct shrink_control shrink = {
2519 .gfp_mask = sc.gfp_mask,
2520 };
2521
2522
2523
2524
2525
2526
2527 nid = mem_cgroup_select_victim_node(memcg);
2528
2529 zonelist = NODE_DATA(nid)->node_zonelists;
2530
2531 trace_mm_vmscan_memcg_reclaim_begin(0,
2532 sc.may_writepage,
2533 sc.gfp_mask);
2534
2535 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2536
2537 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2538
2539 return nr_reclaimed;
2540}
2541#endif
2542
2543static void age_active_anon(struct zone *zone, struct scan_control *sc,
2544 int priority)
2545{
2546 struct mem_cgroup *memcg;
2547
2548 if (!total_swap_pages)
2549 return;
2550
2551 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2552 do {
2553 struct mem_cgroup_zone mz = {
2554 .mem_cgroup = memcg,
2555 .zone = zone,
2556 };
2557
2558 if (inactive_anon_is_low(&mz))
2559 shrink_active_list(SWAP_CLUSTER_MAX, &mz,
2560 sc, priority, 0);
2561
2562 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2563 } while (memcg);
2564}
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2583 int classzone_idx)
2584{
2585 unsigned long present_pages = 0;
2586 int i;
2587
2588 for (i = 0; i <= classzone_idx; i++)
2589 present_pages += pgdat->node_zones[i].present_pages;
2590
2591
2592 return balanced_pages >= (present_pages >> 2);
2593}
2594
2595
2596static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2597 int classzone_idx)
2598{
2599 int i;
2600 unsigned long balanced = 0;
2601 bool all_zones_ok = true;
2602
2603
2604 if (remaining)
2605 return true;
2606
2607
2608 for (i = 0; i <= classzone_idx; i++) {
2609 struct zone *zone = pgdat->node_zones + i;
2610
2611 if (!populated_zone(zone))
2612 continue;
2613
2614
2615
2616
2617
2618
2619
2620 if (zone->all_unreclaimable) {
2621 balanced += zone->present_pages;
2622 continue;
2623 }
2624
2625 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2626 i, 0))
2627 all_zones_ok = false;
2628 else
2629 balanced += zone->present_pages;
2630 }
2631
2632
2633
2634
2635
2636
2637 if (order)
2638 return !pgdat_balanced(pgdat, balanced, classzone_idx);
2639 else
2640 return !all_zones_ok;
2641}
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2665 int *classzone_idx)
2666{
2667 int all_zones_ok;
2668 unsigned long balanced;
2669 int priority;
2670 int i;
2671 int end_zone = 0;
2672 unsigned long total_scanned;
2673 struct reclaim_state *reclaim_state = current->reclaim_state;
2674 unsigned long nr_soft_reclaimed;
2675 unsigned long nr_soft_scanned;
2676 struct scan_control sc = {
2677 .gfp_mask = GFP_KERNEL,
2678 .may_unmap = 1,
2679 .may_swap = 1,
2680
2681
2682
2683
2684 .nr_to_reclaim = ULONG_MAX,
2685 .order = order,
2686 .target_mem_cgroup = NULL,
2687 };
2688 struct shrink_control shrink = {
2689 .gfp_mask = sc.gfp_mask,
2690 };
2691loop_again:
2692 total_scanned = 0;
2693 sc.nr_reclaimed = 0;
2694 sc.may_writepage = !laptop_mode;
2695 count_vm_event(PAGEOUTRUN);
2696
2697 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2698 unsigned long lru_pages = 0;
2699 int has_under_min_watermark_zone = 0;
2700
2701
2702 if (!priority)
2703 disable_swap_token(NULL);
2704
2705 all_zones_ok = 1;
2706 balanced = 0;
2707
2708
2709
2710
2711
2712 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2713 struct zone *zone = pgdat->node_zones + i;
2714
2715 if (!populated_zone(zone))
2716 continue;
2717
2718 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2719 continue;
2720
2721
2722
2723
2724
2725 age_active_anon(zone, &sc, priority);
2726
2727 if (!zone_watermark_ok_safe(zone, order,
2728 high_wmark_pages(zone), 0, 0)) {
2729 end_zone = i;
2730 break;
2731 } else {
2732
2733 zone_clear_flag(zone, ZONE_CONGESTED);
2734 }
2735 }
2736 if (i < 0)
2737 goto out;
2738
2739 for (i = 0; i <= end_zone; i++) {
2740 struct zone *zone = pgdat->node_zones + i;
2741
2742 lru_pages += zone_reclaimable_pages(zone);
2743 }
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754 for (i = 0; i <= end_zone; i++) {
2755 struct zone *zone = pgdat->node_zones + i;
2756 int nr_slab;
2757 unsigned long balance_gap;
2758
2759 if (!populated_zone(zone))
2760 continue;
2761
2762 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2763 continue;
2764
2765 sc.nr_scanned = 0;
2766
2767 nr_soft_scanned = 0;
2768
2769
2770
2771 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2772 order, sc.gfp_mask,
2773 &nr_soft_scanned);
2774 sc.nr_reclaimed += nr_soft_reclaimed;
2775 total_scanned += nr_soft_scanned;
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785 balance_gap = min(low_wmark_pages(zone),
2786 (zone->present_pages +
2787 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2788 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2789 if (!zone_watermark_ok_safe(zone, order,
2790 high_wmark_pages(zone) + balance_gap,
2791 end_zone, 0)) {
2792 shrink_zone(priority, zone, &sc);
2793
2794 reclaim_state->reclaimed_slab = 0;
2795 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2796 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2797 total_scanned += sc.nr_scanned;
2798
2799 if (nr_slab == 0 && !zone_reclaimable(zone))
2800 zone->all_unreclaimable = 1;
2801 }
2802
2803
2804
2805
2806
2807
2808 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2809 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2810 sc.may_writepage = 1;
2811
2812 if (zone->all_unreclaimable) {
2813 if (end_zone && end_zone == i)
2814 end_zone--;
2815 continue;
2816 }
2817
2818 if (!zone_watermark_ok_safe(zone, order,
2819 high_wmark_pages(zone), end_zone, 0)) {
2820 all_zones_ok = 0;
2821
2822
2823
2824
2825
2826 if (!zone_watermark_ok_safe(zone, order,
2827 min_wmark_pages(zone), end_zone, 0))
2828 has_under_min_watermark_zone = 1;
2829 } else {
2830
2831
2832
2833
2834
2835
2836
2837 zone_clear_flag(zone, ZONE_CONGESTED);
2838 if (i <= *classzone_idx)
2839 balanced += zone->present_pages;
2840 }
2841
2842 }
2843 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2844 break;
2845
2846
2847
2848
2849 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2850 if (has_under_min_watermark_zone)
2851 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2852 else
2853 congestion_wait(BLK_RW_ASYNC, HZ/10);
2854 }
2855
2856
2857
2858
2859
2860
2861
2862 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2863 break;
2864 }
2865out:
2866
2867
2868
2869
2870
2871
2872 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2873 cond_resched();
2874
2875 try_to_freeze();
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2892 order = sc.order = 0;
2893
2894 goto loop_again;
2895 }
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905 if (order) {
2906 for (i = 0; i <= end_zone; i++) {
2907 struct zone *zone = pgdat->node_zones + i;
2908
2909 if (!populated_zone(zone))
2910 continue;
2911
2912 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2913 continue;
2914
2915
2916 if (!zone_watermark_ok(zone, 0,
2917 high_wmark_pages(zone), 0, 0)) {
2918 order = sc.order = 0;
2919 goto loop_again;
2920 }
2921
2922
2923 zone_clear_flag(zone, ZONE_CONGESTED);
2924 if (i <= *classzone_idx)
2925 balanced += zone->present_pages;
2926 }
2927 }
2928
2929
2930
2931
2932
2933
2934
2935 *classzone_idx = end_zone;
2936 return order;
2937}
2938
2939static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2940{
2941 long remaining = 0;
2942 DEFINE_WAIT(wait);
2943
2944 if (freezing(current) || kthread_should_stop())
2945 return;
2946
2947 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2948
2949
2950 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2951 remaining = schedule_timeout(HZ/10);
2952 finish_wait(&pgdat->kswapd_wait, &wait);
2953 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2954 }
2955
2956
2957
2958
2959
2960 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2961 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2972 schedule();
2973 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2974 } else {
2975 if (remaining)
2976 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2977 else
2978 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2979 }
2980 finish_wait(&pgdat->kswapd_wait, &wait);
2981}
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996static int kswapd(void *p)
2997{
2998 unsigned long order, new_order;
2999 unsigned balanced_order;
3000 int classzone_idx, new_classzone_idx;
3001 int balanced_classzone_idx;
3002 pg_data_t *pgdat = (pg_data_t*)p;
3003 struct task_struct *tsk = current;
3004
3005 struct reclaim_state reclaim_state = {
3006 .reclaimed_slab = 0,
3007 };
3008 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3009
3010 lockdep_set_current_reclaim_state(GFP_KERNEL);
3011
3012 if (!cpumask_empty(cpumask))
3013 set_cpus_allowed_ptr(tsk, cpumask);
3014 current->reclaim_state = &reclaim_state;
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3029 set_freezable();
3030
3031 order = new_order = 0;
3032 balanced_order = 0;
3033 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3034 balanced_classzone_idx = classzone_idx;
3035 for ( ; ; ) {
3036 int ret;
3037
3038
3039
3040
3041
3042
3043 if (balanced_classzone_idx >= new_classzone_idx &&
3044 balanced_order == new_order) {
3045 new_order = pgdat->kswapd_max_order;
3046 new_classzone_idx = pgdat->classzone_idx;
3047 pgdat->kswapd_max_order = 0;
3048 pgdat->classzone_idx = pgdat->nr_zones - 1;
3049 }
3050
3051 if (order < new_order || classzone_idx > new_classzone_idx) {
3052
3053
3054
3055
3056 order = new_order;
3057 classzone_idx = new_classzone_idx;
3058 } else {
3059 kswapd_try_to_sleep(pgdat, balanced_order,
3060 balanced_classzone_idx);
3061 order = pgdat->kswapd_max_order;
3062 classzone_idx = pgdat->classzone_idx;
3063 new_order = order;
3064 new_classzone_idx = classzone_idx;
3065 pgdat->kswapd_max_order = 0;
3066 pgdat->classzone_idx = pgdat->nr_zones - 1;
3067 }
3068
3069 ret = try_to_freeze();
3070 if (kthread_should_stop())
3071 break;
3072
3073
3074
3075
3076
3077 if (!ret) {
3078 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3079 balanced_classzone_idx = classzone_idx;
3080 balanced_order = balance_pgdat(pgdat, order,
3081 &balanced_classzone_idx);
3082 }
3083 }
3084 return 0;
3085}
3086
3087
3088
3089
3090void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3091{
3092 pg_data_t *pgdat;
3093
3094 if (!populated_zone(zone))
3095 return;
3096
3097 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
3098 return;
3099 pgdat = zone->zone_pgdat;
3100 if (pgdat->kswapd_max_order < order) {
3101 pgdat->kswapd_max_order = order;
3102 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
3103 }
3104 if (!waitqueue_active(&pgdat->kswapd_wait))
3105 return;
3106 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
3107 return;
3108
3109 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
3110 wake_up_interruptible(&pgdat->kswapd_wait);
3111}
3112
3113
3114
3115
3116
3117
3118
3119
3120unsigned long global_reclaimable_pages(void)
3121{
3122 int nr;
3123
3124 nr = global_page_state(NR_ACTIVE_FILE) +
3125 global_page_state(NR_INACTIVE_FILE);
3126
3127 if (nr_swap_pages > 0)
3128 nr += global_page_state(NR_ACTIVE_ANON) +
3129 global_page_state(NR_INACTIVE_ANON);
3130
3131 return nr;
3132}
3133
3134unsigned long zone_reclaimable_pages(struct zone *zone)
3135{
3136 int nr;
3137
3138 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3139 zone_page_state(zone, NR_INACTIVE_FILE);
3140
3141 if (nr_swap_pages > 0)
3142 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3143 zone_page_state(zone, NR_INACTIVE_ANON);
3144
3145 return nr;
3146}
3147
3148#ifdef CONFIG_HIBERNATION
3149
3150
3151
3152
3153
3154
3155
3156
3157unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3158{
3159 struct reclaim_state reclaim_state;
3160 struct scan_control sc = {
3161 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3162 .may_swap = 1,
3163 .may_unmap = 1,
3164 .may_writepage = 1,
3165 .nr_to_reclaim = nr_to_reclaim,
3166 .hibernation_mode = 1,
3167 .order = 0,
3168 };
3169 struct shrink_control shrink = {
3170 .gfp_mask = sc.gfp_mask,
3171 };
3172 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3173 struct task_struct *p = current;
3174 unsigned long nr_reclaimed;
3175
3176 p->flags |= PF_MEMALLOC;
3177 lockdep_set_current_reclaim_state(sc.gfp_mask);
3178 reclaim_state.reclaimed_slab = 0;
3179 p->reclaim_state = &reclaim_state;
3180
3181 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3182
3183 p->reclaim_state = NULL;
3184 lockdep_clear_current_reclaim_state();
3185 p->flags &= ~PF_MEMALLOC;
3186
3187 return nr_reclaimed;
3188}
3189#endif
3190
3191
3192
3193
3194
3195static int __devinit cpu_callback(struct notifier_block *nfb,
3196 unsigned long action, void *hcpu)
3197{
3198 int nid;
3199
3200 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3201 for_each_node_state(nid, N_HIGH_MEMORY) {
3202 pg_data_t *pgdat = NODE_DATA(nid);
3203 const struct cpumask *mask;
3204
3205 mask = cpumask_of_node(pgdat->node_id);
3206
3207 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3208
3209 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3210 }
3211 }
3212 return NOTIFY_OK;
3213}
3214
3215
3216
3217
3218
3219int kswapd_run(int nid)
3220{
3221 pg_data_t *pgdat = NODE_DATA(nid);
3222 int ret = 0;
3223
3224 if (pgdat->kswapd)
3225 return 0;
3226
3227 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3228 if (IS_ERR(pgdat->kswapd)) {
3229
3230 BUG_ON(system_state == SYSTEM_BOOTING);
3231 printk("Failed to start kswapd on node %d\n",nid);
3232 ret = -1;
3233 }
3234 return ret;
3235}
3236
3237
3238
3239
3240void kswapd_stop(int nid)
3241{
3242 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3243
3244 if (kswapd)
3245 kthread_stop(kswapd);
3246}
3247
3248static int __init kswapd_init(void)
3249{
3250 int nid;
3251
3252 swap_setup();
3253 for_each_node_state(nid, N_HIGH_MEMORY)
3254 kswapd_run(nid);
3255 hotcpu_notifier(cpu_callback, 0);
3256 return 0;
3257}
3258
3259module_init(kswapd_init)
3260
3261#ifdef CONFIG_NUMA
3262
3263
3264
3265
3266
3267
3268int zone_reclaim_mode __read_mostly;
3269
3270#define RECLAIM_OFF 0
3271#define RECLAIM_ZONE (1<<0)
3272#define RECLAIM_WRITE (1<<1)
3273#define RECLAIM_SWAP (1<<2)
3274
3275
3276
3277
3278
3279
3280#define ZONE_RECLAIM_PRIORITY 4
3281
3282
3283
3284
3285
3286int sysctl_min_unmapped_ratio = 1;
3287
3288
3289
3290
3291
3292int sysctl_min_slab_ratio = 5;
3293
3294static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3295{
3296 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3297 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3298 zone_page_state(zone, NR_ACTIVE_FILE);
3299
3300
3301
3302
3303
3304
3305 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3306}
3307
3308
3309static long zone_pagecache_reclaimable(struct zone *zone)
3310{
3311 long nr_pagecache_reclaimable;
3312 long delta = 0;
3313
3314
3315
3316
3317
3318
3319
3320 if (zone_reclaim_mode & RECLAIM_SWAP)
3321 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3322 else
3323 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3324
3325
3326 if (!(zone_reclaim_mode & RECLAIM_WRITE))
3327 delta += zone_page_state(zone, NR_FILE_DIRTY);
3328
3329
3330 if (unlikely(delta > nr_pagecache_reclaimable))
3331 delta = nr_pagecache_reclaimable;
3332
3333 return nr_pagecache_reclaimable - delta;
3334}
3335
3336
3337
3338
3339static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3340{
3341
3342 const unsigned long nr_pages = 1 << order;
3343 struct task_struct *p = current;
3344 struct reclaim_state reclaim_state;
3345 int priority;
3346 struct scan_control sc = {
3347 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3348 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3349 .may_swap = 1,
3350 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3351 SWAP_CLUSTER_MAX),
3352 .gfp_mask = gfp_mask,
3353 .order = order,
3354 };
3355 struct shrink_control shrink = {
3356 .gfp_mask = sc.gfp_mask,
3357 };
3358 unsigned long nr_slab_pages0, nr_slab_pages1;
3359
3360 cond_resched();
3361
3362
3363
3364
3365
3366 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3367 lockdep_set_current_reclaim_state(gfp_mask);
3368 reclaim_state.reclaimed_slab = 0;
3369 p->reclaim_state = &reclaim_state;
3370
3371 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3372
3373
3374
3375
3376 priority = ZONE_RECLAIM_PRIORITY;
3377 do {
3378 shrink_zone(priority, zone, &sc);
3379 priority--;
3380 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3381 }
3382
3383 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3384 if (nr_slab_pages0 > zone->min_slab_pages) {
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395 for (;;) {
3396 unsigned long lru_pages = zone_reclaimable_pages(zone);
3397
3398
3399 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3400 break;
3401
3402
3403 nr_slab_pages1 = zone_page_state(zone,
3404 NR_SLAB_RECLAIMABLE);
3405 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3406 break;
3407 }
3408
3409
3410
3411
3412
3413 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3414 if (nr_slab_pages1 < nr_slab_pages0)
3415 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3416 }
3417
3418 p->reclaim_state = NULL;
3419 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3420 lockdep_clear_current_reclaim_state();
3421 return sc.nr_reclaimed >= nr_pages;
3422}
3423
3424int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3425{
3426 int node_id;
3427 int ret;
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3440 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3441 return ZONE_RECLAIM_FULL;
3442
3443 if (zone->all_unreclaimable)
3444 return ZONE_RECLAIM_FULL;
3445
3446
3447
3448
3449 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3450 return ZONE_RECLAIM_NOSCAN;
3451
3452
3453
3454
3455
3456
3457
3458 node_id = zone_to_nid(zone);
3459 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3460 return ZONE_RECLAIM_NOSCAN;
3461
3462 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3463 return ZONE_RECLAIM_NOSCAN;
3464
3465 ret = __zone_reclaim(zone, gfp_mask, order);
3466 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3467
3468 if (!ret)
3469 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3470
3471 return ret;
3472}
3473#endif
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489int page_evictable(struct page *page, struct vm_area_struct *vma)
3490{
3491
3492 if (mapping_unevictable(page_mapping(page)))
3493 return 0;
3494
3495 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
3496 return 0;
3497
3498 return 1;
3499}
3500
3501#ifdef CONFIG_SHMEM
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511void check_move_unevictable_pages(struct page **pages, int nr_pages)
3512{
3513 struct lruvec *lruvec;
3514 struct zone *zone = NULL;
3515 int pgscanned = 0;
3516 int pgrescued = 0;
3517 int i;
3518
3519 for (i = 0; i < nr_pages; i++) {
3520 struct page *page = pages[i];
3521 struct zone *pagezone;
3522
3523 pgscanned++;
3524 pagezone = page_zone(page);
3525 if (pagezone != zone) {
3526 if (zone)
3527 spin_unlock_irq(&zone->lru_lock);
3528 zone = pagezone;
3529 spin_lock_irq(&zone->lru_lock);
3530 }
3531
3532 if (!PageLRU(page) || !PageUnevictable(page))
3533 continue;
3534
3535 if (page_evictable(page, NULL)) {
3536 enum lru_list lru = page_lru_base_type(page);
3537
3538 VM_BUG_ON(PageActive(page));
3539 ClearPageUnevictable(page);
3540 __dec_zone_state(zone, NR_UNEVICTABLE);
3541 lruvec = mem_cgroup_lru_move_lists(zone, page,
3542 LRU_UNEVICTABLE, lru);
3543 list_move(&page->lru, &lruvec->lists[lru]);
3544 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3545 pgrescued++;
3546 }
3547 }
3548
3549 if (zone) {
3550 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3551 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3552 spin_unlock_irq(&zone->lru_lock);
3553 }
3554}
3555#endif
3556
3557static void warn_scan_unevictable_pages(void)
3558{
3559 printk_once(KERN_WARNING
3560 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3561 "disabled for lack of a legitimate use case. If you have "
3562 "one, please send an email to linux-mm@kvack.org.\n",
3563 current->comm);
3564}
3565
3566
3567
3568
3569
3570unsigned long scan_unevictable_pages;
3571
3572int scan_unevictable_handler(struct ctl_table *table, int write,
3573 void __user *buffer,
3574 size_t *length, loff_t *ppos)
3575{
3576 warn_scan_unevictable_pages();
3577 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3578 scan_unevictable_pages = 0;
3579 return 0;
3580}
3581
3582#ifdef CONFIG_NUMA
3583
3584
3585
3586
3587
3588static ssize_t read_scan_unevictable_node(struct device *dev,
3589 struct device_attribute *attr,
3590 char *buf)
3591{
3592 warn_scan_unevictable_pages();
3593 return sprintf(buf, "0\n");
3594}
3595
3596static ssize_t write_scan_unevictable_node(struct device *dev,
3597 struct device_attribute *attr,
3598 const char *buf, size_t count)
3599{
3600 warn_scan_unevictable_pages();
3601 return 1;
3602}
3603
3604
3605static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3606 read_scan_unevictable_node,
3607 write_scan_unevictable_node);
3608
3609int scan_unevictable_register_node(struct node *node)
3610{
3611 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3612}
3613
3614void scan_unevictable_unregister_node(struct node *node)
3615{
3616 device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3617}
3618#endif
3619