1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/notifier.h>
36#include <linux/rwsem.h>
37#include <linux/delay.h>
38#include <linux/kthread.h>
39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
43
44#include <asm/tlbflush.h>
45#include <asm/div64.h>
46
47#include <linux/swapops.h>
48
49#include "internal.h"
50
51struct scan_control {
52
53 unsigned long nr_scanned;
54
55
56 unsigned long nr_reclaimed;
57
58
59 gfp_t gfp_mask;
60
61 int may_writepage;
62
63
64 int may_unmap;
65
66
67 int may_swap;
68
69
70
71
72
73 int swap_cluster_max;
74
75 int swappiness;
76
77 int all_unreclaimable;
78
79 int order;
80
81
82 struct mem_cgroup *mem_cgroup;
83
84
85
86
87
88 nodemask_t *nodemask;
89
90
91 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
92 unsigned long *scanned, int order, int mode,
93 struct zone *z, struct mem_cgroup *mem_cont,
94 int active, int file);
95};
96
97#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
98
99#ifdef ARCH_HAS_PREFETCH
100#define prefetch_prev_lru_page(_page, _base, _field) \
101 do { \
102 if ((_page)->lru.prev != _base) { \
103 struct page *prev; \
104 \
105 prev = lru_to_page(&(_page->lru)); \
106 prefetch(&prev->_field); \
107 } \
108 } while (0)
109#else
110#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
111#endif
112
113#ifdef ARCH_HAS_PREFETCHW
114#define prefetchw_prev_lru_page(_page, _base, _field) \
115 do { \
116 if ((_page)->lru.prev != _base) { \
117 struct page *prev; \
118 \
119 prev = lru_to_page(&(_page->lru)); \
120 prefetchw(&prev->_field); \
121 } \
122 } while (0)
123#else
124#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
125#endif
126
127
128
129
130int vm_swappiness = 60;
131long vm_total_pages;
132
133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem);
135
136#ifdef CONFIG_CGROUP_MEM_RES_CTLR
137#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
138#else
139#define scanning_global_lru(sc) (1)
140#endif
141
142static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
143 struct scan_control *sc)
144{
145 if (!scanning_global_lru(sc))
146 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
147
148 return &zone->reclaim_stat;
149}
150
151static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
152 enum lru_list lru)
153{
154 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
156
157 return zone_page_state(zone, NR_LRU_BASE + lru);
158}
159
160
161
162
163
164void register_shrinker(struct shrinker *shrinker)
165{
166 shrinker->nr = 0;
167 down_write(&shrinker_rwsem);
168 list_add_tail(&shrinker->list, &shrinker_list);
169 up_write(&shrinker_rwsem);
170}
171EXPORT_SYMBOL(register_shrinker);
172
173
174
175
176void unregister_shrinker(struct shrinker *shrinker)
177{
178 down_write(&shrinker_rwsem);
179 list_del(&shrinker->list);
180 up_write(&shrinker_rwsem);
181}
182EXPORT_SYMBOL(unregister_shrinker);
183
184#define SHRINK_BATCH 128
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
205 unsigned long lru_pages)
206{
207 struct shrinker *shrinker;
208 unsigned long ret = 0;
209
210 if (scanned == 0)
211 scanned = SWAP_CLUSTER_MAX;
212
213 if (!down_read_trylock(&shrinker_rwsem))
214 return 1;
215
216 list_for_each_entry(shrinker, &shrinker_list, list) {
217 unsigned long long delta;
218 unsigned long total_scan;
219 unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
220
221 delta = (4 * scanned) / shrinker->seeks;
222 delta *= max_pass;
223 do_div(delta, lru_pages + 1);
224 shrinker->nr += delta;
225 if (shrinker->nr < 0) {
226 printk(KERN_ERR "shrink_slab: %pF negative objects to "
227 "delete nr=%ld\n",
228 shrinker->shrink, shrinker->nr);
229 shrinker->nr = max_pass;
230 }
231
232
233
234
235
236
237 if (shrinker->nr > max_pass * 2)
238 shrinker->nr = max_pass * 2;
239
240 total_scan = shrinker->nr;
241 shrinker->nr = 0;
242
243 while (total_scan >= SHRINK_BATCH) {
244 long this_scan = SHRINK_BATCH;
245 int shrink_ret;
246 int nr_before;
247
248 nr_before = (*shrinker->shrink)(0, gfp_mask);
249 shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
250 if (shrink_ret == -1)
251 break;
252 if (shrink_ret < nr_before)
253 ret += nr_before - shrink_ret;
254 count_vm_events(SLABS_SCANNED, this_scan);
255 total_scan -= this_scan;
256
257 cond_resched();
258 }
259
260 shrinker->nr += total_scan;
261 }
262 up_read(&shrinker_rwsem);
263 return ret;
264}
265
266
267static inline int page_mapping_inuse(struct page *page)
268{
269 struct address_space *mapping;
270
271
272 if (page_mapped(page))
273 return 1;
274
275
276 if (PageSwapCache(page))
277 return 1;
278
279 mapping = page_mapping(page);
280 if (!mapping)
281 return 0;
282
283
284 return mapping_mapped(mapping);
285}
286
287static inline int is_page_cache_freeable(struct page *page)
288{
289 return page_count(page) - !!page_has_private(page) == 2;
290}
291
292static int may_write_to_queue(struct backing_dev_info *bdi)
293{
294 if (current->flags & PF_SWAPWRITE)
295 return 1;
296 if (!bdi_write_congested(bdi))
297 return 1;
298 if (bdi == current->backing_dev_info)
299 return 1;
300 return 0;
301}
302
303
304
305
306
307
308
309
310
311
312
313
314
315static void handle_write_error(struct address_space *mapping,
316 struct page *page, int error)
317{
318 lock_page(page);
319 if (page_mapping(page) == mapping)
320 mapping_set_error(mapping, error);
321 unlock_page(page);
322}
323
324
325enum pageout_io {
326 PAGEOUT_IO_ASYNC,
327 PAGEOUT_IO_SYNC,
328};
329
330
331typedef enum {
332
333 PAGE_KEEP,
334
335 PAGE_ACTIVATE,
336
337 PAGE_SUCCESS,
338
339 PAGE_CLEAN,
340} pageout_t;
341
342
343
344
345
346static pageout_t pageout(struct page *page, struct address_space *mapping,
347 enum pageout_io sync_writeback)
348{
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366 if (!is_page_cache_freeable(page))
367 return PAGE_KEEP;
368 if (!mapping) {
369
370
371
372
373 if (page_has_private(page)) {
374 if (try_to_free_buffers(page)) {
375 ClearPageDirty(page);
376 printk("%s: orphaned page\n", __func__);
377 return PAGE_CLEAN;
378 }
379 }
380 return PAGE_KEEP;
381 }
382 if (mapping->a_ops->writepage == NULL)
383 return PAGE_ACTIVATE;
384 if (!may_write_to_queue(mapping->backing_dev_info))
385 return PAGE_KEEP;
386
387 if (clear_page_dirty_for_io(page)) {
388 int res;
389 struct writeback_control wbc = {
390 .sync_mode = WB_SYNC_NONE,
391 .nr_to_write = SWAP_CLUSTER_MAX,
392 .range_start = 0,
393 .range_end = LLONG_MAX,
394 .nonblocking = 1,
395 .for_reclaim = 1,
396 };
397
398 SetPageReclaim(page);
399 res = mapping->a_ops->writepage(page, &wbc);
400 if (res < 0)
401 handle_write_error(mapping, page, res);
402 if (res == AOP_WRITEPAGE_ACTIVATE) {
403 ClearPageReclaim(page);
404 return PAGE_ACTIVATE;
405 }
406
407
408
409
410
411
412 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
413 wait_on_page_writeback(page);
414
415 if (!PageWriteback(page)) {
416
417 ClearPageReclaim(page);
418 }
419 inc_zone_page_state(page, NR_VMSCAN_WRITE);
420 return PAGE_SUCCESS;
421 }
422
423 return PAGE_CLEAN;
424}
425
426
427
428
429
430static int __remove_mapping(struct address_space *mapping, struct page *page)
431{
432 BUG_ON(!PageLocked(page));
433 BUG_ON(mapping != page_mapping(page));
434
435 spin_lock_irq(&mapping->tree_lock);
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461 if (!page_freeze_refs(page, 2))
462 goto cannot_free;
463
464 if (unlikely(PageDirty(page))) {
465 page_unfreeze_refs(page, 2);
466 goto cannot_free;
467 }
468
469 if (PageSwapCache(page)) {
470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock);
473 swapcache_free(swap, page);
474 } else {
475 __remove_from_page_cache(page);
476 spin_unlock_irq(&mapping->tree_lock);
477 mem_cgroup_uncharge_cache_page(page);
478 }
479
480 return 1;
481
482cannot_free:
483 spin_unlock_irq(&mapping->tree_lock);
484 return 0;
485}
486
487
488
489
490
491
492
493int remove_mapping(struct address_space *mapping, struct page *page)
494{
495 if (__remove_mapping(mapping, page)) {
496
497
498
499
500
501 page_unfreeze_refs(page, 1);
502 return 1;
503 }
504 return 0;
505}
506
507
508
509
510
511
512
513
514
515
516void putback_lru_page(struct page *page)
517{
518 int lru;
519 int active = !!TestClearPageActive(page);
520 int was_unevictable = PageUnevictable(page);
521
522 VM_BUG_ON(PageLRU(page));
523
524redo:
525 ClearPageUnevictable(page);
526
527 if (page_evictable(page, NULL)) {
528
529
530
531
532
533
534 lru = active + page_is_file_cache(page);
535 lru_cache_add_lru(page, lru);
536 } else {
537
538
539
540
541 lru = LRU_UNEVICTABLE;
542 add_page_to_unevictable_list(page);
543 }
544
545
546
547
548
549
550 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
551 if (!isolate_lru_page(page)) {
552 put_page(page);
553 goto redo;
554 }
555
556
557
558
559 }
560
561 if (was_unevictable && lru != LRU_UNEVICTABLE)
562 count_vm_event(UNEVICTABLE_PGRESCUED);
563 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
564 count_vm_event(UNEVICTABLE_PGCULLED);
565
566 put_page(page);
567}
568
569
570
571
572static unsigned long shrink_page_list(struct list_head *page_list,
573 struct scan_control *sc,
574 enum pageout_io sync_writeback)
575{
576 LIST_HEAD(ret_pages);
577 struct pagevec freed_pvec;
578 int pgactivate = 0;
579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
581
582 cond_resched();
583
584 pagevec_init(&freed_pvec, 1);
585 while (!list_empty(page_list)) {
586 struct address_space *mapping;
587 struct page *page;
588 int may_enter_fs;
589 int referenced;
590
591 cond_resched();
592
593 page = lru_to_page(page_list);
594 list_del(&page->lru);
595
596 if (!trylock_page(page))
597 goto keep;
598
599 VM_BUG_ON(PageActive(page));
600
601 sc->nr_scanned++;
602
603 if (unlikely(!page_evictable(page, NULL)))
604 goto cull_mlocked;
605
606 if (!sc->may_unmap && page_mapped(page))
607 goto keep_locked;
608
609
610 if (page_mapped(page) || PageSwapCache(page))
611 sc->nr_scanned++;
612
613 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
614 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
615
616 if (PageWriteback(page)) {
617
618
619
620
621
622
623
624
625 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
626 wait_on_page_writeback(page);
627 else
628 goto keep_locked;
629 }
630
631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
633
634
635
636
637
638 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
639 referenced && page_mapping_inuse(page)
640 && !(vm_flags & VM_LOCKED))
641 goto activate_locked;
642
643
644
645
646
647 if (PageAnon(page) && !PageSwapCache(page)) {
648 if (!(sc->gfp_mask & __GFP_IO))
649 goto keep_locked;
650 if (!add_to_swap(page))
651 goto activate_locked;
652 may_enter_fs = 1;
653 }
654
655 mapping = page_mapping(page);
656
657
658
659
660
661 if (page_mapped(page) && mapping) {
662 switch (try_to_unmap(page, 0)) {
663 case SWAP_FAIL:
664 goto activate_locked;
665 case SWAP_AGAIN:
666 goto keep_locked;
667 case SWAP_MLOCK:
668 goto cull_mlocked;
669 case SWAP_SUCCESS:
670 ;
671 }
672 }
673
674 if (PageDirty(page)) {
675 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
676 goto keep_locked;
677 if (!may_enter_fs)
678 goto keep_locked;
679 if (!sc->may_writepage)
680 goto keep_locked;
681
682
683 switch (pageout(page, mapping, sync_writeback)) {
684 case PAGE_KEEP:
685 goto keep_locked;
686 case PAGE_ACTIVATE:
687 goto activate_locked;
688 case PAGE_SUCCESS:
689 if (PageWriteback(page) || PageDirty(page))
690 goto keep;
691
692
693
694
695 if (!trylock_page(page))
696 goto keep;
697 if (PageDirty(page) || PageWriteback(page))
698 goto keep_locked;
699 mapping = page_mapping(page);
700 case PAGE_CLEAN:
701 ;
702 }
703 }
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726 if (page_has_private(page)) {
727 if (!try_to_release_page(page, sc->gfp_mask))
728 goto activate_locked;
729 if (!mapping && page_count(page) == 1) {
730 unlock_page(page);
731 if (put_page_testzero(page))
732 goto free_it;
733 else {
734
735
736
737
738
739
740
741 nr_reclaimed++;
742 continue;
743 }
744 }
745 }
746
747 if (!mapping || !__remove_mapping(mapping, page))
748 goto keep_locked;
749
750
751
752
753
754
755
756
757 __clear_page_locked(page);
758free_it:
759 nr_reclaimed++;
760 if (!pagevec_add(&freed_pvec, page)) {
761 __pagevec_free(&freed_pvec);
762 pagevec_reinit(&freed_pvec);
763 }
764 continue;
765
766cull_mlocked:
767 if (PageSwapCache(page))
768 try_to_free_swap(page);
769 unlock_page(page);
770 putback_lru_page(page);
771 continue;
772
773activate_locked:
774
775 if (PageSwapCache(page) && vm_swap_full())
776 try_to_free_swap(page);
777 VM_BUG_ON(PageActive(page));
778 SetPageActive(page);
779 pgactivate++;
780keep_locked:
781 unlock_page(page);
782keep:
783 list_add(&page->lru, &ret_pages);
784 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
785 }
786 list_splice(&ret_pages, page_list);
787 if (pagevec_count(&freed_pvec))
788 __pagevec_free(&freed_pvec);
789 count_vm_events(PGACTIVATE, pgactivate);
790 return nr_reclaimed;
791}
792
793
794#define ISOLATE_INACTIVE 0
795#define ISOLATE_ACTIVE 1
796#define ISOLATE_BOTH 2
797
798
799
800
801
802
803
804
805
806
807
808int __isolate_lru_page(struct page *page, int mode, int file)
809{
810 int ret = -EINVAL;
811
812
813 if (!PageLRU(page))
814 return ret;
815
816
817
818
819
820
821 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
822 return ret;
823
824 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
825 return ret;
826
827
828
829
830
831
832 if (PageUnevictable(page))
833 return ret;
834
835 ret = -EBUSY;
836
837 if (likely(get_page_unless_zero(page))) {
838
839
840
841
842
843 ClearPageLRU(page);
844 ret = 0;
845 }
846
847 return ret;
848}
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
871 struct list_head *src, struct list_head *dst,
872 unsigned long *scanned, int order, int mode, int file)
873{
874 unsigned long nr_taken = 0;
875 unsigned long scan;
876
877 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
878 struct page *page;
879 unsigned long pfn;
880 unsigned long end_pfn;
881 unsigned long page_pfn;
882 int zone_id;
883
884 page = lru_to_page(src);
885 prefetchw_prev_lru_page(page, src, flags);
886
887 VM_BUG_ON(!PageLRU(page));
888
889 switch (__isolate_lru_page(page, mode, file)) {
890 case 0:
891 list_move(&page->lru, dst);
892 mem_cgroup_del_lru(page);
893 nr_taken++;
894 break;
895
896 case -EBUSY:
897
898 list_move(&page->lru, src);
899 mem_cgroup_rotate_lru_list(page, page_lru(page));
900 continue;
901
902 default:
903 BUG();
904 }
905
906 if (!order)
907 continue;
908
909
910
911
912
913
914
915
916
917
918 zone_id = page_zone_id(page);
919 page_pfn = page_to_pfn(page);
920 pfn = page_pfn & ~((1 << order) - 1);
921 end_pfn = pfn + (1 << order);
922 for (; pfn < end_pfn; pfn++) {
923 struct page *cursor_page;
924
925
926 if (unlikely(pfn == page_pfn))
927 continue;
928
929
930 if (unlikely(!pfn_valid_within(pfn)))
931 break;
932
933 cursor_page = pfn_to_page(pfn);
934
935
936 if (unlikely(page_zone_id(cursor_page) != zone_id))
937 continue;
938 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
939 list_move(&cursor_page->lru, dst);
940 mem_cgroup_del_lru(cursor_page);
941 nr_taken++;
942 scan++;
943 }
944 }
945 }
946
947 *scanned = scan;
948 return nr_taken;
949}
950
951static unsigned long isolate_pages_global(unsigned long nr,
952 struct list_head *dst,
953 unsigned long *scanned, int order,
954 int mode, struct zone *z,
955 struct mem_cgroup *mem_cont,
956 int active, int file)
957{
958 int lru = LRU_BASE;
959 if (active)
960 lru += LRU_ACTIVE;
961 if (file)
962 lru += LRU_FILE;
963 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
964 mode, !!file);
965}
966
967
968
969
970
971static unsigned long clear_active_flags(struct list_head *page_list,
972 unsigned int *count)
973{
974 int nr_active = 0;
975 int lru;
976 struct page *page;
977
978 list_for_each_entry(page, page_list, lru) {
979 lru = page_is_file_cache(page);
980 if (PageActive(page)) {
981 lru += LRU_ACTIVE;
982 ClearPageActive(page);
983 nr_active++;
984 }
985 count[lru]++;
986 }
987
988 return nr_active;
989}
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016int isolate_lru_page(struct page *page)
1017{
1018 int ret = -EBUSY;
1019
1020 if (PageLRU(page)) {
1021 struct zone *zone = page_zone(page);
1022
1023 spin_lock_irq(&zone->lru_lock);
1024 if (PageLRU(page) && get_page_unless_zero(page)) {
1025 int lru = page_lru(page);
1026 ret = 0;
1027 ClearPageLRU(page);
1028
1029 del_page_from_lru_list(zone, page, lru);
1030 }
1031 spin_unlock_irq(&zone->lru_lock);
1032 }
1033 return ret;
1034}
1035
1036
1037
1038
1039
1040static unsigned long shrink_inactive_list(unsigned long max_scan,
1041 struct zone *zone, struct scan_control *sc,
1042 int priority, int file)
1043{
1044 LIST_HEAD(page_list);
1045 struct pagevec pvec;
1046 unsigned long nr_scanned = 0;
1047 unsigned long nr_reclaimed = 0;
1048 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1049 int lumpy_reclaim = 0;
1050
1051
1052
1053
1054
1055
1056
1057
1058 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1059 lumpy_reclaim = 1;
1060 else if (sc->order && priority < DEF_PRIORITY - 2)
1061 lumpy_reclaim = 1;
1062
1063 pagevec_init(&pvec, 1);
1064
1065 lru_add_drain();
1066 spin_lock_irq(&zone->lru_lock);
1067 do {
1068 struct page *page;
1069 unsigned long nr_taken;
1070 unsigned long nr_scan;
1071 unsigned long nr_freed;
1072 unsigned long nr_active;
1073 unsigned int count[NR_LRU_LISTS] = { 0, };
1074 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1075
1076 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1077 &page_list, &nr_scan, sc->order, mode,
1078 zone, sc->mem_cgroup, 0, file);
1079 nr_active = clear_active_flags(&page_list, count);
1080 __count_vm_events(PGDEACTIVATE, nr_active);
1081
1082 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1083 -count[LRU_ACTIVE_FILE]);
1084 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1085 -count[LRU_INACTIVE_FILE]);
1086 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1087 -count[LRU_ACTIVE_ANON]);
1088 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1089 -count[LRU_INACTIVE_ANON]);
1090
1091 if (scanning_global_lru(sc))
1092 zone->pages_scanned += nr_scan;
1093
1094 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1095 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1096 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1097 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1098
1099 spin_unlock_irq(&zone->lru_lock);
1100
1101 nr_scanned += nr_scan;
1102 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
1103
1104
1105
1106
1107
1108
1109
1110 if (nr_freed < nr_taken && !current_is_kswapd() &&
1111 lumpy_reclaim) {
1112 congestion_wait(BLK_RW_ASYNC, HZ/10);
1113
1114
1115
1116
1117
1118 nr_active = clear_active_flags(&page_list, count);
1119 count_vm_events(PGDEACTIVATE, nr_active);
1120
1121 nr_freed += shrink_page_list(&page_list, sc,
1122 PAGEOUT_IO_SYNC);
1123 }
1124
1125 nr_reclaimed += nr_freed;
1126 local_irq_disable();
1127 if (current_is_kswapd()) {
1128 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1129 __count_vm_events(KSWAPD_STEAL, nr_freed);
1130 } else if (scanning_global_lru(sc))
1131 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1132
1133 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1134
1135 if (nr_taken == 0)
1136 goto done;
1137
1138 spin_lock(&zone->lru_lock);
1139
1140
1141
1142 while (!list_empty(&page_list)) {
1143 int lru;
1144 page = lru_to_page(&page_list);
1145 VM_BUG_ON(PageLRU(page));
1146 list_del(&page->lru);
1147 if (unlikely(!page_evictable(page, NULL))) {
1148 spin_unlock_irq(&zone->lru_lock);
1149 putback_lru_page(page);
1150 spin_lock_irq(&zone->lru_lock);
1151 continue;
1152 }
1153 SetPageLRU(page);
1154 lru = page_lru(page);
1155 add_page_to_lru_list(zone, page, lru);
1156 if (PageActive(page)) {
1157 int file = !!page_is_file_cache(page);
1158 reclaim_stat->recent_rotated[file]++;
1159 }
1160 if (!pagevec_add(&pvec, page)) {
1161 spin_unlock_irq(&zone->lru_lock);
1162 __pagevec_release(&pvec);
1163 spin_lock_irq(&zone->lru_lock);
1164 }
1165 }
1166 } while (nr_scanned < max_scan);
1167 spin_unlock(&zone->lru_lock);
1168done:
1169 local_irq_enable();
1170 pagevec_release(&pvec);
1171 return nr_reclaimed;
1172}
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1183{
1184 if (priority < zone->prev_priority)
1185 zone->prev_priority = priority;
1186}
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206static void move_active_pages_to_lru(struct zone *zone,
1207 struct list_head *list,
1208 enum lru_list lru)
1209{
1210 unsigned long pgmoved = 0;
1211 struct pagevec pvec;
1212 struct page *page;
1213
1214 pagevec_init(&pvec, 1);
1215
1216 while (!list_empty(list)) {
1217 page = lru_to_page(list);
1218 prefetchw_prev_lru_page(page, list, flags);
1219
1220 VM_BUG_ON(PageLRU(page));
1221 SetPageLRU(page);
1222
1223 VM_BUG_ON(!PageActive(page));
1224 if (!is_active_lru(lru))
1225 ClearPageActive(page);
1226
1227 list_move(&page->lru, &zone->lru[lru].list);
1228 mem_cgroup_add_lru_list(page, lru);
1229 pgmoved++;
1230
1231 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1232 spin_unlock_irq(&zone->lru_lock);
1233 if (buffer_heads_over_limit)
1234 pagevec_strip(&pvec);
1235 __pagevec_release(&pvec);
1236 spin_lock_irq(&zone->lru_lock);
1237 }
1238 }
1239 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1240 if (!is_active_lru(lru))
1241 __count_vm_events(PGDEACTIVATE, pgmoved);
1242}
1243
1244static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 struct scan_control *sc, int priority, int file)
1246{
1247 unsigned long pgmoved;
1248 unsigned long pgscanned;
1249 unsigned long vm_flags;
1250 LIST_HEAD(l_hold);
1251 LIST_HEAD(l_active);
1252 LIST_HEAD(l_inactive);
1253 struct page *page;
1254 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1255
1256 lru_add_drain();
1257 spin_lock_irq(&zone->lru_lock);
1258 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1259 ISOLATE_ACTIVE, zone,
1260 sc->mem_cgroup, 1, file);
1261
1262
1263
1264
1265 if (scanning_global_lru(sc)) {
1266 zone->pages_scanned += pgscanned;
1267 }
1268 reclaim_stat->recent_scanned[!!file] += pgmoved;
1269
1270 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1271 if (file)
1272 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1273 else
1274 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1275 spin_unlock_irq(&zone->lru_lock);
1276
1277 pgmoved = 0;
1278 while (!list_empty(&l_hold)) {
1279 cond_resched();
1280 page = lru_to_page(&l_hold);
1281 list_del(&page->lru);
1282
1283 if (unlikely(!page_evictable(page, NULL))) {
1284 putback_lru_page(page);
1285 continue;
1286 }
1287
1288
1289 if (page_mapping_inuse(page) &&
1290 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1291 pgmoved++;
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1302 list_add(&page->lru, &l_active);
1303 continue;
1304 }
1305 }
1306
1307 list_add(&page->lru, &l_inactive);
1308 }
1309
1310
1311
1312
1313 spin_lock_irq(&zone->lru_lock);
1314
1315
1316
1317
1318
1319
1320 reclaim_stat->recent_rotated[!!file] += pgmoved;
1321
1322 move_active_pages_to_lru(zone, &l_active,
1323 LRU_ACTIVE + file * LRU_FILE);
1324 move_active_pages_to_lru(zone, &l_inactive,
1325 LRU_BASE + file * LRU_FILE);
1326
1327 spin_unlock_irq(&zone->lru_lock);
1328}
1329
1330static int inactive_anon_is_low_global(struct zone *zone)
1331{
1332 unsigned long active, inactive;
1333
1334 active = zone_page_state(zone, NR_ACTIVE_ANON);
1335 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1336
1337 if (inactive * zone->inactive_ratio < active)
1338 return 1;
1339
1340 return 0;
1341}
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1352{
1353 int low;
1354
1355 if (scanning_global_lru(sc))
1356 low = inactive_anon_is_low_global(zone);
1357 else
1358 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1359 return low;
1360}
1361
1362static int inactive_file_is_low_global(struct zone *zone)
1363{
1364 unsigned long active, inactive;
1365
1366 active = zone_page_state(zone, NR_ACTIVE_FILE);
1367 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1368
1369 return (active > inactive);
1370}
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1388{
1389 int low;
1390
1391 if (scanning_global_lru(sc))
1392 low = inactive_file_is_low_global(zone);
1393 else
1394 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1395 return low;
1396}
1397
1398static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1399 struct zone *zone, struct scan_control *sc, int priority)
1400{
1401 int file = is_file_lru(lru);
1402
1403 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1404 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1405 return 0;
1406 }
1407
1408 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1409 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1410 return 0;
1411 }
1412 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1413}
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1425 unsigned long *percent)
1426{
1427 unsigned long anon, file, free;
1428 unsigned long anon_prio, file_prio;
1429 unsigned long ap, fp;
1430 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1431
1432 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1433 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1434 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
1435 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
1436
1437 if (scanning_global_lru(sc)) {
1438 free = zone_page_state(zone, NR_FREE_PAGES);
1439
1440
1441 if (unlikely(file + free <= high_wmark_pages(zone))) {
1442 percent[0] = 100;
1443 percent[1] = 0;
1444 return;
1445 }
1446 }
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1460 spin_lock_irq(&zone->lru_lock);
1461 reclaim_stat->recent_scanned[0] /= 2;
1462 reclaim_stat->recent_rotated[0] /= 2;
1463 spin_unlock_irq(&zone->lru_lock);
1464 }
1465
1466 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1467 spin_lock_irq(&zone->lru_lock);
1468 reclaim_stat->recent_scanned[1] /= 2;
1469 reclaim_stat->recent_rotated[1] /= 2;
1470 spin_unlock_irq(&zone->lru_lock);
1471 }
1472
1473
1474
1475
1476
1477 anon_prio = sc->swappiness;
1478 file_prio = 200 - sc->swappiness;
1479
1480
1481
1482
1483
1484
1485 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1486 ap /= reclaim_stat->recent_rotated[0] + 1;
1487
1488 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1489 fp /= reclaim_stat->recent_rotated[1] + 1;
1490
1491
1492 percent[0] = 100 * ap / (ap + fp + 1);
1493 percent[1] = 100 - percent[0];
1494}
1495
1496
1497
1498
1499
1500static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1501 unsigned long *nr_saved_scan,
1502 unsigned long swap_cluster_max)
1503{
1504 unsigned long nr;
1505
1506 *nr_saved_scan += nr_to_scan;
1507 nr = *nr_saved_scan;
1508
1509 if (nr >= swap_cluster_max)
1510 *nr_saved_scan = 0;
1511 else
1512 nr = 0;
1513
1514 return nr;
1515}
1516
1517
1518
1519
1520static void shrink_zone(int priority, struct zone *zone,
1521 struct scan_control *sc)
1522{
1523 unsigned long nr[NR_LRU_LISTS];
1524 unsigned long nr_to_scan;
1525 unsigned long percent[2];
1526 enum lru_list l;
1527 unsigned long nr_reclaimed = sc->nr_reclaimed;
1528 unsigned long swap_cluster_max = sc->swap_cluster_max;
1529 int noswap = 0;
1530
1531
1532 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1533 noswap = 1;
1534 percent[0] = 0;
1535 percent[1] = 100;
1536 } else
1537 get_scan_ratio(zone, sc, percent);
1538
1539 for_each_evictable_lru(l) {
1540 int file = is_file_lru(l);
1541 unsigned long scan;
1542
1543 scan = zone_nr_pages(zone, sc, l);
1544 if (priority || noswap) {
1545 scan >>= priority;
1546 scan = (scan * percent[file]) / 100;
1547 }
1548 if (scanning_global_lru(sc))
1549 nr[l] = nr_scan_try_batch(scan,
1550 &zone->lru[l].nr_saved_scan,
1551 swap_cluster_max);
1552 else
1553 nr[l] = scan;
1554 }
1555
1556 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1557 nr[LRU_INACTIVE_FILE]) {
1558 for_each_evictable_lru(l) {
1559 if (nr[l]) {
1560 nr_to_scan = min(nr[l], swap_cluster_max);
1561 nr[l] -= nr_to_scan;
1562
1563 nr_reclaimed += shrink_list(l, nr_to_scan,
1564 zone, sc, priority);
1565 }
1566 }
1567
1568
1569
1570
1571
1572
1573
1574
1575 if (nr_reclaimed > swap_cluster_max &&
1576 priority < DEF_PRIORITY && !current_is_kswapd())
1577 break;
1578 }
1579
1580 sc->nr_reclaimed = nr_reclaimed;
1581
1582
1583
1584
1585
1586 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1587 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1588
1589 throttle_vm_writeout(sc->gfp_mask);
1590}
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608static void shrink_zones(int priority, struct zonelist *zonelist,
1609 struct scan_control *sc)
1610{
1611 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1612 struct zoneref *z;
1613 struct zone *zone;
1614
1615 sc->all_unreclaimable = 1;
1616 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1617 sc->nodemask) {
1618 if (!populated_zone(zone))
1619 continue;
1620
1621
1622
1623
1624 if (scanning_global_lru(sc)) {
1625 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1626 continue;
1627 note_zone_scanning_priority(zone, priority);
1628
1629 if (zone_is_all_unreclaimable(zone) &&
1630 priority != DEF_PRIORITY)
1631 continue;
1632 sc->all_unreclaimable = 0;
1633 } else {
1634
1635
1636
1637
1638 sc->all_unreclaimable = 0;
1639 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1640 priority);
1641 }
1642
1643 shrink_zone(priority, zone, sc);
1644 }
1645}
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1664 struct scan_control *sc)
1665{
1666 int priority;
1667 unsigned long ret = 0;
1668 unsigned long total_scanned = 0;
1669 struct reclaim_state *reclaim_state = current->reclaim_state;
1670 unsigned long lru_pages = 0;
1671 struct zoneref *z;
1672 struct zone *zone;
1673 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1674
1675 delayacct_freepages_start();
1676
1677 if (scanning_global_lru(sc))
1678 count_vm_event(ALLOCSTALL);
1679
1680
1681
1682 if (scanning_global_lru(sc)) {
1683 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1684
1685 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1686 continue;
1687
1688 lru_pages += zone_lru_pages(zone);
1689 }
1690 }
1691
1692 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1693 sc->nr_scanned = 0;
1694 if (!priority)
1695 disable_swap_token();
1696 shrink_zones(priority, zonelist, sc);
1697
1698
1699
1700
1701 if (scanning_global_lru(sc)) {
1702 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1703 if (reclaim_state) {
1704 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1705 reclaim_state->reclaimed_slab = 0;
1706 }
1707 }
1708 total_scanned += sc->nr_scanned;
1709 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1710 ret = sc->nr_reclaimed;
1711 goto out;
1712 }
1713
1714
1715
1716
1717
1718
1719
1720
1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1;
1725 }
1726
1727
1728 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1729 congestion_wait(BLK_RW_ASYNC, HZ/10);
1730 }
1731
1732 if (!sc->all_unreclaimable && scanning_global_lru(sc))
1733 ret = sc->nr_reclaimed;
1734out:
1735
1736
1737
1738
1739
1740
1741
1742 if (priority < 0)
1743 priority = 0;
1744
1745 if (scanning_global_lru(sc)) {
1746 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1747
1748 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1749 continue;
1750
1751 zone->prev_priority = priority;
1752 }
1753 } else
1754 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1755
1756 delayacct_freepages_end();
1757
1758 return ret;
1759}
1760
1761unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1762 gfp_t gfp_mask, nodemask_t *nodemask)
1763{
1764 struct scan_control sc = {
1765 .gfp_mask = gfp_mask,
1766 .may_writepage = !laptop_mode,
1767 .swap_cluster_max = SWAP_CLUSTER_MAX,
1768 .may_unmap = 1,
1769 .may_swap = 1,
1770 .swappiness = vm_swappiness,
1771 .order = order,
1772 .mem_cgroup = NULL,
1773 .isolate_pages = isolate_pages_global,
1774 .nodemask = nodemask,
1775 };
1776
1777 return do_try_to_free_pages(zonelist, &sc);
1778}
1779
1780#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1781
1782unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1783 gfp_t gfp_mask,
1784 bool noswap,
1785 unsigned int swappiness)
1786{
1787 struct scan_control sc = {
1788 .may_writepage = !laptop_mode,
1789 .may_unmap = 1,
1790 .may_swap = !noswap,
1791 .swap_cluster_max = SWAP_CLUSTER_MAX,
1792 .swappiness = swappiness,
1793 .order = 0,
1794 .mem_cgroup = mem_cont,
1795 .isolate_pages = mem_cgroup_isolate_pages,
1796 .nodemask = NULL,
1797 };
1798 struct zonelist *zonelist;
1799
1800 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1801 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1802 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1803 return do_try_to_free_pages(zonelist, &sc);
1804}
1805#endif
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1829{
1830 int all_zones_ok;
1831 int priority;
1832 int i;
1833 unsigned long total_scanned;
1834 struct reclaim_state *reclaim_state = current->reclaim_state;
1835 struct scan_control sc = {
1836 .gfp_mask = GFP_KERNEL,
1837 .may_unmap = 1,
1838 .may_swap = 1,
1839 .swap_cluster_max = SWAP_CLUSTER_MAX,
1840 .swappiness = vm_swappiness,
1841 .order = order,
1842 .mem_cgroup = NULL,
1843 .isolate_pages = isolate_pages_global,
1844 };
1845
1846
1847
1848
1849
1850 int temp_priority[MAX_NR_ZONES];
1851
1852loop_again:
1853 total_scanned = 0;
1854 sc.nr_reclaimed = 0;
1855 sc.may_writepage = !laptop_mode;
1856 count_vm_event(PAGEOUTRUN);
1857
1858 for (i = 0; i < pgdat->nr_zones; i++)
1859 temp_priority[i] = DEF_PRIORITY;
1860
1861 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1862 int end_zone = 0;
1863 unsigned long lru_pages = 0;
1864
1865
1866 if (!priority)
1867 disable_swap_token();
1868
1869 all_zones_ok = 1;
1870
1871
1872
1873
1874
1875 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1876 struct zone *zone = pgdat->node_zones + i;
1877
1878 if (!populated_zone(zone))
1879 continue;
1880
1881 if (zone_is_all_unreclaimable(zone) &&
1882 priority != DEF_PRIORITY)
1883 continue;
1884
1885
1886
1887
1888
1889 if (inactive_anon_is_low(zone, &sc))
1890 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1891 &sc, priority, 0);
1892
1893 if (!zone_watermark_ok(zone, order,
1894 high_wmark_pages(zone), 0, 0)) {
1895 end_zone = i;
1896 break;
1897 }
1898 }
1899 if (i < 0)
1900 goto out;
1901
1902 for (i = 0; i <= end_zone; i++) {
1903 struct zone *zone = pgdat->node_zones + i;
1904
1905 lru_pages += zone_lru_pages(zone);
1906 }
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917 for (i = 0; i <= end_zone; i++) {
1918 struct zone *zone = pgdat->node_zones + i;
1919 int nr_slab;
1920
1921 if (!populated_zone(zone))
1922 continue;
1923
1924 if (zone_is_all_unreclaimable(zone) &&
1925 priority != DEF_PRIORITY)
1926 continue;
1927
1928 if (!zone_watermark_ok(zone, order,
1929 high_wmark_pages(zone), end_zone, 0))
1930 all_zones_ok = 0;
1931 temp_priority[i] = priority;
1932 sc.nr_scanned = 0;
1933 note_zone_scanning_priority(zone, priority);
1934
1935
1936
1937
1938 if (!zone_watermark_ok(zone, order,
1939 8*high_wmark_pages(zone), end_zone, 0))
1940 shrink_zone(priority, zone, &sc);
1941 reclaim_state->reclaimed_slab = 0;
1942 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1943 lru_pages);
1944 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1945 total_scanned += sc.nr_scanned;
1946 if (zone_is_all_unreclaimable(zone))
1947 continue;
1948 if (nr_slab == 0 && zone->pages_scanned >=
1949 (zone_lru_pages(zone) * 6))
1950 zone_set_flag(zone,
1951 ZONE_ALL_UNRECLAIMABLE);
1952
1953
1954
1955
1956
1957 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1958 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1959 sc.may_writepage = 1;
1960 }
1961 if (all_zones_ok)
1962 break;
1963
1964
1965
1966
1967 if (total_scanned && priority < DEF_PRIORITY - 2)
1968 congestion_wait(BLK_RW_ASYNC, HZ/10);
1969
1970
1971
1972
1973
1974
1975
1976 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1977 break;
1978 }
1979out:
1980
1981
1982
1983
1984
1985 for (i = 0; i < pgdat->nr_zones; i++) {
1986 struct zone *zone = pgdat->node_zones + i;
1987
1988 zone->prev_priority = temp_priority[i];
1989 }
1990 if (!all_zones_ok) {
1991 cond_resched();
1992
1993 try_to_freeze();
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2010 order = sc.order = 0;
2011
2012 goto loop_again;
2013 }
2014
2015 return sc.nr_reclaimed;
2016}
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031static int kswapd(void *p)
2032{
2033 unsigned long order;
2034 pg_data_t *pgdat = (pg_data_t*)p;
2035 struct task_struct *tsk = current;
2036 DEFINE_WAIT(wait);
2037 struct reclaim_state reclaim_state = {
2038 .reclaimed_slab = 0,
2039 };
2040 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2041
2042 lockdep_set_current_reclaim_state(GFP_KERNEL);
2043
2044 if (!cpumask_empty(cpumask))
2045 set_cpus_allowed_ptr(tsk, cpumask);
2046 current->reclaim_state = &reclaim_state;
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2061 set_freezable();
2062
2063 order = 0;
2064 for ( ; ; ) {
2065 unsigned long new_order;
2066
2067 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2068 new_order = pgdat->kswapd_max_order;
2069 pgdat->kswapd_max_order = 0;
2070 if (order < new_order) {
2071
2072
2073
2074
2075 order = new_order;
2076 } else {
2077 if (!freezing(current))
2078 schedule();
2079
2080 order = pgdat->kswapd_max_order;
2081 }
2082 finish_wait(&pgdat->kswapd_wait, &wait);
2083
2084 if (!try_to_freeze()) {
2085
2086
2087
2088 balance_pgdat(pgdat, order);
2089 }
2090 }
2091 return 0;
2092}
2093
2094
2095
2096
2097void wakeup_kswapd(struct zone *zone, int order)
2098{
2099 pg_data_t *pgdat;
2100
2101 if (!populated_zone(zone))
2102 return;
2103
2104 pgdat = zone->zone_pgdat;
2105 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2106 return;
2107 if (pgdat->kswapd_max_order < order)
2108 pgdat->kswapd_max_order = order;
2109 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2110 return;
2111 if (!waitqueue_active(&pgdat->kswapd_wait))
2112 return;
2113 wake_up_interruptible(&pgdat->kswapd_wait);
2114}
2115
2116unsigned long global_lru_pages(void)
2117{
2118 return global_page_state(NR_ACTIVE_ANON)
2119 + global_page_state(NR_ACTIVE_FILE)
2120 + global_page_state(NR_INACTIVE_ANON)
2121 + global_page_state(NR_INACTIVE_FILE);
2122}
2123
2124#ifdef CONFIG_HIBERNATION
2125
2126
2127
2128
2129
2130
2131static void shrink_all_zones(unsigned long nr_pages, int prio,
2132 int pass, struct scan_control *sc)
2133{
2134 struct zone *zone;
2135 unsigned long nr_reclaimed = 0;
2136
2137 for_each_populated_zone(zone) {
2138 enum lru_list l;
2139
2140 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2141 continue;
2142
2143 for_each_evictable_lru(l) {
2144 enum zone_stat_item ls = NR_LRU_BASE + l;
2145 unsigned long lru_pages = zone_page_state(zone, ls);
2146
2147
2148 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2149 l == LRU_ACTIVE_FILE))
2150 continue;
2151
2152 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2153 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2154 unsigned long nr_to_scan;
2155
2156 zone->lru[l].nr_saved_scan = 0;
2157 nr_to_scan = min(nr_pages, lru_pages);
2158 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2159 sc, prio);
2160 if (nr_reclaimed >= nr_pages) {
2161 sc->nr_reclaimed += nr_reclaimed;
2162 return;
2163 }
2164 }
2165 }
2166 }
2167 sc->nr_reclaimed += nr_reclaimed;
2168}
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178unsigned long shrink_all_memory(unsigned long nr_pages)
2179{
2180 unsigned long lru_pages, nr_slab;
2181 int pass;
2182 struct reclaim_state reclaim_state;
2183 struct scan_control sc = {
2184 .gfp_mask = GFP_KERNEL,
2185 .may_unmap = 0,
2186 .may_writepage = 1,
2187 .isolate_pages = isolate_pages_global,
2188 .nr_reclaimed = 0,
2189 };
2190
2191 current->reclaim_state = &reclaim_state;
2192
2193 lru_pages = global_lru_pages();
2194 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2195
2196 while (nr_slab >= lru_pages) {
2197 reclaim_state.reclaimed_slab = 0;
2198 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2199 if (!reclaim_state.reclaimed_slab)
2200 break;
2201
2202 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2203 if (sc.nr_reclaimed >= nr_pages)
2204 goto out;
2205
2206 nr_slab -= reclaim_state.reclaimed_slab;
2207 }
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217 for (pass = 0; pass < 5; pass++) {
2218 int prio;
2219
2220
2221 if (pass > 2)
2222 sc.may_unmap = 1;
2223
2224 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2225 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2226
2227 sc.nr_scanned = 0;
2228 sc.swap_cluster_max = nr_to_scan;
2229 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2230 if (sc.nr_reclaimed >= nr_pages)
2231 goto out;
2232
2233 reclaim_state.reclaimed_slab = 0;
2234 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2235 global_lru_pages());
2236 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2237 if (sc.nr_reclaimed >= nr_pages)
2238 goto out;
2239
2240 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2241 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2242 }
2243 }
2244
2245
2246
2247
2248
2249 if (!sc.nr_reclaimed) {
2250 do {
2251 reclaim_state.reclaimed_slab = 0;
2252 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
2253 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2254 } while (sc.nr_reclaimed < nr_pages &&
2255 reclaim_state.reclaimed_slab > 0);
2256 }
2257
2258
2259out:
2260 current->reclaim_state = NULL;
2261
2262 return sc.nr_reclaimed;
2263}
2264#endif
2265
2266
2267
2268
2269
2270static int __devinit cpu_callback(struct notifier_block *nfb,
2271 unsigned long action, void *hcpu)
2272{
2273 int nid;
2274
2275 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2276 for_each_node_state(nid, N_HIGH_MEMORY) {
2277 pg_data_t *pgdat = NODE_DATA(nid);
2278 const struct cpumask *mask;
2279
2280 mask = cpumask_of_node(pgdat->node_id);
2281
2282 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2283
2284 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2285 }
2286 }
2287 return NOTIFY_OK;
2288}
2289
2290
2291
2292
2293
2294int kswapd_run(int nid)
2295{
2296 pg_data_t *pgdat = NODE_DATA(nid);
2297 int ret = 0;
2298
2299 if (pgdat->kswapd)
2300 return 0;
2301
2302 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
2303 if (IS_ERR(pgdat->kswapd)) {
2304
2305 BUG_ON(system_state == SYSTEM_BOOTING);
2306 printk("Failed to start kswapd on node %d\n",nid);
2307 ret = -1;
2308 }
2309 return ret;
2310}
2311
2312static int __init kswapd_init(void)
2313{
2314 int nid;
2315
2316 swap_setup();
2317 for_each_node_state(nid, N_HIGH_MEMORY)
2318 kswapd_run(nid);
2319 hotcpu_notifier(cpu_callback, 0);
2320 return 0;
2321}
2322
2323module_init(kswapd_init)
2324
2325#ifdef CONFIG_NUMA
2326
2327
2328
2329
2330
2331
2332int zone_reclaim_mode __read_mostly;
2333
2334#define RECLAIM_OFF 0
2335#define RECLAIM_ZONE (1<<0)
2336#define RECLAIM_WRITE (1<<1)
2337#define RECLAIM_SWAP (1<<2)
2338
2339
2340
2341
2342
2343
2344#define ZONE_RECLAIM_PRIORITY 4
2345
2346
2347
2348
2349
2350int sysctl_min_unmapped_ratio = 1;
2351
2352
2353
2354
2355
2356int sysctl_min_slab_ratio = 5;
2357
2358static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2359{
2360 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2361 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2362 zone_page_state(zone, NR_ACTIVE_FILE);
2363
2364
2365
2366
2367
2368
2369 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2370}
2371
2372
2373static long zone_pagecache_reclaimable(struct zone *zone)
2374{
2375 long nr_pagecache_reclaimable;
2376 long delta = 0;
2377
2378
2379
2380
2381
2382
2383
2384 if (zone_reclaim_mode & RECLAIM_SWAP)
2385 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2386 else
2387 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2388
2389
2390 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2391 delta += zone_page_state(zone, NR_FILE_DIRTY);
2392
2393
2394 if (unlikely(delta > nr_pagecache_reclaimable))
2395 delta = nr_pagecache_reclaimable;
2396
2397 return nr_pagecache_reclaimable - delta;
2398}
2399
2400
2401
2402
2403static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2404{
2405
2406 const unsigned long nr_pages = 1 << order;
2407 struct task_struct *p = current;
2408 struct reclaim_state reclaim_state;
2409 int priority;
2410 struct scan_control sc = {
2411 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2412 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2413 .may_swap = 1,
2414 .swap_cluster_max = max_t(unsigned long, nr_pages,
2415 SWAP_CLUSTER_MAX),
2416 .gfp_mask = gfp_mask,
2417 .swappiness = vm_swappiness,
2418 .order = order,
2419 .isolate_pages = isolate_pages_global,
2420 };
2421 unsigned long slab_reclaimable;
2422
2423 disable_swap_token();
2424 cond_resched();
2425
2426
2427
2428
2429
2430 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2431 reclaim_state.reclaimed_slab = 0;
2432 p->reclaim_state = &reclaim_state;
2433
2434 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2435
2436
2437
2438
2439 priority = ZONE_RECLAIM_PRIORITY;
2440 do {
2441 note_zone_scanning_priority(zone, priority);
2442 shrink_zone(priority, zone, &sc);
2443 priority--;
2444 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2445 }
2446
2447 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2448 if (slab_reclaimable > zone->min_slab_pages) {
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
2460 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
2461 slab_reclaimable - nr_pages)
2462 ;
2463
2464
2465
2466
2467
2468 sc.nr_reclaimed += slab_reclaimable -
2469 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2470 }
2471
2472 p->reclaim_state = NULL;
2473 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2474 return sc.nr_reclaimed >= nr_pages;
2475}
2476
2477int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2478{
2479 int node_id;
2480 int ret;
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2493 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2494 return ZONE_RECLAIM_FULL;
2495
2496 if (zone_is_all_unreclaimable(zone))
2497 return ZONE_RECLAIM_FULL;
2498
2499
2500
2501
2502 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2503 return ZONE_RECLAIM_NOSCAN;
2504
2505
2506
2507
2508
2509
2510
2511 node_id = zone_to_nid(zone);
2512 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2513 return ZONE_RECLAIM_NOSCAN;
2514
2515 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2516 return ZONE_RECLAIM_NOSCAN;
2517
2518 ret = __zone_reclaim(zone, gfp_mask, order);
2519 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2520
2521 if (!ret)
2522 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2523
2524 return ret;
2525}
2526#endif
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542int page_evictable(struct page *page, struct vm_area_struct *vma)
2543{
2544
2545 if (mapping_unevictable(page_mapping(page)))
2546 return 0;
2547
2548 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2549 return 0;
2550
2551 return 1;
2552}
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565static void check_move_unevictable_page(struct page *page, struct zone *zone)
2566{
2567 VM_BUG_ON(PageActive(page));
2568
2569retry:
2570 ClearPageUnevictable(page);
2571 if (page_evictable(page, NULL)) {
2572 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2573
2574 __dec_zone_state(zone, NR_UNEVICTABLE);
2575 list_move(&page->lru, &zone->lru[l].list);
2576 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
2577 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2578 __count_vm_event(UNEVICTABLE_PGRESCUED);
2579 } else {
2580
2581
2582
2583 SetPageUnevictable(page);
2584 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2585 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
2586 if (page_evictable(page, NULL))
2587 goto retry;
2588 }
2589}
2590
2591
2592
2593
2594
2595
2596
2597
2598void scan_mapping_unevictable_pages(struct address_space *mapping)
2599{
2600 pgoff_t next = 0;
2601 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2602 PAGE_CACHE_SHIFT;
2603 struct zone *zone;
2604 struct pagevec pvec;
2605
2606 if (mapping->nrpages == 0)
2607 return;
2608
2609 pagevec_init(&pvec, 0);
2610 while (next < end &&
2611 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2612 int i;
2613 int pg_scanned = 0;
2614
2615 zone = NULL;
2616
2617 for (i = 0; i < pagevec_count(&pvec); i++) {
2618 struct page *page = pvec.pages[i];
2619 pgoff_t page_index = page->index;
2620 struct zone *pagezone = page_zone(page);
2621
2622 pg_scanned++;
2623 if (page_index > next)
2624 next = page_index;
2625 next++;
2626
2627 if (pagezone != zone) {
2628 if (zone)
2629 spin_unlock_irq(&zone->lru_lock);
2630 zone = pagezone;
2631 spin_lock_irq(&zone->lru_lock);
2632 }
2633
2634 if (PageLRU(page) && PageUnevictable(page))
2635 check_move_unevictable_page(page, zone);
2636 }
2637 if (zone)
2638 spin_unlock_irq(&zone->lru_lock);
2639 pagevec_release(&pvec);
2640
2641 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2642 }
2643
2644}
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL
2657static void scan_zone_unevictable_pages(struct zone *zone)
2658{
2659 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2660 unsigned long scan;
2661 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2662
2663 while (nr_to_scan > 0) {
2664 unsigned long batch_size = min(nr_to_scan,
2665 SCAN_UNEVICTABLE_BATCH_SIZE);
2666
2667 spin_lock_irq(&zone->lru_lock);
2668 for (scan = 0; scan < batch_size; scan++) {
2669 struct page *page = lru_to_page(l_unevictable);
2670
2671 if (!trylock_page(page))
2672 continue;
2673
2674 prefetchw_prev_lru_page(page, l_unevictable, flags);
2675
2676 if (likely(PageLRU(page) && PageUnevictable(page)))
2677 check_move_unevictable_page(page, zone);
2678
2679 unlock_page(page);
2680 }
2681 spin_unlock_irq(&zone->lru_lock);
2682
2683 nr_to_scan -= batch_size;
2684 }
2685}
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699static void scan_all_zones_unevictable_pages(void)
2700{
2701 struct zone *zone;
2702
2703 for_each_zone(zone) {
2704 scan_zone_unevictable_pages(zone);
2705 }
2706}
2707
2708
2709
2710
2711
2712unsigned long scan_unevictable_pages;
2713
2714int scan_unevictable_handler(struct ctl_table *table, int write,
2715 struct file *file, void __user *buffer,
2716 size_t *length, loff_t *ppos)
2717{
2718 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2719
2720 if (write && *(unsigned long *)table->data)
2721 scan_all_zones_unevictable_pages();
2722
2723 scan_unevictable_pages = 0;
2724 return 0;
2725}
2726
2727
2728
2729
2730
2731
2732static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2733 struct sysdev_attribute *attr,
2734 char *buf)
2735{
2736 return sprintf(buf, "0\n");
2737}
2738
2739static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2740 struct sysdev_attribute *attr,
2741 const char *buf, size_t count)
2742{
2743 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2744 struct zone *zone;
2745 unsigned long res;
2746 unsigned long req = strict_strtoul(buf, 10, &res);
2747
2748 if (!req)
2749 return 1;
2750
2751 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2752 if (!populated_zone(zone))
2753 continue;
2754 scan_zone_unevictable_pages(zone);
2755 }
2756 return 1;
2757}
2758
2759
2760static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2761 read_scan_unevictable_node,
2762 write_scan_unevictable_node);
2763
2764int scan_unevictable_register_node(struct node *node)
2765{
2766 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2767}
2768
2769void scan_unevictable_unregister_node(struct node *node)
2770{
2771 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2772}
2773
2774