1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/notifier.h>
36#include <linux/rwsem.h>
37#include <linux/delay.h>
38#include <linux/kthread.h>
39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
43
44#include <asm/tlbflush.h>
45#include <asm/div64.h>
46
47#include <linux/swapops.h>
48
49#include "internal.h"
50
51struct scan_control {
52
53 unsigned long nr_scanned;
54
55
56 unsigned long nr_reclaimed;
57
58
59 gfp_t gfp_mask;
60
61 int may_writepage;
62
63
64 int may_swap;
65
66
67
68
69
70 int swap_cluster_max;
71
72 int swappiness;
73
74 int all_unreclaimable;
75
76 int order;
77
78
79 struct mem_cgroup *mem_cgroup;
80
81
82 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
83 unsigned long *scanned, int order, int mode,
84 struct zone *z, struct mem_cgroup *mem_cont,
85 int active, int file);
86};
87
88#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
89
90#ifdef ARCH_HAS_PREFETCH
91#define prefetch_prev_lru_page(_page, _base, _field) \
92 do { \
93 if ((_page)->lru.prev != _base) { \
94 struct page *prev; \
95 \
96 prev = lru_to_page(&(_page->lru)); \
97 prefetch(&prev->_field); \
98 } \
99 } while (0)
100#else
101#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
102#endif
103
104#ifdef ARCH_HAS_PREFETCHW
105#define prefetchw_prev_lru_page(_page, _base, _field) \
106 do { \
107 if ((_page)->lru.prev != _base) { \
108 struct page *prev; \
109 \
110 prev = lru_to_page(&(_page->lru)); \
111 prefetchw(&prev->_field); \
112 } \
113 } while (0)
114#else
115#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
116#endif
117
118
119
120
121int vm_swappiness = 60;
122long vm_total_pages;
123
124static LIST_HEAD(shrinker_list);
125static DECLARE_RWSEM(shrinker_rwsem);
126
127#ifdef CONFIG_CGROUP_MEM_RES_CTLR
128#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
129#else
130#define scanning_global_lru(sc) (1)
131#endif
132
133static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
134 struct scan_control *sc)
135{
136 if (!scanning_global_lru(sc))
137 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
138
139 return &zone->reclaim_stat;
140}
141
142static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
143 enum lru_list lru)
144{
145 if (!scanning_global_lru(sc))
146 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
147
148 return zone_page_state(zone, NR_LRU_BASE + lru);
149}
150
151
152
153
154
155void register_shrinker(struct shrinker *shrinker)
156{
157 shrinker->nr = 0;
158 down_write(&shrinker_rwsem);
159 list_add_tail(&shrinker->list, &shrinker_list);
160 up_write(&shrinker_rwsem);
161}
162EXPORT_SYMBOL(register_shrinker);
163
164
165
166
167void unregister_shrinker(struct shrinker *shrinker)
168{
169 down_write(&shrinker_rwsem);
170 list_del(&shrinker->list);
171 up_write(&shrinker_rwsem);
172}
173EXPORT_SYMBOL(unregister_shrinker);
174
175#define SHRINK_BATCH 128
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
196 unsigned long lru_pages)
197{
198 struct shrinker *shrinker;
199 unsigned long ret = 0;
200
201 if (scanned == 0)
202 scanned = SWAP_CLUSTER_MAX;
203
204 if (!down_read_trylock(&shrinker_rwsem))
205 return 1;
206
207 list_for_each_entry(shrinker, &shrinker_list, list) {
208 unsigned long long delta;
209 unsigned long total_scan;
210 unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
211
212 delta = (4 * scanned) / shrinker->seeks;
213 delta *= max_pass;
214 do_div(delta, lru_pages + 1);
215 shrinker->nr += delta;
216 if (shrinker->nr < 0) {
217 printk(KERN_ERR "%s: nr=%ld\n",
218 __func__, shrinker->nr);
219 shrinker->nr = max_pass;
220 }
221
222
223
224
225
226
227 if (shrinker->nr > max_pass * 2)
228 shrinker->nr = max_pass * 2;
229
230 total_scan = shrinker->nr;
231 shrinker->nr = 0;
232
233 while (total_scan >= SHRINK_BATCH) {
234 long this_scan = SHRINK_BATCH;
235 int shrink_ret;
236 int nr_before;
237
238 nr_before = (*shrinker->shrink)(0, gfp_mask);
239 shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
240 if (shrink_ret == -1)
241 break;
242 if (shrink_ret < nr_before)
243 ret += nr_before - shrink_ret;
244 count_vm_events(SLABS_SCANNED, this_scan);
245 total_scan -= this_scan;
246
247 cond_resched();
248 }
249
250 shrinker->nr += total_scan;
251 }
252 up_read(&shrinker_rwsem);
253 return ret;
254}
255
256
257static inline int page_mapping_inuse(struct page *page)
258{
259 struct address_space *mapping;
260
261
262 if (page_mapped(page))
263 return 1;
264
265
266 if (PageSwapCache(page))
267 return 1;
268
269 mapping = page_mapping(page);
270 if (!mapping)
271 return 0;
272
273
274 return mapping_mapped(mapping);
275}
276
277static inline int is_page_cache_freeable(struct page *page)
278{
279 return page_count(page) - !!PagePrivate(page) == 2;
280}
281
282static int may_write_to_queue(struct backing_dev_info *bdi)
283{
284 if (current->flags & PF_SWAPWRITE)
285 return 1;
286 if (!bdi_write_congested(bdi))
287 return 1;
288 if (bdi == current->backing_dev_info)
289 return 1;
290 return 0;
291}
292
293
294
295
296
297
298
299
300
301
302
303
304
305static void handle_write_error(struct address_space *mapping,
306 struct page *page, int error)
307{
308 lock_page(page);
309 if (page_mapping(page) == mapping)
310 mapping_set_error(mapping, error);
311 unlock_page(page);
312}
313
314
315enum pageout_io {
316 PAGEOUT_IO_ASYNC,
317 PAGEOUT_IO_SYNC,
318};
319
320
321typedef enum {
322
323 PAGE_KEEP,
324
325 PAGE_ACTIVATE,
326
327 PAGE_SUCCESS,
328
329 PAGE_CLEAN,
330} pageout_t;
331
332
333
334
335
336static pageout_t pageout(struct page *page, struct address_space *mapping,
337 enum pageout_io sync_writeback)
338{
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356 if (!is_page_cache_freeable(page))
357 return PAGE_KEEP;
358 if (!mapping) {
359
360
361
362
363 if (PagePrivate(page)) {
364 if (try_to_free_buffers(page)) {
365 ClearPageDirty(page);
366 printk("%s: orphaned page\n", __func__);
367 return PAGE_CLEAN;
368 }
369 }
370 return PAGE_KEEP;
371 }
372 if (mapping->a_ops->writepage == NULL)
373 return PAGE_ACTIVATE;
374 if (!may_write_to_queue(mapping->backing_dev_info))
375 return PAGE_KEEP;
376
377 if (clear_page_dirty_for_io(page)) {
378 int res;
379 struct writeback_control wbc = {
380 .sync_mode = WB_SYNC_NONE,
381 .nr_to_write = SWAP_CLUSTER_MAX,
382 .range_start = 0,
383 .range_end = LLONG_MAX,
384 .nonblocking = 1,
385 .for_reclaim = 1,
386 };
387
388 SetPageReclaim(page);
389 res = mapping->a_ops->writepage(page, &wbc);
390 if (res < 0)
391 handle_write_error(mapping, page, res);
392 if (res == AOP_WRITEPAGE_ACTIVATE) {
393 ClearPageReclaim(page);
394 return PAGE_ACTIVATE;
395 }
396
397
398
399
400
401
402 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
403 wait_on_page_writeback(page);
404
405 if (!PageWriteback(page)) {
406
407 ClearPageReclaim(page);
408 }
409 inc_zone_page_state(page, NR_VMSCAN_WRITE);
410 return PAGE_SUCCESS;
411 }
412
413 return PAGE_CLEAN;
414}
415
416
417
418
419
420static int __remove_mapping(struct address_space *mapping, struct page *page)
421{
422 BUG_ON(!PageLocked(page));
423 BUG_ON(mapping != page_mapping(page));
424
425 spin_lock_irq(&mapping->tree_lock);
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451 if (!page_freeze_refs(page, 2))
452 goto cannot_free;
453
454 if (unlikely(PageDirty(page))) {
455 page_unfreeze_refs(page, 2);
456 goto cannot_free;
457 }
458
459 if (PageSwapCache(page)) {
460 swp_entry_t swap = { .val = page_private(page) };
461 __delete_from_swap_cache(page);
462 spin_unlock_irq(&mapping->tree_lock);
463 swap_free(swap);
464 } else {
465 __remove_from_page_cache(page);
466 spin_unlock_irq(&mapping->tree_lock);
467 }
468
469 return 1;
470
471cannot_free:
472 spin_unlock_irq(&mapping->tree_lock);
473 return 0;
474}
475
476
477
478
479
480
481
482int remove_mapping(struct address_space *mapping, struct page *page)
483{
484 if (__remove_mapping(mapping, page)) {
485
486
487
488
489
490 page_unfreeze_refs(page, 1);
491 return 1;
492 }
493 return 0;
494}
495
496
497
498
499
500
501
502
503
504
505#ifdef CONFIG_UNEVICTABLE_LRU
506void putback_lru_page(struct page *page)
507{
508 int lru;
509 int active = !!TestClearPageActive(page);
510 int was_unevictable = PageUnevictable(page);
511
512 VM_BUG_ON(PageLRU(page));
513
514redo:
515 ClearPageUnevictable(page);
516
517 if (page_evictable(page, NULL)) {
518
519
520
521
522
523
524 lru = active + page_is_file_cache(page);
525 lru_cache_add_lru(page, lru);
526 } else {
527
528
529
530
531 lru = LRU_UNEVICTABLE;
532 add_page_to_unevictable_list(page);
533 }
534
535
536
537
538
539
540 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
541 if (!isolate_lru_page(page)) {
542 put_page(page);
543 goto redo;
544 }
545
546
547
548
549 }
550
551 if (was_unevictable && lru != LRU_UNEVICTABLE)
552 count_vm_event(UNEVICTABLE_PGRESCUED);
553 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
554 count_vm_event(UNEVICTABLE_PGCULLED);
555
556 put_page(page);
557}
558
559#else
560
561void putback_lru_page(struct page *page)
562{
563 int lru;
564 VM_BUG_ON(PageLRU(page));
565
566 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
567 lru_cache_add_lru(page, lru);
568 put_page(page);
569}
570#endif
571
572
573
574
575
576static unsigned long shrink_page_list(struct list_head *page_list,
577 struct scan_control *sc,
578 enum pageout_io sync_writeback)
579{
580 LIST_HEAD(ret_pages);
581 struct pagevec freed_pvec;
582 int pgactivate = 0;
583 unsigned long nr_reclaimed = 0;
584
585 cond_resched();
586
587 pagevec_init(&freed_pvec, 1);
588 while (!list_empty(page_list)) {
589 struct address_space *mapping;
590 struct page *page;
591 int may_enter_fs;
592 int referenced;
593
594 cond_resched();
595
596 page = lru_to_page(page_list);
597 list_del(&page->lru);
598
599 if (!trylock_page(page))
600 goto keep;
601
602 VM_BUG_ON(PageActive(page));
603
604 sc->nr_scanned++;
605
606 if (unlikely(!page_evictable(page, NULL)))
607 goto cull_mlocked;
608
609 if (!sc->may_swap && page_mapped(page))
610 goto keep_locked;
611
612
613 if (page_mapped(page) || PageSwapCache(page))
614 sc->nr_scanned++;
615
616 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
617 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
618
619 if (PageWriteback(page)) {
620
621
622
623
624
625
626
627
628 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
629 wait_on_page_writeback(page);
630 else
631 goto keep_locked;
632 }
633
634 referenced = page_referenced(page, 1, sc->mem_cgroup);
635
636 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
637 referenced && page_mapping_inuse(page))
638 goto activate_locked;
639
640
641
642
643
644 if (PageAnon(page) && !PageSwapCache(page)) {
645 if (!(sc->gfp_mask & __GFP_IO))
646 goto keep_locked;
647 if (!add_to_swap(page))
648 goto activate_locked;
649 may_enter_fs = 1;
650 }
651
652 mapping = page_mapping(page);
653
654
655
656
657
658 if (page_mapped(page) && mapping) {
659 switch (try_to_unmap(page, 0)) {
660 case SWAP_FAIL:
661 goto activate_locked;
662 case SWAP_AGAIN:
663 goto keep_locked;
664 case SWAP_MLOCK:
665 goto cull_mlocked;
666 case SWAP_SUCCESS:
667 ;
668 }
669 }
670
671 if (PageDirty(page)) {
672 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
673 goto keep_locked;
674 if (!may_enter_fs)
675 goto keep_locked;
676 if (!sc->may_writepage)
677 goto keep_locked;
678
679
680 switch (pageout(page, mapping, sync_writeback)) {
681 case PAGE_KEEP:
682 goto keep_locked;
683 case PAGE_ACTIVATE:
684 goto activate_locked;
685 case PAGE_SUCCESS:
686 if (PageWriteback(page) || PageDirty(page))
687 goto keep;
688
689
690
691
692 if (!trylock_page(page))
693 goto keep;
694 if (PageDirty(page) || PageWriteback(page))
695 goto keep_locked;
696 mapping = page_mapping(page);
697 case PAGE_CLEAN:
698 ;
699 }
700 }
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723 if (PagePrivate(page)) {
724 if (!try_to_release_page(page, sc->gfp_mask))
725 goto activate_locked;
726 if (!mapping && page_count(page) == 1) {
727 unlock_page(page);
728 if (put_page_testzero(page))
729 goto free_it;
730 else {
731
732
733
734
735
736
737
738 nr_reclaimed++;
739 continue;
740 }
741 }
742 }
743
744 if (!mapping || !__remove_mapping(mapping, page))
745 goto keep_locked;
746
747
748
749
750
751
752
753
754 __clear_page_locked(page);
755free_it:
756 nr_reclaimed++;
757 if (!pagevec_add(&freed_pvec, page)) {
758 __pagevec_free(&freed_pvec);
759 pagevec_reinit(&freed_pvec);
760 }
761 continue;
762
763cull_mlocked:
764 if (PageSwapCache(page))
765 try_to_free_swap(page);
766 unlock_page(page);
767 putback_lru_page(page);
768 continue;
769
770activate_locked:
771
772 if (PageSwapCache(page) && vm_swap_full())
773 try_to_free_swap(page);
774 VM_BUG_ON(PageActive(page));
775 SetPageActive(page);
776 pgactivate++;
777keep_locked:
778 unlock_page(page);
779keep:
780 list_add(&page->lru, &ret_pages);
781 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
782 }
783 list_splice(&ret_pages, page_list);
784 if (pagevec_count(&freed_pvec))
785 __pagevec_free(&freed_pvec);
786 count_vm_events(PGACTIVATE, pgactivate);
787 return nr_reclaimed;
788}
789
790
791#define ISOLATE_INACTIVE 0
792#define ISOLATE_ACTIVE 1
793#define ISOLATE_BOTH 2
794
795
796
797
798
799
800
801
802
803
804
805int __isolate_lru_page(struct page *page, int mode, int file)
806{
807 int ret = -EINVAL;
808
809
810 if (!PageLRU(page))
811 return ret;
812
813
814
815
816
817
818 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
819 return ret;
820
821 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
822 return ret;
823
824
825
826
827
828
829 if (PageUnevictable(page))
830 return ret;
831
832 ret = -EBUSY;
833
834 if (likely(get_page_unless_zero(page))) {
835
836
837
838
839
840 ClearPageLRU(page);
841 ret = 0;
842 mem_cgroup_del_lru(page);
843 }
844
845 return ret;
846}
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
869 struct list_head *src, struct list_head *dst,
870 unsigned long *scanned, int order, int mode, int file)
871{
872 unsigned long nr_taken = 0;
873 unsigned long scan;
874
875 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
876 struct page *page;
877 unsigned long pfn;
878 unsigned long end_pfn;
879 unsigned long page_pfn;
880 int zone_id;
881
882 page = lru_to_page(src);
883 prefetchw_prev_lru_page(page, src, flags);
884
885 VM_BUG_ON(!PageLRU(page));
886
887 switch (__isolate_lru_page(page, mode, file)) {
888 case 0:
889 list_move(&page->lru, dst);
890 nr_taken++;
891 break;
892
893 case -EBUSY:
894
895 list_move(&page->lru, src);
896 continue;
897
898 default:
899 BUG();
900 }
901
902 if (!order)
903 continue;
904
905
906
907
908
909
910
911
912
913
914 zone_id = page_zone_id(page);
915 page_pfn = page_to_pfn(page);
916 pfn = page_pfn & ~((1 << order) - 1);
917 end_pfn = pfn + (1 << order);
918 for (; pfn < end_pfn; pfn++) {
919 struct page *cursor_page;
920
921
922 if (unlikely(pfn == page_pfn))
923 continue;
924
925
926 if (unlikely(!pfn_valid_within(pfn)))
927 break;
928
929 cursor_page = pfn_to_page(pfn);
930
931
932 if (unlikely(page_zone_id(cursor_page) != zone_id))
933 continue;
934 switch (__isolate_lru_page(cursor_page, mode, file)) {
935 case 0:
936 list_move(&cursor_page->lru, dst);
937 nr_taken++;
938 scan++;
939 break;
940
941 case -EBUSY:
942
943 list_move(&cursor_page->lru, src);
944 default:
945 break;
946 }
947 }
948 }
949
950 *scanned = scan;
951 return nr_taken;
952}
953
954static unsigned long isolate_pages_global(unsigned long nr,
955 struct list_head *dst,
956 unsigned long *scanned, int order,
957 int mode, struct zone *z,
958 struct mem_cgroup *mem_cont,
959 int active, int file)
960{
961 int lru = LRU_BASE;
962 if (active)
963 lru += LRU_ACTIVE;
964 if (file)
965 lru += LRU_FILE;
966 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
967 mode, !!file);
968}
969
970
971
972
973
974static unsigned long clear_active_flags(struct list_head *page_list,
975 unsigned int *count)
976{
977 int nr_active = 0;
978 int lru;
979 struct page *page;
980
981 list_for_each_entry(page, page_list, lru) {
982 lru = page_is_file_cache(page);
983 if (PageActive(page)) {
984 lru += LRU_ACTIVE;
985 ClearPageActive(page);
986 nr_active++;
987 }
988 count[lru]++;
989 }
990
991 return nr_active;
992}
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019int isolate_lru_page(struct page *page)
1020{
1021 int ret = -EBUSY;
1022
1023 if (PageLRU(page)) {
1024 struct zone *zone = page_zone(page);
1025
1026 spin_lock_irq(&zone->lru_lock);
1027 if (PageLRU(page) && get_page_unless_zero(page)) {
1028 int lru = page_lru(page);
1029 ret = 0;
1030 ClearPageLRU(page);
1031
1032 del_page_from_lru_list(zone, page, lru);
1033 }
1034 spin_unlock_irq(&zone->lru_lock);
1035 }
1036 return ret;
1037}
1038
1039
1040
1041
1042
1043static unsigned long shrink_inactive_list(unsigned long max_scan,
1044 struct zone *zone, struct scan_control *sc,
1045 int priority, int file)
1046{
1047 LIST_HEAD(page_list);
1048 struct pagevec pvec;
1049 unsigned long nr_scanned = 0;
1050 unsigned long nr_reclaimed = 0;
1051 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1052
1053 pagevec_init(&pvec, 1);
1054
1055 lru_add_drain();
1056 spin_lock_irq(&zone->lru_lock);
1057 do {
1058 struct page *page;
1059 unsigned long nr_taken;
1060 unsigned long nr_scan;
1061 unsigned long nr_freed;
1062 unsigned long nr_active;
1063 unsigned int count[NR_LRU_LISTS] = { 0, };
1064 int mode = ISOLATE_INACTIVE;
1065
1066
1067
1068
1069
1070
1071
1072
1073 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1074 mode = ISOLATE_BOTH;
1075 else if (sc->order && priority < DEF_PRIORITY - 2)
1076 mode = ISOLATE_BOTH;
1077
1078 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1079 &page_list, &nr_scan, sc->order, mode,
1080 zone, sc->mem_cgroup, 0, file);
1081 nr_active = clear_active_flags(&page_list, count);
1082 __count_vm_events(PGDEACTIVATE, nr_active);
1083
1084 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1085 -count[LRU_ACTIVE_FILE]);
1086 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1087 -count[LRU_INACTIVE_FILE]);
1088 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1089 -count[LRU_ACTIVE_ANON]);
1090 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1091 -count[LRU_INACTIVE_ANON]);
1092
1093 if (scanning_global_lru(sc))
1094 zone->pages_scanned += nr_scan;
1095
1096 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1097 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1098 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1099 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1100
1101 spin_unlock_irq(&zone->lru_lock);
1102
1103 nr_scanned += nr_scan;
1104 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
1105
1106
1107
1108
1109
1110
1111
1112 if (nr_freed < nr_taken && !current_is_kswapd() &&
1113 sc->order > PAGE_ALLOC_COSTLY_ORDER) {
1114 congestion_wait(WRITE, HZ/10);
1115
1116
1117
1118
1119
1120 nr_active = clear_active_flags(&page_list, count);
1121 count_vm_events(PGDEACTIVATE, nr_active);
1122
1123 nr_freed += shrink_page_list(&page_list, sc,
1124 PAGEOUT_IO_SYNC);
1125 }
1126
1127 nr_reclaimed += nr_freed;
1128 local_irq_disable();
1129 if (current_is_kswapd()) {
1130 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1131 __count_vm_events(KSWAPD_STEAL, nr_freed);
1132 } else if (scanning_global_lru(sc))
1133 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1134
1135 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1136
1137 if (nr_taken == 0)
1138 goto done;
1139
1140 spin_lock(&zone->lru_lock);
1141
1142
1143
1144 while (!list_empty(&page_list)) {
1145 int lru;
1146 page = lru_to_page(&page_list);
1147 VM_BUG_ON(PageLRU(page));
1148 list_del(&page->lru);
1149 if (unlikely(!page_evictable(page, NULL))) {
1150 spin_unlock_irq(&zone->lru_lock);
1151 putback_lru_page(page);
1152 spin_lock_irq(&zone->lru_lock);
1153 continue;
1154 }
1155 SetPageLRU(page);
1156 lru = page_lru(page);
1157 add_page_to_lru_list(zone, page, lru);
1158 if (PageActive(page)) {
1159 int file = !!page_is_file_cache(page);
1160 reclaim_stat->recent_rotated[file]++;
1161 }
1162 if (!pagevec_add(&pvec, page)) {
1163 spin_unlock_irq(&zone->lru_lock);
1164 __pagevec_release(&pvec);
1165 spin_lock_irq(&zone->lru_lock);
1166 }
1167 }
1168 } while (nr_scanned < max_scan);
1169 spin_unlock(&zone->lru_lock);
1170done:
1171 local_irq_enable();
1172 pagevec_release(&pvec);
1173 return nr_reclaimed;
1174}
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1185{
1186 if (priority < zone->prev_priority)
1187 zone->prev_priority = priority;
1188}
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1210 struct scan_control *sc, int priority, int file)
1211{
1212 unsigned long pgmoved;
1213 int pgdeactivate = 0;
1214 unsigned long pgscanned;
1215 LIST_HEAD(l_hold);
1216 LIST_HEAD(l_inactive);
1217 struct page *page;
1218 struct pagevec pvec;
1219 enum lru_list lru;
1220 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1221
1222 lru_add_drain();
1223 spin_lock_irq(&zone->lru_lock);
1224 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1225 ISOLATE_ACTIVE, zone,
1226 sc->mem_cgroup, 1, file);
1227
1228
1229
1230
1231 if (scanning_global_lru(sc)) {
1232 zone->pages_scanned += pgscanned;
1233 }
1234 reclaim_stat->recent_scanned[!!file] += pgmoved;
1235
1236 if (file)
1237 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1238 else
1239 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1240 spin_unlock_irq(&zone->lru_lock);
1241
1242 pgmoved = 0;
1243 while (!list_empty(&l_hold)) {
1244 cond_resched();
1245 page = lru_to_page(&l_hold);
1246 list_del(&page->lru);
1247
1248 if (unlikely(!page_evictable(page, NULL))) {
1249 putback_lru_page(page);
1250 continue;
1251 }
1252
1253
1254 if (page_mapping_inuse(page) &&
1255 page_referenced(page, 0, sc->mem_cgroup))
1256 pgmoved++;
1257
1258 list_add(&page->lru, &l_inactive);
1259 }
1260
1261
1262
1263
1264 pagevec_init(&pvec, 1);
1265 lru = LRU_BASE + file * LRU_FILE;
1266
1267 spin_lock_irq(&zone->lru_lock);
1268
1269
1270
1271
1272
1273
1274 reclaim_stat->recent_rotated[!!file] += pgmoved;
1275
1276 pgmoved = 0;
1277 while (!list_empty(&l_inactive)) {
1278 page = lru_to_page(&l_inactive);
1279 prefetchw_prev_lru_page(page, &l_inactive, flags);
1280 VM_BUG_ON(PageLRU(page));
1281 SetPageLRU(page);
1282 VM_BUG_ON(!PageActive(page));
1283 ClearPageActive(page);
1284
1285 list_move(&page->lru, &zone->lru[lru].list);
1286 mem_cgroup_add_lru_list(page, lru);
1287 pgmoved++;
1288 if (!pagevec_add(&pvec, page)) {
1289 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1290 spin_unlock_irq(&zone->lru_lock);
1291 pgdeactivate += pgmoved;
1292 pgmoved = 0;
1293 if (buffer_heads_over_limit)
1294 pagevec_strip(&pvec);
1295 __pagevec_release(&pvec);
1296 spin_lock_irq(&zone->lru_lock);
1297 }
1298 }
1299 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1300 pgdeactivate += pgmoved;
1301 if (buffer_heads_over_limit) {
1302 spin_unlock_irq(&zone->lru_lock);
1303 pagevec_strip(&pvec);
1304 spin_lock_irq(&zone->lru_lock);
1305 }
1306 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1307 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1308 spin_unlock_irq(&zone->lru_lock);
1309 if (vm_swap_full())
1310 pagevec_swap_free(&pvec);
1311
1312 pagevec_release(&pvec);
1313}
1314
1315static int inactive_anon_is_low_global(struct zone *zone)
1316{
1317 unsigned long active, inactive;
1318
1319 active = zone_page_state(zone, NR_ACTIVE_ANON);
1320 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1321
1322 if (inactive * zone->inactive_ratio < active)
1323 return 1;
1324
1325 return 0;
1326}
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1337{
1338 int low;
1339
1340 if (scanning_global_lru(sc))
1341 low = inactive_anon_is_low_global(zone);
1342 else
1343 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1344 return low;
1345}
1346
1347static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1348 struct zone *zone, struct scan_control *sc, int priority)
1349{
1350 int file = is_file_lru(lru);
1351
1352 if (lru == LRU_ACTIVE_FILE) {
1353 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1354 return 0;
1355 }
1356
1357 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1358 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1359 return 0;
1360 }
1361 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1362}
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1374 unsigned long *percent)
1375{
1376 unsigned long anon, file, free;
1377 unsigned long anon_prio, file_prio;
1378 unsigned long ap, fp;
1379 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1380
1381
1382 if (nr_swap_pages <= 0) {
1383 percent[0] = 0;
1384 percent[1] = 100;
1385 return;
1386 }
1387
1388 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1389 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1390 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
1391 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
1392
1393 if (scanning_global_lru(sc)) {
1394 free = zone_page_state(zone, NR_FREE_PAGES);
1395
1396
1397 if (unlikely(file + free <= zone->pages_high)) {
1398 percent[0] = 100;
1399 percent[1] = 0;
1400 return;
1401 }
1402 }
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1416 spin_lock_irq(&zone->lru_lock);
1417 reclaim_stat->recent_scanned[0] /= 2;
1418 reclaim_stat->recent_rotated[0] /= 2;
1419 spin_unlock_irq(&zone->lru_lock);
1420 }
1421
1422 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1423 spin_lock_irq(&zone->lru_lock);
1424 reclaim_stat->recent_scanned[1] /= 2;
1425 reclaim_stat->recent_rotated[1] /= 2;
1426 spin_unlock_irq(&zone->lru_lock);
1427 }
1428
1429
1430
1431
1432
1433 anon_prio = sc->swappiness;
1434 file_prio = 200 - sc->swappiness;
1435
1436
1437
1438
1439
1440
1441 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1442 ap /= reclaim_stat->recent_rotated[0] + 1;
1443
1444 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1445 fp /= reclaim_stat->recent_rotated[1] + 1;
1446
1447
1448 percent[0] = 100 * ap / (ap + fp + 1);
1449 percent[1] = 100 - percent[0];
1450}
1451
1452
1453
1454
1455
1456static void shrink_zone(int priority, struct zone *zone,
1457 struct scan_control *sc)
1458{
1459 unsigned long nr[NR_LRU_LISTS];
1460 unsigned long nr_to_scan;
1461 unsigned long percent[2];
1462 enum lru_list l;
1463 unsigned long nr_reclaimed = sc->nr_reclaimed;
1464 unsigned long swap_cluster_max = sc->swap_cluster_max;
1465
1466 get_scan_ratio(zone, sc, percent);
1467
1468 for_each_evictable_lru(l) {
1469 int file = is_file_lru(l);
1470 int scan;
1471
1472 scan = zone_nr_pages(zone, sc, l);
1473 if (priority) {
1474 scan >>= priority;
1475 scan = (scan * percent[file]) / 100;
1476 }
1477 if (scanning_global_lru(sc)) {
1478 zone->lru[l].nr_scan += scan;
1479 nr[l] = zone->lru[l].nr_scan;
1480 if (nr[l] >= swap_cluster_max)
1481 zone->lru[l].nr_scan = 0;
1482 else
1483 nr[l] = 0;
1484 } else
1485 nr[l] = scan;
1486 }
1487
1488 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1489 nr[LRU_INACTIVE_FILE]) {
1490 for_each_evictable_lru(l) {
1491 if (nr[l]) {
1492 nr_to_scan = min(nr[l], swap_cluster_max);
1493 nr[l] -= nr_to_scan;
1494
1495 nr_reclaimed += shrink_list(l, nr_to_scan,
1496 zone, sc, priority);
1497 }
1498 }
1499
1500
1501
1502
1503
1504
1505
1506
1507 if (nr_reclaimed > swap_cluster_max &&
1508 priority < DEF_PRIORITY && !current_is_kswapd())
1509 break;
1510 }
1511
1512 sc->nr_reclaimed = nr_reclaimed;
1513
1514
1515
1516
1517
1518 if (inactive_anon_is_low(zone, sc))
1519 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1520
1521 throttle_vm_writeout(sc->gfp_mask);
1522}
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538static void shrink_zones(int priority, struct zonelist *zonelist,
1539 struct scan_control *sc)
1540{
1541 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1542 struct zoneref *z;
1543 struct zone *zone;
1544
1545 sc->all_unreclaimable = 1;
1546 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1547 if (!populated_zone(zone))
1548 continue;
1549
1550
1551
1552
1553 if (scanning_global_lru(sc)) {
1554 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1555 continue;
1556 note_zone_scanning_priority(zone, priority);
1557
1558 if (zone_is_all_unreclaimable(zone) &&
1559 priority != DEF_PRIORITY)
1560 continue;
1561 sc->all_unreclaimable = 0;
1562 } else {
1563
1564
1565
1566
1567 sc->all_unreclaimable = 0;
1568 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1569 priority);
1570 }
1571
1572 shrink_zone(priority, zone, sc);
1573 }
1574}
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1593 struct scan_control *sc)
1594{
1595 int priority;
1596 unsigned long ret = 0;
1597 unsigned long total_scanned = 0;
1598 struct reclaim_state *reclaim_state = current->reclaim_state;
1599 unsigned long lru_pages = 0;
1600 struct zoneref *z;
1601 struct zone *zone;
1602 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1603
1604 delayacct_freepages_start();
1605
1606 if (scanning_global_lru(sc))
1607 count_vm_event(ALLOCSTALL);
1608
1609
1610
1611 if (scanning_global_lru(sc)) {
1612 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1613
1614 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1615 continue;
1616
1617 lru_pages += zone_lru_pages(zone);
1618 }
1619 }
1620
1621 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1622 sc->nr_scanned = 0;
1623 if (!priority)
1624 disable_swap_token();
1625 shrink_zones(priority, zonelist, sc);
1626
1627
1628
1629
1630 if (scanning_global_lru(sc)) {
1631 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1632 if (reclaim_state) {
1633 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1634 reclaim_state->reclaimed_slab = 0;
1635 }
1636 }
1637 total_scanned += sc->nr_scanned;
1638 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1639 ret = sc->nr_reclaimed;
1640 goto out;
1641 }
1642
1643
1644
1645
1646
1647
1648
1649
1650 if (total_scanned > sc->swap_cluster_max +
1651 sc->swap_cluster_max / 2) {
1652 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1653 sc->may_writepage = 1;
1654 }
1655
1656
1657 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1658 congestion_wait(WRITE, HZ/10);
1659 }
1660
1661 if (!sc->all_unreclaimable && scanning_global_lru(sc))
1662 ret = sc->nr_reclaimed;
1663out:
1664
1665
1666
1667
1668
1669
1670
1671 if (priority < 0)
1672 priority = 0;
1673
1674 if (scanning_global_lru(sc)) {
1675 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1676
1677 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1678 continue;
1679
1680 zone->prev_priority = priority;
1681 }
1682 } else
1683 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1684
1685 delayacct_freepages_end();
1686
1687 return ret;
1688}
1689
1690unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1691 gfp_t gfp_mask)
1692{
1693 struct scan_control sc = {
1694 .gfp_mask = gfp_mask,
1695 .may_writepage = !laptop_mode,
1696 .swap_cluster_max = SWAP_CLUSTER_MAX,
1697 .may_swap = 1,
1698 .swappiness = vm_swappiness,
1699 .order = order,
1700 .mem_cgroup = NULL,
1701 .isolate_pages = isolate_pages_global,
1702 };
1703
1704 return do_try_to_free_pages(zonelist, &sc);
1705}
1706
1707#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1708
1709unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1710 gfp_t gfp_mask,
1711 bool noswap,
1712 unsigned int swappiness)
1713{
1714 struct scan_control sc = {
1715 .may_writepage = !laptop_mode,
1716 .may_swap = 1,
1717 .swap_cluster_max = SWAP_CLUSTER_MAX,
1718 .swappiness = swappiness,
1719 .order = 0,
1720 .mem_cgroup = mem_cont,
1721 .isolate_pages = mem_cgroup_isolate_pages,
1722 };
1723 struct zonelist *zonelist;
1724
1725 if (noswap)
1726 sc.may_swap = 0;
1727
1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1730 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1731 return do_try_to_free_pages(zonelist, &sc);
1732}
1733#endif
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1757{
1758 int all_zones_ok;
1759 int priority;
1760 int i;
1761 unsigned long total_scanned;
1762 struct reclaim_state *reclaim_state = current->reclaim_state;
1763 struct scan_control sc = {
1764 .gfp_mask = GFP_KERNEL,
1765 .may_swap = 1,
1766 .swap_cluster_max = SWAP_CLUSTER_MAX,
1767 .swappiness = vm_swappiness,
1768 .order = order,
1769 .mem_cgroup = NULL,
1770 .isolate_pages = isolate_pages_global,
1771 };
1772
1773
1774
1775
1776 int temp_priority[MAX_NR_ZONES];
1777
1778loop_again:
1779 total_scanned = 0;
1780 sc.nr_reclaimed = 0;
1781 sc.may_writepage = !laptop_mode;
1782 count_vm_event(PAGEOUTRUN);
1783
1784 for (i = 0; i < pgdat->nr_zones; i++)
1785 temp_priority[i] = DEF_PRIORITY;
1786
1787 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1788 int end_zone = 0;
1789 unsigned long lru_pages = 0;
1790
1791
1792 if (!priority)
1793 disable_swap_token();
1794
1795 all_zones_ok = 1;
1796
1797
1798
1799
1800
1801 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1802 struct zone *zone = pgdat->node_zones + i;
1803
1804 if (!populated_zone(zone))
1805 continue;
1806
1807 if (zone_is_all_unreclaimable(zone) &&
1808 priority != DEF_PRIORITY)
1809 continue;
1810
1811
1812
1813
1814
1815 if (inactive_anon_is_low(zone, &sc))
1816 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1817 &sc, priority, 0);
1818
1819 if (!zone_watermark_ok(zone, order, zone->pages_high,
1820 0, 0)) {
1821 end_zone = i;
1822 break;
1823 }
1824 }
1825 if (i < 0)
1826 goto out;
1827
1828 for (i = 0; i <= end_zone; i++) {
1829 struct zone *zone = pgdat->node_zones + i;
1830
1831 lru_pages += zone_lru_pages(zone);
1832 }
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843 for (i = 0; i <= end_zone; i++) {
1844 struct zone *zone = pgdat->node_zones + i;
1845 int nr_slab;
1846
1847 if (!populated_zone(zone))
1848 continue;
1849
1850 if (zone_is_all_unreclaimable(zone) &&
1851 priority != DEF_PRIORITY)
1852 continue;
1853
1854 if (!zone_watermark_ok(zone, order, zone->pages_high,
1855 end_zone, 0))
1856 all_zones_ok = 0;
1857 temp_priority[i] = priority;
1858 sc.nr_scanned = 0;
1859 note_zone_scanning_priority(zone, priority);
1860
1861
1862
1863
1864 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1865 end_zone, 0))
1866 shrink_zone(priority, zone, &sc);
1867 reclaim_state->reclaimed_slab = 0;
1868 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1869 lru_pages);
1870 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1871 total_scanned += sc.nr_scanned;
1872 if (zone_is_all_unreclaimable(zone))
1873 continue;
1874 if (nr_slab == 0 && zone->pages_scanned >=
1875 (zone_lru_pages(zone) * 6))
1876 zone_set_flag(zone,
1877 ZONE_ALL_UNRECLAIMABLE);
1878
1879
1880
1881
1882
1883 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1884 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1885 sc.may_writepage = 1;
1886 }
1887 if (all_zones_ok)
1888 break;
1889
1890
1891
1892
1893 if (total_scanned && priority < DEF_PRIORITY - 2)
1894 congestion_wait(WRITE, HZ/10);
1895
1896
1897
1898
1899
1900
1901
1902 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1903 break;
1904 }
1905out:
1906
1907
1908
1909
1910
1911 for (i = 0; i < pgdat->nr_zones; i++) {
1912 struct zone *zone = pgdat->node_zones + i;
1913
1914 zone->prev_priority = temp_priority[i];
1915 }
1916 if (!all_zones_ok) {
1917 cond_resched();
1918
1919 try_to_freeze();
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
1936 order = sc.order = 0;
1937
1938 goto loop_again;
1939 }
1940
1941 return sc.nr_reclaimed;
1942}
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957static int kswapd(void *p)
1958{
1959 unsigned long order;
1960 pg_data_t *pgdat = (pg_data_t*)p;
1961 struct task_struct *tsk = current;
1962 DEFINE_WAIT(wait);
1963 struct reclaim_state reclaim_state = {
1964 .reclaimed_slab = 0,
1965 };
1966 node_to_cpumask_ptr(cpumask, pgdat->node_id);
1967
1968 if (!cpumask_empty(cpumask))
1969 set_cpus_allowed_ptr(tsk, cpumask);
1970 current->reclaim_state = &reclaim_state;
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1985 set_freezable();
1986
1987 order = 0;
1988 for ( ; ; ) {
1989 unsigned long new_order;
1990
1991 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1992 new_order = pgdat->kswapd_max_order;
1993 pgdat->kswapd_max_order = 0;
1994 if (order < new_order) {
1995
1996
1997
1998
1999 order = new_order;
2000 } else {
2001 if (!freezing(current))
2002 schedule();
2003
2004 order = pgdat->kswapd_max_order;
2005 }
2006 finish_wait(&pgdat->kswapd_wait, &wait);
2007
2008 if (!try_to_freeze()) {
2009
2010
2011
2012 balance_pgdat(pgdat, order);
2013 }
2014 }
2015 return 0;
2016}
2017
2018
2019
2020
2021void wakeup_kswapd(struct zone *zone, int order)
2022{
2023 pg_data_t *pgdat;
2024
2025 if (!populated_zone(zone))
2026 return;
2027
2028 pgdat = zone->zone_pgdat;
2029 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
2030 return;
2031 if (pgdat->kswapd_max_order < order)
2032 pgdat->kswapd_max_order = order;
2033 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2034 return;
2035 if (!waitqueue_active(&pgdat->kswapd_wait))
2036 return;
2037 wake_up_interruptible(&pgdat->kswapd_wait);
2038}
2039
2040unsigned long global_lru_pages(void)
2041{
2042 return global_page_state(NR_ACTIVE_ANON)
2043 + global_page_state(NR_ACTIVE_FILE)
2044 + global_page_state(NR_INACTIVE_ANON)
2045 + global_page_state(NR_INACTIVE_FILE);
2046}
2047
2048#ifdef CONFIG_PM
2049
2050
2051
2052
2053
2054
2055
2056static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2057 int pass, struct scan_control *sc)
2058{
2059 struct zone *zone;
2060 unsigned long ret = 0;
2061
2062 for_each_zone(zone) {
2063 enum lru_list l;
2064
2065 if (!populated_zone(zone))
2066 continue;
2067 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2068 continue;
2069
2070 for_each_evictable_lru(l) {
2071 enum zone_stat_item ls = NR_LRU_BASE + l;
2072 unsigned long lru_pages = zone_page_state(zone, ls);
2073
2074
2075 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2076 l == LRU_ACTIVE_FILE))
2077 continue;
2078
2079 zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
2080 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2081 unsigned long nr_to_scan;
2082
2083 zone->lru[l].nr_scan = 0;
2084 nr_to_scan = min(nr_pages, lru_pages);
2085 ret += shrink_list(l, nr_to_scan, zone,
2086 sc, prio);
2087 if (ret >= nr_pages)
2088 return ret;
2089 }
2090 }
2091 }
2092 return ret;
2093}
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103unsigned long shrink_all_memory(unsigned long nr_pages)
2104{
2105 unsigned long lru_pages, nr_slab;
2106 unsigned long ret = 0;
2107 int pass;
2108 struct reclaim_state reclaim_state;
2109 struct scan_control sc = {
2110 .gfp_mask = GFP_KERNEL,
2111 .may_swap = 0,
2112 .swap_cluster_max = nr_pages,
2113 .may_writepage = 1,
2114 .isolate_pages = isolate_pages_global,
2115 };
2116
2117 current->reclaim_state = &reclaim_state;
2118
2119 lru_pages = global_lru_pages();
2120 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2121
2122 while (nr_slab >= lru_pages) {
2123 reclaim_state.reclaimed_slab = 0;
2124 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2125 if (!reclaim_state.reclaimed_slab)
2126 break;
2127
2128 ret += reclaim_state.reclaimed_slab;
2129 if (ret >= nr_pages)
2130 goto out;
2131
2132 nr_slab -= reclaim_state.reclaimed_slab;
2133 }
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143 for (pass = 0; pass < 5; pass++) {
2144 int prio;
2145
2146
2147 if (pass > 2)
2148 sc.may_swap = 1;
2149
2150 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2151 unsigned long nr_to_scan = nr_pages - ret;
2152
2153 sc.nr_scanned = 0;
2154 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
2155 if (ret >= nr_pages)
2156 goto out;
2157
2158 reclaim_state.reclaimed_slab = 0;
2159 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2160 global_lru_pages());
2161 ret += reclaim_state.reclaimed_slab;
2162 if (ret >= nr_pages)
2163 goto out;
2164
2165 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2166 congestion_wait(WRITE, HZ / 10);
2167 }
2168 }
2169
2170
2171
2172
2173
2174 if (!ret) {
2175 do {
2176 reclaim_state.reclaimed_slab = 0;
2177 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
2178 ret += reclaim_state.reclaimed_slab;
2179 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
2180 }
2181
2182out:
2183 current->reclaim_state = NULL;
2184
2185 return ret;
2186}
2187#endif
2188
2189
2190
2191
2192
2193static int __devinit cpu_callback(struct notifier_block *nfb,
2194 unsigned long action, void *hcpu)
2195{
2196 int nid;
2197
2198 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2199 for_each_node_state(nid, N_HIGH_MEMORY) {
2200 pg_data_t *pgdat = NODE_DATA(nid);
2201 node_to_cpumask_ptr(mask, pgdat->node_id);
2202
2203 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2204
2205 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2206 }
2207 }
2208 return NOTIFY_OK;
2209}
2210
2211
2212
2213
2214
2215int kswapd_run(int nid)
2216{
2217 pg_data_t *pgdat = NODE_DATA(nid);
2218 int ret = 0;
2219
2220 if (pgdat->kswapd)
2221 return 0;
2222
2223 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
2224 if (IS_ERR(pgdat->kswapd)) {
2225
2226 BUG_ON(system_state == SYSTEM_BOOTING);
2227 printk("Failed to start kswapd on node %d\n",nid);
2228 ret = -1;
2229 }
2230 return ret;
2231}
2232
2233static int __init kswapd_init(void)
2234{
2235 int nid;
2236
2237 swap_setup();
2238 for_each_node_state(nid, N_HIGH_MEMORY)
2239 kswapd_run(nid);
2240 hotcpu_notifier(cpu_callback, 0);
2241 return 0;
2242}
2243
2244module_init(kswapd_init)
2245
2246#ifdef CONFIG_NUMA
2247
2248
2249
2250
2251
2252
2253int zone_reclaim_mode __read_mostly;
2254
2255#define RECLAIM_OFF 0
2256#define RECLAIM_ZONE (1<<0)
2257#define RECLAIM_WRITE (1<<1)
2258#define RECLAIM_SWAP (1<<2)
2259
2260
2261
2262
2263
2264
2265#define ZONE_RECLAIM_PRIORITY 4
2266
2267
2268
2269
2270
2271int sysctl_min_unmapped_ratio = 1;
2272
2273
2274
2275
2276
2277int sysctl_min_slab_ratio = 5;
2278
2279
2280
2281
2282static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2283{
2284
2285 const unsigned long nr_pages = 1 << order;
2286 struct task_struct *p = current;
2287 struct reclaim_state reclaim_state;
2288 int priority;
2289 struct scan_control sc = {
2290 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2291 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2292 .swap_cluster_max = max_t(unsigned long, nr_pages,
2293 SWAP_CLUSTER_MAX),
2294 .gfp_mask = gfp_mask,
2295 .swappiness = vm_swappiness,
2296 .isolate_pages = isolate_pages_global,
2297 };
2298 unsigned long slab_reclaimable;
2299
2300 disable_swap_token();
2301 cond_resched();
2302
2303
2304
2305
2306
2307 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2308 reclaim_state.reclaimed_slab = 0;
2309 p->reclaim_state = &reclaim_state;
2310
2311 if (zone_page_state(zone, NR_FILE_PAGES) -
2312 zone_page_state(zone, NR_FILE_MAPPED) >
2313 zone->min_unmapped_pages) {
2314
2315
2316
2317
2318 priority = ZONE_RECLAIM_PRIORITY;
2319 do {
2320 note_zone_scanning_priority(zone, priority);
2321 shrink_zone(priority, zone, &sc);
2322 priority--;
2323 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2324 }
2325
2326 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2327 if (slab_reclaimable > zone->min_slab_pages) {
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
2339 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
2340 slab_reclaimable - nr_pages)
2341 ;
2342
2343
2344
2345
2346
2347 sc.nr_reclaimed += slab_reclaimable -
2348 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2349 }
2350
2351 p->reclaim_state = NULL;
2352 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2353 return sc.nr_reclaimed >= nr_pages;
2354}
2355
2356int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2357{
2358 int node_id;
2359 int ret;
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371 if (zone_page_state(zone, NR_FILE_PAGES) -
2372 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
2373 && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
2374 <= zone->min_slab_pages)
2375 return 0;
2376
2377 if (zone_is_all_unreclaimable(zone))
2378 return 0;
2379
2380
2381
2382
2383 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2384 return 0;
2385
2386
2387
2388
2389
2390
2391
2392 node_id = zone_to_nid(zone);
2393 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2394 return 0;
2395
2396 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2397 return 0;
2398 ret = __zone_reclaim(zone, gfp_mask, order);
2399 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2400
2401 return ret;
2402}
2403#endif
2404
2405#ifdef CONFIG_UNEVICTABLE_LRU
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420int page_evictable(struct page *page, struct vm_area_struct *vma)
2421{
2422
2423 if (mapping_unevictable(page_mapping(page)))
2424 return 0;
2425
2426 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2427 return 0;
2428
2429 return 1;
2430}
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443static void check_move_unevictable_page(struct page *page, struct zone *zone)
2444{
2445 VM_BUG_ON(PageActive(page));
2446
2447retry:
2448 ClearPageUnevictable(page);
2449 if (page_evictable(page, NULL)) {
2450 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2451
2452 __dec_zone_state(zone, NR_UNEVICTABLE);
2453 list_move(&page->lru, &zone->lru[l].list);
2454 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
2455 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2456 __count_vm_event(UNEVICTABLE_PGRESCUED);
2457 } else {
2458
2459
2460
2461 SetPageUnevictable(page);
2462 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2463 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
2464 if (page_evictable(page, NULL))
2465 goto retry;
2466 }
2467}
2468
2469
2470
2471
2472
2473
2474
2475
2476void scan_mapping_unevictable_pages(struct address_space *mapping)
2477{
2478 pgoff_t next = 0;
2479 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2480 PAGE_CACHE_SHIFT;
2481 struct zone *zone;
2482 struct pagevec pvec;
2483
2484 if (mapping->nrpages == 0)
2485 return;
2486
2487 pagevec_init(&pvec, 0);
2488 while (next < end &&
2489 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2490 int i;
2491 int pg_scanned = 0;
2492
2493 zone = NULL;
2494
2495 for (i = 0; i < pagevec_count(&pvec); i++) {
2496 struct page *page = pvec.pages[i];
2497 pgoff_t page_index = page->index;
2498 struct zone *pagezone = page_zone(page);
2499
2500 pg_scanned++;
2501 if (page_index > next)
2502 next = page_index;
2503 next++;
2504
2505 if (pagezone != zone) {
2506 if (zone)
2507 spin_unlock_irq(&zone->lru_lock);
2508 zone = pagezone;
2509 spin_lock_irq(&zone->lru_lock);
2510 }
2511
2512 if (PageLRU(page) && PageUnevictable(page))
2513 check_move_unevictable_page(page, zone);
2514 }
2515 if (zone)
2516 spin_unlock_irq(&zone->lru_lock);
2517 pagevec_release(&pvec);
2518
2519 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2520 }
2521
2522}
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL
2535static void scan_zone_unevictable_pages(struct zone *zone)
2536{
2537 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2538 unsigned long scan;
2539 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2540
2541 while (nr_to_scan > 0) {
2542 unsigned long batch_size = min(nr_to_scan,
2543 SCAN_UNEVICTABLE_BATCH_SIZE);
2544
2545 spin_lock_irq(&zone->lru_lock);
2546 for (scan = 0; scan < batch_size; scan++) {
2547 struct page *page = lru_to_page(l_unevictable);
2548
2549 if (!trylock_page(page))
2550 continue;
2551
2552 prefetchw_prev_lru_page(page, l_unevictable, flags);
2553
2554 if (likely(PageLRU(page) && PageUnevictable(page)))
2555 check_move_unevictable_page(page, zone);
2556
2557 unlock_page(page);
2558 }
2559 spin_unlock_irq(&zone->lru_lock);
2560
2561 nr_to_scan -= batch_size;
2562 }
2563}
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577static void scan_all_zones_unevictable_pages(void)
2578{
2579 struct zone *zone;
2580
2581 for_each_zone(zone) {
2582 scan_zone_unevictable_pages(zone);
2583 }
2584}
2585
2586
2587
2588
2589
2590unsigned long scan_unevictable_pages;
2591
2592int scan_unevictable_handler(struct ctl_table *table, int write,
2593 struct file *file, void __user *buffer,
2594 size_t *length, loff_t *ppos)
2595{
2596 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2597
2598 if (write && *(unsigned long *)table->data)
2599 scan_all_zones_unevictable_pages();
2600
2601 scan_unevictable_pages = 0;
2602 return 0;
2603}
2604
2605
2606
2607
2608
2609
2610static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2611 struct sysdev_attribute *attr,
2612 char *buf)
2613{
2614 return sprintf(buf, "0\n");
2615}
2616
2617static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2618 struct sysdev_attribute *attr,
2619 const char *buf, size_t count)
2620{
2621 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2622 struct zone *zone;
2623 unsigned long res;
2624 unsigned long req = strict_strtoul(buf, 10, &res);
2625
2626 if (!req)
2627 return 1;
2628
2629 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2630 if (!populated_zone(zone))
2631 continue;
2632 scan_zone_unevictable_pages(zone);
2633 }
2634 return 1;
2635}
2636
2637
2638static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2639 read_scan_unevictable_node,
2640 write_scan_unevictable_node);
2641
2642int scan_unevictable_register_node(struct node *node)
2643{
2644 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2645}
2646
2647void scan_unevictable_unregister_node(struct node *node)
2648{
2649 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2650}
2651
2652#endif
2653