1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/notifier.h>
36#include <linux/rwsem.h>
37#include <linux/delay.h>
38#include <linux/kthread.h>
39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
43
44#include <asm/tlbflush.h>
45#include <asm/div64.h>
46
47#include <linux/swapops.h>
48
49#include "internal.h"
50
51struct scan_control {
52
53 unsigned long nr_scanned;
54
55
56 gfp_t gfp_mask;
57
58 int may_writepage;
59
60
61 int may_swap;
62
63
64
65
66
67 int swap_cluster_max;
68
69 int swappiness;
70
71 int all_unreclaimable;
72
73 int order;
74
75
76 struct mem_cgroup *mem_cgroup;
77
78
79 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
80 unsigned long *scanned, int order, int mode,
81 struct zone *z, struct mem_cgroup *mem_cont,
82 int active, int file);
83};
84
85#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
86
87#ifdef ARCH_HAS_PREFETCH
88#define prefetch_prev_lru_page(_page, _base, _field) \
89 do { \
90 if ((_page)->lru.prev != _base) { \
91 struct page *prev; \
92 \
93 prev = lru_to_page(&(_page->lru)); \
94 prefetch(&prev->_field); \
95 } \
96 } while (0)
97#else
98#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
99#endif
100
101#ifdef ARCH_HAS_PREFETCHW
102#define prefetchw_prev_lru_page(_page, _base, _field) \
103 do { \
104 if ((_page)->lru.prev != _base) { \
105 struct page *prev; \
106 \
107 prev = lru_to_page(&(_page->lru)); \
108 prefetchw(&prev->_field); \
109 } \
110 } while (0)
111#else
112#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
113#endif
114
115
116
117
118int vm_swappiness = 60;
119long vm_total_pages;
120
121static LIST_HEAD(shrinker_list);
122static DECLARE_RWSEM(shrinker_rwsem);
123
124#ifdef CONFIG_CGROUP_MEM_RES_CTLR
125#define scan_global_lru(sc) (!(sc)->mem_cgroup)
126#else
127#define scan_global_lru(sc) (1)
128#endif
129
130
131
132
133void register_shrinker(struct shrinker *shrinker)
134{
135 shrinker->nr = 0;
136 down_write(&shrinker_rwsem);
137 list_add_tail(&shrinker->list, &shrinker_list);
138 up_write(&shrinker_rwsem);
139}
140EXPORT_SYMBOL(register_shrinker);
141
142
143
144
145void unregister_shrinker(struct shrinker *shrinker)
146{
147 down_write(&shrinker_rwsem);
148 list_del(&shrinker->list);
149 up_write(&shrinker_rwsem);
150}
151EXPORT_SYMBOL(unregister_shrinker);
152
153#define SHRINK_BATCH 128
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
174 unsigned long lru_pages)
175{
176 struct shrinker *shrinker;
177 unsigned long ret = 0;
178
179 if (scanned == 0)
180 scanned = SWAP_CLUSTER_MAX;
181
182 if (!down_read_trylock(&shrinker_rwsem))
183 return 1;
184
185 list_for_each_entry(shrinker, &shrinker_list, list) {
186 unsigned long long delta;
187 unsigned long total_scan;
188 unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
189
190 delta = (4 * scanned) / shrinker->seeks;
191 delta *= max_pass;
192 do_div(delta, lru_pages + 1);
193 shrinker->nr += delta;
194 if (shrinker->nr < 0) {
195 printk(KERN_ERR "%s: nr=%ld\n",
196 __func__, shrinker->nr);
197 shrinker->nr = max_pass;
198 }
199
200
201
202
203
204
205 if (shrinker->nr > max_pass * 2)
206 shrinker->nr = max_pass * 2;
207
208 total_scan = shrinker->nr;
209 shrinker->nr = 0;
210
211 while (total_scan >= SHRINK_BATCH) {
212 long this_scan = SHRINK_BATCH;
213 int shrink_ret;
214 int nr_before;
215
216 nr_before = (*shrinker->shrink)(0, gfp_mask);
217 shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
218 if (shrink_ret == -1)
219 break;
220 if (shrink_ret < nr_before)
221 ret += nr_before - shrink_ret;
222 count_vm_events(SLABS_SCANNED, this_scan);
223 total_scan -= this_scan;
224
225 cond_resched();
226 }
227
228 shrinker->nr += total_scan;
229 }
230 up_read(&shrinker_rwsem);
231 return ret;
232}
233
234
235static inline int page_mapping_inuse(struct page *page)
236{
237 struct address_space *mapping;
238
239
240 if (page_mapped(page))
241 return 1;
242
243
244 if (PageSwapCache(page))
245 return 1;
246
247 mapping = page_mapping(page);
248 if (!mapping)
249 return 0;
250
251
252 return mapping_mapped(mapping);
253}
254
255static inline int is_page_cache_freeable(struct page *page)
256{
257 return page_count(page) - !!PagePrivate(page) == 2;
258}
259
260static int may_write_to_queue(struct backing_dev_info *bdi)
261{
262 if (current->flags & PF_SWAPWRITE)
263 return 1;
264 if (!bdi_write_congested(bdi))
265 return 1;
266 if (bdi == current->backing_dev_info)
267 return 1;
268 return 0;
269}
270
271
272
273
274
275
276
277
278
279
280
281
282
283static void handle_write_error(struct address_space *mapping,
284 struct page *page, int error)
285{
286 lock_page(page);
287 if (page_mapping(page) == mapping)
288 mapping_set_error(mapping, error);
289 unlock_page(page);
290}
291
292
293enum pageout_io {
294 PAGEOUT_IO_ASYNC,
295 PAGEOUT_IO_SYNC,
296};
297
298
299typedef enum {
300
301 PAGE_KEEP,
302
303 PAGE_ACTIVATE,
304
305 PAGE_SUCCESS,
306
307 PAGE_CLEAN,
308} pageout_t;
309
310
311
312
313
314static pageout_t pageout(struct page *page, struct address_space *mapping,
315 enum pageout_io sync_writeback)
316{
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334 if (!is_page_cache_freeable(page))
335 return PAGE_KEEP;
336 if (!mapping) {
337
338
339
340
341 if (PagePrivate(page)) {
342 if (try_to_free_buffers(page)) {
343 ClearPageDirty(page);
344 printk("%s: orphaned page\n", __func__);
345 return PAGE_CLEAN;
346 }
347 }
348 return PAGE_KEEP;
349 }
350 if (mapping->a_ops->writepage == NULL)
351 return PAGE_ACTIVATE;
352 if (!may_write_to_queue(mapping->backing_dev_info))
353 return PAGE_KEEP;
354
355 if (clear_page_dirty_for_io(page)) {
356 int res;
357 struct writeback_control wbc = {
358 .sync_mode = WB_SYNC_NONE,
359 .nr_to_write = SWAP_CLUSTER_MAX,
360 .range_start = 0,
361 .range_end = LLONG_MAX,
362 .nonblocking = 1,
363 .for_reclaim = 1,
364 };
365
366 SetPageReclaim(page);
367 res = mapping->a_ops->writepage(page, &wbc);
368 if (res < 0)
369 handle_write_error(mapping, page, res);
370 if (res == AOP_WRITEPAGE_ACTIVATE) {
371 ClearPageReclaim(page);
372 return PAGE_ACTIVATE;
373 }
374
375
376
377
378
379
380 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
381 wait_on_page_writeback(page);
382
383 if (!PageWriteback(page)) {
384
385 ClearPageReclaim(page);
386 }
387 inc_zone_page_state(page, NR_VMSCAN_WRITE);
388 return PAGE_SUCCESS;
389 }
390
391 return PAGE_CLEAN;
392}
393
394
395
396
397
398static int __remove_mapping(struct address_space *mapping, struct page *page)
399{
400 BUG_ON(!PageLocked(page));
401 BUG_ON(mapping != page_mapping(page));
402
403 spin_lock_irq(&mapping->tree_lock);
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429 if (!page_freeze_refs(page, 2))
430 goto cannot_free;
431
432 if (unlikely(PageDirty(page))) {
433 page_unfreeze_refs(page, 2);
434 goto cannot_free;
435 }
436
437 if (PageSwapCache(page)) {
438 swp_entry_t swap = { .val = page_private(page) };
439 __delete_from_swap_cache(page);
440 spin_unlock_irq(&mapping->tree_lock);
441 swap_free(swap);
442 } else {
443 __remove_from_page_cache(page);
444 spin_unlock_irq(&mapping->tree_lock);
445 }
446
447 return 1;
448
449cannot_free:
450 spin_unlock_irq(&mapping->tree_lock);
451 return 0;
452}
453
454
455
456
457
458
459
460int remove_mapping(struct address_space *mapping, struct page *page)
461{
462 if (__remove_mapping(mapping, page)) {
463
464
465
466
467
468 page_unfreeze_refs(page, 1);
469 return 1;
470 }
471 return 0;
472}
473
474
475
476
477
478
479
480
481
482
483#ifdef CONFIG_UNEVICTABLE_LRU
484void putback_lru_page(struct page *page)
485{
486 int lru;
487 int active = !!TestClearPageActive(page);
488 int was_unevictable = PageUnevictable(page);
489
490 VM_BUG_ON(PageLRU(page));
491
492redo:
493 ClearPageUnevictable(page);
494
495 if (page_evictable(page, NULL)) {
496
497
498
499
500
501
502 lru = active + page_is_file_cache(page);
503 lru_cache_add_lru(page, lru);
504 } else {
505
506
507
508
509 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page);
511 }
512 mem_cgroup_move_lists(page, lru);
513
514
515
516
517
518
519 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
520 if (!isolate_lru_page(page)) {
521 put_page(page);
522 goto redo;
523 }
524
525
526
527
528 }
529
530 if (was_unevictable && lru != LRU_UNEVICTABLE)
531 count_vm_event(UNEVICTABLE_PGRESCUED);
532 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
533 count_vm_event(UNEVICTABLE_PGCULLED);
534
535 put_page(page);
536}
537
538#else
539
540void putback_lru_page(struct page *page)
541{
542 int lru;
543 VM_BUG_ON(PageLRU(page));
544
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page);
549}
550#endif
551
552
553
554
555
556static unsigned long shrink_page_list(struct list_head *page_list,
557 struct scan_control *sc,
558 enum pageout_io sync_writeback)
559{
560 LIST_HEAD(ret_pages);
561 struct pagevec freed_pvec;
562 int pgactivate = 0;
563 unsigned long nr_reclaimed = 0;
564
565 cond_resched();
566
567 pagevec_init(&freed_pvec, 1);
568 while (!list_empty(page_list)) {
569 struct address_space *mapping;
570 struct page *page;
571 int may_enter_fs;
572 int referenced;
573
574 cond_resched();
575
576 page = lru_to_page(page_list);
577 list_del(&page->lru);
578
579 if (!trylock_page(page))
580 goto keep;
581
582 VM_BUG_ON(PageActive(page));
583
584 sc->nr_scanned++;
585
586 if (unlikely(!page_evictable(page, NULL)))
587 goto cull_mlocked;
588
589 if (!sc->may_swap && page_mapped(page))
590 goto keep_locked;
591
592
593 if (page_mapped(page) || PageSwapCache(page))
594 sc->nr_scanned++;
595
596 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
597 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
598
599 if (PageWriteback(page)) {
600
601
602
603
604
605
606
607
608 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
609 wait_on_page_writeback(page);
610 else
611 goto keep_locked;
612 }
613
614 referenced = page_referenced(page, 1, sc->mem_cgroup);
615
616 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
617 referenced && page_mapping_inuse(page))
618 goto activate_locked;
619
620#ifdef CONFIG_SWAP
621
622
623
624
625 if (PageAnon(page) && !PageSwapCache(page)) {
626 if (!(sc->gfp_mask & __GFP_IO))
627 goto keep_locked;
628 switch (try_to_munlock(page)) {
629 case SWAP_FAIL:
630 case SWAP_AGAIN:
631 goto keep_locked;
632 case SWAP_MLOCK:
633 goto cull_mlocked;
634 case SWAP_SUCCESS:
635 ;
636 }
637 if (!add_to_swap(page, GFP_ATOMIC))
638 goto activate_locked;
639 may_enter_fs = 1;
640 }
641#endif
642
643 mapping = page_mapping(page);
644
645
646
647
648
649 if (page_mapped(page) && mapping) {
650 switch (try_to_unmap(page, 0)) {
651 case SWAP_FAIL:
652 goto activate_locked;
653 case SWAP_AGAIN:
654 goto keep_locked;
655 case SWAP_MLOCK:
656 goto cull_mlocked;
657 case SWAP_SUCCESS:
658 ;
659 }
660 }
661
662 if (PageDirty(page)) {
663 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
664 goto keep_locked;
665 if (!may_enter_fs)
666 goto keep_locked;
667 if (!sc->may_writepage)
668 goto keep_locked;
669
670
671 switch (pageout(page, mapping, sync_writeback)) {
672 case PAGE_KEEP:
673 goto keep_locked;
674 case PAGE_ACTIVATE:
675 goto activate_locked;
676 case PAGE_SUCCESS:
677 if (PageWriteback(page) || PageDirty(page))
678 goto keep;
679
680
681
682
683 if (!trylock_page(page))
684 goto keep;
685 if (PageDirty(page) || PageWriteback(page))
686 goto keep_locked;
687 mapping = page_mapping(page);
688 case PAGE_CLEAN:
689 ;
690 }
691 }
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714 if (PagePrivate(page)) {
715 if (!try_to_release_page(page, sc->gfp_mask))
716 goto activate_locked;
717 if (!mapping && page_count(page) == 1) {
718 unlock_page(page);
719 if (put_page_testzero(page))
720 goto free_it;
721 else {
722
723
724
725
726
727
728
729 nr_reclaimed++;
730 continue;
731 }
732 }
733 }
734
735 if (!mapping || !__remove_mapping(mapping, page))
736 goto keep_locked;
737
738
739
740
741
742
743
744
745 __clear_page_locked(page);
746free_it:
747 nr_reclaimed++;
748 if (!pagevec_add(&freed_pvec, page)) {
749 __pagevec_free(&freed_pvec);
750 pagevec_reinit(&freed_pvec);
751 }
752 continue;
753
754cull_mlocked:
755 unlock_page(page);
756 putback_lru_page(page);
757 continue;
758
759activate_locked:
760
761 if (PageSwapCache(page) && vm_swap_full())
762 remove_exclusive_swap_page_ref(page);
763 VM_BUG_ON(PageActive(page));
764 SetPageActive(page);
765 pgactivate++;
766keep_locked:
767 unlock_page(page);
768keep:
769 list_add(&page->lru, &ret_pages);
770 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
771 }
772 list_splice(&ret_pages, page_list);
773 if (pagevec_count(&freed_pvec))
774 __pagevec_free(&freed_pvec);
775 count_vm_events(PGACTIVATE, pgactivate);
776 return nr_reclaimed;
777}
778
779
780#define ISOLATE_INACTIVE 0
781#define ISOLATE_ACTIVE 1
782#define ISOLATE_BOTH 2
783
784
785
786
787
788
789
790
791
792
793
794int __isolate_lru_page(struct page *page, int mode, int file)
795{
796 int ret = -EINVAL;
797
798
799 if (!PageLRU(page))
800 return ret;
801
802
803
804
805
806
807 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
808 return ret;
809
810 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
811 return ret;
812
813
814
815
816
817
818 if (PageUnevictable(page))
819 return ret;
820
821 ret = -EBUSY;
822 if (likely(get_page_unless_zero(page))) {
823
824
825
826
827
828 ClearPageLRU(page);
829 ret = 0;
830 }
831
832 return ret;
833}
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
856 struct list_head *src, struct list_head *dst,
857 unsigned long *scanned, int order, int mode, int file)
858{
859 unsigned long nr_taken = 0;
860 unsigned long scan;
861
862 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
863 struct page *page;
864 unsigned long pfn;
865 unsigned long end_pfn;
866 unsigned long page_pfn;
867 int zone_id;
868
869 page = lru_to_page(src);
870 prefetchw_prev_lru_page(page, src, flags);
871
872 VM_BUG_ON(!PageLRU(page));
873
874 switch (__isolate_lru_page(page, mode, file)) {
875 case 0:
876 list_move(&page->lru, dst);
877 nr_taken++;
878 break;
879
880 case -EBUSY:
881
882 list_move(&page->lru, src);
883 continue;
884
885 default:
886 BUG();
887 }
888
889 if (!order)
890 continue;
891
892
893
894
895
896
897
898
899
900
901 zone_id = page_zone_id(page);
902 page_pfn = page_to_pfn(page);
903 pfn = page_pfn & ~((1 << order) - 1);
904 end_pfn = pfn + (1 << order);
905 for (; pfn < end_pfn; pfn++) {
906 struct page *cursor_page;
907
908
909 if (unlikely(pfn == page_pfn))
910 continue;
911
912
913 if (unlikely(!pfn_valid_within(pfn)))
914 break;
915
916 cursor_page = pfn_to_page(pfn);
917
918
919 if (unlikely(page_zone_id(cursor_page) != zone_id))
920 continue;
921 switch (__isolate_lru_page(cursor_page, mode, file)) {
922 case 0:
923 list_move(&cursor_page->lru, dst);
924 nr_taken++;
925 scan++;
926 break;
927
928 case -EBUSY:
929
930 list_move(&cursor_page->lru, src);
931 default:
932 break;
933 }
934 }
935 }
936
937 *scanned = scan;
938 return nr_taken;
939}
940
941static unsigned long isolate_pages_global(unsigned long nr,
942 struct list_head *dst,
943 unsigned long *scanned, int order,
944 int mode, struct zone *z,
945 struct mem_cgroup *mem_cont,
946 int active, int file)
947{
948 int lru = LRU_BASE;
949 if (active)
950 lru += LRU_ACTIVE;
951 if (file)
952 lru += LRU_FILE;
953 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
954 mode, !!file);
955}
956
957
958
959
960
961static unsigned long clear_active_flags(struct list_head *page_list,
962 unsigned int *count)
963{
964 int nr_active = 0;
965 int lru;
966 struct page *page;
967
968 list_for_each_entry(page, page_list, lru) {
969 lru = page_is_file_cache(page);
970 if (PageActive(page)) {
971 lru += LRU_ACTIVE;
972 ClearPageActive(page);
973 nr_active++;
974 }
975 count[lru]++;
976 }
977
978 return nr_active;
979}
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006int isolate_lru_page(struct page *page)
1007{
1008 int ret = -EBUSY;
1009
1010 if (PageLRU(page)) {
1011 struct zone *zone = page_zone(page);
1012
1013 spin_lock_irq(&zone->lru_lock);
1014 if (PageLRU(page) && get_page_unless_zero(page)) {
1015 int lru = page_lru(page);
1016 ret = 0;
1017 ClearPageLRU(page);
1018
1019 del_page_from_lru_list(zone, page, lru);
1020 }
1021 spin_unlock_irq(&zone->lru_lock);
1022 }
1023 return ret;
1024}
1025
1026
1027
1028
1029
1030static unsigned long shrink_inactive_list(unsigned long max_scan,
1031 struct zone *zone, struct scan_control *sc,
1032 int priority, int file)
1033{
1034 LIST_HEAD(page_list);
1035 struct pagevec pvec;
1036 unsigned long nr_scanned = 0;
1037 unsigned long nr_reclaimed = 0;
1038
1039 pagevec_init(&pvec, 1);
1040
1041 lru_add_drain();
1042 spin_lock_irq(&zone->lru_lock);
1043 do {
1044 struct page *page;
1045 unsigned long nr_taken;
1046 unsigned long nr_scan;
1047 unsigned long nr_freed;
1048 unsigned long nr_active;
1049 unsigned int count[NR_LRU_LISTS] = { 0, };
1050 int mode = ISOLATE_INACTIVE;
1051
1052
1053
1054
1055
1056
1057
1058
1059 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1060 mode = ISOLATE_BOTH;
1061 else if (sc->order && priority < DEF_PRIORITY - 2)
1062 mode = ISOLATE_BOTH;
1063
1064 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1065 &page_list, &nr_scan, sc->order, mode,
1066 zone, sc->mem_cgroup, 0, file);
1067 nr_active = clear_active_flags(&page_list, count);
1068 __count_vm_events(PGDEACTIVATE, nr_active);
1069
1070 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1071 -count[LRU_ACTIVE_FILE]);
1072 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1073 -count[LRU_INACTIVE_FILE]);
1074 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1075 -count[LRU_ACTIVE_ANON]);
1076 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1077 -count[LRU_INACTIVE_ANON]);
1078
1079 if (scan_global_lru(sc)) {
1080 zone->pages_scanned += nr_scan;
1081 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1082 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1083 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1084 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1085 }
1086 spin_unlock_irq(&zone->lru_lock);
1087
1088 nr_scanned += nr_scan;
1089 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
1090
1091
1092
1093
1094
1095
1096
1097 if (nr_freed < nr_taken && !current_is_kswapd() &&
1098 sc->order > PAGE_ALLOC_COSTLY_ORDER) {
1099 congestion_wait(WRITE, HZ/10);
1100
1101
1102
1103
1104
1105 nr_active = clear_active_flags(&page_list, count);
1106 count_vm_events(PGDEACTIVATE, nr_active);
1107
1108 nr_freed += shrink_page_list(&page_list, sc,
1109 PAGEOUT_IO_SYNC);
1110 }
1111
1112 nr_reclaimed += nr_freed;
1113 local_irq_disable();
1114 if (current_is_kswapd()) {
1115 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1116 __count_vm_events(KSWAPD_STEAL, nr_freed);
1117 } else if (scan_global_lru(sc))
1118 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1119
1120 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1121
1122 if (nr_taken == 0)
1123 goto done;
1124
1125 spin_lock(&zone->lru_lock);
1126
1127
1128
1129 while (!list_empty(&page_list)) {
1130 int lru;
1131 page = lru_to_page(&page_list);
1132 VM_BUG_ON(PageLRU(page));
1133 list_del(&page->lru);
1134 if (unlikely(!page_evictable(page, NULL))) {
1135 spin_unlock_irq(&zone->lru_lock);
1136 putback_lru_page(page);
1137 spin_lock_irq(&zone->lru_lock);
1138 continue;
1139 }
1140 SetPageLRU(page);
1141 lru = page_lru(page);
1142 add_page_to_lru_list(zone, page, lru);
1143 mem_cgroup_move_lists(page, lru);
1144 if (PageActive(page) && scan_global_lru(sc)) {
1145 int file = !!page_is_file_cache(page);
1146 zone->recent_rotated[file]++;
1147 }
1148 if (!pagevec_add(&pvec, page)) {
1149 spin_unlock_irq(&zone->lru_lock);
1150 __pagevec_release(&pvec);
1151 spin_lock_irq(&zone->lru_lock);
1152 }
1153 }
1154 } while (nr_scanned < max_scan);
1155 spin_unlock(&zone->lru_lock);
1156done:
1157 local_irq_enable();
1158 pagevec_release(&pvec);
1159 return nr_reclaimed;
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1171{
1172 if (priority < zone->prev_priority)
1173 zone->prev_priority = priority;
1174}
1175
1176static inline int zone_is_near_oom(struct zone *zone)
1177{
1178 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1179}
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1201 struct scan_control *sc, int priority, int file)
1202{
1203 unsigned long pgmoved;
1204 int pgdeactivate = 0;
1205 unsigned long pgscanned;
1206 LIST_HEAD(l_hold);
1207 LIST_HEAD(l_inactive);
1208 struct page *page;
1209 struct pagevec pvec;
1210 enum lru_list lru;
1211
1212 lru_add_drain();
1213 spin_lock_irq(&zone->lru_lock);
1214 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1215 ISOLATE_ACTIVE, zone,
1216 sc->mem_cgroup, 1, file);
1217
1218
1219
1220
1221 if (scan_global_lru(sc)) {
1222 zone->pages_scanned += pgscanned;
1223 zone->recent_scanned[!!file] += pgmoved;
1224 }
1225
1226 if (file)
1227 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1228 else
1229 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1230 spin_unlock_irq(&zone->lru_lock);
1231
1232 pgmoved = 0;
1233 while (!list_empty(&l_hold)) {
1234 cond_resched();
1235 page = lru_to_page(&l_hold);
1236 list_del(&page->lru);
1237
1238 if (unlikely(!page_evictable(page, NULL))) {
1239 putback_lru_page(page);
1240 continue;
1241 }
1242
1243
1244 if (page_mapping_inuse(page) &&
1245 page_referenced(page, 0, sc->mem_cgroup))
1246 pgmoved++;
1247
1248 list_add(&page->lru, &l_inactive);
1249 }
1250
1251 spin_lock_irq(&zone->lru_lock);
1252
1253
1254
1255
1256
1257
1258 zone->recent_rotated[!!file] += pgmoved;
1259
1260
1261
1262
1263 pagevec_init(&pvec, 1);
1264
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267 while (!list_empty(&l_inactive)) {
1268 page = lru_to_page(&l_inactive);
1269 prefetchw_prev_lru_page(page, &l_inactive, flags);
1270 VM_BUG_ON(PageLRU(page));
1271 SetPageLRU(page);
1272 VM_BUG_ON(!PageActive(page));
1273 ClearPageActive(page);
1274
1275 list_move(&page->lru, &zone->lru[lru].list);
1276 mem_cgroup_move_lists(page, lru);
1277 pgmoved++;
1278 if (!pagevec_add(&pvec, page)) {
1279 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1280 spin_unlock_irq(&zone->lru_lock);
1281 pgdeactivate += pgmoved;
1282 pgmoved = 0;
1283 if (buffer_heads_over_limit)
1284 pagevec_strip(&pvec);
1285 __pagevec_release(&pvec);
1286 spin_lock_irq(&zone->lru_lock);
1287 }
1288 }
1289 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1290 pgdeactivate += pgmoved;
1291 if (buffer_heads_over_limit) {
1292 spin_unlock_irq(&zone->lru_lock);
1293 pagevec_strip(&pvec);
1294 spin_lock_irq(&zone->lru_lock);
1295 }
1296 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1297 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1298 spin_unlock_irq(&zone->lru_lock);
1299 if (vm_swap_full())
1300 pagevec_swap_free(&pvec);
1301
1302 pagevec_release(&pvec);
1303}
1304
1305static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1306 struct zone *zone, struct scan_control *sc, int priority)
1307{
1308 int file = is_file_lru(lru);
1309
1310 if (lru == LRU_ACTIVE_FILE) {
1311 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1312 return 0;
1313 }
1314
1315 if (lru == LRU_ACTIVE_ANON &&
1316 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1317 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1318 return 0;
1319 }
1320 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1321}
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1333 unsigned long *percent)
1334{
1335 unsigned long anon, file, free;
1336 unsigned long anon_prio, file_prio;
1337 unsigned long ap, fp;
1338
1339 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1340 zone_page_state(zone, NR_INACTIVE_ANON);
1341 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1342 zone_page_state(zone, NR_INACTIVE_FILE);
1343 free = zone_page_state(zone, NR_FREE_PAGES);
1344
1345
1346 if (nr_swap_pages <= 0) {
1347 percent[0] = 0;
1348 percent[1] = 100;
1349 return;
1350 }
1351
1352
1353 if (unlikely(file + free <= zone->pages_high)) {
1354 percent[0] = 100;
1355 percent[1] = 0;
1356 return;
1357 }
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1371 spin_lock_irq(&zone->lru_lock);
1372 zone->recent_scanned[0] /= 2;
1373 zone->recent_rotated[0] /= 2;
1374 spin_unlock_irq(&zone->lru_lock);
1375 }
1376
1377 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1378 spin_lock_irq(&zone->lru_lock);
1379 zone->recent_scanned[1] /= 2;
1380 zone->recent_rotated[1] /= 2;
1381 spin_unlock_irq(&zone->lru_lock);
1382 }
1383
1384
1385
1386
1387
1388 anon_prio = sc->swappiness;
1389 file_prio = 200 - sc->swappiness;
1390
1391
1392
1393
1394
1395
1396 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1397 ap /= zone->recent_rotated[0] + 1;
1398
1399 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1400 fp /= zone->recent_rotated[1] + 1;
1401
1402
1403 percent[0] = 100 * ap / (ap + fp + 1);
1404 percent[1] = 100 - percent[0];
1405}
1406
1407
1408
1409
1410
1411static unsigned long shrink_zone(int priority, struct zone *zone,
1412 struct scan_control *sc)
1413{
1414 unsigned long nr[NR_LRU_LISTS];
1415 unsigned long nr_to_scan;
1416 unsigned long nr_reclaimed = 0;
1417 unsigned long percent[2];
1418 enum lru_list l;
1419
1420 get_scan_ratio(zone, sc, percent);
1421
1422 for_each_evictable_lru(l) {
1423 if (scan_global_lru(sc)) {
1424 int file = is_file_lru(l);
1425 int scan;
1426
1427 scan = zone_page_state(zone, NR_LRU_BASE + l);
1428 if (priority) {
1429 scan >>= priority;
1430 scan = (scan * percent[file]) / 100;
1431 }
1432 zone->lru[l].nr_scan += scan;
1433 nr[l] = zone->lru[l].nr_scan;
1434 if (nr[l] >= sc->swap_cluster_max)
1435 zone->lru[l].nr_scan = 0;
1436 else
1437 nr[l] = 0;
1438 } else {
1439
1440
1441
1442
1443
1444 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1445 priority, l);
1446 }
1447 }
1448
1449 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1450 nr[LRU_INACTIVE_FILE]) {
1451 for_each_evictable_lru(l) {
1452 if (nr[l]) {
1453 nr_to_scan = min(nr[l],
1454 (unsigned long)sc->swap_cluster_max);
1455 nr[l] -= nr_to_scan;
1456
1457 nr_reclaimed += shrink_list(l, nr_to_scan,
1458 zone, sc, priority);
1459 }
1460 }
1461 }
1462
1463
1464
1465
1466
1467 if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
1468 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1469 else if (!scan_global_lru(sc))
1470 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1471
1472 throttle_vm_writeout(sc->gfp_mask);
1473 return nr_reclaimed;
1474}
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1493 struct scan_control *sc)
1494{
1495 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1496 unsigned long nr_reclaimed = 0;
1497 struct zoneref *z;
1498 struct zone *zone;
1499
1500 sc->all_unreclaimable = 1;
1501 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1502 if (!populated_zone(zone))
1503 continue;
1504
1505
1506
1507
1508 if (scan_global_lru(sc)) {
1509 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1510 continue;
1511 note_zone_scanning_priority(zone, priority);
1512
1513 if (zone_is_all_unreclaimable(zone) &&
1514 priority != DEF_PRIORITY)
1515 continue;
1516 sc->all_unreclaimable = 0;
1517 } else {
1518
1519
1520
1521
1522 sc->all_unreclaimable = 0;
1523 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1524 priority);
1525 }
1526
1527 nr_reclaimed += shrink_zone(priority, zone, sc);
1528 }
1529
1530 return nr_reclaimed;
1531}
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1550 struct scan_control *sc)
1551{
1552 int priority;
1553 unsigned long ret = 0;
1554 unsigned long total_scanned = 0;
1555 unsigned long nr_reclaimed = 0;
1556 struct reclaim_state *reclaim_state = current->reclaim_state;
1557 unsigned long lru_pages = 0;
1558 struct zoneref *z;
1559 struct zone *zone;
1560 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1561
1562 delayacct_freepages_start();
1563
1564 if (scan_global_lru(sc))
1565 count_vm_event(ALLOCSTALL);
1566
1567
1568
1569 if (scan_global_lru(sc)) {
1570 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1571
1572 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1573 continue;
1574
1575 lru_pages += zone_lru_pages(zone);
1576 }
1577 }
1578
1579 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1580 sc->nr_scanned = 0;
1581 if (!priority)
1582 disable_swap_token();
1583 nr_reclaimed += shrink_zones(priority, zonelist, sc);
1584
1585
1586
1587
1588 if (scan_global_lru(sc)) {
1589 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1590 if (reclaim_state) {
1591 nr_reclaimed += reclaim_state->reclaimed_slab;
1592 reclaim_state->reclaimed_slab = 0;
1593 }
1594 }
1595 total_scanned += sc->nr_scanned;
1596 if (nr_reclaimed >= sc->swap_cluster_max) {
1597 ret = nr_reclaimed;
1598 goto out;
1599 }
1600
1601
1602
1603
1604
1605
1606
1607
1608 if (total_scanned > sc->swap_cluster_max +
1609 sc->swap_cluster_max / 2) {
1610 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1611 sc->may_writepage = 1;
1612 }
1613
1614
1615 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1616 congestion_wait(WRITE, HZ/10);
1617 }
1618
1619 if (!sc->all_unreclaimable && scan_global_lru(sc))
1620 ret = nr_reclaimed;
1621out:
1622
1623
1624
1625
1626
1627
1628
1629 if (priority < 0)
1630 priority = 0;
1631
1632 if (scan_global_lru(sc)) {
1633 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1634
1635 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1636 continue;
1637
1638 zone->prev_priority = priority;
1639 }
1640 } else
1641 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1642
1643 delayacct_freepages_end();
1644
1645 return ret;
1646}
1647
1648unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1649 gfp_t gfp_mask)
1650{
1651 struct scan_control sc = {
1652 .gfp_mask = gfp_mask,
1653 .may_writepage = !laptop_mode,
1654 .swap_cluster_max = SWAP_CLUSTER_MAX,
1655 .may_swap = 1,
1656 .swappiness = vm_swappiness,
1657 .order = order,
1658 .mem_cgroup = NULL,
1659 .isolate_pages = isolate_pages_global,
1660 };
1661
1662 return do_try_to_free_pages(zonelist, &sc);
1663}
1664
1665#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1666
1667unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1668 gfp_t gfp_mask)
1669{
1670 struct scan_control sc = {
1671 .may_writepage = !laptop_mode,
1672 .may_swap = 1,
1673 .swap_cluster_max = SWAP_CLUSTER_MAX,
1674 .swappiness = vm_swappiness,
1675 .order = 0,
1676 .mem_cgroup = mem_cont,
1677 .isolate_pages = mem_cgroup_isolate_pages,
1678 };
1679 struct zonelist *zonelist;
1680
1681 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1682 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1683 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1684 return do_try_to_free_pages(zonelist, &sc);
1685}
1686#endif
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1710{
1711 int all_zones_ok;
1712 int priority;
1713 int i;
1714 unsigned long total_scanned;
1715 unsigned long nr_reclaimed;
1716 struct reclaim_state *reclaim_state = current->reclaim_state;
1717 struct scan_control sc = {
1718 .gfp_mask = GFP_KERNEL,
1719 .may_swap = 1,
1720 .swap_cluster_max = SWAP_CLUSTER_MAX,
1721 .swappiness = vm_swappiness,
1722 .order = order,
1723 .mem_cgroup = NULL,
1724 .isolate_pages = isolate_pages_global,
1725 };
1726
1727
1728
1729
1730 int temp_priority[MAX_NR_ZONES];
1731
1732loop_again:
1733 total_scanned = 0;
1734 nr_reclaimed = 0;
1735 sc.may_writepage = !laptop_mode;
1736 count_vm_event(PAGEOUTRUN);
1737
1738 for (i = 0; i < pgdat->nr_zones; i++)
1739 temp_priority[i] = DEF_PRIORITY;
1740
1741 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1742 int end_zone = 0;
1743 unsigned long lru_pages = 0;
1744
1745
1746 if (!priority)
1747 disable_swap_token();
1748
1749 all_zones_ok = 1;
1750
1751
1752
1753
1754
1755 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1756 struct zone *zone = pgdat->node_zones + i;
1757
1758 if (!populated_zone(zone))
1759 continue;
1760
1761 if (zone_is_all_unreclaimable(zone) &&
1762 priority != DEF_PRIORITY)
1763 continue;
1764
1765
1766
1767
1768
1769 if (inactive_anon_is_low(zone))
1770 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1771 &sc, priority, 0);
1772
1773 if (!zone_watermark_ok(zone, order, zone->pages_high,
1774 0, 0)) {
1775 end_zone = i;
1776 break;
1777 }
1778 }
1779 if (i < 0)
1780 goto out;
1781
1782 for (i = 0; i <= end_zone; i++) {
1783 struct zone *zone = pgdat->node_zones + i;
1784
1785 lru_pages += zone_lru_pages(zone);
1786 }
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797 for (i = 0; i <= end_zone; i++) {
1798 struct zone *zone = pgdat->node_zones + i;
1799 int nr_slab;
1800
1801 if (!populated_zone(zone))
1802 continue;
1803
1804 if (zone_is_all_unreclaimable(zone) &&
1805 priority != DEF_PRIORITY)
1806 continue;
1807
1808 if (!zone_watermark_ok(zone, order, zone->pages_high,
1809 end_zone, 0))
1810 all_zones_ok = 0;
1811 temp_priority[i] = priority;
1812 sc.nr_scanned = 0;
1813 note_zone_scanning_priority(zone, priority);
1814
1815
1816
1817
1818 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1819 end_zone, 0))
1820 nr_reclaimed += shrink_zone(priority, zone, &sc);
1821 reclaim_state->reclaimed_slab = 0;
1822 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1823 lru_pages);
1824 nr_reclaimed += reclaim_state->reclaimed_slab;
1825 total_scanned += sc.nr_scanned;
1826 if (zone_is_all_unreclaimable(zone))
1827 continue;
1828 if (nr_slab == 0 && zone->pages_scanned >=
1829 (zone_lru_pages(zone) * 6))
1830 zone_set_flag(zone,
1831 ZONE_ALL_UNRECLAIMABLE);
1832
1833
1834
1835
1836
1837 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1838 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1839 sc.may_writepage = 1;
1840 }
1841 if (all_zones_ok)
1842 break;
1843
1844
1845
1846
1847 if (total_scanned && priority < DEF_PRIORITY - 2)
1848 congestion_wait(WRITE, HZ/10);
1849
1850
1851
1852
1853
1854
1855
1856 if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1857 break;
1858 }
1859out:
1860
1861
1862
1863
1864
1865 for (i = 0; i < pgdat->nr_zones; i++) {
1866 struct zone *zone = pgdat->node_zones + i;
1867
1868 zone->prev_priority = temp_priority[i];
1869 }
1870 if (!all_zones_ok) {
1871 cond_resched();
1872
1873 try_to_freeze();
1874
1875 goto loop_again;
1876 }
1877
1878 return nr_reclaimed;
1879}
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894static int kswapd(void *p)
1895{
1896 unsigned long order;
1897 pg_data_t *pgdat = (pg_data_t*)p;
1898 struct task_struct *tsk = current;
1899 DEFINE_WAIT(wait);
1900 struct reclaim_state reclaim_state = {
1901 .reclaimed_slab = 0,
1902 };
1903 node_to_cpumask_ptr(cpumask, pgdat->node_id);
1904
1905 if (!cpus_empty(*cpumask))
1906 set_cpus_allowed_ptr(tsk, cpumask);
1907 current->reclaim_state = &reclaim_state;
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1922 set_freezable();
1923
1924 order = 0;
1925 for ( ; ; ) {
1926 unsigned long new_order;
1927
1928 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1929 new_order = pgdat->kswapd_max_order;
1930 pgdat->kswapd_max_order = 0;
1931 if (order < new_order) {
1932
1933
1934
1935
1936 order = new_order;
1937 } else {
1938 if (!freezing(current))
1939 schedule();
1940
1941 order = pgdat->kswapd_max_order;
1942 }
1943 finish_wait(&pgdat->kswapd_wait, &wait);
1944
1945 if (!try_to_freeze()) {
1946
1947
1948
1949 balance_pgdat(pgdat, order);
1950 }
1951 }
1952 return 0;
1953}
1954
1955
1956
1957
1958void wakeup_kswapd(struct zone *zone, int order)
1959{
1960 pg_data_t *pgdat;
1961
1962 if (!populated_zone(zone))
1963 return;
1964
1965 pgdat = zone->zone_pgdat;
1966 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
1967 return;
1968 if (pgdat->kswapd_max_order < order)
1969 pgdat->kswapd_max_order = order;
1970 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1971 return;
1972 if (!waitqueue_active(&pgdat->kswapd_wait))
1973 return;
1974 wake_up_interruptible(&pgdat->kswapd_wait);
1975}
1976
1977unsigned long global_lru_pages(void)
1978{
1979 return global_page_state(NR_ACTIVE_ANON)
1980 + global_page_state(NR_ACTIVE_FILE)
1981 + global_page_state(NR_INACTIVE_ANON)
1982 + global_page_state(NR_INACTIVE_FILE);
1983}
1984
1985#ifdef CONFIG_PM
1986
1987
1988
1989
1990
1991
1992
1993static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1994 int pass, struct scan_control *sc)
1995{
1996 struct zone *zone;
1997 unsigned long nr_to_scan, ret = 0;
1998 enum lru_list l;
1999
2000 for_each_zone(zone) {
2001
2002 if (!populated_zone(zone))
2003 continue;
2004
2005 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2006 continue;
2007
2008 for_each_evictable_lru(l) {
2009
2010 if (pass == 0 &&
2011 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
2012 continue;
2013
2014 zone->lru[l].nr_scan +=
2015 (zone_page_state(zone, NR_LRU_BASE + l)
2016 >> prio) + 1;
2017 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2018 zone->lru[l].nr_scan = 0;
2019 nr_to_scan = min(nr_pages,
2020 zone_page_state(zone,
2021 NR_LRU_BASE + l));
2022 ret += shrink_list(l, nr_to_scan, zone,
2023 sc, prio);
2024 if (ret >= nr_pages)
2025 return ret;
2026 }
2027 }
2028 }
2029
2030 return ret;
2031}
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041unsigned long shrink_all_memory(unsigned long nr_pages)
2042{
2043 unsigned long lru_pages, nr_slab;
2044 unsigned long ret = 0;
2045 int pass;
2046 struct reclaim_state reclaim_state;
2047 struct scan_control sc = {
2048 .gfp_mask = GFP_KERNEL,
2049 .may_swap = 0,
2050 .swap_cluster_max = nr_pages,
2051 .may_writepage = 1,
2052 .swappiness = vm_swappiness,
2053 .isolate_pages = isolate_pages_global,
2054 };
2055
2056 current->reclaim_state = &reclaim_state;
2057
2058 lru_pages = global_lru_pages();
2059 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2060
2061 while (nr_slab >= lru_pages) {
2062 reclaim_state.reclaimed_slab = 0;
2063 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2064 if (!reclaim_state.reclaimed_slab)
2065 break;
2066
2067 ret += reclaim_state.reclaimed_slab;
2068 if (ret >= nr_pages)
2069 goto out;
2070
2071 nr_slab -= reclaim_state.reclaimed_slab;
2072 }
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082 for (pass = 0; pass < 5; pass++) {
2083 int prio;
2084
2085
2086 if (pass > 2) {
2087 sc.may_swap = 1;
2088 sc.swappiness = 100;
2089 }
2090
2091 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2092 unsigned long nr_to_scan = nr_pages - ret;
2093
2094 sc.nr_scanned = 0;
2095 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
2096 if (ret >= nr_pages)
2097 goto out;
2098
2099 reclaim_state.reclaimed_slab = 0;
2100 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2101 global_lru_pages());
2102 ret += reclaim_state.reclaimed_slab;
2103 if (ret >= nr_pages)
2104 goto out;
2105
2106 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2107 congestion_wait(WRITE, HZ / 10);
2108 }
2109 }
2110
2111
2112
2113
2114
2115 if (!ret) {
2116 do {
2117 reclaim_state.reclaimed_slab = 0;
2118 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
2119 ret += reclaim_state.reclaimed_slab;
2120 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
2121 }
2122
2123out:
2124 current->reclaim_state = NULL;
2125
2126 return ret;
2127}
2128#endif
2129
2130
2131
2132
2133
2134static int __devinit cpu_callback(struct notifier_block *nfb,
2135 unsigned long action, void *hcpu)
2136{
2137 int nid;
2138
2139 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2140 for_each_node_state(nid, N_HIGH_MEMORY) {
2141 pg_data_t *pgdat = NODE_DATA(nid);
2142 node_to_cpumask_ptr(mask, pgdat->node_id);
2143
2144 if (any_online_cpu(*mask) < nr_cpu_ids)
2145
2146 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2147 }
2148 }
2149 return NOTIFY_OK;
2150}
2151
2152
2153
2154
2155
2156int kswapd_run(int nid)
2157{
2158 pg_data_t *pgdat = NODE_DATA(nid);
2159 int ret = 0;
2160
2161 if (pgdat->kswapd)
2162 return 0;
2163
2164 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
2165 if (IS_ERR(pgdat->kswapd)) {
2166
2167 BUG_ON(system_state == SYSTEM_BOOTING);
2168 printk("Failed to start kswapd on node %d\n",nid);
2169 ret = -1;
2170 }
2171 return ret;
2172}
2173
2174static int __init kswapd_init(void)
2175{
2176 int nid;
2177
2178 swap_setup();
2179 for_each_node_state(nid, N_HIGH_MEMORY)
2180 kswapd_run(nid);
2181 hotcpu_notifier(cpu_callback, 0);
2182 return 0;
2183}
2184
2185module_init(kswapd_init)
2186
2187#ifdef CONFIG_NUMA
2188
2189
2190
2191
2192
2193
2194int zone_reclaim_mode __read_mostly;
2195
2196#define RECLAIM_OFF 0
2197#define RECLAIM_ZONE (1<<0)
2198#define RECLAIM_WRITE (1<<1)
2199#define RECLAIM_SWAP (1<<2)
2200
2201
2202
2203
2204
2205
2206#define ZONE_RECLAIM_PRIORITY 4
2207
2208
2209
2210
2211
2212int sysctl_min_unmapped_ratio = 1;
2213
2214
2215
2216
2217
2218int sysctl_min_slab_ratio = 5;
2219
2220
2221
2222
2223static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2224{
2225
2226 const unsigned long nr_pages = 1 << order;
2227 struct task_struct *p = current;
2228 struct reclaim_state reclaim_state;
2229 int priority;
2230 unsigned long nr_reclaimed = 0;
2231 struct scan_control sc = {
2232 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2233 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2234 .swap_cluster_max = max_t(unsigned long, nr_pages,
2235 SWAP_CLUSTER_MAX),
2236 .gfp_mask = gfp_mask,
2237 .swappiness = vm_swappiness,
2238 .isolate_pages = isolate_pages_global,
2239 };
2240 unsigned long slab_reclaimable;
2241
2242 disable_swap_token();
2243 cond_resched();
2244
2245
2246
2247
2248
2249 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2250 reclaim_state.reclaimed_slab = 0;
2251 p->reclaim_state = &reclaim_state;
2252
2253 if (zone_page_state(zone, NR_FILE_PAGES) -
2254 zone_page_state(zone, NR_FILE_MAPPED) >
2255 zone->min_unmapped_pages) {
2256
2257
2258
2259
2260 priority = ZONE_RECLAIM_PRIORITY;
2261 do {
2262 note_zone_scanning_priority(zone, priority);
2263 nr_reclaimed += shrink_zone(priority, zone, &sc);
2264 priority--;
2265 } while (priority >= 0 && nr_reclaimed < nr_pages);
2266 }
2267
2268 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2269 if (slab_reclaimable > zone->min_slab_pages) {
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280 while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
2281 zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
2282 slab_reclaimable - nr_pages)
2283 ;
2284
2285
2286
2287
2288
2289 nr_reclaimed += slab_reclaimable -
2290 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2291 }
2292
2293 p->reclaim_state = NULL;
2294 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2295 return nr_reclaimed >= nr_pages;
2296}
2297
2298int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2299{
2300 int node_id;
2301 int ret;
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313 if (zone_page_state(zone, NR_FILE_PAGES) -
2314 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
2315 && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
2316 <= zone->min_slab_pages)
2317 return 0;
2318
2319 if (zone_is_all_unreclaimable(zone))
2320 return 0;
2321
2322
2323
2324
2325 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2326 return 0;
2327
2328
2329
2330
2331
2332
2333
2334 node_id = zone_to_nid(zone);
2335 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2336 return 0;
2337
2338 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2339 return 0;
2340 ret = __zone_reclaim(zone, gfp_mask, order);
2341 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2342
2343 return ret;
2344}
2345#endif
2346
2347#ifdef CONFIG_UNEVICTABLE_LRU
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362int page_evictable(struct page *page, struct vm_area_struct *vma)
2363{
2364
2365 if (mapping_unevictable(page_mapping(page)))
2366 return 0;
2367
2368 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2369 return 0;
2370
2371 return 1;
2372}
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385static void check_move_unevictable_page(struct page *page, struct zone *zone)
2386{
2387 VM_BUG_ON(PageActive(page));
2388
2389retry:
2390 ClearPageUnevictable(page);
2391 if (page_evictable(page, NULL)) {
2392 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2393
2394 __dec_zone_state(zone, NR_UNEVICTABLE);
2395 list_move(&page->lru, &zone->lru[l].list);
2396 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2397 __count_vm_event(UNEVICTABLE_PGRESCUED);
2398 } else {
2399
2400
2401
2402 SetPageUnevictable(page);
2403 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2404 if (page_evictable(page, NULL))
2405 goto retry;
2406 }
2407}
2408
2409
2410
2411
2412
2413
2414
2415
2416void scan_mapping_unevictable_pages(struct address_space *mapping)
2417{
2418 pgoff_t next = 0;
2419 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2420 PAGE_CACHE_SHIFT;
2421 struct zone *zone;
2422 struct pagevec pvec;
2423
2424 if (mapping->nrpages == 0)
2425 return;
2426
2427 pagevec_init(&pvec, 0);
2428 while (next < end &&
2429 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2430 int i;
2431 int pg_scanned = 0;
2432
2433 zone = NULL;
2434
2435 for (i = 0; i < pagevec_count(&pvec); i++) {
2436 struct page *page = pvec.pages[i];
2437 pgoff_t page_index = page->index;
2438 struct zone *pagezone = page_zone(page);
2439
2440 pg_scanned++;
2441 if (page_index > next)
2442 next = page_index;
2443 next++;
2444
2445 if (pagezone != zone) {
2446 if (zone)
2447 spin_unlock_irq(&zone->lru_lock);
2448 zone = pagezone;
2449 spin_lock_irq(&zone->lru_lock);
2450 }
2451
2452 if (PageLRU(page) && PageUnevictable(page))
2453 check_move_unevictable_page(page, zone);
2454 }
2455 if (zone)
2456 spin_unlock_irq(&zone->lru_lock);
2457 pagevec_release(&pvec);
2458
2459 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2460 }
2461
2462}
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL
2475void scan_zone_unevictable_pages(struct zone *zone)
2476{
2477 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2478 unsigned long scan;
2479 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2480
2481 while (nr_to_scan > 0) {
2482 unsigned long batch_size = min(nr_to_scan,
2483 SCAN_UNEVICTABLE_BATCH_SIZE);
2484
2485 spin_lock_irq(&zone->lru_lock);
2486 for (scan = 0; scan < batch_size; scan++) {
2487 struct page *page = lru_to_page(l_unevictable);
2488
2489 if (!trylock_page(page))
2490 continue;
2491
2492 prefetchw_prev_lru_page(page, l_unevictable, flags);
2493
2494 if (likely(PageLRU(page) && PageUnevictable(page)))
2495 check_move_unevictable_page(page, zone);
2496
2497 unlock_page(page);
2498 }
2499 spin_unlock_irq(&zone->lru_lock);
2500
2501 nr_to_scan -= batch_size;
2502 }
2503}
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517void scan_all_zones_unevictable_pages(void)
2518{
2519 struct zone *zone;
2520
2521 for_each_zone(zone) {
2522 scan_zone_unevictable_pages(zone);
2523 }
2524}
2525
2526
2527
2528
2529
2530unsigned long scan_unevictable_pages;
2531
2532int scan_unevictable_handler(struct ctl_table *table, int write,
2533 struct file *file, void __user *buffer,
2534 size_t *length, loff_t *ppos)
2535{
2536 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2537
2538 if (write && *(unsigned long *)table->data)
2539 scan_all_zones_unevictable_pages();
2540
2541 scan_unevictable_pages = 0;
2542 return 0;
2543}
2544
2545
2546
2547
2548
2549
2550static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2551 struct sysdev_attribute *attr,
2552 char *buf)
2553{
2554 return sprintf(buf, "0\n");
2555}
2556
2557static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2558 struct sysdev_attribute *attr,
2559 const char *buf, size_t count)
2560{
2561 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2562 struct zone *zone;
2563 unsigned long res;
2564 unsigned long req = strict_strtoul(buf, 10, &res);
2565
2566 if (!req)
2567 return 1;
2568
2569 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2570 if (!populated_zone(zone))
2571 continue;
2572 scan_zone_unevictable_pages(zone);
2573 }
2574 return 1;
2575}
2576
2577
2578static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2579 read_scan_unevictable_node,
2580 write_scan_unevictable_node);
2581
2582int scan_unevictable_register_node(struct node *node)
2583{
2584 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2585}
2586
2587void scan_unevictable_unregister_node(struct node *node)
2588{
2589 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2590}
2591
2592#endif
2593