1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/kernel_stat.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/init.h>
20#include <linux/highmem.h>
21#include <linux/file.h>
22#include <linux/writeback.h>
23#include <linux/suspend.h>
24#include <linux/buffer_head.h>
25#include <linux/mm_inline.h>
26#include <linux/pagevec.h>
27#include <linux/rmap-locking.h>
28
29#include <asm/pgalloc.h>
30#include <asm/tlbflush.h>
31#include <linux/swapops.h>
32
33
34
35
36
37
38
39#define DEF_PRIORITY (6)
40
41#ifdef ARCH_HAS_PREFETCH
42#define prefetch_prev_lru_page(_page, _base, _field) \
43 do { \
44 if ((_page)->lru.prev != _base) { \
45 struct page *prev; \
46 \
47 prev = list_entry(_page->lru.prev, \
48 struct page, lru); \
49 prefetch(&prev->_field); \
50 } \
51 } while (0)
52#else
53#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
54#endif
55
56#ifdef ARCH_HAS_PREFETCHW
57#define prefetchw_prev_lru_page(_page, _base, _field) \
58 do { \
59 if ((_page)->lru.prev != _base) { \
60 struct page *prev; \
61 \
62 prev = list_entry(_page->lru.prev, \
63 struct page, lru); \
64 prefetchw(&prev->_field); \
65 } \
66 } while (0)
67#else
68#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
69#endif
70
71
72static inline int page_mapping_inuse(struct page * page)
73{
74 struct address_space *mapping = page->mapping;
75
76
77 if (page_mapped(page))
78 return 1;
79
80
81 if (!mapping)
82 return 0;
83
84
85 if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared))
86 return 1;
87
88 return 0;
89}
90
91static inline int is_page_cache_freeable(struct page *page)
92{
93 return page_count(page) - !!PagePrivate(page) == 2;
94}
95
96static int
97shrink_list(struct list_head *page_list, int nr_pages,
98 unsigned int gfp_mask, int priority, int *max_scan)
99{
100 struct address_space *mapping;
101 LIST_HEAD(ret_pages);
102 struct pagevec freed_pvec;
103 const int nr_pages_in = nr_pages;
104 int pgactivate = 0;
105
106 pagevec_init(&freed_pvec);
107 while (!list_empty(page_list)) {
108 struct page *page;
109 int may_enter_fs;
110
111 page = list_entry(page_list->prev, struct page, lru);
112 list_del(&page->lru);
113
114 if (TestSetPageLocked(page))
115 goto keep;
116
117 BUG_ON(PageActive(page));
118 may_enter_fs = (gfp_mask & __GFP_FS) ||
119 (PageSwapCache(page) && (gfp_mask & __GFP_IO));
120 if (PageWriteback(page)) {
121 if (may_enter_fs)
122 wait_on_page_writeback(page);
123 else
124 goto keep_locked;
125 }
126
127 pte_chain_lock(page);
128 if (page_referenced(page) && page_mapping_inuse(page)) {
129
130 pte_chain_unlock(page);
131 goto activate_locked;
132 }
133
134 mapping = page->mapping;
135
136
137
138
139
140
141
142 if (page_mapped(page) && !mapping && !PagePrivate(page)) {
143 pte_chain_unlock(page);
144 if (!add_to_swap(page))
145 goto activate_locked;
146 pte_chain_lock(page);
147 mapping = page->mapping;
148 }
149
150
151
152
153
154 if (page_mapped(page) && mapping) {
155 switch (try_to_unmap(page)) {
156 case SWAP_ERROR:
157 case SWAP_FAIL:
158 pte_chain_unlock(page);
159 goto activate_locked;
160 case SWAP_AGAIN:
161 pte_chain_unlock(page);
162 goto keep_locked;
163 case SWAP_SUCCESS:
164 ;
165 }
166 }
167 pte_chain_unlock(page);
168
169
170
171
172
173
174
175 if (PageDirty(page) && is_page_cache_freeable(page) &&
176 mapping && may_enter_fs) {
177 int (*writeback)(struct page *,
178 struct writeback_control *);
179 const int cluster_size = SWAP_CLUSTER_MAX;
180 struct writeback_control wbc = {
181 .nr_to_write = cluster_size,
182 };
183
184 writeback = mapping->a_ops->vm_writeback;
185 if (writeback == NULL)
186 writeback = generic_vm_writeback;
187 (*writeback)(page, &wbc);
188 *max_scan -= (cluster_size - wbc.nr_to_write);
189 goto keep;
190 }
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213 if (PagePrivate(page)) {
214 if (!try_to_release_page(page, gfp_mask))
215 goto keep_locked;
216 if (!mapping && page_count(page) == 1)
217 goto free_it;
218 }
219
220 if (!mapping)
221 goto keep_locked;
222
223 write_lock(&mapping->page_lock);
224
225
226
227
228
229
230 if (page_count(page) != 2 || PageDirty(page)) {
231 write_unlock(&mapping->page_lock);
232 goto keep_locked;
233 }
234
235 if (PageSwapCache(page)) {
236 swp_entry_t swap = { .val = page->index };
237 __delete_from_swap_cache(page);
238 write_unlock(&mapping->page_lock);
239 swap_free(swap);
240 } else {
241 __remove_from_page_cache(page);
242 write_unlock(&mapping->page_lock);
243 }
244 __put_page(page);
245free_it:
246 unlock_page(page);
247 nr_pages--;
248 if (!pagevec_add(&freed_pvec, page))
249 __pagevec_release_nonlru(&freed_pvec);
250 continue;
251
252activate_locked:
253 SetPageActive(page);
254 pgactivate++;
255keep_locked:
256 unlock_page(page);
257keep:
258 list_add(&page->lru, &ret_pages);
259 BUG_ON(PageLRU(page));
260 }
261 list_splice(&ret_pages, page_list);
262 if (pagevec_count(&freed_pvec))
263 __pagevec_release_nonlru(&freed_pvec);
264 KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages);
265 KERNEL_STAT_ADD(pgactivate, pgactivate);
266 return nr_pages;
267}
268
269
270
271
272
273
274
275
276
277
278
279
280static int
281shrink_cache(int nr_pages, struct zone *zone,
282 unsigned int gfp_mask, int priority, int max_scan)
283{
284 LIST_HEAD(page_list);
285 struct pagevec pvec;
286 int nr_to_process;
287
288
289
290
291 nr_to_process = nr_pages;
292 if (nr_to_process < SWAP_CLUSTER_MAX)
293 nr_to_process = SWAP_CLUSTER_MAX;
294
295 pagevec_init(&pvec);
296
297 lru_add_drain();
298 spin_lock_irq(&zone->lru_lock);
299 while (max_scan > 0 && nr_pages > 0) {
300 struct page *page;
301 int n = 0;
302
303 while (n < nr_to_process && !list_empty(&zone->inactive_list)) {
304 page = list_entry(zone->inactive_list.prev,
305 struct page, lru);
306
307 prefetchw_prev_lru_page(page,
308 &zone->inactive_list, flags);
309
310 if (!TestClearPageLRU(page))
311 BUG();
312 list_del(&page->lru);
313 if (page_count(page) == 0) {
314
315 SetPageLRU(page);
316 list_add(&page->lru, &zone->inactive_list);
317 continue;
318 }
319 list_add(&page->lru, &page_list);
320 page_cache_get(page);
321 n++;
322 }
323 zone->nr_inactive -= n;
324 spin_unlock_irq(&zone->lru_lock);
325
326 if (list_empty(&page_list))
327 goto done;
328
329 max_scan -= n;
330 KERNEL_STAT_ADD(pgscan, n);
331 nr_pages = shrink_list(&page_list, nr_pages,
332 gfp_mask, priority, &max_scan);
333
334 if (nr_pages <= 0 && list_empty(&page_list))
335 goto done;
336
337 spin_lock_irq(&zone->lru_lock);
338
339
340
341 while (!list_empty(&page_list)) {
342 page = list_entry(page_list.prev, struct page, lru);
343 if (TestSetPageLRU(page))
344 BUG();
345 list_del(&page->lru);
346 if (PageActive(page))
347 add_page_to_active_list(zone, page);
348 else
349 add_page_to_inactive_list(zone, page);
350 if (!pagevec_add(&pvec, page)) {
351 spin_unlock_irq(&zone->lru_lock);
352 __pagevec_release(&pvec);
353 spin_lock_irq(&zone->lru_lock);
354 }
355 }
356 }
357 spin_unlock_irq(&zone->lru_lock);
358done:
359 pagevec_release(&pvec);
360 return nr_pages;
361}
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380static void
381refill_inactive_zone(struct zone *zone, const int nr_pages_in)
382{
383 int pgdeactivate = 0;
384 int nr_pages = nr_pages_in;
385 LIST_HEAD(l_hold);
386 LIST_HEAD(l_inactive);
387 LIST_HEAD(l_active);
388 struct page *page;
389 struct pagevec pvec;
390
391 lru_add_drain();
392 spin_lock_irq(&zone->lru_lock);
393 while (nr_pages && !list_empty(&zone->active_list)) {
394 page = list_entry(zone->active_list.prev, struct page, lru);
395 prefetchw_prev_lru_page(page, &zone->active_list, flags);
396 if (!TestClearPageLRU(page))
397 BUG();
398 list_del(&page->lru);
399 if (page_count(page) == 0) {
400
401 SetPageLRU(page);
402 list_add(&page->lru, &zone->active_list);
403 continue;
404 }
405 page_cache_get(page);
406 list_add(&page->lru, &l_hold);
407 nr_pages--;
408 }
409 spin_unlock_irq(&zone->lru_lock);
410
411 while (!list_empty(&l_hold)) {
412 page = list_entry(l_hold.prev, struct page, lru);
413 list_del(&page->lru);
414 if (page_mapped(page)) {
415 pte_chain_lock(page);
416 if (page_mapped(page) && page_referenced(page)) {
417 pte_chain_unlock(page);
418 list_add(&page->lru, &l_active);
419 continue;
420 }
421 pte_chain_unlock(page);
422 }
423 list_add(&page->lru, &l_inactive);
424 pgdeactivate++;
425 }
426
427 pagevec_init(&pvec);
428 spin_lock_irq(&zone->lru_lock);
429 while (!list_empty(&l_inactive)) {
430 page = list_entry(l_inactive.prev, struct page, lru);
431 prefetchw_prev_lru_page(page, &l_inactive, flags);
432 if (TestSetPageLRU(page))
433 BUG();
434 if (!TestClearPageActive(page))
435 BUG();
436 list_move(&page->lru, &zone->inactive_list);
437 if (!pagevec_add(&pvec, page)) {
438 spin_unlock_irq(&zone->lru_lock);
439 if (buffer_heads_over_limit)
440 pagevec_strip(&pvec);
441 __pagevec_release(&pvec);
442 spin_lock_irq(&zone->lru_lock);
443 }
444 }
445 if (buffer_heads_over_limit) {
446 spin_unlock_irq(&zone->lru_lock);
447 pagevec_strip(&pvec);
448 spin_lock_irq(&zone->lru_lock);
449 }
450 while (!list_empty(&l_active)) {
451 page = list_entry(l_active.prev, struct page, lru);
452 prefetchw_prev_lru_page(page, &l_active, flags);
453 if (TestSetPageLRU(page))
454 BUG();
455 BUG_ON(!PageActive(page));
456 list_move(&page->lru, &zone->active_list);
457 if (!pagevec_add(&pvec, page)) {
458 spin_unlock_irq(&zone->lru_lock);
459 __pagevec_release(&pvec);
460 spin_lock_irq(&zone->lru_lock);
461 }
462 }
463 zone->nr_active -= pgdeactivate;
464 zone->nr_inactive += pgdeactivate;
465 spin_unlock_irq(&zone->lru_lock);
466 pagevec_release(&pvec);
467
468 KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
469 KERNEL_STAT_ADD(pgdeactivate, pgdeactivate);
470}
471
472static int
473shrink_zone(struct zone *zone, int priority,
474 unsigned int gfp_mask, int nr_pages)
475{
476 unsigned long ratio;
477 int max_scan;
478
479
480 if (kmem_cache_reap(gfp_mask) >= nr_pages)
481 return 0;
482
483
484
485
486
487
488
489
490
491
492
493 ratio = (unsigned long)nr_pages * zone->nr_active /
494 ((zone->nr_inactive | 1) * 2);
495 atomic_add(ratio+1, &zone->refill_counter);
496 while (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
497 atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
498 refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
499 }
500
501 max_scan = zone->nr_inactive / priority;
502 nr_pages = shrink_cache(nr_pages, zone,
503 gfp_mask, priority, max_scan);
504
505 if (nr_pages <= 0)
506 return 0;
507
508 wakeup_bdflush();
509
510 shrink_dcache_memory(priority, gfp_mask);
511
512
513 shrink_icache_memory(1, gfp_mask);
514#ifdef CONFIG_QUOTA
515 shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
516#endif
517
518 return nr_pages;
519}
520
521static int
522shrink_caches(struct zone *classzone, int priority,
523 int gfp_mask, int nr_pages)
524{
525 struct zone *first_classzone;
526 struct zone *zone;
527
528 first_classzone = classzone->zone_pgdat->node_zones;
529 zone = classzone;
530 while (zone >= first_classzone && nr_pages > 0) {
531 if (zone->free_pages <= zone->pages_high) {
532 nr_pages = shrink_zone(zone, priority,
533 gfp_mask, nr_pages);
534 }
535 zone--;
536 }
537 return nr_pages;
538}
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557int
558try_to_free_pages(struct zone *classzone,
559 unsigned int gfp_mask, unsigned int order)
560{
561 int priority = DEF_PRIORITY;
562 int nr_pages = SWAP_CLUSTER_MAX;
563
564 KERNEL_STAT_INC(pageoutrun);
565
566 for (priority = DEF_PRIORITY; priority; priority--) {
567 nr_pages = shrink_caches(classzone, priority,
568 gfp_mask, nr_pages);
569 if (nr_pages <= 0)
570 return 1;
571 if (!(gfp_mask & __GFP_FS))
572 break;
573 }
574 if (gfp_mask & __GFP_FS)
575 out_of_memory();
576 return 0;
577}
578
579DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
580
581static int check_classzone_need_balance(struct zone *classzone)
582{
583 struct zone *first_classzone;
584
585 first_classzone = classzone->zone_pgdat->node_zones;
586 while (classzone >= first_classzone) {
587 if (classzone->free_pages > classzone->pages_high)
588 return 0;
589 classzone--;
590 }
591 return 1;
592}
593
594static int kswapd_balance_pgdat(pg_data_t * pgdat)
595{
596 int need_more_balance = 0, i;
597 struct zone *zone;
598
599 for (i = pgdat->nr_zones-1; i >= 0; i--) {
600 zone = pgdat->node_zones + i;
601 cond_resched();
602 if (!zone->need_balance)
603 continue;
604 if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
605 zone->need_balance = 0;
606 __set_current_state(TASK_INTERRUPTIBLE);
607 schedule_timeout(HZ);
608 continue;
609 }
610 if (check_classzone_need_balance(zone))
611 need_more_balance = 1;
612 else
613 zone->need_balance = 0;
614 }
615
616 return need_more_balance;
617}
618
619static void kswapd_balance(void)
620{
621 int need_more_balance;
622 pg_data_t * pgdat;
623
624 do {
625 need_more_balance = 0;
626 pgdat = pgdat_list;
627 do
628 need_more_balance |= kswapd_balance_pgdat(pgdat);
629 while ((pgdat = pgdat->pgdat_next));
630 } while (need_more_balance);
631}
632
633static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
634{
635 struct zone *zone;
636 int i;
637
638 for (i = pgdat->nr_zones-1; i >= 0; i--) {
639 zone = pgdat->node_zones + i;
640 if (!zone->need_balance)
641 continue;
642 return 0;
643 }
644
645 return 1;
646}
647
648static int kswapd_can_sleep(void)
649{
650 pg_data_t * pgdat;
651
652 pgdat = pgdat_list;
653 do {
654 if (kswapd_can_sleep_pgdat(pgdat))
655 continue;
656 return 0;
657 } while ((pgdat = pgdat->pgdat_next));
658
659 return 1;
660}
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675int kswapd(void *unused)
676{
677 struct task_struct *tsk = current;
678 DECLARE_WAITQUEUE(wait, tsk);
679
680 daemonize();
681 strcpy(tsk->comm, "kswapd");
682 sigfillset(&tsk->blocked);
683
684
685
686
687
688
689
690
691
692
693
694
695
696 tsk->flags |= PF_MEMALLOC;
697
698
699
700
701 for (;;) {
702 if (current->flags & PF_FREEZE)
703 refrigerator(PF_IOTHREAD);
704 __set_current_state(TASK_INTERRUPTIBLE);
705 add_wait_queue(&kswapd_wait, &wait);
706
707 mb();
708 if (kswapd_can_sleep())
709 schedule();
710
711 __set_current_state(TASK_RUNNING);
712 remove_wait_queue(&kswapd_wait, &wait);
713
714
715
716
717
718
719 kswapd_balance();
720 blk_run_queues();
721 }
722}
723
724static int __init kswapd_init(void)
725{
726 printk("Starting kswapd\n");
727 swap_setup();
728 kernel_thread(kswapd, NULL, CLONE_KERNEL);
729 return 0;
730}
731
732module_init(kswapd_init)
733