1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/slab.h>
18#include <linux/kernel_stat.h>
19#include <linux/swap.h>
20#include <linux/swapctl.h>
21#include <linux/smp_lock.h>
22#include <linux/pagemap.h>
23#include <linux/init.h>
24#include <linux/highmem.h>
25#include <linux/file.h>
26
27#include <asm/pgalloc.h>
28
29
30
31
32
33
34
35int vm_passes = 60;
36
37
38
39
40
41
42int vm_cache_scan_ratio = 6;
43
44
45
46
47
48int vm_mapped_ratio = 100;
49
50
51
52
53
54
55
56
57int vm_lru_balance_ratio = 2;
58
59
60
61
62
63
64
65int vm_vfs_scan_ratio = 6;
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86int vm_anon_lru = 0;
87
88
89
90
91
92
93
94
95
96
97
98static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
99{
100 pte_t pte;
101 swp_entry_t entry;
102
103
104 if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
105 mark_page_accessed(page);
106 return 0;
107 }
108
109
110 if (PageActive(page))
111 return 0;
112
113
114 if (!memclass(page_zone(page), classzone))
115 return 0;
116
117 if (TryLockPage(page))
118 return 0;
119
120
121
122
123
124
125 flush_cache_page(vma, address);
126 pte = ptep_get_and_clear(page_table);
127 flush_tlb_page(vma, address);
128
129 if (pte_dirty(pte))
130 set_page_dirty(page);
131
132
133
134
135
136
137 if (PageSwapCache(page)) {
138 entry.val = page->index;
139 swap_duplicate(entry);
140set_swap_pte:
141 set_pte(page_table, swp_entry_to_pte(entry));
142drop_pte:
143 mm->rss--;
144 UnlockPage(page);
145 {
146 int freeable = page_count(page) - !!page->buffers <= 2;
147 page_cache_release(page);
148 return freeable;
149 }
150 }
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166 if (page->mapping)
167 goto drop_pte;
168 if (!PageDirty(page))
169 goto drop_pte;
170
171
172
173
174
175 if (page->buffers)
176 goto preserve;
177
178
179
180
181
182
183
184 for (;;) {
185 entry = get_swap_page();
186 if (!entry.val)
187 break;
188
189
190
191
192 if (add_to_swap_cache(page, entry) == 0) {
193 SetPageUptodate(page);
194 set_page_dirty(page);
195 goto set_swap_pte;
196 }
197
198 swap_free(entry);
199 }
200
201
202preserve:
203 set_pte(page_table, pte);
204 UnlockPage(page);
205 return 0;
206}
207
208
209static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
210{
211 pte_t * pte;
212 unsigned long pmd_end;
213
214 if (pmd_none(*dir))
215 return count;
216 if (pmd_bad(*dir)) {
217 pmd_ERROR(*dir);
218 pmd_clear(dir);
219 return count;
220 }
221
222 pte = pte_offset(dir, address);
223
224 pmd_end = (address + PMD_SIZE) & PMD_MASK;
225 if (end > pmd_end)
226 end = pmd_end;
227
228 do {
229 if (pte_present(*pte)) {
230 struct page *page = pte_page(*pte);
231
232 if (VALID_PAGE(page) && !PageReserved(page)) {
233 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
234 if (!count) {
235 address += PAGE_SIZE;
236 break;
237 }
238 }
239 }
240 address += PAGE_SIZE;
241 pte++;
242 } while (address && (address < end));
243 mm->swap_address = address;
244 return count;
245}
246
247
248static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
249{
250 pmd_t * pmd;
251 unsigned long pgd_end;
252
253 if (pgd_none(*dir))
254 return count;
255 if (pgd_bad(*dir)) {
256 pgd_ERROR(*dir);
257 pgd_clear(dir);
258 return count;
259 }
260
261 pmd = pmd_offset(dir, address);
262
263 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
264 if (pgd_end && (end > pgd_end))
265 end = pgd_end;
266
267 do {
268 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
269 if (!count)
270 break;
271 address = (address + PMD_SIZE) & PMD_MASK;
272 pmd++;
273 } while (address && (address < end));
274 return count;
275}
276
277
278static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
279{
280 pgd_t *pgdir;
281 unsigned long end;
282
283
284 if (vma->vm_flags & VM_RESERVED)
285 return count;
286
287 pgdir = pgd_offset(mm, address);
288
289 end = vma->vm_end;
290 BUG_ON(address >= end);
291 do {
292 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
293 if (!count)
294 break;
295 address = (address + PGDIR_SIZE) & PGDIR_MASK;
296 pgdir++;
297 } while (address && (address < end));
298 return count;
299}
300
301
302struct mm_struct *swap_mm = &init_mm;
303
304
305
306
307static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
308{
309 unsigned long address;
310 struct vm_area_struct* vma;
311
312
313
314
315
316 spin_lock(&mm->page_table_lock);
317 address = mm->swap_address;
318 if (address == TASK_SIZE || swap_mm != mm) {
319
320 ++*mmcounter;
321 goto out_unlock;
322 }
323 vma = find_vma(mm, address);
324 if (vma) {
325 if (address < vma->vm_start)
326 address = vma->vm_start;
327
328 for (;;) {
329 count = swap_out_vma(mm, vma, address, count, classzone);
330 vma = vma->vm_next;
331 if (!vma)
332 break;
333 if (!count)
334 goto out_unlock;
335 address = vma->vm_start;
336 }
337 }
338
339 mm->swap_address = TASK_SIZE;
340
341out_unlock:
342 spin_unlock(&mm->page_table_lock);
343 return count;
344}
345
346static int FASTCALL(swap_out(zone_t * classzone));
347static int fastcall swap_out(zone_t * classzone)
348{
349 int counter, nr_pages = SWAP_CLUSTER_MAX;
350 struct mm_struct *mm;
351
352 counter = mmlist_nr << 1;
353 do {
354 if (unlikely(current->need_resched)) {
355 __set_current_state(TASK_RUNNING);
356 schedule();
357 }
358
359 spin_lock(&mmlist_lock);
360 mm = swap_mm;
361 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
362 mm->swap_address = 0;
363 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
364 if (mm == swap_mm)
365 goto empty;
366 swap_mm = mm;
367 }
368
369
370 atomic_inc(&mm->mm_users);
371 spin_unlock(&mmlist_lock);
372
373 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
374
375 mmput(mm);
376
377 if (!nr_pages)
378 return 1;
379 } while (--counter >= 0);
380
381 return 0;
382
383empty:
384 spin_unlock(&mmlist_lock);
385 return 0;
386}
387
388static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
389static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
390static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
391{
392 struct list_head * entry;
393 int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
394 int max_mapped = vm_mapped_ratio * nr_pages;
395
396 while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
397 struct page * page;
398
399 if (unlikely(current->need_resched)) {
400 spin_unlock(&pagemap_lru_lock);
401 __set_current_state(TASK_RUNNING);
402 schedule();
403 spin_lock(&pagemap_lru_lock);
404 continue;
405 }
406
407 page = list_entry(entry, struct page, lru);
408
409 BUG_ON(!PageLRU(page));
410 BUG_ON(PageActive(page));
411
412 list_del(entry);
413 list_add(entry, &inactive_list);
414
415
416
417
418
419 if (unlikely(!page_count(page)))
420 continue;
421
422 if (!memclass(page_zone(page), classzone))
423 continue;
424
425 max_scan--;
426
427
428 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
429 goto page_mapped;
430
431
432
433
434
435 if (unlikely(TryLockPage(page))) {
436 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
437 page_cache_get(page);
438 spin_unlock(&pagemap_lru_lock);
439 wait_on_page(page);
440 page_cache_release(page);
441 spin_lock(&pagemap_lru_lock);
442 }
443 continue;
444 }
445
446 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
447
448
449
450
451
452
453
454
455 int (*writepage)(struct page *);
456
457 writepage = page->mapping->a_ops->writepage;
458 if ((gfp_mask & __GFP_FS) && writepage) {
459 ClearPageDirty(page);
460 SetPageLaunder(page);
461 page_cache_get(page);
462 spin_unlock(&pagemap_lru_lock);
463
464 writepage(page);
465 page_cache_release(page);
466
467 spin_lock(&pagemap_lru_lock);
468 continue;
469 }
470 }
471
472
473
474
475
476
477 if (page->buffers) {
478 spin_unlock(&pagemap_lru_lock);
479
480
481 page_cache_get(page);
482
483 if (try_to_release_page(page, gfp_mask)) {
484 if (!page->mapping) {
485
486
487
488
489
490
491 spin_lock(&pagemap_lru_lock);
492 UnlockPage(page);
493 __lru_cache_del(page);
494
495
496 page_cache_release(page);
497
498 if (--nr_pages)
499 continue;
500 break;
501 } else {
502
503
504
505
506
507 page_cache_release(page);
508
509 spin_lock(&pagemap_lru_lock);
510 }
511 } else {
512
513 UnlockPage(page);
514 page_cache_release(page);
515
516 spin_lock(&pagemap_lru_lock);
517 continue;
518 }
519 }
520
521 spin_lock(&pagecache_lock);
522
523
524
525
526
527
528
529
530
531 if (!page->mapping || page_count(page) > 1) {
532 spin_unlock(&pagecache_lock);
533 UnlockPage(page);
534page_mapped:
535 if (--max_mapped < 0) {
536 spin_unlock(&pagemap_lru_lock);
537
538 nr_pages -= kmem_cache_reap(gfp_mask);
539 if (nr_pages <= 0)
540 goto out;
541
542 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
543 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
544#ifdef CONFIG_QUOTA
545 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
546#endif
547
548 if (!*failed_swapout)
549 *failed_swapout = !swap_out(classzone);
550
551 max_mapped = nr_pages * vm_mapped_ratio;
552
553 spin_lock(&pagemap_lru_lock);
554 refill_inactive(nr_pages, classzone);
555 }
556 continue;
557
558 }
559 if (PageDirty(page)) {
560 spin_unlock(&pagecache_lock);
561 UnlockPage(page);
562 continue;
563 }
564
565 __lru_cache_del(page);
566
567
568 if (likely(!PageSwapCache(page))) {
569 __remove_inode_page(page);
570 spin_unlock(&pagecache_lock);
571 } else {
572 swp_entry_t swap;
573 swap.val = page->index;
574 __delete_from_swap_cache(page);
575 spin_unlock(&pagecache_lock);
576 swap_free(swap);
577 }
578
579 UnlockPage(page);
580
581
582 page_cache_release(page);
583
584 if (--nr_pages)
585 continue;
586 break;
587 }
588 spin_unlock(&pagemap_lru_lock);
589
590 out:
591 return nr_pages;
592}
593
594
595
596
597
598
599
600
601static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
602{
603 struct list_head * entry;
604 unsigned long ratio;
605
606 ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
607
608 entry = active_list.prev;
609 while (ratio && entry != &active_list) {
610 struct page * page;
611
612 page = list_entry(entry, struct page, lru);
613 entry = entry->prev;
614 if (PageTestandClearReferenced(page)) {
615 list_del(&page->lru);
616 list_add(&page->lru, &active_list);
617 continue;
618 }
619
620 ratio--;
621
622 del_page_from_active_list(page);
623 add_page_to_inactive_list(page);
624 SetPageReferenced(page);
625 }
626
627 if (entry != &active_list) {
628 list_del(&active_list);
629 list_add(&active_list, entry);
630 }
631}
632
633static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
634static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
635{
636 nr_pages -= kmem_cache_reap(gfp_mask);
637 if (nr_pages <= 0)
638 goto out;
639
640 spin_lock(&pagemap_lru_lock);
641 refill_inactive(nr_pages, classzone);
642
643 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
644
645out:
646 return nr_pages;
647}
648
649static int check_classzone_need_balance(zone_t * classzone);
650
651int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
652{
653 gfp_mask = pf_gfp_mask(gfp_mask);
654
655 for (;;) {
656 int tries = vm_passes;
657 int failed_swapout = !(gfp_mask & __GFP_IO);
658 int nr_pages = SWAP_CLUSTER_MAX;
659
660 do {
661 nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
662 if (nr_pages <= 0)
663 return 1;
664 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
665 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
666#ifdef CONFIG_QUOTA
667 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
668#endif
669 if (!failed_swapout)
670 failed_swapout = !swap_out(classzone);
671 } while (--tries);
672
673#ifdef CONFIG_OOM_KILLER
674 out_of_memory();
675#else
676 if (likely(current->pid != 1))
677 break;
678 if (!check_classzone_need_balance(classzone))
679 break;
680
681 __set_current_state(TASK_RUNNING);
682 yield();
683#endif
684 }
685
686 return 0;
687}
688
689int fastcall try_to_free_pages(unsigned int gfp_mask)
690{
691 pg_data_t *pgdat;
692 zonelist_t *zonelist;
693 unsigned long pf_free_pages;
694 int error = 0;
695
696 pf_free_pages = current->flags & PF_FREE_PAGES;
697 current->flags &= ~PF_FREE_PAGES;
698
699 for_each_pgdat(pgdat) {
700 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
701 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
702 }
703
704 current->flags |= pf_free_pages;
705 return error;
706}
707
708DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
709
710static int check_classzone_need_balance(zone_t * classzone)
711{
712 zone_t * first_zone;
713 int class_idx = zone_idx(classzone);
714
715 first_zone = classzone->zone_pgdat->node_zones;
716 while (classzone >= first_zone) {
717 if (classzone->free_pages > classzone->watermarks[class_idx].high)
718 return 0;
719 classzone--;
720 }
721 return 1;
722}
723
724static int kswapd_balance_pgdat(pg_data_t * pgdat)
725{
726 int need_more_balance = 0, i;
727 zone_t * zone;
728
729 for (i = pgdat->nr_zones-1; i >= 0; i--) {
730 zone = pgdat->node_zones + i;
731 if (unlikely(current->need_resched))
732 schedule();
733 if (!zone->need_balance || !zone->size)
734 continue;
735 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
736 zone->need_balance = 0;
737 __set_current_state(TASK_INTERRUPTIBLE);
738 schedule_timeout(HZ*5);
739 continue;
740 }
741 if (check_classzone_need_balance(zone))
742 need_more_balance = 1;
743 else
744 zone->need_balance = 0;
745 }
746
747 return need_more_balance;
748}
749
750static void kswapd_balance(void)
751{
752 int need_more_balance;
753 pg_data_t * pgdat;
754
755 do {
756 need_more_balance = 0;
757
758 for_each_pgdat(pgdat)
759 need_more_balance |= kswapd_balance_pgdat(pgdat);
760 } while (need_more_balance);
761}
762
763static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
764{
765 zone_t * zone;
766 int i;
767
768 for (i = pgdat->nr_zones-1; i >= 0; i--) {
769 zone = pgdat->node_zones + i;
770 if (!zone->need_balance || !zone->size)
771 continue;
772 return 0;
773 }
774
775 return 1;
776}
777
778static int kswapd_can_sleep(void)
779{
780 pg_data_t * pgdat;
781
782 for_each_pgdat(pgdat) {
783 if (!kswapd_can_sleep_pgdat(pgdat))
784 return 0;
785 }
786
787 return 1;
788}
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803int kswapd(void *unused)
804{
805 struct task_struct *tsk = current;
806 DECLARE_WAITQUEUE(wait, tsk);
807
808 daemonize();
809 strcpy(tsk->comm, "kswapd");
810 sigfillset(&tsk->blocked);
811
812
813
814
815
816
817
818
819
820
821
822
823
824 tsk->flags |= PF_MEMALLOC;
825
826
827
828
829 for (;;) {
830 __set_current_state(TASK_INTERRUPTIBLE);
831 add_wait_queue(&kswapd_wait, &wait);
832
833 mb();
834 if (kswapd_can_sleep())
835 schedule();
836
837 __set_current_state(TASK_RUNNING);
838 remove_wait_queue(&kswapd_wait, &wait);
839
840
841
842
843
844
845 kswapd_balance();
846 run_task_queue(&tq_disk);
847 }
848}
849
850static int __init kswapd_init(void)
851{
852 printk("Starting kswapd\n");
853 swap_setup();
854 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
855 return 0;
856}
857
858module_init(kswapd_init)
859