1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/slab.h>
18#include <linux/kernel_stat.h>
19#include <linux/swap.h>
20#include <linux/swapctl.h>
21#include <linux/smp_lock.h>
22#include <linux/pagemap.h>
23#include <linux/init.h>
24#include <linux/highmem.h>
25#include <linux/file.h>
26
27#include <asm/pgalloc.h>
28
29
30
31
32
33
34
35int vm_passes = 60;
36
37
38
39
40
41
42int vm_cache_scan_ratio = 6;
43
44
45
46
47
48int vm_mapped_ratio = 100;
49
50
51
52
53
54
55
56
57int vm_lru_balance_ratio = 2;
58
59
60
61
62
63
64
65int vm_vfs_scan_ratio = 6;
66
67
68
69
70
71
72
73
74
75
76
77static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
78{
79 pte_t pte;
80 swp_entry_t entry;
81
82
83 if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
84 mark_page_accessed(page);
85 return 0;
86 }
87
88
89 if (PageActive(page))
90 return 0;
91
92
93 if (!memclass(page_zone(page), classzone))
94 return 0;
95
96 if (TryLockPage(page))
97 return 0;
98
99
100
101
102
103
104 flush_cache_page(vma, address);
105 pte = ptep_get_and_clear(page_table);
106 flush_tlb_page(vma, address);
107
108 if (pte_dirty(pte))
109 set_page_dirty(page);
110
111
112
113
114
115
116 if (PageSwapCache(page)) {
117 entry.val = page->index;
118 swap_duplicate(entry);
119set_swap_pte:
120 set_pte(page_table, swp_entry_to_pte(entry));
121drop_pte:
122 mm->rss--;
123 UnlockPage(page);
124 {
125 int freeable = page_count(page) - !!page->buffers <= 2;
126 page_cache_release(page);
127 return freeable;
128 }
129 }
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 if (page->mapping)
146 goto drop_pte;
147 if (!PageDirty(page))
148 goto drop_pte;
149
150
151
152
153
154 if (page->buffers)
155 goto preserve;
156
157
158
159
160
161
162
163 for (;;) {
164 entry = get_swap_page();
165 if (!entry.val)
166 break;
167
168
169
170
171 if (add_to_swap_cache(page, entry) == 0) {
172 SetPageUptodate(page);
173 set_page_dirty(page);
174 goto set_swap_pte;
175 }
176
177 swap_free(entry);
178 }
179
180
181preserve:
182 set_pte(page_table, pte);
183 UnlockPage(page);
184 return 0;
185}
186
187
188static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
189{
190 pte_t * pte;
191 unsigned long pmd_end;
192
193 if (pmd_none(*dir))
194 return count;
195 if (pmd_bad(*dir)) {
196 pmd_ERROR(*dir);
197 pmd_clear(dir);
198 return count;
199 }
200
201 pte = pte_offset(dir, address);
202
203 pmd_end = (address + PMD_SIZE) & PMD_MASK;
204 if (end > pmd_end)
205 end = pmd_end;
206
207 do {
208 if (pte_present(*pte)) {
209 struct page *page = pte_page(*pte);
210
211 if (VALID_PAGE(page) && !PageReserved(page)) {
212 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
213 if (!count) {
214 address += PAGE_SIZE;
215 break;
216 }
217 }
218 }
219 address += PAGE_SIZE;
220 pte++;
221 } while (address && (address < end));
222 mm->swap_address = address;
223 return count;
224}
225
226
227static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
228{
229 pmd_t * pmd;
230 unsigned long pgd_end;
231
232 if (pgd_none(*dir))
233 return count;
234 if (pgd_bad(*dir)) {
235 pgd_ERROR(*dir);
236 pgd_clear(dir);
237 return count;
238 }
239
240 pmd = pmd_offset(dir, address);
241
242 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
243 if (pgd_end && (end > pgd_end))
244 end = pgd_end;
245
246 do {
247 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
248 if (!count)
249 break;
250 address = (address + PMD_SIZE) & PMD_MASK;
251 pmd++;
252 } while (address && (address < end));
253 return count;
254}
255
256
257static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
258{
259 pgd_t *pgdir;
260 unsigned long end;
261
262
263 if (vma->vm_flags & VM_RESERVED)
264 return count;
265
266 pgdir = pgd_offset(mm, address);
267
268 end = vma->vm_end;
269 BUG_ON(address >= end);
270 do {
271 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
272 if (!count)
273 break;
274 address = (address + PGDIR_SIZE) & PGDIR_MASK;
275 pgdir++;
276 } while (address && (address < end));
277 return count;
278}
279
280
281struct mm_struct *swap_mm = &init_mm;
282
283
284
285
286static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
287{
288 unsigned long address;
289 struct vm_area_struct* vma;
290
291
292
293
294
295 spin_lock(&mm->page_table_lock);
296 address = mm->swap_address;
297 if (address == TASK_SIZE || swap_mm != mm) {
298
299 ++*mmcounter;
300 goto out_unlock;
301 }
302 vma = find_vma(mm, address);
303 if (vma) {
304 if (address < vma->vm_start)
305 address = vma->vm_start;
306
307 for (;;) {
308 count = swap_out_vma(mm, vma, address, count, classzone);
309 vma = vma->vm_next;
310 if (!vma)
311 break;
312 if (!count)
313 goto out_unlock;
314 address = vma->vm_start;
315 }
316 }
317
318 mm->swap_address = TASK_SIZE;
319
320out_unlock:
321 spin_unlock(&mm->page_table_lock);
322 return count;
323}
324
325static int FASTCALL(swap_out(zone_t * classzone));
326static int swap_out(zone_t * classzone)
327{
328 int counter, nr_pages = SWAP_CLUSTER_MAX;
329 struct mm_struct *mm;
330
331 counter = mmlist_nr << 1;
332 do {
333 if (unlikely(current->need_resched)) {
334 __set_current_state(TASK_RUNNING);
335 schedule();
336 }
337
338 spin_lock(&mmlist_lock);
339 mm = swap_mm;
340 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
341 mm->swap_address = 0;
342 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
343 if (mm == swap_mm)
344 goto empty;
345 swap_mm = mm;
346 }
347
348
349 atomic_inc(&mm->mm_users);
350 spin_unlock(&mmlist_lock);
351
352 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
353
354 mmput(mm);
355
356 if (!nr_pages)
357 return 1;
358 } while (--counter >= 0);
359
360 return 0;
361
362empty:
363 spin_unlock(&mmlist_lock);
364 return 0;
365}
366
367static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
368static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
369static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
370{
371 struct list_head * entry;
372 int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
373 int max_mapped = vm_mapped_ratio * nr_pages;
374
375 while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
376 struct page * page;
377
378 if (unlikely(current->need_resched)) {
379 spin_unlock(&pagemap_lru_lock);
380 __set_current_state(TASK_RUNNING);
381 schedule();
382 spin_lock(&pagemap_lru_lock);
383 continue;
384 }
385
386 page = list_entry(entry, struct page, lru);
387
388 BUG_ON(!PageLRU(page));
389 BUG_ON(PageActive(page));
390
391 list_del(entry);
392 list_add(entry, &inactive_list);
393
394
395
396
397
398 if (unlikely(!page_count(page)))
399 continue;
400
401 if (!memclass(page_zone(page), classzone))
402 continue;
403
404 max_scan--;
405
406
407 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
408 goto page_mapped;
409
410
411
412
413
414 if (unlikely(TryLockPage(page))) {
415 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
416 page_cache_get(page);
417 spin_unlock(&pagemap_lru_lock);
418 wait_on_page(page);
419 page_cache_release(page);
420 spin_lock(&pagemap_lru_lock);
421 }
422 continue;
423 }
424
425 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
426
427
428
429
430
431
432
433
434 int (*writepage)(struct page *);
435
436 writepage = page->mapping->a_ops->writepage;
437 if ((gfp_mask & __GFP_FS) && writepage) {
438 ClearPageDirty(page);
439 SetPageLaunder(page);
440 page_cache_get(page);
441 spin_unlock(&pagemap_lru_lock);
442
443 writepage(page);
444 page_cache_release(page);
445
446 spin_lock(&pagemap_lru_lock);
447 continue;
448 }
449 }
450
451
452
453
454
455
456 if (page->buffers) {
457 spin_unlock(&pagemap_lru_lock);
458
459
460 page_cache_get(page);
461
462 if (try_to_release_page(page, gfp_mask)) {
463 if (!page->mapping) {
464
465
466
467
468
469
470 spin_lock(&pagemap_lru_lock);
471 UnlockPage(page);
472 __lru_cache_del(page);
473
474
475 page_cache_release(page);
476
477 if (--nr_pages)
478 continue;
479 break;
480 } else {
481
482
483
484
485
486 page_cache_release(page);
487
488 spin_lock(&pagemap_lru_lock);
489 }
490 } else {
491
492 UnlockPage(page);
493 page_cache_release(page);
494
495 spin_lock(&pagemap_lru_lock);
496 continue;
497 }
498 }
499
500 spin_lock(&pagecache_lock);
501
502
503
504
505
506
507
508
509
510 if (!page->mapping || page_count(page) > 1) {
511 spin_unlock(&pagecache_lock);
512 UnlockPage(page);
513page_mapped:
514 if (--max_mapped < 0) {
515 spin_unlock(&pagemap_lru_lock);
516
517 nr_pages -= kmem_cache_reap(gfp_mask);
518 if (nr_pages <= 0)
519 goto out;
520
521 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
522 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
523#ifdef CONFIG_QUOTA
524 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
525#endif
526
527 if (!*failed_swapout)
528 *failed_swapout = !swap_out(classzone);
529
530 max_mapped = nr_pages * vm_mapped_ratio;
531
532 spin_lock(&pagemap_lru_lock);
533 refill_inactive(nr_pages, classzone);
534 }
535 continue;
536
537 }
538 if (PageDirty(page)) {
539 spin_unlock(&pagecache_lock);
540 UnlockPage(page);
541 continue;
542 }
543
544 __lru_cache_del(page);
545
546
547 if (likely(!PageSwapCache(page))) {
548 __remove_inode_page(page);
549 spin_unlock(&pagecache_lock);
550 } else {
551 swp_entry_t swap;
552 swap.val = page->index;
553 __delete_from_swap_cache(page);
554 spin_unlock(&pagecache_lock);
555 swap_free(swap);
556 }
557
558 UnlockPage(page);
559
560
561 page_cache_release(page);
562
563 if (--nr_pages)
564 continue;
565 break;
566 }
567 spin_unlock(&pagemap_lru_lock);
568
569 out:
570 return nr_pages;
571}
572
573
574
575
576
577
578
579
580static void refill_inactive(int nr_pages, zone_t * classzone)
581{
582 struct list_head * entry;
583 unsigned long ratio;
584
585 ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
586
587 entry = active_list.prev;
588 while (ratio && entry != &active_list) {
589 struct page * page;
590
591 page = list_entry(entry, struct page, lru);
592 entry = entry->prev;
593 if (PageTestandClearReferenced(page)) {
594 list_del(&page->lru);
595 list_add(&page->lru, &active_list);
596 continue;
597 }
598
599 ratio--;
600
601 del_page_from_active_list(page);
602 add_page_to_inactive_list(page);
603 SetPageReferenced(page);
604 }
605
606 if (entry != &active_list) {
607 list_del(&active_list);
608 list_add(&active_list, entry);
609 }
610}
611
612static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
613static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
614{
615 nr_pages -= kmem_cache_reap(gfp_mask);
616 if (nr_pages <= 0)
617 goto out;
618
619 spin_lock(&pagemap_lru_lock);
620 refill_inactive(nr_pages, classzone);
621
622 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
623
624out:
625 return nr_pages;
626}
627
628static int check_classzone_need_balance(zone_t * classzone);
629
630int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
631{
632 gfp_mask = pf_gfp_mask(gfp_mask);
633
634 for (;;) {
635 int tries = vm_passes;
636 int failed_swapout = !(gfp_mask & __GFP_IO);
637 int nr_pages = SWAP_CLUSTER_MAX;
638
639 do {
640 nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
641 if (nr_pages <= 0)
642 return 1;
643 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
644 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
645#ifdef CONFIG_QUOTA
646 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
647#endif
648 if (!failed_swapout)
649 failed_swapout = !swap_out(classzone);
650 } while (--tries);
651
652#ifdef CONFIG_OOM_KILLER
653 out_of_memory();
654#else
655 if (likely(current->pid != 1))
656 break;
657 if (!check_classzone_need_balance(classzone))
658 break;
659
660 __set_current_state(TASK_RUNNING);
661 yield();
662#endif
663 }
664
665 return 0;
666}
667
668int try_to_free_pages(unsigned int gfp_mask)
669{
670 pg_data_t *pgdat;
671 zonelist_t *zonelist;
672 unsigned long pf_free_pages;
673 int error = 0;
674
675 pf_free_pages = current->flags & PF_FREE_PAGES;
676 current->flags &= ~PF_FREE_PAGES;
677
678 for_each_pgdat(pgdat) {
679 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
680 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
681 }
682
683 current->flags |= pf_free_pages;
684 return error;
685}
686
687DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
688
689static int check_classzone_need_balance(zone_t * classzone)
690{
691 zone_t * first_zone;
692 int class_idx = zone_idx(classzone);
693
694 first_zone = classzone->zone_pgdat->node_zones;
695 while (classzone >= first_zone) {
696 if (classzone->free_pages > classzone->watermarks[class_idx].high)
697 return 0;
698 classzone--;
699 }
700 return 1;
701}
702
703static int kswapd_balance_pgdat(pg_data_t * pgdat)
704{
705 int need_more_balance = 0, i;
706 zone_t * zone;
707
708 for (i = pgdat->nr_zones-1; i >= 0; i--) {
709 zone = pgdat->node_zones + i;
710 if (unlikely(current->need_resched))
711 schedule();
712 if (!zone->need_balance || !zone->size)
713 continue;
714 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
715 zone->need_balance = 0;
716 __set_current_state(TASK_INTERRUPTIBLE);
717 schedule_timeout(HZ*5);
718 continue;
719 }
720 if (check_classzone_need_balance(zone))
721 need_more_balance = 1;
722 else
723 zone->need_balance = 0;
724 }
725
726 return need_more_balance;
727}
728
729static void kswapd_balance(void)
730{
731 int need_more_balance;
732 pg_data_t * pgdat;
733
734 do {
735 need_more_balance = 0;
736
737 for_each_pgdat(pgdat)
738 need_more_balance |= kswapd_balance_pgdat(pgdat);
739 } while (need_more_balance);
740}
741
742static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
743{
744 zone_t * zone;
745 int i;
746
747 for (i = pgdat->nr_zones-1; i >= 0; i--) {
748 zone = pgdat->node_zones + i;
749 if (!zone->need_balance || !zone->size)
750 continue;
751 return 0;
752 }
753
754 return 1;
755}
756
757static int kswapd_can_sleep(void)
758{
759 pg_data_t * pgdat;
760
761 for_each_pgdat(pgdat) {
762 if (!kswapd_can_sleep_pgdat(pgdat))
763 return 0;
764 }
765
766 return 1;
767}
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782int kswapd(void *unused)
783{
784 struct task_struct *tsk = current;
785 DECLARE_WAITQUEUE(wait, tsk);
786
787 daemonize();
788 strcpy(tsk->comm, "kswapd");
789 sigfillset(&tsk->blocked);
790
791
792
793
794
795
796
797
798
799
800
801
802
803 tsk->flags |= PF_MEMALLOC;
804
805
806
807
808 for (;;) {
809 __set_current_state(TASK_INTERRUPTIBLE);
810 add_wait_queue(&kswapd_wait, &wait);
811
812 mb();
813 if (kswapd_can_sleep())
814 schedule();
815
816 __set_current_state(TASK_RUNNING);
817 remove_wait_queue(&kswapd_wait, &wait);
818
819
820
821
822
823
824 kswapd_balance();
825 run_task_queue(&tq_disk);
826 }
827}
828
829static int __init kswapd_init(void)
830{
831 printk("Starting kswapd\n");
832 swap_setup();
833 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
834 return 0;
835}
836
837module_init(kswapd_init)
838