1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/mm.h>
14#include <linux/sched.h>
15#include <linux/head.h>
16#include <linux/kernel.h>
17#include <linux/kernel_stat.h>
18#include <linux/errno.h>
19#include <linux/string.h>
20#include <linux/stat.h>
21#include <linux/swap.h>
22#include <linux/fs.h>
23#include <linux/swapctl.h>
24#include <linux/smp_lock.h>
25#include <linux/slab.h>
26
27#include <asm/dma.h>
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/bitops.h>
31#include <asm/pgtable.h>
32
33
34
35
36static int next_swap_jiffies = 0;
37
38
39
40
41
42int swapout_interval = HZ / 4;
43
44
45
46
47static struct wait_queue * kswapd_wait = NULL;
48
49
50
51
52static int kswapd_awake = 0;
53
54static void init_swap_timer(void);
55
56
57
58
59
60
61
62
63
64
65
66
67static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
68 unsigned long address, pte_t * page_table, int dma, int wait)
69{
70 pte_t pte;
71 unsigned long entry;
72 unsigned long page;
73 struct page * page_map;
74
75 pte = *page_table;
76 if (!pte_present(pte))
77 return 0;
78 page = pte_page(pte);
79 if (MAP_NR(page) >= max_mapnr)
80 return 0;
81
82 page_map = mem_map + MAP_NR(page);
83 if (PageReserved(page_map)
84 || PageLocked(page_map)
85 || (dma && !PageDMA(page_map)))
86 return 0;
87
88
89
90 if ((pte_dirty(pte) && delete_from_swap_cache(page_map))
91 || pte_young(pte)) {
92 set_pte(page_table, pte_mkold(pte));
93 touch_page(page_map);
94 return 0;
95 }
96 age_page(page_map);
97 if (page_map->age)
98 return 0;
99 if (pte_dirty(pte)) {
100 if (vma->vm_ops && vma->vm_ops->swapout) {
101 pid_t pid = tsk->pid;
102 vma->vm_mm->rss--;
103 if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
104 kill_proc(pid, SIGBUS, 1);
105 } else {
106 if (atomic_read(&page_map->count) != 1)
107 return 0;
108 if (!(entry = get_swap_page()))
109 return 0;
110 vma->vm_mm->rss--;
111 flush_cache_page(vma, address);
112 set_pte(page_table, __pte(entry));
113 flush_tlb_page(vma, address);
114 tsk->nswap++;
115 rw_swap_page(WRITE, entry, (char *) page, wait);
116 }
117 free_page(page);
118 return 1;
119 }
120 if ((entry = find_in_swap_cache(page_map))) {
121 if (atomic_read(&page_map->count) != 1) {
122 set_pte(page_table, pte_mkdirty(pte));
123 printk("Aiee.. duplicated cached swap-cache entry\n");
124 return 0;
125 }
126 vma->vm_mm->rss--;
127 flush_cache_page(vma, address);
128 set_pte(page_table, __pte(entry));
129 flush_tlb_page(vma, address);
130 free_page(page);
131 return 1;
132 }
133 vma->vm_mm->rss--;
134 flush_cache_page(vma, address);
135 pte_clear(page_table);
136 flush_tlb_page(vma, address);
137 entry = page_unuse(page);
138 free_page(page);
139 return entry;
140}
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
157 pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
158{
159 pte_t * pte;
160 unsigned long pmd_end;
161
162 if (pmd_none(*dir))
163 return 0;
164 if (pmd_bad(*dir)) {
165 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
166 pmd_clear(dir);
167 return 0;
168 }
169
170 pte = pte_offset(dir, address);
171
172 pmd_end = (address + PMD_SIZE) & PMD_MASK;
173 if (end > pmd_end)
174 end = pmd_end;
175
176 do {
177 int result;
178 tsk->swap_address = address + PAGE_SIZE;
179 result = try_to_swap_out(tsk, vma, address, pte, dma, wait);
180 if (result)
181 return result;
182 address += PAGE_SIZE;
183 pte++;
184 } while (address < end);
185 return 0;
186}
187
188static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
189 pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
190{
191 pmd_t * pmd;
192 unsigned long pgd_end;
193
194 if (pgd_none(*dir))
195 return 0;
196 if (pgd_bad(*dir)) {
197 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
198 pgd_clear(dir);
199 return 0;
200 }
201
202 pmd = pmd_offset(dir, address);
203
204 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
205 if (end > pgd_end)
206 end = pgd_end;
207
208 do {
209 int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait);
210 if (result)
211 return result;
212 address = (address + PMD_SIZE) & PMD_MASK;
213 pmd++;
214 } while (address < end);
215 return 0;
216}
217
218static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
219 pgd_t *pgdir, unsigned long start, int dma, int wait)
220{
221 unsigned long end;
222
223
224
225 if (vma->vm_flags & (VM_SHM | VM_LOCKED))
226 return 0;
227
228 end = vma->vm_end;
229 while (start < end) {
230 int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait);
231 if (result)
232 return result;
233 start = (start + PGDIR_SIZE) & PGDIR_MASK;
234 pgdir++;
235 }
236 return 0;
237}
238
239static int swap_out_process(struct task_struct * p, int dma, int wait)
240{
241 unsigned long address;
242 struct vm_area_struct* vma;
243
244
245
246
247 address = p->swap_address;
248 p->swap_address = 0;
249
250
251
252
253 vma = find_vma(p->mm, address);
254 if (!vma)
255 return 0;
256 if (address < vma->vm_start)
257 address = vma->vm_start;
258
259 for (;;) {
260 int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait);
261 if (result)
262 return result;
263 vma = vma->vm_next;
264 if (!vma)
265 break;
266 address = vma->vm_start;
267 }
268 p->swap_address = 0;
269 return 0;
270}
271
272
273
274
275
276
277static int swap_out(unsigned int priority, int dma, int wait)
278{
279 struct task_struct * p, * pbest;
280 int counter, assign, max_cnt;
281
282
283
284
285
286
287
288
289
290
291 counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
292 for (; counter >= 0; counter--) {
293 assign = 0;
294 max_cnt = 0;
295 pbest = NULL;
296 select:
297 read_lock(&tasklist_lock);
298 p = init_task.next_task;
299 for (; p != &init_task; p = p->next_task) {
300 if (!p->swappable)
301 continue;
302 if (p->mm->rss <= 0)
303 continue;
304 if (assign) {
305
306
307
308
309
310
311 p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
312 }
313 if (p->swap_cnt > max_cnt) {
314 max_cnt = p->swap_cnt;
315 pbest = p;
316 }
317 }
318 read_unlock(&tasklist_lock);
319 if (!pbest) {
320 if (!assign) {
321 assign = 1;
322 goto select;
323 }
324 goto out;
325 }
326 pbest->swap_cnt--;
327
328 switch (swap_out_process(pbest, dma, wait)) {
329 case 0:
330
331
332
333
334
335 pbest->swap_cnt = 0;
336 break;
337 case 1:
338 return 1;
339 default:
340 break;
341 };
342 }
343out:
344 return 0;
345}
346
347
348
349
350
351
352static inline int do_try_to_free_page(int priority, int dma, int wait)
353{
354 static int state = 0;
355 int i=6;
356 int stop;
357
358
359 (void) kmem_cache_reap(0, dma, wait);
360
361
362 stop = 3;
363 if (wait)
364 stop = 0;
365 switch (state) {
366 do {
367 case 0:
368 if (shrink_mmap(i, dma))
369 return 1;
370 state = 1;
371 case 1:
372 if (shm_swap(i, dma))
373 return 1;
374 state = 2;
375 default:
376 if (swap_out(i, dma, wait))
377 return 1;
378 state = 0;
379 i--;
380 } while ((i - stop) >= 0);
381 }
382 return 0;
383}
384
385
386
387
388
389
390
391
392int try_to_free_page(int priority, int dma, int wait)
393{
394 int retval;
395
396 lock_kernel();
397 retval = do_try_to_free_page(priority,dma,wait);
398 unlock_kernel();
399 return retval;
400}
401
402
403
404
405
406
407
408void kswapd_setup(void)
409{
410 int i;
411 char *revision="$Revision: 1.23 $", *s, *e;
412
413 if ((s = strchr(revision, ':')) &&
414 (e = strchr(s, '$')))
415 s++, i = e - s;
416 else
417 s = revision, i = -1;
418 printk ("Starting kswapd v%.*s\n", i, s);
419}
420
421
422
423
424
425int kswapd(void *unused)
426{
427 current->session = 1;
428 current->pgrp = 1;
429 sprintf(current->comm, "kswapd");
430 current->blocked = ~0UL;
431
432
433
434
435
436
437 lock_kernel();
438
439
440 current->policy = SCHED_FIFO;
441 current->priority = 32;
442
443
444
445 init_swap_timer();
446
447 while (1) {
448 kswapd_awake = 0;
449 current->signal = 0;
450 run_task_queue(&tq_disk);
451 interruptible_sleep_on(&kswapd_wait);
452 kswapd_awake = 1;
453 swapstats.wakeups++;
454
455
456
457
458
459 while(nr_free_pages < min_free_pages)
460 try_to_free_page(GFP_KERNEL, 0, 1);
461 while((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_low)
462 try_to_free_page(GFP_KERNEL, 0, 1);
463 while((nr_free_pages + atomic_read(&nr_async_pages)) < free_pages_high)
464 try_to_free_page(GFP_KERNEL, 0, 0);
465 }
466}
467
468
469
470
471
472void swap_tick(void)
473{
474 int want_wakeup = 0, memory_low = 0;
475 int pages = nr_free_pages + atomic_read(&nr_async_pages);
476
477 if (pages < free_pages_low)
478 memory_low = want_wakeup = 1;
479 else if (pages < free_pages_high && jiffies >= next_swap_jiffies)
480 want_wakeup = 1;
481
482 if (want_wakeup) {
483 if (!kswapd_awake) {
484 wake_up(&kswapd_wait);
485 need_resched = 1;
486 }
487
488 next_swap_jiffies = jiffies;
489 if (!memory_low)
490 next_swap_jiffies += swapout_interval;
491 }
492 timer_active |= (1<<SWAP_TIMER);
493}
494
495
496
497
498
499void init_swap_timer(void)
500{
501 timer_table[SWAP_TIMER].expires = 0;
502 timer_table[SWAP_TIMER].fn = swap_tick;
503 timer_active |= (1<<SWAP_TIMER);
504}
505