1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
37
38#include <asm/pgtable.h>
39#include <asm/tlbflush.h>
40#include <linux/swapops.h>
41#include <linux/page_cgroup.h>
42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char);
45static void free_swap_count_continuations(struct swap_info_struct *);
46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47
48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles;
50long nr_swap_pages;
51long total_swap_pages;
52static int least_priority;
53
54static const char Bad_file[] = "Bad swap file entry ";
55static const char Unused_file[] = "Unused swap file entry ";
56static const char Bad_offset[] = "Bad swap offset entry ";
57static const char Unused_offset[] = "Unused swap offset entry ";
58
59struct swap_list_t swap_list = {-1, -1};
60
61struct swap_info_struct *swap_info[MAX_SWAPFILES];
62
63static DEFINE_MUTEX(swapon_mutex);
64
65static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
66
67static atomic_t proc_poll_event = ATOMIC_INIT(0);
68
69static inline unsigned char swap_count(unsigned char ent)
70{
71 return ent & ~SWAP_HAS_CACHE;
72}
73
74
75static int
76__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
77{
78 swp_entry_t entry = swp_entry(si->type, offset);
79 struct page *page;
80 int ret = 0;
81
82 page = find_get_page(&swapper_space, entry.val);
83 if (!page)
84 return 0;
85
86
87
88
89
90
91
92 if (trylock_page(page)) {
93 ret = try_to_free_swap(page);
94 unlock_page(page);
95 }
96 page_cache_release(page);
97 return ret;
98}
99
100
101
102
103
104static int discard_swap(struct swap_info_struct *si)
105{
106 struct swap_extent *se;
107 sector_t start_block;
108 sector_t nr_blocks;
109 int err = 0;
110
111
112 se = &si->first_swap_extent;
113 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
114 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
115 if (nr_blocks) {
116 err = blkdev_issue_discard(si->bdev, start_block,
117 nr_blocks, GFP_KERNEL, 0);
118 if (err)
119 return err;
120 cond_resched();
121 }
122
123 list_for_each_entry(se, &si->first_swap_extent.list, list) {
124 start_block = se->start_block << (PAGE_SHIFT - 9);
125 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
126
127 err = blkdev_issue_discard(si->bdev, start_block,
128 nr_blocks, GFP_KERNEL, 0);
129 if (err)
130 break;
131
132 cond_resched();
133 }
134 return err;
135}
136
137
138
139
140
141static void discard_swap_cluster(struct swap_info_struct *si,
142 pgoff_t start_page, pgoff_t nr_pages)
143{
144 struct swap_extent *se = si->curr_swap_extent;
145 int found_extent = 0;
146
147 while (nr_pages) {
148 struct list_head *lh;
149
150 if (se->start_page <= start_page &&
151 start_page < se->start_page + se->nr_pages) {
152 pgoff_t offset = start_page - se->start_page;
153 sector_t start_block = se->start_block + offset;
154 sector_t nr_blocks = se->nr_pages - offset;
155
156 if (nr_blocks > nr_pages)
157 nr_blocks = nr_pages;
158 start_page += nr_blocks;
159 nr_pages -= nr_blocks;
160
161 if (!found_extent++)
162 si->curr_swap_extent = se;
163
164 start_block <<= PAGE_SHIFT - 9;
165 nr_blocks <<= PAGE_SHIFT - 9;
166 if (blkdev_issue_discard(si->bdev, start_block,
167 nr_blocks, GFP_NOIO, 0))
168 break;
169 }
170
171 lh = se->list.next;
172 se = list_entry(lh, struct swap_extent, list);
173 }
174}
175
176static int wait_for_discard(void *word)
177{
178 schedule();
179 return 0;
180}
181
182#define SWAPFILE_CLUSTER 256
183#define LATENCY_LIMIT 256
184
185static unsigned long scan_swap_map(struct swap_info_struct *si,
186 unsigned char usage)
187{
188 unsigned long offset;
189 unsigned long scan_base;
190 unsigned long last_in_cluster = 0;
191 int latency_ration = LATENCY_LIMIT;
192 int found_free_cluster = 0;
193
194
195
196
197
198
199
200
201
202
203
204
205 si->flags += SWP_SCANNING;
206 scan_base = offset = si->cluster_next;
207
208 if (unlikely(!si->cluster_nr--)) {
209 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
210 si->cluster_nr = SWAPFILE_CLUSTER - 1;
211 goto checks;
212 }
213 if (si->flags & SWP_DISCARDABLE) {
214
215
216
217
218
219
220
221 if (si->lowest_alloc)
222 goto checks;
223 si->lowest_alloc = si->max;
224 si->highest_alloc = 0;
225 }
226 spin_unlock(&swap_lock);
227
228
229
230
231
232
233
234
235
236 if (!(si->flags & SWP_SOLIDSTATE))
237 scan_base = offset = si->lowest_bit;
238 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
239
240
241 for (; last_in_cluster <= si->highest_bit; offset++) {
242 if (si->swap_map[offset])
243 last_in_cluster = offset + SWAPFILE_CLUSTER;
244 else if (offset == last_in_cluster) {
245 spin_lock(&swap_lock);
246 offset -= SWAPFILE_CLUSTER - 1;
247 si->cluster_next = offset;
248 si->cluster_nr = SWAPFILE_CLUSTER - 1;
249 found_free_cluster = 1;
250 goto checks;
251 }
252 if (unlikely(--latency_ration < 0)) {
253 cond_resched();
254 latency_ration = LATENCY_LIMIT;
255 }
256 }
257
258 offset = si->lowest_bit;
259 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
260
261
262 for (; last_in_cluster < scan_base; offset++) {
263 if (si->swap_map[offset])
264 last_in_cluster = offset + SWAPFILE_CLUSTER;
265 else if (offset == last_in_cluster) {
266 spin_lock(&swap_lock);
267 offset -= SWAPFILE_CLUSTER - 1;
268 si->cluster_next = offset;
269 si->cluster_nr = SWAPFILE_CLUSTER - 1;
270 found_free_cluster = 1;
271 goto checks;
272 }
273 if (unlikely(--latency_ration < 0)) {
274 cond_resched();
275 latency_ration = LATENCY_LIMIT;
276 }
277 }
278
279 offset = scan_base;
280 spin_lock(&swap_lock);
281 si->cluster_nr = SWAPFILE_CLUSTER - 1;
282 si->lowest_alloc = 0;
283 }
284
285checks:
286 if (!(si->flags & SWP_WRITEOK))
287 goto no_page;
288 if (!si->highest_bit)
289 goto no_page;
290 if (offset > si->highest_bit)
291 scan_base = offset = si->lowest_bit;
292
293
294 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
295 int swap_was_freed;
296 spin_unlock(&swap_lock);
297 swap_was_freed = __try_to_reclaim_swap(si, offset);
298 spin_lock(&swap_lock);
299
300 if (swap_was_freed)
301 goto checks;
302 goto scan;
303 }
304
305 if (si->swap_map[offset])
306 goto scan;
307
308 if (offset == si->lowest_bit)
309 si->lowest_bit++;
310 if (offset == si->highest_bit)
311 si->highest_bit--;
312 si->inuse_pages++;
313 if (si->inuse_pages == si->pages) {
314 si->lowest_bit = si->max;
315 si->highest_bit = 0;
316 }
317 si->swap_map[offset] = usage;
318 si->cluster_next = offset + 1;
319 si->flags -= SWP_SCANNING;
320
321 if (si->lowest_alloc) {
322
323
324
325
326 if (found_free_cluster) {
327
328
329
330
331
332
333
334 if (offset < si->highest_alloc &&
335 si->lowest_alloc <= last_in_cluster)
336 last_in_cluster = si->lowest_alloc - 1;
337 si->flags |= SWP_DISCARDING;
338 spin_unlock(&swap_lock);
339
340 if (offset < last_in_cluster)
341 discard_swap_cluster(si, offset,
342 last_in_cluster - offset + 1);
343
344 spin_lock(&swap_lock);
345 si->lowest_alloc = 0;
346 si->flags &= ~SWP_DISCARDING;
347
348 smp_mb();
349 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
350
351 } else if (si->flags & SWP_DISCARDING) {
352
353
354
355
356
357
358 spin_unlock(&swap_lock);
359 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
360 wait_for_discard, TASK_UNINTERRUPTIBLE);
361 spin_lock(&swap_lock);
362 } else {
363
364
365
366
367
368 if (offset < si->lowest_alloc)
369 si->lowest_alloc = offset;
370 if (offset > si->highest_alloc)
371 si->highest_alloc = offset;
372 }
373 }
374 return offset;
375
376scan:
377 spin_unlock(&swap_lock);
378 while (++offset <= si->highest_bit) {
379 if (!si->swap_map[offset]) {
380 spin_lock(&swap_lock);
381 goto checks;
382 }
383 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
384 spin_lock(&swap_lock);
385 goto checks;
386 }
387 if (unlikely(--latency_ration < 0)) {
388 cond_resched();
389 latency_ration = LATENCY_LIMIT;
390 }
391 }
392 offset = si->lowest_bit;
393 while (++offset < scan_base) {
394 if (!si->swap_map[offset]) {
395 spin_lock(&swap_lock);
396 goto checks;
397 }
398 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
399 spin_lock(&swap_lock);
400 goto checks;
401 }
402 if (unlikely(--latency_ration < 0)) {
403 cond_resched();
404 latency_ration = LATENCY_LIMIT;
405 }
406 }
407 spin_lock(&swap_lock);
408
409no_page:
410 si->flags -= SWP_SCANNING;
411 return 0;
412}
413
414swp_entry_t get_swap_page(void)
415{
416 struct swap_info_struct *si;
417 pgoff_t offset;
418 int type, next;
419 int wrapped = 0;
420
421 spin_lock(&swap_lock);
422 if (nr_swap_pages <= 0)
423 goto noswap;
424 nr_swap_pages--;
425
426 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
427 si = swap_info[type];
428 next = si->next;
429 if (next < 0 ||
430 (!wrapped && si->prio != swap_info[next]->prio)) {
431 next = swap_list.head;
432 wrapped++;
433 }
434
435 if (!si->highest_bit)
436 continue;
437 if (!(si->flags & SWP_WRITEOK))
438 continue;
439
440 swap_list.next = next;
441
442 offset = scan_swap_map(si, SWAP_HAS_CACHE);
443 if (offset) {
444 spin_unlock(&swap_lock);
445 return swp_entry(type, offset);
446 }
447 next = swap_list.next;
448 }
449
450 nr_swap_pages++;
451noswap:
452 spin_unlock(&swap_lock);
453 return (swp_entry_t) {0};
454}
455
456
457swp_entry_t get_swap_page_of_type(int type)
458{
459 struct swap_info_struct *si;
460 pgoff_t offset;
461
462 spin_lock(&swap_lock);
463 si = swap_info[type];
464 if (si && (si->flags & SWP_WRITEOK)) {
465 nr_swap_pages--;
466
467 offset = scan_swap_map(si, 1);
468 if (offset) {
469 spin_unlock(&swap_lock);
470 return swp_entry(type, offset);
471 }
472 nr_swap_pages++;
473 }
474 spin_unlock(&swap_lock);
475 return (swp_entry_t) {0};
476}
477
478static struct swap_info_struct *swap_info_get(swp_entry_t entry)
479{
480 struct swap_info_struct *p;
481 unsigned long offset, type;
482
483 if (!entry.val)
484 goto out;
485 type = swp_type(entry);
486 if (type >= nr_swapfiles)
487 goto bad_nofile;
488 p = swap_info[type];
489 if (!(p->flags & SWP_USED))
490 goto bad_device;
491 offset = swp_offset(entry);
492 if (offset >= p->max)
493 goto bad_offset;
494 if (!p->swap_map[offset])
495 goto bad_free;
496 spin_lock(&swap_lock);
497 return p;
498
499bad_free:
500 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
501 goto out;
502bad_offset:
503 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
504 goto out;
505bad_device:
506 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
507 goto out;
508bad_nofile:
509 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
510out:
511 return NULL;
512}
513
514static unsigned char swap_entry_free(struct swap_info_struct *p,
515 swp_entry_t entry, unsigned char usage)
516{
517 unsigned long offset = swp_offset(entry);
518 unsigned char count;
519 unsigned char has_cache;
520
521 count = p->swap_map[offset];
522 has_cache = count & SWAP_HAS_CACHE;
523 count &= ~SWAP_HAS_CACHE;
524
525 if (usage == SWAP_HAS_CACHE) {
526 VM_BUG_ON(!has_cache);
527 has_cache = 0;
528 } else if (count == SWAP_MAP_SHMEM) {
529
530
531
532
533 count = 0;
534 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
535 if (count == COUNT_CONTINUED) {
536 if (swap_count_continued(p, offset, count))
537 count = SWAP_MAP_MAX | COUNT_CONTINUED;
538 else
539 count = SWAP_MAP_MAX;
540 } else
541 count--;
542 }
543
544 if (!count)
545 mem_cgroup_uncharge_swap(entry);
546
547 usage = count | has_cache;
548 p->swap_map[offset] = usage;
549
550
551 if (!usage) {
552 if (offset < p->lowest_bit)
553 p->lowest_bit = offset;
554 if (offset > p->highest_bit)
555 p->highest_bit = offset;
556 if (swap_list.next >= 0 &&
557 p->prio > swap_info[swap_list.next]->prio)
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
562 if (p->flags & SWP_BLKDEV) {
563 struct gendisk *disk = p->bdev->bd_disk;
564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
568 }
569
570 return usage;
571}
572
573
574
575
576
577void swap_free(swp_entry_t entry)
578{
579 struct swap_info_struct *p;
580
581 p = swap_info_get(entry);
582 if (p) {
583 swap_entry_free(p, entry, 1);
584 spin_unlock(&swap_lock);
585 }
586}
587
588
589
590
591void swapcache_free(swp_entry_t entry, struct page *page)
592{
593 struct swap_info_struct *p;
594 unsigned char count;
595
596 p = swap_info_get(entry);
597 if (p) {
598 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
599 if (page)
600 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
601 spin_unlock(&swap_lock);
602 }
603}
604
605
606
607
608
609
610int page_swapcount(struct page *page)
611{
612 int count = 0;
613 struct swap_info_struct *p;
614 swp_entry_t entry;
615
616 entry.val = page_private(page);
617 p = swap_info_get(entry);
618 if (p) {
619 count = swap_count(p->swap_map[swp_offset(entry)]);
620 spin_unlock(&swap_lock);
621 }
622 return count;
623}
624
625
626
627
628
629
630
631int reuse_swap_page(struct page *page)
632{
633 int count;
634
635 VM_BUG_ON(!PageLocked(page));
636 if (unlikely(PageKsm(page)))
637 return 0;
638 count = page_mapcount(page);
639 if (count <= 1 && PageSwapCache(page)) {
640 count += page_swapcount(page);
641 if (count == 1 && !PageWriteback(page)) {
642 delete_from_swap_cache(page);
643 SetPageDirty(page);
644 }
645 }
646 return count <= 1;
647}
648
649
650
651
652
653int try_to_free_swap(struct page *page)
654{
655 VM_BUG_ON(!PageLocked(page));
656
657 if (!PageSwapCache(page))
658 return 0;
659 if (PageWriteback(page))
660 return 0;
661 if (page_swapcount(page))
662 return 0;
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679 if (pm_suspended_storage())
680 return 0;
681
682 delete_from_swap_cache(page);
683 SetPageDirty(page);
684 return 1;
685}
686
687
688
689
690
691int free_swap_and_cache(swp_entry_t entry)
692{
693 struct swap_info_struct *p;
694 struct page *page = NULL;
695
696 if (non_swap_entry(entry))
697 return 1;
698
699 p = swap_info_get(entry);
700 if (p) {
701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702 page = find_get_page(&swapper_space, entry.val);
703 if (page && !trylock_page(page)) {
704 page_cache_release(page);
705 page = NULL;
706 }
707 }
708 spin_unlock(&swap_lock);
709 }
710 if (page) {
711
712
713
714
715 if (PageSwapCache(page) && !PageWriteback(page) &&
716 (!page_mapped(page) || vm_swap_full())) {
717 delete_from_swap_cache(page);
718 SetPageDirty(page);
719 }
720 unlock_page(page);
721 page_cache_release(page);
722 }
723 return p != NULL;
724}
725
726#ifdef CONFIG_HIBERNATION
727
728
729
730
731
732
733
734
735int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
736{
737 struct block_device *bdev = NULL;
738 int type;
739
740 if (device)
741 bdev = bdget(device);
742
743 spin_lock(&swap_lock);
744 for (type = 0; type < nr_swapfiles; type++) {
745 struct swap_info_struct *sis = swap_info[type];
746
747 if (!(sis->flags & SWP_WRITEOK))
748 continue;
749
750 if (!bdev) {
751 if (bdev_p)
752 *bdev_p = bdgrab(sis->bdev);
753
754 spin_unlock(&swap_lock);
755 return type;
756 }
757 if (bdev == sis->bdev) {
758 struct swap_extent *se = &sis->first_swap_extent;
759
760 if (se->start_block == offset) {
761 if (bdev_p)
762 *bdev_p = bdgrab(sis->bdev);
763
764 spin_unlock(&swap_lock);
765 bdput(bdev);
766 return type;
767 }
768 }
769 }
770 spin_unlock(&swap_lock);
771 if (bdev)
772 bdput(bdev);
773
774 return -ENODEV;
775}
776
777
778
779
780
781sector_t swapdev_block(int type, pgoff_t offset)
782{
783 struct block_device *bdev;
784
785 if ((unsigned int)type >= nr_swapfiles)
786 return 0;
787 if (!(swap_info[type]->flags & SWP_WRITEOK))
788 return 0;
789 return map_swap_entry(swp_entry(type, offset), &bdev);
790}
791
792
793
794
795
796
797
798unsigned int count_swap_pages(int type, int free)
799{
800 unsigned int n = 0;
801
802 spin_lock(&swap_lock);
803 if ((unsigned int)type < nr_swapfiles) {
804 struct swap_info_struct *sis = swap_info[type];
805
806 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages;
808 if (free)
809 n -= sis->inuse_pages;
810 }
811 }
812 spin_unlock(&swap_lock);
813 return n;
814}
815#endif
816
817
818
819
820
821
822static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
823 unsigned long addr, swp_entry_t entry, struct page *page)
824{
825 struct mem_cgroup *memcg;
826 spinlock_t *ptl;
827 pte_t *pte;
828 int ret = 1;
829
830 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
831 GFP_KERNEL, &memcg)) {
832 ret = -ENOMEM;
833 goto out_nolock;
834 }
835
836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
838 mem_cgroup_cancel_charge_swapin(memcg);
839 ret = 0;
840 goto out;
841 }
842
843 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
844 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
845 get_page(page);
846 set_pte_at(vma->vm_mm, addr, pte,
847 pte_mkold(mk_pte(page, vma->vm_page_prot)));
848 page_add_anon_rmap(page, vma, addr);
849 mem_cgroup_commit_charge_swapin(page, memcg);
850 swap_free(entry);
851
852
853
854
855 activate_page(page);
856out:
857 pte_unmap_unlock(pte, ptl);
858out_nolock:
859 return ret;
860}
861
862static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
863 unsigned long addr, unsigned long end,
864 swp_entry_t entry, struct page *page)
865{
866 pte_t swp_pte = swp_entry_to_pte(entry);
867 pte_t *pte;
868 int ret = 0;
869
870
871
872
873
874
875
876
877
878
879 pte = pte_offset_map(pmd, addr);
880 do {
881
882
883
884
885 if (unlikely(pte_same(*pte, swp_pte))) {
886 pte_unmap(pte);
887 ret = unuse_pte(vma, pmd, addr, entry, page);
888 if (ret)
889 goto out;
890 pte = pte_offset_map(pmd, addr);
891 }
892 } while (pte++, addr += PAGE_SIZE, addr != end);
893 pte_unmap(pte - 1);
894out:
895 return ret;
896}
897
898static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
899 unsigned long addr, unsigned long end,
900 swp_entry_t entry, struct page *page)
901{
902 pmd_t *pmd;
903 unsigned long next;
904 int ret;
905
906 pmd = pmd_offset(pud, addr);
907 do {
908 next = pmd_addr_end(addr, end);
909 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
910 continue;
911 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
912 if (ret)
913 return ret;
914 } while (pmd++, addr = next, addr != end);
915 return 0;
916}
917
918static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
919 unsigned long addr, unsigned long end,
920 swp_entry_t entry, struct page *page)
921{
922 pud_t *pud;
923 unsigned long next;
924 int ret;
925
926 pud = pud_offset(pgd, addr);
927 do {
928 next = pud_addr_end(addr, end);
929 if (pud_none_or_clear_bad(pud))
930 continue;
931 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
932 if (ret)
933 return ret;
934 } while (pud++, addr = next, addr != end);
935 return 0;
936}
937
938static int unuse_vma(struct vm_area_struct *vma,
939 swp_entry_t entry, struct page *page)
940{
941 pgd_t *pgd;
942 unsigned long addr, end, next;
943 int ret;
944
945 if (page_anon_vma(page)) {
946 addr = page_address_in_vma(page, vma);
947 if (addr == -EFAULT)
948 return 0;
949 else
950 end = addr + PAGE_SIZE;
951 } else {
952 addr = vma->vm_start;
953 end = vma->vm_end;
954 }
955
956 pgd = pgd_offset(vma->vm_mm, addr);
957 do {
958 next = pgd_addr_end(addr, end);
959 if (pgd_none_or_clear_bad(pgd))
960 continue;
961 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
962 if (ret)
963 return ret;
964 } while (pgd++, addr = next, addr != end);
965 return 0;
966}
967
968static int unuse_mm(struct mm_struct *mm,
969 swp_entry_t entry, struct page *page)
970{
971 struct vm_area_struct *vma;
972 int ret = 0;
973
974 if (!down_read_trylock(&mm->mmap_sem)) {
975
976
977
978
979 activate_page(page);
980 unlock_page(page);
981 down_read(&mm->mmap_sem);
982 lock_page(page);
983 }
984 for (vma = mm->mmap; vma; vma = vma->vm_next) {
985 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
986 break;
987 }
988 up_read(&mm->mmap_sem);
989 return (ret < 0)? ret: 0;
990}
991
992
993
994
995
996
997static unsigned int find_next_to_unuse(struct swap_info_struct *si,
998 unsigned int prev, bool frontswap)
999{
1000 unsigned int max = si->max;
1001 unsigned int i = prev;
1002 unsigned char count;
1003
1004
1005
1006
1007
1008
1009
1010 for (;;) {
1011 if (++i >= max) {
1012 if (!prev) {
1013 i = 0;
1014 break;
1015 }
1016
1017
1018
1019
1020 max = prev + 1;
1021 prev = 0;
1022 i = 1;
1023 }
1024 if (frontswap) {
1025 if (frontswap_test(si, i))
1026 break;
1027 else
1028 continue;
1029 }
1030 count = si->swap_map[i];
1031 if (count && swap_count(count) != SWAP_MAP_BAD)
1032 break;
1033 }
1034 return i;
1035}
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045int try_to_unuse(unsigned int type, bool frontswap,
1046 unsigned long pages_to_unuse)
1047{
1048 struct swap_info_struct *si = swap_info[type];
1049 struct mm_struct *start_mm;
1050 unsigned char *swap_map;
1051 unsigned char swcount;
1052 struct page *page;
1053 swp_entry_t entry;
1054 unsigned int i = 0;
1055 int retval = 0;
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071 start_mm = &init_mm;
1072 atomic_inc(&init_mm.mm_users);
1073
1074
1075
1076
1077
1078
1079 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1080 if (signal_pending(current)) {
1081 retval = -EINTR;
1082 break;
1083 }
1084
1085
1086
1087
1088
1089
1090 swap_map = &si->swap_map[i];
1091 entry = swp_entry(type, i);
1092 page = read_swap_cache_async(entry,
1093 GFP_HIGHUSER_MOVABLE, NULL, 0);
1094 if (!page) {
1095
1096
1097
1098
1099
1100
1101 if (!*swap_map)
1102 continue;
1103 retval = -ENOMEM;
1104 break;
1105 }
1106
1107
1108
1109
1110 if (atomic_read(&start_mm->mm_users) == 1) {
1111 mmput(start_mm);
1112 start_mm = &init_mm;
1113 atomic_inc(&init_mm.mm_users);
1114 }
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124 wait_on_page_locked(page);
1125 wait_on_page_writeback(page);
1126 lock_page(page);
1127 wait_on_page_writeback(page);
1128
1129
1130
1131
1132 swcount = *swap_map;
1133 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1134 retval = shmem_unuse(entry, page);
1135
1136 if (retval < 0)
1137 break;
1138 continue;
1139 }
1140 if (swap_count(swcount) && start_mm != &init_mm)
1141 retval = unuse_mm(start_mm, entry, page);
1142
1143 if (swap_count(*swap_map)) {
1144 int set_start_mm = (*swap_map >= swcount);
1145 struct list_head *p = &start_mm->mmlist;
1146 struct mm_struct *new_start_mm = start_mm;
1147 struct mm_struct *prev_mm = start_mm;
1148 struct mm_struct *mm;
1149
1150 atomic_inc(&new_start_mm->mm_users);
1151 atomic_inc(&prev_mm->mm_users);
1152 spin_lock(&mmlist_lock);
1153 while (swap_count(*swap_map) && !retval &&
1154 (p = p->next) != &start_mm->mmlist) {
1155 mm = list_entry(p, struct mm_struct, mmlist);
1156 if (!atomic_inc_not_zero(&mm->mm_users))
1157 continue;
1158 spin_unlock(&mmlist_lock);
1159 mmput(prev_mm);
1160 prev_mm = mm;
1161
1162 cond_resched();
1163
1164 swcount = *swap_map;
1165 if (!swap_count(swcount))
1166 ;
1167 else if (mm == &init_mm)
1168 set_start_mm = 1;
1169 else
1170 retval = unuse_mm(mm, entry, page);
1171
1172 if (set_start_mm && *swap_map < swcount) {
1173 mmput(new_start_mm);
1174 atomic_inc(&mm->mm_users);
1175 new_start_mm = mm;
1176 set_start_mm = 0;
1177 }
1178 spin_lock(&mmlist_lock);
1179 }
1180 spin_unlock(&mmlist_lock);
1181 mmput(prev_mm);
1182 mmput(start_mm);
1183 start_mm = new_start_mm;
1184 }
1185 if (retval) {
1186 unlock_page(page);
1187 page_cache_release(page);
1188 break;
1189 }
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210 if (swap_count(*swap_map) &&
1211 PageDirty(page) && PageSwapCache(page)) {
1212 struct writeback_control wbc = {
1213 .sync_mode = WB_SYNC_NONE,
1214 };
1215
1216 swap_writepage(page, &wbc);
1217 lock_page(page);
1218 wait_on_page_writeback(page);
1219 }
1220
1221
1222
1223
1224
1225
1226
1227
1228 if (PageSwapCache(page) &&
1229 likely(page_private(page) == entry.val))
1230 delete_from_swap_cache(page);
1231
1232
1233
1234
1235
1236
1237 SetPageDirty(page);
1238 unlock_page(page);
1239 page_cache_release(page);
1240
1241
1242
1243
1244
1245 cond_resched();
1246 if (frontswap && pages_to_unuse > 0) {
1247 if (!--pages_to_unuse)
1248 break;
1249 }
1250 }
1251
1252 mmput(start_mm);
1253 return retval;
1254}
1255
1256
1257
1258
1259
1260
1261
1262static void drain_mmlist(void)
1263{
1264 struct list_head *p, *next;
1265 unsigned int type;
1266
1267 for (type = 0; type < nr_swapfiles; type++)
1268 if (swap_info[type]->inuse_pages)
1269 return;
1270 spin_lock(&mmlist_lock);
1271 list_for_each_safe(p, next, &init_mm.mmlist)
1272 list_del_init(p);
1273 spin_unlock(&mmlist_lock);
1274}
1275
1276
1277
1278
1279
1280
1281
1282static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1283{
1284 struct swap_info_struct *sis;
1285 struct swap_extent *start_se;
1286 struct swap_extent *se;
1287 pgoff_t offset;
1288
1289 sis = swap_info[swp_type(entry)];
1290 *bdev = sis->bdev;
1291
1292 offset = swp_offset(entry);
1293 start_se = sis->curr_swap_extent;
1294 se = start_se;
1295
1296 for ( ; ; ) {
1297 struct list_head *lh;
1298
1299 if (se->start_page <= offset &&
1300 offset < (se->start_page + se->nr_pages)) {
1301 return se->start_block + (offset - se->start_page);
1302 }
1303 lh = se->list.next;
1304 se = list_entry(lh, struct swap_extent, list);
1305 sis->curr_swap_extent = se;
1306 BUG_ON(se == start_se);
1307 }
1308}
1309
1310
1311
1312
1313sector_t map_swap_page(struct page *page, struct block_device **bdev)
1314{
1315 swp_entry_t entry;
1316 entry.val = page_private(page);
1317 return map_swap_entry(entry, bdev);
1318}
1319
1320
1321
1322
1323static void destroy_swap_extents(struct swap_info_struct *sis)
1324{
1325 while (!list_empty(&sis->first_swap_extent.list)) {
1326 struct swap_extent *se;
1327
1328 se = list_entry(sis->first_swap_extent.list.next,
1329 struct swap_extent, list);
1330 list_del(&se->list);
1331 kfree(se);
1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1341}
1342
1343
1344
1345
1346
1347
1348
1349int
1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1351 unsigned long nr_pages, sector_t start_block)
1352{
1353 struct swap_extent *se;
1354 struct swap_extent *new_se;
1355 struct list_head *lh;
1356
1357 if (start_page == 0) {
1358 se = &sis->first_swap_extent;
1359 sis->curr_swap_extent = se;
1360 se->start_page = 0;
1361 se->nr_pages = nr_pages;
1362 se->start_block = start_block;
1363 return 1;
1364 } else {
1365 lh = sis->first_swap_extent.list.prev;
1366 se = list_entry(lh, struct swap_extent, list);
1367 BUG_ON(se->start_page + se->nr_pages != start_page);
1368 if (se->start_block + se->nr_pages == start_block) {
1369
1370 se->nr_pages += nr_pages;
1371 return 0;
1372 }
1373 }
1374
1375
1376
1377
1378 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1379 if (new_se == NULL)
1380 return -ENOMEM;
1381 new_se->start_page = start_page;
1382 new_se->nr_pages = nr_pages;
1383 new_se->start_block = start_block;
1384
1385 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1386 return 1;
1387}
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1421{
1422 struct file *swap_file = sis->swap_file;
1423 struct address_space *mapping = swap_file->f_mapping;
1424 struct inode *inode = mapping->host;
1425 int ret;
1426
1427 if (S_ISBLK(inode->i_mode)) {
1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1429 *span = sis->pages;
1430 return ret;
1431 }
1432
1433 if (mapping->a_ops->swap_activate) {
1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1435 if (!ret) {
1436 sis->flags |= SWP_FILE;
1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1438 *span = sis->pages;
1439 }
1440 return ret;
1441 }
1442
1443 return generic_swapfile_activate(sis, swap_file, span);
1444}
1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map)
1449{
1450 int i, prev;
1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0)
1454 p->prio = prio;
1455 else
1456 p->prio = --least_priority;
1457 p->swap_map = swap_map;
1458 frontswap_map_set(p, frontswap_map);
1459 p->flags |= SWP_WRITEOK;
1460 nr_swap_pages += p->pages;
1461 total_swap_pages += p->pages;
1462
1463
1464 prev = -1;
1465 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1466 if (p->prio >= swap_info[i]->prio)
1467 break;
1468 prev = i;
1469 }
1470 p->next = i;
1471 if (prev < 0)
1472 swap_list.head = swap_list.next = p->type;
1473 else
1474 swap_info[prev]->next = p->type;
1475 frontswap_init(p->type);
1476 spin_unlock(&swap_lock);
1477}
1478
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{
1481 struct swap_info_struct *p = NULL;
1482 unsigned char *swap_map;
1483 struct file *swap_file, *victim;
1484 struct address_space *mapping;
1485 struct inode *inode;
1486 struct filename *pathname;
1487 int oom_score_adj;
1488 int i, type, prev;
1489 int err;
1490
1491 if (!capable(CAP_SYS_ADMIN))
1492 return -EPERM;
1493
1494 BUG_ON(!current->mm);
1495
1496 pathname = getname(specialfile);
1497 if (IS_ERR(pathname))
1498 return PTR_ERR(pathname);
1499
1500 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1501 err = PTR_ERR(victim);
1502 if (IS_ERR(victim))
1503 goto out;
1504
1505 mapping = victim->f_mapping;
1506 prev = -1;
1507 spin_lock(&swap_lock);
1508 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1509 p = swap_info[type];
1510 if (p->flags & SWP_WRITEOK) {
1511 if (p->swap_file->f_mapping == mapping)
1512 break;
1513 }
1514 prev = type;
1515 }
1516 if (type < 0) {
1517 err = -EINVAL;
1518 spin_unlock(&swap_lock);
1519 goto out_dput;
1520 }
1521 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1522 vm_unacct_memory(p->pages);
1523 else {
1524 err = -ENOMEM;
1525 spin_unlock(&swap_lock);
1526 goto out_dput;
1527 }
1528 if (prev < 0)
1529 swap_list.head = p->next;
1530 else
1531 swap_info[prev]->next = p->next;
1532 if (type == swap_list.next) {
1533
1534 swap_list.next = swap_list.head;
1535 }
1536 if (p->prio < 0) {
1537 for (i = p->next; i >= 0; i = swap_info[i]->next)
1538 swap_info[i]->prio = p->prio--;
1539 least_priority++;
1540 }
1541 nr_swap_pages -= p->pages;
1542 total_swap_pages -= p->pages;
1543 p->flags &= ~SWP_WRITEOK;
1544 spin_unlock(&swap_lock);
1545
1546 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1547 err = try_to_unuse(type, false, 0);
1548 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1549
1550 if (err) {
1551
1552
1553
1554
1555
1556
1557
1558 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1559 goto out_dput;
1560 }
1561
1562 destroy_swap_extents(p);
1563 if (p->flags & SWP_CONTINUED)
1564 free_swap_count_continuations(p);
1565
1566 mutex_lock(&swapon_mutex);
1567 spin_lock(&swap_lock);
1568 drain_mmlist();
1569
1570
1571 p->highest_bit = 0;
1572 while (p->flags >= SWP_SCANNING) {
1573 spin_unlock(&swap_lock);
1574 schedule_timeout_uninterruptible(1);
1575 spin_lock(&swap_lock);
1576 }
1577
1578 swap_file = p->swap_file;
1579 p->swap_file = NULL;
1580 p->max = 0;
1581 swap_map = p->swap_map;
1582 p->swap_map = NULL;
1583 p->flags = 0;
1584 frontswap_invalidate_area(type);
1585 spin_unlock(&swap_lock);
1586 mutex_unlock(&swapon_mutex);
1587 vfree(swap_map);
1588 vfree(frontswap_map_get(p));
1589
1590 swap_cgroup_swapoff(type);
1591
1592 inode = mapping->host;
1593 if (S_ISBLK(inode->i_mode)) {
1594 struct block_device *bdev = I_BDEV(inode);
1595 set_blocksize(bdev, p->old_block_size);
1596 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1597 } else {
1598 mutex_lock(&inode->i_mutex);
1599 inode->i_flags &= ~S_SWAPFILE;
1600 mutex_unlock(&inode->i_mutex);
1601 }
1602 filp_close(swap_file, NULL);
1603 err = 0;
1604 atomic_inc(&proc_poll_event);
1605 wake_up_interruptible(&proc_poll_wait);
1606
1607out_dput:
1608 filp_close(victim, NULL);
1609out:
1610 putname(pathname);
1611 return err;
1612}
1613
1614#ifdef CONFIG_PROC_FS
1615static unsigned swaps_poll(struct file *file, poll_table *wait)
1616{
1617 struct seq_file *seq = file->private_data;
1618
1619 poll_wait(file, &proc_poll_wait, wait);
1620
1621 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1622 seq->poll_event = atomic_read(&proc_poll_event);
1623 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1624 }
1625
1626 return POLLIN | POLLRDNORM;
1627}
1628
1629
1630static void *swap_start(struct seq_file *swap, loff_t *pos)
1631{
1632 struct swap_info_struct *si;
1633 int type;
1634 loff_t l = *pos;
1635
1636 mutex_lock(&swapon_mutex);
1637
1638 if (!l)
1639 return SEQ_START_TOKEN;
1640
1641 for (type = 0; type < nr_swapfiles; type++) {
1642 smp_rmb();
1643 si = swap_info[type];
1644 if (!(si->flags & SWP_USED) || !si->swap_map)
1645 continue;
1646 if (!--l)
1647 return si;
1648 }
1649
1650 return NULL;
1651}
1652
1653static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1654{
1655 struct swap_info_struct *si = v;
1656 int type;
1657
1658 if (v == SEQ_START_TOKEN)
1659 type = 0;
1660 else
1661 type = si->type + 1;
1662
1663 for (; type < nr_swapfiles; type++) {
1664 smp_rmb();
1665 si = swap_info[type];
1666 if (!(si->flags & SWP_USED) || !si->swap_map)
1667 continue;
1668 ++*pos;
1669 return si;
1670 }
1671
1672 return NULL;
1673}
1674
1675static void swap_stop(struct seq_file *swap, void *v)
1676{
1677 mutex_unlock(&swapon_mutex);
1678}
1679
1680static int swap_show(struct seq_file *swap, void *v)
1681{
1682 struct swap_info_struct *si = v;
1683 struct file *file;
1684 int len;
1685
1686 if (si == SEQ_START_TOKEN) {
1687 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1688 return 0;
1689 }
1690
1691 file = si->swap_file;
1692 len = seq_path(swap, &file->f_path, " \t\n\\");
1693 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1694 len < 40 ? 40 - len : 1, " ",
1695 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1696 "partition" : "file\t",
1697 si->pages << (PAGE_SHIFT - 10),
1698 si->inuse_pages << (PAGE_SHIFT - 10),
1699 si->prio);
1700 return 0;
1701}
1702
1703static const struct seq_operations swaps_op = {
1704 .start = swap_start,
1705 .next = swap_next,
1706 .stop = swap_stop,
1707 .show = swap_show
1708};
1709
1710static int swaps_open(struct inode *inode, struct file *file)
1711{
1712 struct seq_file *seq;
1713 int ret;
1714
1715 ret = seq_open(file, &swaps_op);
1716 if (ret)
1717 return ret;
1718
1719 seq = file->private_data;
1720 seq->poll_event = atomic_read(&proc_poll_event);
1721 return 0;
1722}
1723
1724static const struct file_operations proc_swaps_operations = {
1725 .open = swaps_open,
1726 .read = seq_read,
1727 .llseek = seq_lseek,
1728 .release = seq_release,
1729 .poll = swaps_poll,
1730};
1731
1732static int __init procswaps_init(void)
1733{
1734 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1735 return 0;
1736}
1737__initcall(procswaps_init);
1738#endif
1739
1740#ifdef MAX_SWAPFILES_CHECK
1741static int __init max_swapfiles_check(void)
1742{
1743 MAX_SWAPFILES_CHECK();
1744 return 0;
1745}
1746late_initcall(max_swapfiles_check);
1747#endif
1748
1749static struct swap_info_struct *alloc_swap_info(void)
1750{
1751 struct swap_info_struct *p;
1752 unsigned int type;
1753
1754 p = kzalloc(sizeof(*p), GFP_KERNEL);
1755 if (!p)
1756 return ERR_PTR(-ENOMEM);
1757
1758 spin_lock(&swap_lock);
1759 for (type = 0; type < nr_swapfiles; type++) {
1760 if (!(swap_info[type]->flags & SWP_USED))
1761 break;
1762 }
1763 if (type >= MAX_SWAPFILES) {
1764 spin_unlock(&swap_lock);
1765 kfree(p);
1766 return ERR_PTR(-EPERM);
1767 }
1768 if (type >= nr_swapfiles) {
1769 p->type = type;
1770 swap_info[type] = p;
1771
1772
1773
1774
1775
1776 smp_wmb();
1777 nr_swapfiles++;
1778 } else {
1779 kfree(p);
1780 p = swap_info[type];
1781
1782
1783
1784
1785 }
1786 INIT_LIST_HEAD(&p->first_swap_extent.list);
1787 p->flags = SWP_USED;
1788 p->next = -1;
1789 spin_unlock(&swap_lock);
1790
1791 return p;
1792}
1793
1794static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1795{
1796 int error;
1797
1798 if (S_ISBLK(inode->i_mode)) {
1799 p->bdev = bdgrab(I_BDEV(inode));
1800 error = blkdev_get(p->bdev,
1801 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1802 sys_swapon);
1803 if (error < 0) {
1804 p->bdev = NULL;
1805 return -EINVAL;
1806 }
1807 p->old_block_size = block_size(p->bdev);
1808 error = set_blocksize(p->bdev, PAGE_SIZE);
1809 if (error < 0)
1810 return error;
1811 p->flags |= SWP_BLKDEV;
1812 } else if (S_ISREG(inode->i_mode)) {
1813 p->bdev = inode->i_sb->s_bdev;
1814 mutex_lock(&inode->i_mutex);
1815 if (IS_SWAPFILE(inode))
1816 return -EBUSY;
1817 } else
1818 return -EINVAL;
1819
1820 return 0;
1821}
1822
1823static unsigned long read_swap_header(struct swap_info_struct *p,
1824 union swap_header *swap_header,
1825 struct inode *inode)
1826{
1827 int i;
1828 unsigned long maxpages;
1829 unsigned long swapfilepages;
1830
1831 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1832 printk(KERN_ERR "Unable to find swap-space signature\n");
1833 return 0;
1834 }
1835
1836
1837 if (swab32(swap_header->info.version) == 1) {
1838 swab32s(&swap_header->info.version);
1839 swab32s(&swap_header->info.last_page);
1840 swab32s(&swap_header->info.nr_badpages);
1841 for (i = 0; i < swap_header->info.nr_badpages; i++)
1842 swab32s(&swap_header->info.badpages[i]);
1843 }
1844
1845 if (swap_header->info.version != 1) {
1846 printk(KERN_WARNING
1847 "Unable to handle swap header version %d\n",
1848 swap_header->info.version);
1849 return 0;
1850 }
1851
1852 p->lowest_bit = 1;
1853 p->cluster_next = 1;
1854 p->cluster_nr = 0;
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870 maxpages = swp_offset(pte_to_swp_entry(
1871 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1872 if (maxpages > swap_header->info.last_page) {
1873 maxpages = swap_header->info.last_page + 1;
1874
1875 if ((unsigned int)maxpages == 0)
1876 maxpages = UINT_MAX;
1877 }
1878 p->highest_bit = maxpages - 1;
1879
1880 if (!maxpages)
1881 return 0;
1882 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1883 if (swapfilepages && maxpages > swapfilepages) {
1884 printk(KERN_WARNING
1885 "Swap area shorter than signature indicates\n");
1886 return 0;
1887 }
1888 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1889 return 0;
1890 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1891 return 0;
1892
1893 return maxpages;
1894}
1895
1896static int setup_swap_map_and_extents(struct swap_info_struct *p,
1897 union swap_header *swap_header,
1898 unsigned char *swap_map,
1899 unsigned long maxpages,
1900 sector_t *span)
1901{
1902 int i;
1903 unsigned int nr_good_pages;
1904 int nr_extents;
1905
1906 nr_good_pages = maxpages - 1;
1907
1908 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1909 unsigned int page_nr = swap_header->info.badpages[i];
1910 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1911 return -EINVAL;
1912 if (page_nr < maxpages) {
1913 swap_map[page_nr] = SWAP_MAP_BAD;
1914 nr_good_pages--;
1915 }
1916 }
1917
1918 if (nr_good_pages) {
1919 swap_map[0] = SWAP_MAP_BAD;
1920 p->max = maxpages;
1921 p->pages = nr_good_pages;
1922 nr_extents = setup_swap_extents(p, span);
1923 if (nr_extents < 0)
1924 return nr_extents;
1925 nr_good_pages = p->pages;
1926 }
1927 if (!nr_good_pages) {
1928 printk(KERN_WARNING "Empty swap-file\n");
1929 return -EINVAL;
1930 }
1931
1932 return nr_extents;
1933}
1934
1935SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1936{
1937 struct swap_info_struct *p;
1938 struct filename *name;
1939 struct file *swap_file = NULL;
1940 struct address_space *mapping;
1941 int i;
1942 int prio;
1943 int error;
1944 union swap_header *swap_header;
1945 int nr_extents;
1946 sector_t span;
1947 unsigned long maxpages;
1948 unsigned char *swap_map = NULL;
1949 unsigned long *frontswap_map = NULL;
1950 struct page *page = NULL;
1951 struct inode *inode = NULL;
1952
1953 if (swap_flags & ~SWAP_FLAGS_VALID)
1954 return -EINVAL;
1955
1956 if (!capable(CAP_SYS_ADMIN))
1957 return -EPERM;
1958
1959 p = alloc_swap_info();
1960 if (IS_ERR(p))
1961 return PTR_ERR(p);
1962
1963 name = getname(specialfile);
1964 if (IS_ERR(name)) {
1965 error = PTR_ERR(name);
1966 name = NULL;
1967 goto bad_swap;
1968 }
1969 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
1970 if (IS_ERR(swap_file)) {
1971 error = PTR_ERR(swap_file);
1972 swap_file = NULL;
1973 goto bad_swap;
1974 }
1975
1976 p->swap_file = swap_file;
1977 mapping = swap_file->f_mapping;
1978
1979 for (i = 0; i < nr_swapfiles; i++) {
1980 struct swap_info_struct *q = swap_info[i];
1981
1982 if (q == p || !q->swap_file)
1983 continue;
1984 if (mapping == q->swap_file->f_mapping) {
1985 error = -EBUSY;
1986 goto bad_swap;
1987 }
1988 }
1989
1990 inode = mapping->host;
1991
1992 error = claim_swapfile(p, inode);
1993 if (unlikely(error))
1994 goto bad_swap;
1995
1996
1997
1998
1999 if (!mapping->a_ops->readpage) {
2000 error = -EINVAL;
2001 goto bad_swap;
2002 }
2003 page = read_mapping_page(mapping, 0, swap_file);
2004 if (IS_ERR(page)) {
2005 error = PTR_ERR(page);
2006 goto bad_swap;
2007 }
2008 swap_header = kmap(page);
2009
2010 maxpages = read_swap_header(p, swap_header, inode);
2011 if (unlikely(!maxpages)) {
2012 error = -EINVAL;
2013 goto bad_swap;
2014 }
2015
2016
2017 swap_map = vzalloc(maxpages);
2018 if (!swap_map) {
2019 error = -ENOMEM;
2020 goto bad_swap;
2021 }
2022
2023 error = swap_cgroup_swapon(p->type, maxpages);
2024 if (error)
2025 goto bad_swap;
2026
2027 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2028 maxpages, &span);
2029 if (unlikely(nr_extents < 0)) {
2030 error = nr_extents;
2031 goto bad_swap;
2032 }
2033
2034 if (frontswap_enabled)
2035 frontswap_map = vzalloc(maxpages / sizeof(long));
2036
2037 if (p->bdev) {
2038 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2039 p->flags |= SWP_SOLIDSTATE;
2040 p->cluster_next = 1 + (random32() % p->highest_bit);
2041 }
2042 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2043 p->flags |= SWP_DISCARDABLE;
2044 }
2045
2046 mutex_lock(&swapon_mutex);
2047 prio = -1;
2048 if (swap_flags & SWAP_FLAG_PREFER)
2049 prio =
2050 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2051 enable_swap_info(p, prio, swap_map, frontswap_map);
2052
2053 printk(KERN_INFO "Adding %uk swap on %s. "
2054 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2055 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2056 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2057 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2058 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2059 (frontswap_map) ? "FS" : "");
2060
2061 mutex_unlock(&swapon_mutex);
2062 atomic_inc(&proc_poll_event);
2063 wake_up_interruptible(&proc_poll_wait);
2064
2065 if (S_ISREG(inode->i_mode))
2066 inode->i_flags |= S_SWAPFILE;
2067 error = 0;
2068 goto out;
2069bad_swap:
2070 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2071 set_blocksize(p->bdev, p->old_block_size);
2072 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2073 }
2074 destroy_swap_extents(p);
2075 swap_cgroup_swapoff(p->type);
2076 spin_lock(&swap_lock);
2077 p->swap_file = NULL;
2078 p->flags = 0;
2079 spin_unlock(&swap_lock);
2080 vfree(swap_map);
2081 if (swap_file) {
2082 if (inode && S_ISREG(inode->i_mode)) {
2083 mutex_unlock(&inode->i_mutex);
2084 inode = NULL;
2085 }
2086 filp_close(swap_file, NULL);
2087 }
2088out:
2089 if (page && !IS_ERR(page)) {
2090 kunmap(page);
2091 page_cache_release(page);
2092 }
2093 if (name)
2094 putname(name);
2095 if (inode && S_ISREG(inode->i_mode))
2096 mutex_unlock(&inode->i_mutex);
2097 return error;
2098}
2099
2100void si_swapinfo(struct sysinfo *val)
2101{
2102 unsigned int type;
2103 unsigned long nr_to_be_unused = 0;
2104
2105 spin_lock(&swap_lock);
2106 for (type = 0; type < nr_swapfiles; type++) {
2107 struct swap_info_struct *si = swap_info[type];
2108
2109 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2110 nr_to_be_unused += si->inuse_pages;
2111 }
2112 val->freeswap = nr_swap_pages + nr_to_be_unused;
2113 val->totalswap = total_swap_pages + nr_to_be_unused;
2114 spin_unlock(&swap_lock);
2115}
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2129{
2130 struct swap_info_struct *p;
2131 unsigned long offset, type;
2132 unsigned char count;
2133 unsigned char has_cache;
2134 int err = -EINVAL;
2135
2136 if (non_swap_entry(entry))
2137 goto out;
2138
2139 type = swp_type(entry);
2140 if (type >= nr_swapfiles)
2141 goto bad_file;
2142 p = swap_info[type];
2143 offset = swp_offset(entry);
2144
2145 spin_lock(&swap_lock);
2146 if (unlikely(offset >= p->max))
2147 goto unlock_out;
2148
2149 count = p->swap_map[offset];
2150 has_cache = count & SWAP_HAS_CACHE;
2151 count &= ~SWAP_HAS_CACHE;
2152 err = 0;
2153
2154 if (usage == SWAP_HAS_CACHE) {
2155
2156
2157 if (!has_cache && count)
2158 has_cache = SWAP_HAS_CACHE;
2159 else if (has_cache)
2160 err = -EEXIST;
2161 else
2162 err = -ENOENT;
2163
2164 } else if (count || has_cache) {
2165
2166 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2167 count += usage;
2168 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2169 err = -EINVAL;
2170 else if (swap_count_continued(p, offset, count))
2171 count = COUNT_CONTINUED;
2172 else
2173 err = -ENOMEM;
2174 } else
2175 err = -ENOENT;
2176
2177 p->swap_map[offset] = count | has_cache;
2178
2179unlock_out:
2180 spin_unlock(&swap_lock);
2181out:
2182 return err;
2183
2184bad_file:
2185 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2186 goto out;
2187}
2188
2189
2190
2191
2192
2193void swap_shmem_alloc(swp_entry_t entry)
2194{
2195 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2196}
2197
2198
2199
2200
2201
2202
2203
2204
2205int swap_duplicate(swp_entry_t entry)
2206{
2207 int err = 0;
2208
2209 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2210 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2211 return err;
2212}
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222int swapcache_prepare(swp_entry_t entry)
2223{
2224 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2225}
2226
2227struct swap_info_struct *page_swap_info(struct page *page)
2228{
2229 swp_entry_t swap = { .val = page_private(page) };
2230 BUG_ON(!PageSwapCache(page));
2231 return swap_info[swp_type(swap)];
2232}
2233
2234
2235
2236
2237struct address_space *__page_file_mapping(struct page *page)
2238{
2239 VM_BUG_ON(!PageSwapCache(page));
2240 return page_swap_info(page)->swap_file->f_mapping;
2241}
2242EXPORT_SYMBOL_GPL(__page_file_mapping);
2243
2244pgoff_t __page_file_index(struct page *page)
2245{
2246 swp_entry_t swap = { .val = page_private(page) };
2247 VM_BUG_ON(!PageSwapCache(page));
2248 return swp_offset(swap);
2249}
2250EXPORT_SYMBOL_GPL(__page_file_index);
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2268{
2269 struct swap_info_struct *si;
2270 struct page *head;
2271 struct page *page;
2272 struct page *list_page;
2273 pgoff_t offset;
2274 unsigned char count;
2275
2276
2277
2278
2279
2280 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2281
2282 si = swap_info_get(entry);
2283 if (!si) {
2284
2285
2286
2287
2288
2289 goto outer;
2290 }
2291
2292 offset = swp_offset(entry);
2293 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2294
2295 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2296
2297
2298
2299
2300
2301 goto out;
2302 }
2303
2304 if (!page) {
2305 spin_unlock(&swap_lock);
2306 return -ENOMEM;
2307 }
2308
2309
2310
2311
2312
2313
2314 head = vmalloc_to_page(si->swap_map + offset);
2315 offset &= ~PAGE_MASK;
2316
2317
2318
2319
2320
2321 if (!page_private(head)) {
2322 BUG_ON(count & COUNT_CONTINUED);
2323 INIT_LIST_HEAD(&head->lru);
2324 set_page_private(head, SWP_CONTINUED);
2325 si->flags |= SWP_CONTINUED;
2326 }
2327
2328 list_for_each_entry(list_page, &head->lru, lru) {
2329 unsigned char *map;
2330
2331
2332
2333
2334
2335 if (!(count & COUNT_CONTINUED))
2336 goto out;
2337
2338 map = kmap_atomic(list_page) + offset;
2339 count = *map;
2340 kunmap_atomic(map);
2341
2342
2343
2344
2345
2346 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2347 goto out;
2348 }
2349
2350 list_add_tail(&page->lru, &head->lru);
2351 page = NULL;
2352out:
2353 spin_unlock(&swap_lock);
2354outer:
2355 if (page)
2356 __free_page(page);
2357 return 0;
2358}
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368static bool swap_count_continued(struct swap_info_struct *si,
2369 pgoff_t offset, unsigned char count)
2370{
2371 struct page *head;
2372 struct page *page;
2373 unsigned char *map;
2374
2375 head = vmalloc_to_page(si->swap_map + offset);
2376 if (page_private(head) != SWP_CONTINUED) {
2377 BUG_ON(count & COUNT_CONTINUED);
2378 return false;
2379 }
2380
2381 offset &= ~PAGE_MASK;
2382 page = list_entry(head->lru.next, struct page, lru);
2383 map = kmap_atomic(page) + offset;
2384
2385 if (count == SWAP_MAP_MAX)
2386 goto init_map;
2387
2388 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2389
2390
2391
2392 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2393 kunmap_atomic(map);
2394 page = list_entry(page->lru.next, struct page, lru);
2395 BUG_ON(page == head);
2396 map = kmap_atomic(page) + offset;
2397 }
2398 if (*map == SWAP_CONT_MAX) {
2399 kunmap_atomic(map);
2400 page = list_entry(page->lru.next, struct page, lru);
2401 if (page == head)
2402 return false;
2403 map = kmap_atomic(page) + offset;
2404init_map: *map = 0;
2405 }
2406 *map += 1;
2407 kunmap_atomic(map);
2408 page = list_entry(page->lru.prev, struct page, lru);
2409 while (page != head) {
2410 map = kmap_atomic(page) + offset;
2411 *map = COUNT_CONTINUED;
2412 kunmap_atomic(map);
2413 page = list_entry(page->lru.prev, struct page, lru);
2414 }
2415 return true;
2416
2417 } else {
2418
2419
2420
2421 BUG_ON(count != COUNT_CONTINUED);
2422 while (*map == COUNT_CONTINUED) {
2423 kunmap_atomic(map);
2424 page = list_entry(page->lru.next, struct page, lru);
2425 BUG_ON(page == head);
2426 map = kmap_atomic(page) + offset;
2427 }
2428 BUG_ON(*map == 0);
2429 *map -= 1;
2430 if (*map == 0)
2431 count = 0;
2432 kunmap_atomic(map);
2433 page = list_entry(page->lru.prev, struct page, lru);
2434 while (page != head) {
2435 map = kmap_atomic(page) + offset;
2436 *map = SWAP_CONT_MAX | count;
2437 count = COUNT_CONTINUED;
2438 kunmap_atomic(map);
2439 page = list_entry(page->lru.prev, struct page, lru);
2440 }
2441 return count == COUNT_CONTINUED;
2442 }
2443}
2444
2445
2446
2447
2448
2449static void free_swap_count_continuations(struct swap_info_struct *si)
2450{
2451 pgoff_t offset;
2452
2453 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2454 struct page *head;
2455 head = vmalloc_to_page(si->swap_map + offset);
2456 if (page_private(head)) {
2457 struct list_head *this, *next;
2458 list_for_each_safe(this, next, &head->lru) {
2459 struct page *page;
2460 page = list_entry(this, struct page, lru);
2461 list_del(this);
2462 __free_page(page);
2463 }
2464 }
2465 }
2466}
2467