1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36#include <linux/export.h>
37
38#include <asm/pgtable.h>
39#include <asm/tlbflush.h>
40#include <linux/swapops.h>
41#include <linux/page_cgroup.h>
42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char);
45static void free_swap_count_continuations(struct swap_info_struct *);
46static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47
48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles;
50long nr_swap_pages;
51long total_swap_pages;
52static int least_priority;
53
54static const char Bad_file[] = "Bad swap file entry ";
55static const char Unused_file[] = "Unused swap file entry ";
56static const char Bad_offset[] = "Bad swap offset entry ";
57static const char Unused_offset[] = "Unused swap offset entry ";
58
59struct swap_list_t swap_list = {-1, -1};
60
61struct swap_info_struct *swap_info[MAX_SWAPFILES];
62
63static DEFINE_MUTEX(swapon_mutex);
64
65static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
66
67static atomic_t proc_poll_event = ATOMIC_INIT(0);
68
69static inline unsigned char swap_count(unsigned char ent)
70{
71 return ent & ~SWAP_HAS_CACHE;
72}
73
74
75static int
76__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
77{
78 swp_entry_t entry = swp_entry(si->type, offset);
79 struct page *page;
80 int ret = 0;
81
82 page = find_get_page(&swapper_space, entry.val);
83 if (!page)
84 return 0;
85
86
87
88
89
90
91
92 if (trylock_page(page)) {
93 ret = try_to_free_swap(page);
94 unlock_page(page);
95 }
96 page_cache_release(page);
97 return ret;
98}
99
100
101
102
103
104static int discard_swap(struct swap_info_struct *si)
105{
106 struct swap_extent *se;
107 sector_t start_block;
108 sector_t nr_blocks;
109 int err = 0;
110
111
112 se = &si->first_swap_extent;
113 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
114 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
115 if (nr_blocks) {
116 err = blkdev_issue_discard(si->bdev, start_block,
117 nr_blocks, GFP_KERNEL, 0);
118 if (err)
119 return err;
120 cond_resched();
121 }
122
123 list_for_each_entry(se, &si->first_swap_extent.list, list) {
124 start_block = se->start_block << (PAGE_SHIFT - 9);
125 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
126
127 err = blkdev_issue_discard(si->bdev, start_block,
128 nr_blocks, GFP_KERNEL, 0);
129 if (err)
130 break;
131
132 cond_resched();
133 }
134 return err;
135}
136
137
138
139
140
141static void discard_swap_cluster(struct swap_info_struct *si,
142 pgoff_t start_page, pgoff_t nr_pages)
143{
144 struct swap_extent *se = si->curr_swap_extent;
145 int found_extent = 0;
146
147 while (nr_pages) {
148 struct list_head *lh;
149
150 if (se->start_page <= start_page &&
151 start_page < se->start_page + se->nr_pages) {
152 pgoff_t offset = start_page - se->start_page;
153 sector_t start_block = se->start_block + offset;
154 sector_t nr_blocks = se->nr_pages - offset;
155
156 if (nr_blocks > nr_pages)
157 nr_blocks = nr_pages;
158 start_page += nr_blocks;
159 nr_pages -= nr_blocks;
160
161 if (!found_extent++)
162 si->curr_swap_extent = se;
163
164 start_block <<= PAGE_SHIFT - 9;
165 nr_blocks <<= PAGE_SHIFT - 9;
166 if (blkdev_issue_discard(si->bdev, start_block,
167 nr_blocks, GFP_NOIO, 0))
168 break;
169 }
170
171 lh = se->list.next;
172 se = list_entry(lh, struct swap_extent, list);
173 }
174}
175
176static int wait_for_discard(void *word)
177{
178 schedule();
179 return 0;
180}
181
182#define SWAPFILE_CLUSTER 256
183#define LATENCY_LIMIT 256
184
185static unsigned long scan_swap_map(struct swap_info_struct *si,
186 unsigned char usage)
187{
188 unsigned long offset;
189 unsigned long scan_base;
190 unsigned long last_in_cluster = 0;
191 int latency_ration = LATENCY_LIMIT;
192 int found_free_cluster = 0;
193
194
195
196
197
198
199
200
201
202
203
204
205 si->flags += SWP_SCANNING;
206 scan_base = offset = si->cluster_next;
207
208 if (unlikely(!si->cluster_nr--)) {
209 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
210 si->cluster_nr = SWAPFILE_CLUSTER - 1;
211 goto checks;
212 }
213 if (si->flags & SWP_DISCARDABLE) {
214
215
216
217
218
219
220
221 if (si->lowest_alloc)
222 goto checks;
223 si->lowest_alloc = si->max;
224 si->highest_alloc = 0;
225 }
226 spin_unlock(&swap_lock);
227
228
229
230
231
232
233
234
235
236 if (!(si->flags & SWP_SOLIDSTATE))
237 scan_base = offset = si->lowest_bit;
238 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
239
240
241 for (; last_in_cluster <= si->highest_bit; offset++) {
242 if (si->swap_map[offset])
243 last_in_cluster = offset + SWAPFILE_CLUSTER;
244 else if (offset == last_in_cluster) {
245 spin_lock(&swap_lock);
246 offset -= SWAPFILE_CLUSTER - 1;
247 si->cluster_next = offset;
248 si->cluster_nr = SWAPFILE_CLUSTER - 1;
249 found_free_cluster = 1;
250 goto checks;
251 }
252 if (unlikely(--latency_ration < 0)) {
253 cond_resched();
254 latency_ration = LATENCY_LIMIT;
255 }
256 }
257
258 offset = si->lowest_bit;
259 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
260
261
262 for (; last_in_cluster < scan_base; offset++) {
263 if (si->swap_map[offset])
264 last_in_cluster = offset + SWAPFILE_CLUSTER;
265 else if (offset == last_in_cluster) {
266 spin_lock(&swap_lock);
267 offset -= SWAPFILE_CLUSTER - 1;
268 si->cluster_next = offset;
269 si->cluster_nr = SWAPFILE_CLUSTER - 1;
270 found_free_cluster = 1;
271 goto checks;
272 }
273 if (unlikely(--latency_ration < 0)) {
274 cond_resched();
275 latency_ration = LATENCY_LIMIT;
276 }
277 }
278
279 offset = scan_base;
280 spin_lock(&swap_lock);
281 si->cluster_nr = SWAPFILE_CLUSTER - 1;
282 si->lowest_alloc = 0;
283 }
284
285checks:
286 if (!(si->flags & SWP_WRITEOK))
287 goto no_page;
288 if (!si->highest_bit)
289 goto no_page;
290 if (offset > si->highest_bit)
291 scan_base = offset = si->lowest_bit;
292
293
294 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
295 int swap_was_freed;
296 spin_unlock(&swap_lock);
297 swap_was_freed = __try_to_reclaim_swap(si, offset);
298 spin_lock(&swap_lock);
299
300 if (swap_was_freed)
301 goto checks;
302 goto scan;
303 }
304
305 if (si->swap_map[offset])
306 goto scan;
307
308 if (offset == si->lowest_bit)
309 si->lowest_bit++;
310 if (offset == si->highest_bit)
311 si->highest_bit--;
312 si->inuse_pages++;
313 if (si->inuse_pages == si->pages) {
314 si->lowest_bit = si->max;
315 si->highest_bit = 0;
316 }
317 si->swap_map[offset] = usage;
318 si->cluster_next = offset + 1;
319 si->flags -= SWP_SCANNING;
320
321 if (si->lowest_alloc) {
322
323
324
325
326 if (found_free_cluster) {
327
328
329
330
331
332
333
334 if (offset < si->highest_alloc &&
335 si->lowest_alloc <= last_in_cluster)
336 last_in_cluster = si->lowest_alloc - 1;
337 si->flags |= SWP_DISCARDING;
338 spin_unlock(&swap_lock);
339
340 if (offset < last_in_cluster)
341 discard_swap_cluster(si, offset,
342 last_in_cluster - offset + 1);
343
344 spin_lock(&swap_lock);
345 si->lowest_alloc = 0;
346 si->flags &= ~SWP_DISCARDING;
347
348 smp_mb();
349 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
350
351 } else if (si->flags & SWP_DISCARDING) {
352
353
354
355
356
357
358 spin_unlock(&swap_lock);
359 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
360 wait_for_discard, TASK_UNINTERRUPTIBLE);
361 spin_lock(&swap_lock);
362 } else {
363
364
365
366
367
368 if (offset < si->lowest_alloc)
369 si->lowest_alloc = offset;
370 if (offset > si->highest_alloc)
371 si->highest_alloc = offset;
372 }
373 }
374 return offset;
375
376scan:
377 spin_unlock(&swap_lock);
378 while (++offset <= si->highest_bit) {
379 if (!si->swap_map[offset]) {
380 spin_lock(&swap_lock);
381 goto checks;
382 }
383 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
384 spin_lock(&swap_lock);
385 goto checks;
386 }
387 if (unlikely(--latency_ration < 0)) {
388 cond_resched();
389 latency_ration = LATENCY_LIMIT;
390 }
391 }
392 offset = si->lowest_bit;
393 while (++offset < scan_base) {
394 if (!si->swap_map[offset]) {
395 spin_lock(&swap_lock);
396 goto checks;
397 }
398 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
399 spin_lock(&swap_lock);
400 goto checks;
401 }
402 if (unlikely(--latency_ration < 0)) {
403 cond_resched();
404 latency_ration = LATENCY_LIMIT;
405 }
406 }
407 spin_lock(&swap_lock);
408
409no_page:
410 si->flags -= SWP_SCANNING;
411 return 0;
412}
413
414swp_entry_t get_swap_page(void)
415{
416 struct swap_info_struct *si;
417 pgoff_t offset;
418 int type, next;
419 int wrapped = 0;
420
421 spin_lock(&swap_lock);
422 if (nr_swap_pages <= 0)
423 goto noswap;
424 nr_swap_pages--;
425
426 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
427 si = swap_info[type];
428 next = si->next;
429 if (next < 0 ||
430 (!wrapped && si->prio != swap_info[next]->prio)) {
431 next = swap_list.head;
432 wrapped++;
433 }
434
435 if (!si->highest_bit)
436 continue;
437 if (!(si->flags & SWP_WRITEOK))
438 continue;
439
440 swap_list.next = next;
441
442 offset = scan_swap_map(si, SWAP_HAS_CACHE);
443 if (offset) {
444 spin_unlock(&swap_lock);
445 return swp_entry(type, offset);
446 }
447 next = swap_list.next;
448 }
449
450 nr_swap_pages++;
451noswap:
452 spin_unlock(&swap_lock);
453 return (swp_entry_t) {0};
454}
455
456
457swp_entry_t get_swap_page_of_type(int type)
458{
459 struct swap_info_struct *si;
460 pgoff_t offset;
461
462 spin_lock(&swap_lock);
463 si = swap_info[type];
464 if (si && (si->flags & SWP_WRITEOK)) {
465 nr_swap_pages--;
466
467 offset = scan_swap_map(si, 1);
468 if (offset) {
469 spin_unlock(&swap_lock);
470 return swp_entry(type, offset);
471 }
472 nr_swap_pages++;
473 }
474 spin_unlock(&swap_lock);
475 return (swp_entry_t) {0};
476}
477
478static struct swap_info_struct *swap_info_get(swp_entry_t entry)
479{
480 struct swap_info_struct *p;
481 unsigned long offset, type;
482
483 if (!entry.val)
484 goto out;
485 type = swp_type(entry);
486 if (type >= nr_swapfiles)
487 goto bad_nofile;
488 p = swap_info[type];
489 if (!(p->flags & SWP_USED))
490 goto bad_device;
491 offset = swp_offset(entry);
492 if (offset >= p->max)
493 goto bad_offset;
494 if (!p->swap_map[offset])
495 goto bad_free;
496 spin_lock(&swap_lock);
497 return p;
498
499bad_free:
500 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
501 goto out;
502bad_offset:
503 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
504 goto out;
505bad_device:
506 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
507 goto out;
508bad_nofile:
509 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
510out:
511 return NULL;
512}
513
514static unsigned char swap_entry_free(struct swap_info_struct *p,
515 swp_entry_t entry, unsigned char usage)
516{
517 unsigned long offset = swp_offset(entry);
518 unsigned char count;
519 unsigned char has_cache;
520
521 count = p->swap_map[offset];
522 has_cache = count & SWAP_HAS_CACHE;
523 count &= ~SWAP_HAS_CACHE;
524
525 if (usage == SWAP_HAS_CACHE) {
526 VM_BUG_ON(!has_cache);
527 has_cache = 0;
528 } else if (count == SWAP_MAP_SHMEM) {
529
530
531
532
533 count = 0;
534 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
535 if (count == COUNT_CONTINUED) {
536 if (swap_count_continued(p, offset, count))
537 count = SWAP_MAP_MAX | COUNT_CONTINUED;
538 else
539 count = SWAP_MAP_MAX;
540 } else
541 count--;
542 }
543
544 if (!count)
545 mem_cgroup_uncharge_swap(entry);
546
547 usage = count | has_cache;
548 p->swap_map[offset] = usage;
549
550
551 if (!usage) {
552 if (offset < p->lowest_bit)
553 p->lowest_bit = offset;
554 if (offset > p->highest_bit)
555 p->highest_bit = offset;
556 if (swap_list.next >= 0 &&
557 p->prio > swap_info[swap_list.next]->prio)
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
562 if (p->flags & SWP_BLKDEV) {
563 struct gendisk *disk = p->bdev->bd_disk;
564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
568 }
569
570 return usage;
571}
572
573
574
575
576
577void swap_free(swp_entry_t entry)
578{
579 struct swap_info_struct *p;
580
581 p = swap_info_get(entry);
582 if (p) {
583 swap_entry_free(p, entry, 1);
584 spin_unlock(&swap_lock);
585 }
586}
587
588
589
590
591void swapcache_free(swp_entry_t entry, struct page *page)
592{
593 struct swap_info_struct *p;
594 unsigned char count;
595
596 p = swap_info_get(entry);
597 if (p) {
598 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
599 if (page)
600 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
601 spin_unlock(&swap_lock);
602 }
603}
604
605
606
607
608
609
610int page_swapcount(struct page *page)
611{
612 int count = 0;
613 struct swap_info_struct *p;
614 swp_entry_t entry;
615
616 entry.val = page_private(page);
617 p = swap_info_get(entry);
618 if (p) {
619 count = swap_count(p->swap_map[swp_offset(entry)]);
620 spin_unlock(&swap_lock);
621 }
622 return count;
623}
624
625
626
627
628
629
630
631int reuse_swap_page(struct page *page)
632{
633 int count;
634
635 VM_BUG_ON(!PageLocked(page));
636 if (unlikely(PageKsm(page)))
637 return 0;
638 count = page_mapcount(page);
639 if (count <= 1 && PageSwapCache(page)) {
640 count += page_swapcount(page);
641 if (count == 1 && !PageWriteback(page)) {
642 delete_from_swap_cache(page);
643 SetPageDirty(page);
644 }
645 }
646 return count <= 1;
647}
648
649
650
651
652
653int try_to_free_swap(struct page *page)
654{
655 VM_BUG_ON(!PageLocked(page));
656
657 if (!PageSwapCache(page))
658 return 0;
659 if (PageWriteback(page))
660 return 0;
661 if (page_swapcount(page))
662 return 0;
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679 if (pm_suspended_storage())
680 return 0;
681
682 delete_from_swap_cache(page);
683 SetPageDirty(page);
684 return 1;
685}
686
687
688
689
690
691int free_swap_and_cache(swp_entry_t entry)
692{
693 struct swap_info_struct *p;
694 struct page *page = NULL;
695
696 if (non_swap_entry(entry))
697 return 1;
698
699 p = swap_info_get(entry);
700 if (p) {
701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702 page = find_get_page(&swapper_space, entry.val);
703 if (page && !trylock_page(page)) {
704 page_cache_release(page);
705 page = NULL;
706 }
707 }
708 spin_unlock(&swap_lock);
709 }
710 if (page) {
711
712
713
714
715 if (PageSwapCache(page) && !PageWriteback(page) &&
716 (!page_mapped(page) || vm_swap_full())) {
717 delete_from_swap_cache(page);
718 SetPageDirty(page);
719 }
720 unlock_page(page);
721 page_cache_release(page);
722 }
723 return p != NULL;
724}
725
726#ifdef CONFIG_HIBERNATION
727
728
729
730
731
732
733
734
735int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
736{
737 struct block_device *bdev = NULL;
738 int type;
739
740 if (device)
741 bdev = bdget(device);
742
743 spin_lock(&swap_lock);
744 for (type = 0; type < nr_swapfiles; type++) {
745 struct swap_info_struct *sis = swap_info[type];
746
747 if (!(sis->flags & SWP_WRITEOK))
748 continue;
749
750 if (!bdev) {
751 if (bdev_p)
752 *bdev_p = bdgrab(sis->bdev);
753
754 spin_unlock(&swap_lock);
755 return type;
756 }
757 if (bdev == sis->bdev) {
758 struct swap_extent *se = &sis->first_swap_extent;
759
760 if (se->start_block == offset) {
761 if (bdev_p)
762 *bdev_p = bdgrab(sis->bdev);
763
764 spin_unlock(&swap_lock);
765 bdput(bdev);
766 return type;
767 }
768 }
769 }
770 spin_unlock(&swap_lock);
771 if (bdev)
772 bdput(bdev);
773
774 return -ENODEV;
775}
776
777
778
779
780
781sector_t swapdev_block(int type, pgoff_t offset)
782{
783 struct block_device *bdev;
784
785 if ((unsigned int)type >= nr_swapfiles)
786 return 0;
787 if (!(swap_info[type]->flags & SWP_WRITEOK))
788 return 0;
789 return map_swap_entry(swp_entry(type, offset), &bdev);
790}
791
792
793
794
795
796
797
798unsigned int count_swap_pages(int type, int free)
799{
800 unsigned int n = 0;
801
802 spin_lock(&swap_lock);
803 if ((unsigned int)type < nr_swapfiles) {
804 struct swap_info_struct *sis = swap_info[type];
805
806 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages;
808 if (free)
809 n -= sis->inuse_pages;
810 }
811 }
812 spin_unlock(&swap_lock);
813 return n;
814}
815#endif
816
817
818
819
820
821
822static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
823 unsigned long addr, swp_entry_t entry, struct page *page)
824{
825 struct mem_cgroup *memcg;
826 spinlock_t *ptl;
827 pte_t *pte;
828 int ret = 1;
829
830 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
831 GFP_KERNEL, &memcg)) {
832 ret = -ENOMEM;
833 goto out_nolock;
834 }
835
836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
838 mem_cgroup_cancel_charge_swapin(memcg);
839 ret = 0;
840 goto out;
841 }
842
843 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
844 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
845 get_page(page);
846 set_pte_at(vma->vm_mm, addr, pte,
847 pte_mkold(mk_pte(page, vma->vm_page_prot)));
848 page_add_anon_rmap(page, vma, addr);
849 mem_cgroup_commit_charge_swapin(page, memcg);
850 swap_free(entry);
851
852
853
854
855 activate_page(page);
856out:
857 pte_unmap_unlock(pte, ptl);
858out_nolock:
859 return ret;
860}
861
862static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
863 unsigned long addr, unsigned long end,
864 swp_entry_t entry, struct page *page)
865{
866 pte_t swp_pte = swp_entry_to_pte(entry);
867 pte_t *pte;
868 int ret = 0;
869
870
871
872
873
874
875
876
877
878
879 pte = pte_offset_map(pmd, addr);
880 do {
881
882
883
884
885 if (unlikely(pte_same(*pte, swp_pte))) {
886 pte_unmap(pte);
887 ret = unuse_pte(vma, pmd, addr, entry, page);
888 if (ret)
889 goto out;
890 pte = pte_offset_map(pmd, addr);
891 }
892 } while (pte++, addr += PAGE_SIZE, addr != end);
893 pte_unmap(pte - 1);
894out:
895 return ret;
896}
897
898static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
899 unsigned long addr, unsigned long end,
900 swp_entry_t entry, struct page *page)
901{
902 pmd_t *pmd;
903 unsigned long next;
904 int ret;
905
906 pmd = pmd_offset(pud, addr);
907 do {
908 next = pmd_addr_end(addr, end);
909 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
910 continue;
911 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
912 if (ret)
913 return ret;
914 } while (pmd++, addr = next, addr != end);
915 return 0;
916}
917
918static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
919 unsigned long addr, unsigned long end,
920 swp_entry_t entry, struct page *page)
921{
922 pud_t *pud;
923 unsigned long next;
924 int ret;
925
926 pud = pud_offset(pgd, addr);
927 do {
928 next = pud_addr_end(addr, end);
929 if (pud_none_or_clear_bad(pud))
930 continue;
931 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
932 if (ret)
933 return ret;
934 } while (pud++, addr = next, addr != end);
935 return 0;
936}
937
938static int unuse_vma(struct vm_area_struct *vma,
939 swp_entry_t entry, struct page *page)
940{
941 pgd_t *pgd;
942 unsigned long addr, end, next;
943 int ret;
944
945 if (page_anon_vma(page)) {
946 addr = page_address_in_vma(page, vma);
947 if (addr == -EFAULT)
948 return 0;
949 else
950 end = addr + PAGE_SIZE;
951 } else {
952 addr = vma->vm_start;
953 end = vma->vm_end;
954 }
955
956 pgd = pgd_offset(vma->vm_mm, addr);
957 do {
958 next = pgd_addr_end(addr, end);
959 if (pgd_none_or_clear_bad(pgd))
960 continue;
961 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
962 if (ret)
963 return ret;
964 } while (pgd++, addr = next, addr != end);
965 return 0;
966}
967
968static int unuse_mm(struct mm_struct *mm,
969 swp_entry_t entry, struct page *page)
970{
971 struct vm_area_struct *vma;
972 int ret = 0;
973
974 if (!down_read_trylock(&mm->mmap_sem)) {
975
976
977
978
979 activate_page(page);
980 unlock_page(page);
981 down_read(&mm->mmap_sem);
982 lock_page(page);
983 }
984 for (vma = mm->mmap; vma; vma = vma->vm_next) {
985 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
986 break;
987 }
988 up_read(&mm->mmap_sem);
989 return (ret < 0)? ret: 0;
990}
991
992
993
994
995
996
997static unsigned int find_next_to_unuse(struct swap_info_struct *si,
998 unsigned int prev, bool frontswap)
999{
1000 unsigned int max = si->max;
1001 unsigned int i = prev;
1002 unsigned char count;
1003
1004
1005
1006
1007
1008
1009
1010 for (;;) {
1011 if (++i >= max) {
1012 if (!prev) {
1013 i = 0;
1014 break;
1015 }
1016
1017
1018
1019
1020 max = prev + 1;
1021 prev = 0;
1022 i = 1;
1023 }
1024 if (frontswap) {
1025 if (frontswap_test(si, i))
1026 break;
1027 else
1028 continue;
1029 }
1030 count = si->swap_map[i];
1031 if (count && swap_count(count) != SWAP_MAP_BAD)
1032 break;
1033 }
1034 return i;
1035}
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045int try_to_unuse(unsigned int type, bool frontswap,
1046 unsigned long pages_to_unuse)
1047{
1048 struct swap_info_struct *si = swap_info[type];
1049 struct mm_struct *start_mm;
1050 unsigned char *swap_map;
1051 unsigned char swcount;
1052 struct page *page;
1053 swp_entry_t entry;
1054 unsigned int i = 0;
1055 int retval = 0;
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071 start_mm = &init_mm;
1072 atomic_inc(&init_mm.mm_users);
1073
1074
1075
1076
1077
1078
1079 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1080 if (signal_pending(current)) {
1081 retval = -EINTR;
1082 break;
1083 }
1084
1085
1086
1087
1088
1089
1090 swap_map = &si->swap_map[i];
1091 entry = swp_entry(type, i);
1092 page = read_swap_cache_async(entry,
1093 GFP_HIGHUSER_MOVABLE, NULL, 0);
1094 if (!page) {
1095
1096
1097
1098
1099
1100
1101 if (!*swap_map)
1102 continue;
1103 retval = -ENOMEM;
1104 break;
1105 }
1106
1107
1108
1109
1110 if (atomic_read(&start_mm->mm_users) == 1) {
1111 mmput(start_mm);
1112 start_mm = &init_mm;
1113 atomic_inc(&init_mm.mm_users);
1114 }
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124 wait_on_page_locked(page);
1125 wait_on_page_writeback(page);
1126 lock_page(page);
1127 wait_on_page_writeback(page);
1128
1129
1130
1131
1132 swcount = *swap_map;
1133 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1134 retval = shmem_unuse(entry, page);
1135
1136 if (retval < 0)
1137 break;
1138 continue;
1139 }
1140 if (swap_count(swcount) && start_mm != &init_mm)
1141 retval = unuse_mm(start_mm, entry, page);
1142
1143 if (swap_count(*swap_map)) {
1144 int set_start_mm = (*swap_map >= swcount);
1145 struct list_head *p = &start_mm->mmlist;
1146 struct mm_struct *new_start_mm = start_mm;
1147 struct mm_struct *prev_mm = start_mm;
1148 struct mm_struct *mm;
1149
1150 atomic_inc(&new_start_mm->mm_users);
1151 atomic_inc(&prev_mm->mm_users);
1152 spin_lock(&mmlist_lock);
1153 while (swap_count(*swap_map) && !retval &&
1154 (p = p->next) != &start_mm->mmlist) {
1155 mm = list_entry(p, struct mm_struct, mmlist);
1156 if (!atomic_inc_not_zero(&mm->mm_users))
1157 continue;
1158 spin_unlock(&mmlist_lock);
1159 mmput(prev_mm);
1160 prev_mm = mm;
1161
1162 cond_resched();
1163
1164 swcount = *swap_map;
1165 if (!swap_count(swcount))
1166 ;
1167 else if (mm == &init_mm)
1168 set_start_mm = 1;
1169 else
1170 retval = unuse_mm(mm, entry, page);
1171
1172 if (set_start_mm && *swap_map < swcount) {
1173 mmput(new_start_mm);
1174 atomic_inc(&mm->mm_users);
1175 new_start_mm = mm;
1176 set_start_mm = 0;
1177 }
1178 spin_lock(&mmlist_lock);
1179 }
1180 spin_unlock(&mmlist_lock);
1181 mmput(prev_mm);
1182 mmput(start_mm);
1183 start_mm = new_start_mm;
1184 }
1185 if (retval) {
1186 unlock_page(page);
1187 page_cache_release(page);
1188 break;
1189 }
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210 if (swap_count(*swap_map) &&
1211 PageDirty(page) && PageSwapCache(page)) {
1212 struct writeback_control wbc = {
1213 .sync_mode = WB_SYNC_NONE,
1214 };
1215
1216 swap_writepage(page, &wbc);
1217 lock_page(page);
1218 wait_on_page_writeback(page);
1219 }
1220
1221
1222
1223
1224
1225
1226
1227
1228 if (PageSwapCache(page) &&
1229 likely(page_private(page) == entry.val))
1230 delete_from_swap_cache(page);
1231
1232
1233
1234
1235
1236
1237 SetPageDirty(page);
1238 unlock_page(page);
1239 page_cache_release(page);
1240
1241
1242
1243
1244
1245 cond_resched();
1246 if (frontswap && pages_to_unuse > 0) {
1247 if (!--pages_to_unuse)
1248 break;
1249 }
1250 }
1251
1252 mmput(start_mm);
1253 return retval;
1254}
1255
1256
1257
1258
1259
1260
1261
1262static void drain_mmlist(void)
1263{
1264 struct list_head *p, *next;
1265 unsigned int type;
1266
1267 for (type = 0; type < nr_swapfiles; type++)
1268 if (swap_info[type]->inuse_pages)
1269 return;
1270 spin_lock(&mmlist_lock);
1271 list_for_each_safe(p, next, &init_mm.mmlist)
1272 list_del_init(p);
1273 spin_unlock(&mmlist_lock);
1274}
1275
1276
1277
1278
1279
1280
1281
1282static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1283{
1284 struct swap_info_struct *sis;
1285 struct swap_extent *start_se;
1286 struct swap_extent *se;
1287 pgoff_t offset;
1288
1289 sis = swap_info[swp_type(entry)];
1290 *bdev = sis->bdev;
1291
1292 offset = swp_offset(entry);
1293 start_se = sis->curr_swap_extent;
1294 se = start_se;
1295
1296 for ( ; ; ) {
1297 struct list_head *lh;
1298
1299 if (se->start_page <= offset &&
1300 offset < (se->start_page + se->nr_pages)) {
1301 return se->start_block + (offset - se->start_page);
1302 }
1303 lh = se->list.next;
1304 se = list_entry(lh, struct swap_extent, list);
1305 sis->curr_swap_extent = se;
1306 BUG_ON(se == start_se);
1307 }
1308}
1309
1310
1311
1312
1313sector_t map_swap_page(struct page *page, struct block_device **bdev)
1314{
1315 swp_entry_t entry;
1316 entry.val = page_private(page);
1317 return map_swap_entry(entry, bdev);
1318}
1319
1320
1321
1322
1323static void destroy_swap_extents(struct swap_info_struct *sis)
1324{
1325 while (!list_empty(&sis->first_swap_extent.list)) {
1326 struct swap_extent *se;
1327
1328 se = list_entry(sis->first_swap_extent.list.next,
1329 struct swap_extent, list);
1330 list_del(&se->list);
1331 kfree(se);
1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1341}
1342
1343
1344
1345
1346
1347
1348
1349int
1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1351 unsigned long nr_pages, sector_t start_block)
1352{
1353 struct swap_extent *se;
1354 struct swap_extent *new_se;
1355 struct list_head *lh;
1356
1357 if (start_page == 0) {
1358 se = &sis->first_swap_extent;
1359 sis->curr_swap_extent = se;
1360 se->start_page = 0;
1361 se->nr_pages = nr_pages;
1362 se->start_block = start_block;
1363 return 1;
1364 } else {
1365 lh = sis->first_swap_extent.list.prev;
1366 se = list_entry(lh, struct swap_extent, list);
1367 BUG_ON(se->start_page + se->nr_pages != start_page);
1368 if (se->start_block + se->nr_pages == start_block) {
1369
1370 se->nr_pages += nr_pages;
1371 return 0;
1372 }
1373 }
1374
1375
1376
1377
1378 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1379 if (new_se == NULL)
1380 return -ENOMEM;
1381 new_se->start_page = start_page;
1382 new_se->nr_pages = nr_pages;
1383 new_se->start_block = start_block;
1384
1385 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1386 return 1;
1387}
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1421{
1422 struct file *swap_file = sis->swap_file;
1423 struct address_space *mapping = swap_file->f_mapping;
1424 struct inode *inode = mapping->host;
1425 int ret;
1426
1427 if (S_ISBLK(inode->i_mode)) {
1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1429 *span = sis->pages;
1430 return ret;
1431 }
1432
1433 if (mapping->a_ops->swap_activate) {
1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1435 if (!ret) {
1436 sis->flags |= SWP_FILE;
1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1438 *span = sis->pages;
1439 }
1440 return ret;
1441 }
1442
1443 return generic_swapfile_activate(sis, swap_file, span);
1444}
1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map)
1449{
1450 int i, prev;
1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0)
1454 p->prio = prio;
1455 else
1456 p->prio = --least_priority;
1457 p->swap_map = swap_map;
1458 frontswap_map_set(p, frontswap_map);
1459 p->flags |= SWP_WRITEOK;
1460 nr_swap_pages += p->pages;
1461 total_swap_pages += p->pages;
1462
1463
1464 prev = -1;
1465 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1466 if (p->prio >= swap_info[i]->prio)
1467 break;
1468 prev = i;
1469 }
1470 p->next = i;
1471 if (prev < 0)
1472 swap_list.head = swap_list.next = p->type;
1473 else
1474 swap_info[prev]->next = p->type;
1475 frontswap_init(p->type);
1476 spin_unlock(&swap_lock);
1477}
1478
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{
1481 struct swap_info_struct *p = NULL;
1482 unsigned char *swap_map;
1483 struct file *swap_file, *victim;
1484 struct address_space *mapping;
1485 struct inode *inode;
1486 char *pathname;
1487 int oom_score_adj;
1488 int i, type, prev;
1489 int err;
1490
1491 if (!capable(CAP_SYS_ADMIN))
1492 return -EPERM;
1493
1494 BUG_ON(!current->mm);
1495
1496 pathname = getname(specialfile);
1497 err = PTR_ERR(pathname);
1498 if (IS_ERR(pathname))
1499 goto out;
1500
1501 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1502 putname(pathname);
1503 err = PTR_ERR(victim);
1504 if (IS_ERR(victim))
1505 goto out;
1506
1507 mapping = victim->f_mapping;
1508 prev = -1;
1509 spin_lock(&swap_lock);
1510 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1511 p = swap_info[type];
1512 if (p->flags & SWP_WRITEOK) {
1513 if (p->swap_file->f_mapping == mapping)
1514 break;
1515 }
1516 prev = type;
1517 }
1518 if (type < 0) {
1519 err = -EINVAL;
1520 spin_unlock(&swap_lock);
1521 goto out_dput;
1522 }
1523 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1524 vm_unacct_memory(p->pages);
1525 else {
1526 err = -ENOMEM;
1527 spin_unlock(&swap_lock);
1528 goto out_dput;
1529 }
1530 if (prev < 0)
1531 swap_list.head = p->next;
1532 else
1533 swap_info[prev]->next = p->next;
1534 if (type == swap_list.next) {
1535
1536 swap_list.next = swap_list.head;
1537 }
1538 if (p->prio < 0) {
1539 for (i = p->next; i >= 0; i = swap_info[i]->next)
1540 swap_info[i]->prio = p->prio--;
1541 least_priority++;
1542 }
1543 nr_swap_pages -= p->pages;
1544 total_swap_pages -= p->pages;
1545 p->flags &= ~SWP_WRITEOK;
1546 spin_unlock(&swap_lock);
1547
1548 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1549 err = try_to_unuse(type, false, 0);
1550 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1551
1552 if (err) {
1553
1554
1555
1556
1557
1558
1559
1560 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1561 goto out_dput;
1562 }
1563
1564 destroy_swap_extents(p);
1565 if (p->flags & SWP_CONTINUED)
1566 free_swap_count_continuations(p);
1567
1568 mutex_lock(&swapon_mutex);
1569 spin_lock(&swap_lock);
1570 drain_mmlist();
1571
1572
1573 p->highest_bit = 0;
1574 while (p->flags >= SWP_SCANNING) {
1575 spin_unlock(&swap_lock);
1576 schedule_timeout_uninterruptible(1);
1577 spin_lock(&swap_lock);
1578 }
1579
1580 swap_file = p->swap_file;
1581 p->swap_file = NULL;
1582 p->max = 0;
1583 swap_map = p->swap_map;
1584 p->swap_map = NULL;
1585 p->flags = 0;
1586 frontswap_invalidate_area(type);
1587 spin_unlock(&swap_lock);
1588 mutex_unlock(&swapon_mutex);
1589 vfree(swap_map);
1590 vfree(frontswap_map_get(p));
1591
1592 swap_cgroup_swapoff(type);
1593
1594 inode = mapping->host;
1595 if (S_ISBLK(inode->i_mode)) {
1596 struct block_device *bdev = I_BDEV(inode);
1597 set_blocksize(bdev, p->old_block_size);
1598 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1599 } else {
1600 mutex_lock(&inode->i_mutex);
1601 inode->i_flags &= ~S_SWAPFILE;
1602 mutex_unlock(&inode->i_mutex);
1603 }
1604 filp_close(swap_file, NULL);
1605 err = 0;
1606 atomic_inc(&proc_poll_event);
1607 wake_up_interruptible(&proc_poll_wait);
1608
1609out_dput:
1610 filp_close(victim, NULL);
1611out:
1612 return err;
1613}
1614
1615#ifdef CONFIG_PROC_FS
1616static unsigned swaps_poll(struct file *file, poll_table *wait)
1617{
1618 struct seq_file *seq = file->private_data;
1619
1620 poll_wait(file, &proc_poll_wait, wait);
1621
1622 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1623 seq->poll_event = atomic_read(&proc_poll_event);
1624 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1625 }
1626
1627 return POLLIN | POLLRDNORM;
1628}
1629
1630
1631static void *swap_start(struct seq_file *swap, loff_t *pos)
1632{
1633 struct swap_info_struct *si;
1634 int type;
1635 loff_t l = *pos;
1636
1637 mutex_lock(&swapon_mutex);
1638
1639 if (!l)
1640 return SEQ_START_TOKEN;
1641
1642 for (type = 0; type < nr_swapfiles; type++) {
1643 smp_rmb();
1644 si = swap_info[type];
1645 if (!(si->flags & SWP_USED) || !si->swap_map)
1646 continue;
1647 if (!--l)
1648 return si;
1649 }
1650
1651 return NULL;
1652}
1653
1654static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1655{
1656 struct swap_info_struct *si = v;
1657 int type;
1658
1659 if (v == SEQ_START_TOKEN)
1660 type = 0;
1661 else
1662 type = si->type + 1;
1663
1664 for (; type < nr_swapfiles; type++) {
1665 smp_rmb();
1666 si = swap_info[type];
1667 if (!(si->flags & SWP_USED) || !si->swap_map)
1668 continue;
1669 ++*pos;
1670 return si;
1671 }
1672
1673 return NULL;
1674}
1675
1676static void swap_stop(struct seq_file *swap, void *v)
1677{
1678 mutex_unlock(&swapon_mutex);
1679}
1680
1681static int swap_show(struct seq_file *swap, void *v)
1682{
1683 struct swap_info_struct *si = v;
1684 struct file *file;
1685 int len;
1686
1687 if (si == SEQ_START_TOKEN) {
1688 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1689 return 0;
1690 }
1691
1692 file = si->swap_file;
1693 len = seq_path(swap, &file->f_path, " \t\n\\");
1694 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1695 len < 40 ? 40 - len : 1, " ",
1696 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1697 "partition" : "file\t",
1698 si->pages << (PAGE_SHIFT - 10),
1699 si->inuse_pages << (PAGE_SHIFT - 10),
1700 si->prio);
1701 return 0;
1702}
1703
1704static const struct seq_operations swaps_op = {
1705 .start = swap_start,
1706 .next = swap_next,
1707 .stop = swap_stop,
1708 .show = swap_show
1709};
1710
1711static int swaps_open(struct inode *inode, struct file *file)
1712{
1713 struct seq_file *seq;
1714 int ret;
1715
1716 ret = seq_open(file, &swaps_op);
1717 if (ret)
1718 return ret;
1719
1720 seq = file->private_data;
1721 seq->poll_event = atomic_read(&proc_poll_event);
1722 return 0;
1723}
1724
1725static const struct file_operations proc_swaps_operations = {
1726 .open = swaps_open,
1727 .read = seq_read,
1728 .llseek = seq_lseek,
1729 .release = seq_release,
1730 .poll = swaps_poll,
1731};
1732
1733static int __init procswaps_init(void)
1734{
1735 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1736 return 0;
1737}
1738__initcall(procswaps_init);
1739#endif
1740
1741#ifdef MAX_SWAPFILES_CHECK
1742static int __init max_swapfiles_check(void)
1743{
1744 MAX_SWAPFILES_CHECK();
1745 return 0;
1746}
1747late_initcall(max_swapfiles_check);
1748#endif
1749
1750static struct swap_info_struct *alloc_swap_info(void)
1751{
1752 struct swap_info_struct *p;
1753 unsigned int type;
1754
1755 p = kzalloc(sizeof(*p), GFP_KERNEL);
1756 if (!p)
1757 return ERR_PTR(-ENOMEM);
1758
1759 spin_lock(&swap_lock);
1760 for (type = 0; type < nr_swapfiles; type++) {
1761 if (!(swap_info[type]->flags & SWP_USED))
1762 break;
1763 }
1764 if (type >= MAX_SWAPFILES) {
1765 spin_unlock(&swap_lock);
1766 kfree(p);
1767 return ERR_PTR(-EPERM);
1768 }
1769 if (type >= nr_swapfiles) {
1770 p->type = type;
1771 swap_info[type] = p;
1772
1773
1774
1775
1776
1777 smp_wmb();
1778 nr_swapfiles++;
1779 } else {
1780 kfree(p);
1781 p = swap_info[type];
1782
1783
1784
1785
1786 }
1787 INIT_LIST_HEAD(&p->first_swap_extent.list);
1788 p->flags = SWP_USED;
1789 p->next = -1;
1790 spin_unlock(&swap_lock);
1791
1792 return p;
1793}
1794
1795static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1796{
1797 int error;
1798
1799 if (S_ISBLK(inode->i_mode)) {
1800 p->bdev = bdgrab(I_BDEV(inode));
1801 error = blkdev_get(p->bdev,
1802 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1803 sys_swapon);
1804 if (error < 0) {
1805 p->bdev = NULL;
1806 return -EINVAL;
1807 }
1808 p->old_block_size = block_size(p->bdev);
1809 error = set_blocksize(p->bdev, PAGE_SIZE);
1810 if (error < 0)
1811 return error;
1812 p->flags |= SWP_BLKDEV;
1813 } else if (S_ISREG(inode->i_mode)) {
1814 p->bdev = inode->i_sb->s_bdev;
1815 mutex_lock(&inode->i_mutex);
1816 if (IS_SWAPFILE(inode))
1817 return -EBUSY;
1818 } else
1819 return -EINVAL;
1820
1821 return 0;
1822}
1823
1824static unsigned long read_swap_header(struct swap_info_struct *p,
1825 union swap_header *swap_header,
1826 struct inode *inode)
1827{
1828 int i;
1829 unsigned long maxpages;
1830 unsigned long swapfilepages;
1831
1832 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1833 printk(KERN_ERR "Unable to find swap-space signature\n");
1834 return 0;
1835 }
1836
1837
1838 if (swab32(swap_header->info.version) == 1) {
1839 swab32s(&swap_header->info.version);
1840 swab32s(&swap_header->info.last_page);
1841 swab32s(&swap_header->info.nr_badpages);
1842 for (i = 0; i < swap_header->info.nr_badpages; i++)
1843 swab32s(&swap_header->info.badpages[i]);
1844 }
1845
1846 if (swap_header->info.version != 1) {
1847 printk(KERN_WARNING
1848 "Unable to handle swap header version %d\n",
1849 swap_header->info.version);
1850 return 0;
1851 }
1852
1853 p->lowest_bit = 1;
1854 p->cluster_next = 1;
1855 p->cluster_nr = 0;
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871 maxpages = swp_offset(pte_to_swp_entry(
1872 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1873 if (maxpages > swap_header->info.last_page) {
1874 maxpages = swap_header->info.last_page + 1;
1875
1876 if ((unsigned int)maxpages == 0)
1877 maxpages = UINT_MAX;
1878 }
1879 p->highest_bit = maxpages - 1;
1880
1881 if (!maxpages)
1882 return 0;
1883 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1884 if (swapfilepages && maxpages > swapfilepages) {
1885 printk(KERN_WARNING
1886 "Swap area shorter than signature indicates\n");
1887 return 0;
1888 }
1889 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1890 return 0;
1891 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1892 return 0;
1893
1894 return maxpages;
1895}
1896
1897static int setup_swap_map_and_extents(struct swap_info_struct *p,
1898 union swap_header *swap_header,
1899 unsigned char *swap_map,
1900 unsigned long maxpages,
1901 sector_t *span)
1902{
1903 int i;
1904 unsigned int nr_good_pages;
1905 int nr_extents;
1906
1907 nr_good_pages = maxpages - 1;
1908
1909 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1910 unsigned int page_nr = swap_header->info.badpages[i];
1911 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1912 return -EINVAL;
1913 if (page_nr < maxpages) {
1914 swap_map[page_nr] = SWAP_MAP_BAD;
1915 nr_good_pages--;
1916 }
1917 }
1918
1919 if (nr_good_pages) {
1920 swap_map[0] = SWAP_MAP_BAD;
1921 p->max = maxpages;
1922 p->pages = nr_good_pages;
1923 nr_extents = setup_swap_extents(p, span);
1924 if (nr_extents < 0)
1925 return nr_extents;
1926 nr_good_pages = p->pages;
1927 }
1928 if (!nr_good_pages) {
1929 printk(KERN_WARNING "Empty swap-file\n");
1930 return -EINVAL;
1931 }
1932
1933 return nr_extents;
1934}
1935
1936SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1937{
1938 struct swap_info_struct *p;
1939 char *name;
1940 struct file *swap_file = NULL;
1941 struct address_space *mapping;
1942 int i;
1943 int prio;
1944 int error;
1945 union swap_header *swap_header;
1946 int nr_extents;
1947 sector_t span;
1948 unsigned long maxpages;
1949 unsigned char *swap_map = NULL;
1950 unsigned long *frontswap_map = NULL;
1951 struct page *page = NULL;
1952 struct inode *inode = NULL;
1953
1954 if (swap_flags & ~SWAP_FLAGS_VALID)
1955 return -EINVAL;
1956
1957 if (!capable(CAP_SYS_ADMIN))
1958 return -EPERM;
1959
1960 p = alloc_swap_info();
1961 if (IS_ERR(p))
1962 return PTR_ERR(p);
1963
1964 name = getname(specialfile);
1965 if (IS_ERR(name)) {
1966 error = PTR_ERR(name);
1967 name = NULL;
1968 goto bad_swap;
1969 }
1970 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1971 if (IS_ERR(swap_file)) {
1972 error = PTR_ERR(swap_file);
1973 swap_file = NULL;
1974 goto bad_swap;
1975 }
1976
1977 p->swap_file = swap_file;
1978 mapping = swap_file->f_mapping;
1979
1980 for (i = 0; i < nr_swapfiles; i++) {
1981 struct swap_info_struct *q = swap_info[i];
1982
1983 if (q == p || !q->swap_file)
1984 continue;
1985 if (mapping == q->swap_file->f_mapping) {
1986 error = -EBUSY;
1987 goto bad_swap;
1988 }
1989 }
1990
1991 inode = mapping->host;
1992
1993 error = claim_swapfile(p, inode);
1994 if (unlikely(error))
1995 goto bad_swap;
1996
1997
1998
1999
2000 if (!mapping->a_ops->readpage) {
2001 error = -EINVAL;
2002 goto bad_swap;
2003 }
2004 page = read_mapping_page(mapping, 0, swap_file);
2005 if (IS_ERR(page)) {
2006 error = PTR_ERR(page);
2007 goto bad_swap;
2008 }
2009 swap_header = kmap(page);
2010
2011 maxpages = read_swap_header(p, swap_header, inode);
2012 if (unlikely(!maxpages)) {
2013 error = -EINVAL;
2014 goto bad_swap;
2015 }
2016
2017
2018 swap_map = vzalloc(maxpages);
2019 if (!swap_map) {
2020 error = -ENOMEM;
2021 goto bad_swap;
2022 }
2023
2024 error = swap_cgroup_swapon(p->type, maxpages);
2025 if (error)
2026 goto bad_swap;
2027
2028 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2029 maxpages, &span);
2030 if (unlikely(nr_extents < 0)) {
2031 error = nr_extents;
2032 goto bad_swap;
2033 }
2034
2035 if (frontswap_enabled)
2036 frontswap_map = vzalloc(maxpages / sizeof(long));
2037
2038 if (p->bdev) {
2039 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2040 p->flags |= SWP_SOLIDSTATE;
2041 p->cluster_next = 1 + (random32() % p->highest_bit);
2042 }
2043 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2044 p->flags |= SWP_DISCARDABLE;
2045 }
2046
2047 mutex_lock(&swapon_mutex);
2048 prio = -1;
2049 if (swap_flags & SWAP_FLAG_PREFER)
2050 prio =
2051 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2052 enable_swap_info(p, prio, swap_map, frontswap_map);
2053
2054 printk(KERN_INFO "Adding %uk swap on %s. "
2055 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2056 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2057 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2058 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2059 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2060 (frontswap_map) ? "FS" : "");
2061
2062 mutex_unlock(&swapon_mutex);
2063 atomic_inc(&proc_poll_event);
2064 wake_up_interruptible(&proc_poll_wait);
2065
2066 if (S_ISREG(inode->i_mode))
2067 inode->i_flags |= S_SWAPFILE;
2068 error = 0;
2069 goto out;
2070bad_swap:
2071 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2072 set_blocksize(p->bdev, p->old_block_size);
2073 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2074 }
2075 destroy_swap_extents(p);
2076 swap_cgroup_swapoff(p->type);
2077 spin_lock(&swap_lock);
2078 p->swap_file = NULL;
2079 p->flags = 0;
2080 spin_unlock(&swap_lock);
2081 vfree(swap_map);
2082 if (swap_file) {
2083 if (inode && S_ISREG(inode->i_mode)) {
2084 mutex_unlock(&inode->i_mutex);
2085 inode = NULL;
2086 }
2087 filp_close(swap_file, NULL);
2088 }
2089out:
2090 if (page && !IS_ERR(page)) {
2091 kunmap(page);
2092 page_cache_release(page);
2093 }
2094 if (name)
2095 putname(name);
2096 if (inode && S_ISREG(inode->i_mode))
2097 mutex_unlock(&inode->i_mutex);
2098 return error;
2099}
2100
2101void si_swapinfo(struct sysinfo *val)
2102{
2103 unsigned int type;
2104 unsigned long nr_to_be_unused = 0;
2105
2106 spin_lock(&swap_lock);
2107 for (type = 0; type < nr_swapfiles; type++) {
2108 struct swap_info_struct *si = swap_info[type];
2109
2110 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2111 nr_to_be_unused += si->inuse_pages;
2112 }
2113 val->freeswap = nr_swap_pages + nr_to_be_unused;
2114 val->totalswap = total_swap_pages + nr_to_be_unused;
2115 spin_unlock(&swap_lock);
2116}
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2130{
2131 struct swap_info_struct *p;
2132 unsigned long offset, type;
2133 unsigned char count;
2134 unsigned char has_cache;
2135 int err = -EINVAL;
2136
2137 if (non_swap_entry(entry))
2138 goto out;
2139
2140 type = swp_type(entry);
2141 if (type >= nr_swapfiles)
2142 goto bad_file;
2143 p = swap_info[type];
2144 offset = swp_offset(entry);
2145
2146 spin_lock(&swap_lock);
2147 if (unlikely(offset >= p->max))
2148 goto unlock_out;
2149
2150 count = p->swap_map[offset];
2151 has_cache = count & SWAP_HAS_CACHE;
2152 count &= ~SWAP_HAS_CACHE;
2153 err = 0;
2154
2155 if (usage == SWAP_HAS_CACHE) {
2156
2157
2158 if (!has_cache && count)
2159 has_cache = SWAP_HAS_CACHE;
2160 else if (has_cache)
2161 err = -EEXIST;
2162 else
2163 err = -ENOENT;
2164
2165 } else if (count || has_cache) {
2166
2167 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2168 count += usage;
2169 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2170 err = -EINVAL;
2171 else if (swap_count_continued(p, offset, count))
2172 count = COUNT_CONTINUED;
2173 else
2174 err = -ENOMEM;
2175 } else
2176 err = -ENOENT;
2177
2178 p->swap_map[offset] = count | has_cache;
2179
2180unlock_out:
2181 spin_unlock(&swap_lock);
2182out:
2183 return err;
2184
2185bad_file:
2186 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2187 goto out;
2188}
2189
2190
2191
2192
2193
2194void swap_shmem_alloc(swp_entry_t entry)
2195{
2196 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2197}
2198
2199
2200
2201
2202
2203
2204
2205
2206int swap_duplicate(swp_entry_t entry)
2207{
2208 int err = 0;
2209
2210 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2211 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2212 return err;
2213}
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223int swapcache_prepare(swp_entry_t entry)
2224{
2225 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2226}
2227
2228struct swap_info_struct *page_swap_info(struct page *page)
2229{
2230 swp_entry_t swap = { .val = page_private(page) };
2231 BUG_ON(!PageSwapCache(page));
2232 return swap_info[swp_type(swap)];
2233}
2234
2235
2236
2237
2238struct address_space *__page_file_mapping(struct page *page)
2239{
2240 VM_BUG_ON(!PageSwapCache(page));
2241 return page_swap_info(page)->swap_file->f_mapping;
2242}
2243EXPORT_SYMBOL_GPL(__page_file_mapping);
2244
2245pgoff_t __page_file_index(struct page *page)
2246{
2247 swp_entry_t swap = { .val = page_private(page) };
2248 VM_BUG_ON(!PageSwapCache(page));
2249 return swp_offset(swap);
2250}
2251EXPORT_SYMBOL_GPL(__page_file_index);
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2269{
2270 struct swap_info_struct *si;
2271 struct page *head;
2272 struct page *page;
2273 struct page *list_page;
2274 pgoff_t offset;
2275 unsigned char count;
2276
2277
2278
2279
2280
2281 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2282
2283 si = swap_info_get(entry);
2284 if (!si) {
2285
2286
2287
2288
2289
2290 goto outer;
2291 }
2292
2293 offset = swp_offset(entry);
2294 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2295
2296 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2297
2298
2299
2300
2301
2302 goto out;
2303 }
2304
2305 if (!page) {
2306 spin_unlock(&swap_lock);
2307 return -ENOMEM;
2308 }
2309
2310
2311
2312
2313
2314
2315 head = vmalloc_to_page(si->swap_map + offset);
2316 offset &= ~PAGE_MASK;
2317
2318
2319
2320
2321
2322 if (!page_private(head)) {
2323 BUG_ON(count & COUNT_CONTINUED);
2324 INIT_LIST_HEAD(&head->lru);
2325 set_page_private(head, SWP_CONTINUED);
2326 si->flags |= SWP_CONTINUED;
2327 }
2328
2329 list_for_each_entry(list_page, &head->lru, lru) {
2330 unsigned char *map;
2331
2332
2333
2334
2335
2336 if (!(count & COUNT_CONTINUED))
2337 goto out;
2338
2339 map = kmap_atomic(list_page) + offset;
2340 count = *map;
2341 kunmap_atomic(map);
2342
2343
2344
2345
2346
2347 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2348 goto out;
2349 }
2350
2351 list_add_tail(&page->lru, &head->lru);
2352 page = NULL;
2353out:
2354 spin_unlock(&swap_lock);
2355outer:
2356 if (page)
2357 __free_page(page);
2358 return 0;
2359}
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369static bool swap_count_continued(struct swap_info_struct *si,
2370 pgoff_t offset, unsigned char count)
2371{
2372 struct page *head;
2373 struct page *page;
2374 unsigned char *map;
2375
2376 head = vmalloc_to_page(si->swap_map + offset);
2377 if (page_private(head) != SWP_CONTINUED) {
2378 BUG_ON(count & COUNT_CONTINUED);
2379 return false;
2380 }
2381
2382 offset &= ~PAGE_MASK;
2383 page = list_entry(head->lru.next, struct page, lru);
2384 map = kmap_atomic(page) + offset;
2385
2386 if (count == SWAP_MAP_MAX)
2387 goto init_map;
2388
2389 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2390
2391
2392
2393 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2394 kunmap_atomic(map);
2395 page = list_entry(page->lru.next, struct page, lru);
2396 BUG_ON(page == head);
2397 map = kmap_atomic(page) + offset;
2398 }
2399 if (*map == SWAP_CONT_MAX) {
2400 kunmap_atomic(map);
2401 page = list_entry(page->lru.next, struct page, lru);
2402 if (page == head)
2403 return false;
2404 map = kmap_atomic(page) + offset;
2405init_map: *map = 0;
2406 }
2407 *map += 1;
2408 kunmap_atomic(map);
2409 page = list_entry(page->lru.prev, struct page, lru);
2410 while (page != head) {
2411 map = kmap_atomic(page) + offset;
2412 *map = COUNT_CONTINUED;
2413 kunmap_atomic(map);
2414 page = list_entry(page->lru.prev, struct page, lru);
2415 }
2416 return true;
2417
2418 } else {
2419
2420
2421
2422 BUG_ON(count != COUNT_CONTINUED);
2423 while (*map == COUNT_CONTINUED) {
2424 kunmap_atomic(map);
2425 page = list_entry(page->lru.next, struct page, lru);
2426 BUG_ON(page == head);
2427 map = kmap_atomic(page) + offset;
2428 }
2429 BUG_ON(*map == 0);
2430 *map -= 1;
2431 if (*map == 0)
2432 count = 0;
2433 kunmap_atomic(map);
2434 page = list_entry(page->lru.prev, struct page, lru);
2435 while (page != head) {
2436 map = kmap_atomic(page) + offset;
2437 *map = SWAP_CONT_MAX | count;
2438 count = COUNT_CONTINUED;
2439 kunmap_atomic(map);
2440 page = list_entry(page->lru.prev, struct page, lru);
2441 }
2442 return count == COUNT_CONTINUED;
2443 }
2444}
2445
2446
2447
2448
2449
2450static void free_swap_count_continuations(struct swap_info_struct *si)
2451{
2452 pgoff_t offset;
2453
2454 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2455 struct page *head;
2456 head = vmalloc_to_page(si->swap_map + offset);
2457 if (page_private(head)) {
2458 struct list_head *this, *next;
2459 list_for_each_safe(this, next, &head->lru) {
2460 struct page *page;
2461 page = list_entry(this, struct page, lru);
2462 list_del(this);
2463 __free_page(page);
2464 }
2465 }
2466 }
2467}
2468