1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34#include <linux/frontswap.h>
35#include <linux/swapfile.h>
36
37#include <asm/pgtable.h>
38#include <asm/tlbflush.h>
39#include <linux/swapops.h>
40#include <linux/page_cgroup.h>
41
42static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
43 unsigned char);
44static void free_swap_count_continuations(struct swap_info_struct *);
45static sector_t map_swap_entry(swp_entry_t, struct block_device**);
46
47DEFINE_SPINLOCK(swap_lock);
48static unsigned int nr_swapfiles;
49long nr_swap_pages;
50long total_swap_pages;
51static int least_priority;
52
53static const char Bad_file[] = "Bad swap file entry ";
54static const char Unused_file[] = "Unused swap file entry ";
55static const char Bad_offset[] = "Bad swap offset entry ";
56static const char Unused_offset[] = "Unused swap offset entry ";
57
58struct swap_list_t swap_list = {-1, -1};
59
60struct swap_info_struct *swap_info[MAX_SWAPFILES];
61
62static DEFINE_MUTEX(swapon_mutex);
63
64static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
65
66static atomic_t proc_poll_event = ATOMIC_INIT(0);
67
68static inline unsigned char swap_count(unsigned char ent)
69{
70 return ent & ~SWAP_HAS_CACHE;
71}
72
73
74static int
75__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
76{
77 swp_entry_t entry = swp_entry(si->type, offset);
78 struct page *page;
79 int ret = 0;
80
81 page = find_get_page(&swapper_space, entry.val);
82 if (!page)
83 return 0;
84
85
86
87
88
89
90
91 if (trylock_page(page)) {
92 ret = try_to_free_swap(page);
93 unlock_page(page);
94 }
95 page_cache_release(page);
96 return ret;
97}
98
99
100
101
102
103static int discard_swap(struct swap_info_struct *si)
104{
105 struct swap_extent *se;
106 sector_t start_block;
107 sector_t nr_blocks;
108 int err = 0;
109
110
111 se = &si->first_swap_extent;
112 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
113 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
114 if (nr_blocks) {
115 err = blkdev_issue_discard(si->bdev, start_block,
116 nr_blocks, GFP_KERNEL, 0);
117 if (err)
118 return err;
119 cond_resched();
120 }
121
122 list_for_each_entry(se, &si->first_swap_extent.list, list) {
123 start_block = se->start_block << (PAGE_SHIFT - 9);
124 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
125
126 err = blkdev_issue_discard(si->bdev, start_block,
127 nr_blocks, GFP_KERNEL, 0);
128 if (err)
129 break;
130
131 cond_resched();
132 }
133 return err;
134}
135
136
137
138
139
140static void discard_swap_cluster(struct swap_info_struct *si,
141 pgoff_t start_page, pgoff_t nr_pages)
142{
143 struct swap_extent *se = si->curr_swap_extent;
144 int found_extent = 0;
145
146 while (nr_pages) {
147 struct list_head *lh;
148
149 if (se->start_page <= start_page &&
150 start_page < se->start_page + se->nr_pages) {
151 pgoff_t offset = start_page - se->start_page;
152 sector_t start_block = se->start_block + offset;
153 sector_t nr_blocks = se->nr_pages - offset;
154
155 if (nr_blocks > nr_pages)
156 nr_blocks = nr_pages;
157 start_page += nr_blocks;
158 nr_pages -= nr_blocks;
159
160 if (!found_extent++)
161 si->curr_swap_extent = se;
162
163 start_block <<= PAGE_SHIFT - 9;
164 nr_blocks <<= PAGE_SHIFT - 9;
165 if (blkdev_issue_discard(si->bdev, start_block,
166 nr_blocks, GFP_NOIO, 0))
167 break;
168 }
169
170 lh = se->list.next;
171 se = list_entry(lh, struct swap_extent, list);
172 }
173}
174
175static int wait_for_discard(void *word)
176{
177 schedule();
178 return 0;
179}
180
181#define SWAPFILE_CLUSTER 256
182#define LATENCY_LIMIT 256
183
184static unsigned long scan_swap_map(struct swap_info_struct *si,
185 unsigned char usage)
186{
187 unsigned long offset;
188 unsigned long scan_base;
189 unsigned long last_in_cluster = 0;
190 int latency_ration = LATENCY_LIMIT;
191 int found_free_cluster = 0;
192
193
194
195
196
197
198
199
200
201
202
203
204 si->flags += SWP_SCANNING;
205 scan_base = offset = si->cluster_next;
206
207 if (unlikely(!si->cluster_nr--)) {
208 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
209 si->cluster_nr = SWAPFILE_CLUSTER - 1;
210 goto checks;
211 }
212 if (si->flags & SWP_DISCARDABLE) {
213
214
215
216
217
218
219
220 if (si->lowest_alloc)
221 goto checks;
222 si->lowest_alloc = si->max;
223 si->highest_alloc = 0;
224 }
225 spin_unlock(&swap_lock);
226
227
228
229
230
231
232
233
234
235 if (!(si->flags & SWP_SOLIDSTATE))
236 scan_base = offset = si->lowest_bit;
237 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
238
239
240 for (; last_in_cluster <= si->highest_bit; offset++) {
241 if (si->swap_map[offset])
242 last_in_cluster = offset + SWAPFILE_CLUSTER;
243 else if (offset == last_in_cluster) {
244 spin_lock(&swap_lock);
245 offset -= SWAPFILE_CLUSTER - 1;
246 si->cluster_next = offset;
247 si->cluster_nr = SWAPFILE_CLUSTER - 1;
248 found_free_cluster = 1;
249 goto checks;
250 }
251 if (unlikely(--latency_ration < 0)) {
252 cond_resched();
253 latency_ration = LATENCY_LIMIT;
254 }
255 }
256
257 offset = si->lowest_bit;
258 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
259
260
261 for (; last_in_cluster < scan_base; offset++) {
262 if (si->swap_map[offset])
263 last_in_cluster = offset + SWAPFILE_CLUSTER;
264 else if (offset == last_in_cluster) {
265 spin_lock(&swap_lock);
266 offset -= SWAPFILE_CLUSTER - 1;
267 si->cluster_next = offset;
268 si->cluster_nr = SWAPFILE_CLUSTER - 1;
269 found_free_cluster = 1;
270 goto checks;
271 }
272 if (unlikely(--latency_ration < 0)) {
273 cond_resched();
274 latency_ration = LATENCY_LIMIT;
275 }
276 }
277
278 offset = scan_base;
279 spin_lock(&swap_lock);
280 si->cluster_nr = SWAPFILE_CLUSTER - 1;
281 si->lowest_alloc = 0;
282 }
283
284checks:
285 if (!(si->flags & SWP_WRITEOK))
286 goto no_page;
287 if (!si->highest_bit)
288 goto no_page;
289 if (offset > si->highest_bit)
290 scan_base = offset = si->lowest_bit;
291
292
293 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
294 int swap_was_freed;
295 spin_unlock(&swap_lock);
296 swap_was_freed = __try_to_reclaim_swap(si, offset);
297 spin_lock(&swap_lock);
298
299 if (swap_was_freed)
300 goto checks;
301 goto scan;
302 }
303
304 if (si->swap_map[offset])
305 goto scan;
306
307 if (offset == si->lowest_bit)
308 si->lowest_bit++;
309 if (offset == si->highest_bit)
310 si->highest_bit--;
311 si->inuse_pages++;
312 if (si->inuse_pages == si->pages) {
313 si->lowest_bit = si->max;
314 si->highest_bit = 0;
315 }
316 si->swap_map[offset] = usage;
317 si->cluster_next = offset + 1;
318 si->flags -= SWP_SCANNING;
319
320 if (si->lowest_alloc) {
321
322
323
324
325 if (found_free_cluster) {
326
327
328
329
330
331
332
333 if (offset < si->highest_alloc &&
334 si->lowest_alloc <= last_in_cluster)
335 last_in_cluster = si->lowest_alloc - 1;
336 si->flags |= SWP_DISCARDING;
337 spin_unlock(&swap_lock);
338
339 if (offset < last_in_cluster)
340 discard_swap_cluster(si, offset,
341 last_in_cluster - offset + 1);
342
343 spin_lock(&swap_lock);
344 si->lowest_alloc = 0;
345 si->flags &= ~SWP_DISCARDING;
346
347 smp_mb();
348 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
349
350 } else if (si->flags & SWP_DISCARDING) {
351
352
353
354
355
356
357 spin_unlock(&swap_lock);
358 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
359 wait_for_discard, TASK_UNINTERRUPTIBLE);
360 spin_lock(&swap_lock);
361 } else {
362
363
364
365
366
367 if (offset < si->lowest_alloc)
368 si->lowest_alloc = offset;
369 if (offset > si->highest_alloc)
370 si->highest_alloc = offset;
371 }
372 }
373 return offset;
374
375scan:
376 spin_unlock(&swap_lock);
377 while (++offset <= si->highest_bit) {
378 if (!si->swap_map[offset]) {
379 spin_lock(&swap_lock);
380 goto checks;
381 }
382 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
383 spin_lock(&swap_lock);
384 goto checks;
385 }
386 if (unlikely(--latency_ration < 0)) {
387 cond_resched();
388 latency_ration = LATENCY_LIMIT;
389 }
390 }
391 offset = si->lowest_bit;
392 while (++offset < scan_base) {
393 if (!si->swap_map[offset]) {
394 spin_lock(&swap_lock);
395 goto checks;
396 }
397 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
398 spin_lock(&swap_lock);
399 goto checks;
400 }
401 if (unlikely(--latency_ration < 0)) {
402 cond_resched();
403 latency_ration = LATENCY_LIMIT;
404 }
405 }
406 spin_lock(&swap_lock);
407
408no_page:
409 si->flags -= SWP_SCANNING;
410 return 0;
411}
412
413swp_entry_t get_swap_page(void)
414{
415 struct swap_info_struct *si;
416 pgoff_t offset;
417 int type, next;
418 int wrapped = 0;
419
420 spin_lock(&swap_lock);
421 if (nr_swap_pages <= 0)
422 goto noswap;
423 nr_swap_pages--;
424
425 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
426 si = swap_info[type];
427 next = si->next;
428 if (next < 0 ||
429 (!wrapped && si->prio != swap_info[next]->prio)) {
430 next = swap_list.head;
431 wrapped++;
432 }
433
434 if (!si->highest_bit)
435 continue;
436 if (!(si->flags & SWP_WRITEOK))
437 continue;
438
439 swap_list.next = next;
440
441 offset = scan_swap_map(si, SWAP_HAS_CACHE);
442 if (offset) {
443 spin_unlock(&swap_lock);
444 return swp_entry(type, offset);
445 }
446 next = swap_list.next;
447 }
448
449 nr_swap_pages++;
450noswap:
451 spin_unlock(&swap_lock);
452 return (swp_entry_t) {0};
453}
454
455
456swp_entry_t get_swap_page_of_type(int type)
457{
458 struct swap_info_struct *si;
459 pgoff_t offset;
460
461 spin_lock(&swap_lock);
462 si = swap_info[type];
463 if (si && (si->flags & SWP_WRITEOK)) {
464 nr_swap_pages--;
465
466 offset = scan_swap_map(si, 1);
467 if (offset) {
468 spin_unlock(&swap_lock);
469 return swp_entry(type, offset);
470 }
471 nr_swap_pages++;
472 }
473 spin_unlock(&swap_lock);
474 return (swp_entry_t) {0};
475}
476
477static struct swap_info_struct *swap_info_get(swp_entry_t entry)
478{
479 struct swap_info_struct *p;
480 unsigned long offset, type;
481
482 if (!entry.val)
483 goto out;
484 type = swp_type(entry);
485 if (type >= nr_swapfiles)
486 goto bad_nofile;
487 p = swap_info[type];
488 if (!(p->flags & SWP_USED))
489 goto bad_device;
490 offset = swp_offset(entry);
491 if (offset >= p->max)
492 goto bad_offset;
493 if (!p->swap_map[offset])
494 goto bad_free;
495 spin_lock(&swap_lock);
496 return p;
497
498bad_free:
499 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
500 goto out;
501bad_offset:
502 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
503 goto out;
504bad_device:
505 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
506 goto out;
507bad_nofile:
508 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
509out:
510 return NULL;
511}
512
513static unsigned char swap_entry_free(struct swap_info_struct *p,
514 swp_entry_t entry, unsigned char usage)
515{
516 unsigned long offset = swp_offset(entry);
517 unsigned char count;
518 unsigned char has_cache;
519
520 count = p->swap_map[offset];
521 has_cache = count & SWAP_HAS_CACHE;
522 count &= ~SWAP_HAS_CACHE;
523
524 if (usage == SWAP_HAS_CACHE) {
525 VM_BUG_ON(!has_cache);
526 has_cache = 0;
527 } else if (count == SWAP_MAP_SHMEM) {
528
529
530
531
532 count = 0;
533 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
534 if (count == COUNT_CONTINUED) {
535 if (swap_count_continued(p, offset, count))
536 count = SWAP_MAP_MAX | COUNT_CONTINUED;
537 else
538 count = SWAP_MAP_MAX;
539 } else
540 count--;
541 }
542
543 if (!count)
544 mem_cgroup_uncharge_swap(entry);
545
546 usage = count | has_cache;
547 p->swap_map[offset] = usage;
548
549
550 if (!usage) {
551 struct gendisk *disk = p->bdev->bd_disk;
552 if (offset < p->lowest_bit)
553 p->lowest_bit = offset;
554 if (offset > p->highest_bit)
555 p->highest_bit = offset;
556 if (swap_list.next >= 0 &&
557 p->prio > swap_info[swap_list.next]->prio)
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset);
562 if ((p->flags & SWP_BLKDEV) &&
563 disk->fops->swap_slot_free_notify)
564 disk->fops->swap_slot_free_notify(p->bdev, offset);
565 }
566
567 return usage;
568}
569
570
571
572
573
574void swap_free(swp_entry_t entry)
575{
576 struct swap_info_struct *p;
577
578 p = swap_info_get(entry);
579 if (p) {
580 swap_entry_free(p, entry, 1);
581 spin_unlock(&swap_lock);
582 }
583}
584
585
586
587
588void swapcache_free(swp_entry_t entry, struct page *page)
589{
590 struct swap_info_struct *p;
591 unsigned char count;
592
593 p = swap_info_get(entry);
594 if (p) {
595 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
596 if (page)
597 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
598 spin_unlock(&swap_lock);
599 }
600}
601
602
603
604
605
606
607int page_swapcount(struct page *page)
608{
609 int count = 0;
610 struct swap_info_struct *p;
611 swp_entry_t entry;
612
613 entry.val = page_private(page);
614 p = swap_info_get(entry);
615 if (p) {
616 count = swap_count(p->swap_map[swp_offset(entry)]);
617 spin_unlock(&swap_lock);
618 }
619 return count;
620}
621
622
623
624
625
626
627
628int reuse_swap_page(struct page *page)
629{
630 int count;
631
632 VM_BUG_ON(!PageLocked(page));
633 if (unlikely(PageKsm(page)))
634 return 0;
635 count = page_mapcount(page);
636 if (count <= 1 && PageSwapCache(page)) {
637 count += page_swapcount(page);
638 if (count == 1 && !PageWriteback(page)) {
639 delete_from_swap_cache(page);
640 SetPageDirty(page);
641 }
642 }
643 return count <= 1;
644}
645
646
647
648
649
650int try_to_free_swap(struct page *page)
651{
652 VM_BUG_ON(!PageLocked(page));
653
654 if (!PageSwapCache(page))
655 return 0;
656 if (PageWriteback(page))
657 return 0;
658 if (page_swapcount(page))
659 return 0;
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676 if (pm_suspended_storage())
677 return 0;
678
679 delete_from_swap_cache(page);
680 SetPageDirty(page);
681 return 1;
682}
683
684
685
686
687
688int free_swap_and_cache(swp_entry_t entry)
689{
690 struct swap_info_struct *p;
691 struct page *page = NULL;
692
693 if (non_swap_entry(entry))
694 return 1;
695
696 p = swap_info_get(entry);
697 if (p) {
698 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
699 page = find_get_page(&swapper_space, entry.val);
700 if (page && !trylock_page(page)) {
701 page_cache_release(page);
702 page = NULL;
703 }
704 }
705 spin_unlock(&swap_lock);
706 }
707 if (page) {
708
709
710
711
712 if (PageSwapCache(page) && !PageWriteback(page) &&
713 (!page_mapped(page) || vm_swap_full())) {
714 delete_from_swap_cache(page);
715 SetPageDirty(page);
716 }
717 unlock_page(page);
718 page_cache_release(page);
719 }
720 return p != NULL;
721}
722
723#ifdef CONFIG_HIBERNATION
724
725
726
727
728
729
730
731
732int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
733{
734 struct block_device *bdev = NULL;
735 int type;
736
737 if (device)
738 bdev = bdget(device);
739
740 spin_lock(&swap_lock);
741 for (type = 0; type < nr_swapfiles; type++) {
742 struct swap_info_struct *sis = swap_info[type];
743
744 if (!(sis->flags & SWP_WRITEOK))
745 continue;
746
747 if (!bdev) {
748 if (bdev_p)
749 *bdev_p = bdgrab(sis->bdev);
750
751 spin_unlock(&swap_lock);
752 return type;
753 }
754 if (bdev == sis->bdev) {
755 struct swap_extent *se = &sis->first_swap_extent;
756
757 if (se->start_block == offset) {
758 if (bdev_p)
759 *bdev_p = bdgrab(sis->bdev);
760
761 spin_unlock(&swap_lock);
762 bdput(bdev);
763 return type;
764 }
765 }
766 }
767 spin_unlock(&swap_lock);
768 if (bdev)
769 bdput(bdev);
770
771 return -ENODEV;
772}
773
774
775
776
777
778sector_t swapdev_block(int type, pgoff_t offset)
779{
780 struct block_device *bdev;
781
782 if ((unsigned int)type >= nr_swapfiles)
783 return 0;
784 if (!(swap_info[type]->flags & SWP_WRITEOK))
785 return 0;
786 return map_swap_entry(swp_entry(type, offset), &bdev);
787}
788
789
790
791
792
793
794
795unsigned int count_swap_pages(int type, int free)
796{
797 unsigned int n = 0;
798
799 spin_lock(&swap_lock);
800 if ((unsigned int)type < nr_swapfiles) {
801 struct swap_info_struct *sis = swap_info[type];
802
803 if (sis->flags & SWP_WRITEOK) {
804 n = sis->pages;
805 if (free)
806 n -= sis->inuse_pages;
807 }
808 }
809 spin_unlock(&swap_lock);
810 return n;
811}
812#endif
813
814
815
816
817
818
819static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
820 unsigned long addr, swp_entry_t entry, struct page *page)
821{
822 struct mem_cgroup *memcg;
823 spinlock_t *ptl;
824 pte_t *pte;
825 int ret = 1;
826
827 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
828 GFP_KERNEL, &memcg)) {
829 ret = -ENOMEM;
830 goto out_nolock;
831 }
832
833 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
834 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
835 if (ret > 0)
836 mem_cgroup_cancel_charge_swapin(memcg);
837 ret = 0;
838 goto out;
839 }
840
841 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
842 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
843 get_page(page);
844 set_pte_at(vma->vm_mm, addr, pte,
845 pte_mkold(mk_pte(page, vma->vm_page_prot)));
846 page_add_anon_rmap(page, vma, addr);
847 mem_cgroup_commit_charge_swapin(page, memcg);
848 swap_free(entry);
849
850
851
852
853 activate_page(page);
854out:
855 pte_unmap_unlock(pte, ptl);
856out_nolock:
857 return ret;
858}
859
860static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
861 unsigned long addr, unsigned long end,
862 swp_entry_t entry, struct page *page)
863{
864 pte_t swp_pte = swp_entry_to_pte(entry);
865 pte_t *pte;
866 int ret = 0;
867
868
869
870
871
872
873
874
875
876
877 pte = pte_offset_map(pmd, addr);
878 do {
879
880
881
882
883 if (unlikely(pte_same(*pte, swp_pte))) {
884 pte_unmap(pte);
885 ret = unuse_pte(vma, pmd, addr, entry, page);
886 if (ret)
887 goto out;
888 pte = pte_offset_map(pmd, addr);
889 }
890 } while (pte++, addr += PAGE_SIZE, addr != end);
891 pte_unmap(pte - 1);
892out:
893 return ret;
894}
895
896static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
897 unsigned long addr, unsigned long end,
898 swp_entry_t entry, struct page *page)
899{
900 pmd_t *pmd;
901 unsigned long next;
902 int ret;
903
904 pmd = pmd_offset(pud, addr);
905 do {
906 next = pmd_addr_end(addr, end);
907 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
908 continue;
909 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
910 if (ret)
911 return ret;
912 } while (pmd++, addr = next, addr != end);
913 return 0;
914}
915
916static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
917 unsigned long addr, unsigned long end,
918 swp_entry_t entry, struct page *page)
919{
920 pud_t *pud;
921 unsigned long next;
922 int ret;
923
924 pud = pud_offset(pgd, addr);
925 do {
926 next = pud_addr_end(addr, end);
927 if (pud_none_or_clear_bad(pud))
928 continue;
929 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
930 if (ret)
931 return ret;
932 } while (pud++, addr = next, addr != end);
933 return 0;
934}
935
936static int unuse_vma(struct vm_area_struct *vma,
937 swp_entry_t entry, struct page *page)
938{
939 pgd_t *pgd;
940 unsigned long addr, end, next;
941 int ret;
942
943 if (page_anon_vma(page)) {
944 addr = page_address_in_vma(page, vma);
945 if (addr == -EFAULT)
946 return 0;
947 else
948 end = addr + PAGE_SIZE;
949 } else {
950 addr = vma->vm_start;
951 end = vma->vm_end;
952 }
953
954 pgd = pgd_offset(vma->vm_mm, addr);
955 do {
956 next = pgd_addr_end(addr, end);
957 if (pgd_none_or_clear_bad(pgd))
958 continue;
959 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
960 if (ret)
961 return ret;
962 } while (pgd++, addr = next, addr != end);
963 return 0;
964}
965
966static int unuse_mm(struct mm_struct *mm,
967 swp_entry_t entry, struct page *page)
968{
969 struct vm_area_struct *vma;
970 int ret = 0;
971
972 if (!down_read_trylock(&mm->mmap_sem)) {
973
974
975
976
977 activate_page(page);
978 unlock_page(page);
979 down_read(&mm->mmap_sem);
980 lock_page(page);
981 }
982 for (vma = mm->mmap; vma; vma = vma->vm_next) {
983 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
984 break;
985 }
986 up_read(&mm->mmap_sem);
987 return (ret < 0)? ret: 0;
988}
989
990
991
992
993
994
995static unsigned int find_next_to_unuse(struct swap_info_struct *si,
996 unsigned int prev, bool frontswap)
997{
998 unsigned int max = si->max;
999 unsigned int i = prev;
1000 unsigned char count;
1001
1002
1003
1004
1005
1006
1007
1008 for (;;) {
1009 if (++i >= max) {
1010 if (!prev) {
1011 i = 0;
1012 break;
1013 }
1014
1015
1016
1017
1018 max = prev + 1;
1019 prev = 0;
1020 i = 1;
1021 }
1022 if (frontswap) {
1023 if (frontswap_test(si, i))
1024 break;
1025 else
1026 continue;
1027 }
1028 count = si->swap_map[i];
1029 if (count && swap_count(count) != SWAP_MAP_BAD)
1030 break;
1031 }
1032 return i;
1033}
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043int try_to_unuse(unsigned int type, bool frontswap,
1044 unsigned long pages_to_unuse)
1045{
1046 struct swap_info_struct *si = swap_info[type];
1047 struct mm_struct *start_mm;
1048 unsigned char *swap_map;
1049 unsigned char swcount;
1050 struct page *page;
1051 swp_entry_t entry;
1052 unsigned int i = 0;
1053 int retval = 0;
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069 start_mm = &init_mm;
1070 atomic_inc(&init_mm.mm_users);
1071
1072
1073
1074
1075
1076
1077 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
1078 if (signal_pending(current)) {
1079 retval = -EINTR;
1080 break;
1081 }
1082
1083
1084
1085
1086
1087
1088 swap_map = &si->swap_map[i];
1089 entry = swp_entry(type, i);
1090 page = read_swap_cache_async(entry,
1091 GFP_HIGHUSER_MOVABLE, NULL, 0);
1092 if (!page) {
1093
1094
1095
1096
1097
1098
1099 if (!*swap_map)
1100 continue;
1101 retval = -ENOMEM;
1102 break;
1103 }
1104
1105
1106
1107
1108 if (atomic_read(&start_mm->mm_users) == 1) {
1109 mmput(start_mm);
1110 start_mm = &init_mm;
1111 atomic_inc(&init_mm.mm_users);
1112 }
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122 wait_on_page_locked(page);
1123 wait_on_page_writeback(page);
1124 lock_page(page);
1125 wait_on_page_writeback(page);
1126
1127
1128
1129
1130 swcount = *swap_map;
1131 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1132 retval = shmem_unuse(entry, page);
1133
1134 if (retval < 0)
1135 break;
1136 continue;
1137 }
1138 if (swap_count(swcount) && start_mm != &init_mm)
1139 retval = unuse_mm(start_mm, entry, page);
1140
1141 if (swap_count(*swap_map)) {
1142 int set_start_mm = (*swap_map >= swcount);
1143 struct list_head *p = &start_mm->mmlist;
1144 struct mm_struct *new_start_mm = start_mm;
1145 struct mm_struct *prev_mm = start_mm;
1146 struct mm_struct *mm;
1147
1148 atomic_inc(&new_start_mm->mm_users);
1149 atomic_inc(&prev_mm->mm_users);
1150 spin_lock(&mmlist_lock);
1151 while (swap_count(*swap_map) && !retval &&
1152 (p = p->next) != &start_mm->mmlist) {
1153 mm = list_entry(p, struct mm_struct, mmlist);
1154 if (!atomic_inc_not_zero(&mm->mm_users))
1155 continue;
1156 spin_unlock(&mmlist_lock);
1157 mmput(prev_mm);
1158 prev_mm = mm;
1159
1160 cond_resched();
1161
1162 swcount = *swap_map;
1163 if (!swap_count(swcount))
1164 ;
1165 else if (mm == &init_mm)
1166 set_start_mm = 1;
1167 else
1168 retval = unuse_mm(mm, entry, page);
1169
1170 if (set_start_mm && *swap_map < swcount) {
1171 mmput(new_start_mm);
1172 atomic_inc(&mm->mm_users);
1173 new_start_mm = mm;
1174 set_start_mm = 0;
1175 }
1176 spin_lock(&mmlist_lock);
1177 }
1178 spin_unlock(&mmlist_lock);
1179 mmput(prev_mm);
1180 mmput(start_mm);
1181 start_mm = new_start_mm;
1182 }
1183 if (retval) {
1184 unlock_page(page);
1185 page_cache_release(page);
1186 break;
1187 }
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208 if (swap_count(*swap_map) &&
1209 PageDirty(page) && PageSwapCache(page)) {
1210 struct writeback_control wbc = {
1211 .sync_mode = WB_SYNC_NONE,
1212 };
1213
1214 swap_writepage(page, &wbc);
1215 lock_page(page);
1216 wait_on_page_writeback(page);
1217 }
1218
1219
1220
1221
1222
1223
1224
1225
1226 if (PageSwapCache(page) &&
1227 likely(page_private(page) == entry.val))
1228 delete_from_swap_cache(page);
1229
1230
1231
1232
1233
1234
1235 SetPageDirty(page);
1236 unlock_page(page);
1237 page_cache_release(page);
1238
1239
1240
1241
1242
1243 cond_resched();
1244 if (frontswap && pages_to_unuse > 0) {
1245 if (!--pages_to_unuse)
1246 break;
1247 }
1248 }
1249
1250 mmput(start_mm);
1251 return retval;
1252}
1253
1254
1255
1256
1257
1258
1259
1260static void drain_mmlist(void)
1261{
1262 struct list_head *p, *next;
1263 unsigned int type;
1264
1265 for (type = 0; type < nr_swapfiles; type++)
1266 if (swap_info[type]->inuse_pages)
1267 return;
1268 spin_lock(&mmlist_lock);
1269 list_for_each_safe(p, next, &init_mm.mmlist)
1270 list_del_init(p);
1271 spin_unlock(&mmlist_lock);
1272}
1273
1274
1275
1276
1277
1278
1279
1280static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1281{
1282 struct swap_info_struct *sis;
1283 struct swap_extent *start_se;
1284 struct swap_extent *se;
1285 pgoff_t offset;
1286
1287 sis = swap_info[swp_type(entry)];
1288 *bdev = sis->bdev;
1289
1290 offset = swp_offset(entry);
1291 start_se = sis->curr_swap_extent;
1292 se = start_se;
1293
1294 for ( ; ; ) {
1295 struct list_head *lh;
1296
1297 if (se->start_page <= offset &&
1298 offset < (se->start_page + se->nr_pages)) {
1299 return se->start_block + (offset - se->start_page);
1300 }
1301 lh = se->list.next;
1302 se = list_entry(lh, struct swap_extent, list);
1303 sis->curr_swap_extent = se;
1304 BUG_ON(se == start_se);
1305 }
1306}
1307
1308
1309
1310
1311sector_t map_swap_page(struct page *page, struct block_device **bdev)
1312{
1313 swp_entry_t entry;
1314 entry.val = page_private(page);
1315 return map_swap_entry(entry, bdev);
1316}
1317
1318
1319
1320
1321static void destroy_swap_extents(struct swap_info_struct *sis)
1322{
1323 while (!list_empty(&sis->first_swap_extent.list)) {
1324 struct swap_extent *se;
1325
1326 se = list_entry(sis->first_swap_extent.list.next,
1327 struct swap_extent, list);
1328 list_del(&se->list);
1329 kfree(se);
1330 }
1331}
1332
1333
1334
1335
1336
1337
1338
1339static int
1340add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1341 unsigned long nr_pages, sector_t start_block)
1342{
1343 struct swap_extent *se;
1344 struct swap_extent *new_se;
1345 struct list_head *lh;
1346
1347 if (start_page == 0) {
1348 se = &sis->first_swap_extent;
1349 sis->curr_swap_extent = se;
1350 se->start_page = 0;
1351 se->nr_pages = nr_pages;
1352 se->start_block = start_block;
1353 return 1;
1354 } else {
1355 lh = sis->first_swap_extent.list.prev;
1356 se = list_entry(lh, struct swap_extent, list);
1357 BUG_ON(se->start_page + se->nr_pages != start_page);
1358 if (se->start_block + se->nr_pages == start_block) {
1359
1360 se->nr_pages += nr_pages;
1361 return 0;
1362 }
1363 }
1364
1365
1366
1367
1368 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1369 if (new_se == NULL)
1370 return -ENOMEM;
1371 new_se->start_page = start_page;
1372 new_se->nr_pages = nr_pages;
1373 new_se->start_block = start_block;
1374
1375 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1376 return 1;
1377}
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1411{
1412 struct inode *inode;
1413 unsigned blocks_per_page;
1414 unsigned long page_no;
1415 unsigned blkbits;
1416 sector_t probe_block;
1417 sector_t last_block;
1418 sector_t lowest_block = -1;
1419 sector_t highest_block = 0;
1420 int nr_extents = 0;
1421 int ret;
1422
1423 inode = sis->swap_file->f_mapping->host;
1424 if (S_ISBLK(inode->i_mode)) {
1425 ret = add_swap_extent(sis, 0, sis->max, 0);
1426 *span = sis->pages;
1427 goto out;
1428 }
1429
1430 blkbits = inode->i_blkbits;
1431 blocks_per_page = PAGE_SIZE >> blkbits;
1432
1433
1434
1435
1436
1437 probe_block = 0;
1438 page_no = 0;
1439 last_block = i_size_read(inode) >> blkbits;
1440 while ((probe_block + blocks_per_page) <= last_block &&
1441 page_no < sis->max) {
1442 unsigned block_in_page;
1443 sector_t first_block;
1444
1445 first_block = bmap(inode, probe_block);
1446 if (first_block == 0)
1447 goto bad_bmap;
1448
1449
1450
1451
1452 if (first_block & (blocks_per_page - 1)) {
1453 probe_block++;
1454 goto reprobe;
1455 }
1456
1457 for (block_in_page = 1; block_in_page < blocks_per_page;
1458 block_in_page++) {
1459 sector_t block;
1460
1461 block = bmap(inode, probe_block + block_in_page);
1462 if (block == 0)
1463 goto bad_bmap;
1464 if (block != first_block + block_in_page) {
1465
1466 probe_block++;
1467 goto reprobe;
1468 }
1469 }
1470
1471 first_block >>= (PAGE_SHIFT - blkbits);
1472 if (page_no) {
1473 if (first_block < lowest_block)
1474 lowest_block = first_block;
1475 if (first_block > highest_block)
1476 highest_block = first_block;
1477 }
1478
1479
1480
1481
1482 ret = add_swap_extent(sis, page_no, 1, first_block);
1483 if (ret < 0)
1484 goto out;
1485 nr_extents += ret;
1486 page_no++;
1487 probe_block += blocks_per_page;
1488reprobe:
1489 continue;
1490 }
1491 ret = nr_extents;
1492 *span = 1 + highest_block - lowest_block;
1493 if (page_no == 0)
1494 page_no = 1;
1495 sis->max = page_no;
1496 sis->pages = page_no - 1;
1497 sis->highest_bit = page_no - 1;
1498out:
1499 return ret;
1500bad_bmap:
1501 printk(KERN_ERR "swapon: swapfile has holes\n");
1502 ret = -EINVAL;
1503 goto out;
1504}
1505
1506static void enable_swap_info(struct swap_info_struct *p, int prio,
1507 unsigned char *swap_map,
1508 unsigned long *frontswap_map)
1509{
1510 int i, prev;
1511
1512 spin_lock(&swap_lock);
1513 if (prio >= 0)
1514 p->prio = prio;
1515 else
1516 p->prio = --least_priority;
1517 p->swap_map = swap_map;
1518 frontswap_map_set(p, frontswap_map);
1519 p->flags |= SWP_WRITEOK;
1520 nr_swap_pages += p->pages;
1521 total_swap_pages += p->pages;
1522
1523
1524 prev = -1;
1525 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1526 if (p->prio >= swap_info[i]->prio)
1527 break;
1528 prev = i;
1529 }
1530 p->next = i;
1531 if (prev < 0)
1532 swap_list.head = swap_list.next = p->type;
1533 else
1534 swap_info[prev]->next = p->type;
1535 frontswap_init(p->type);
1536 spin_unlock(&swap_lock);
1537}
1538
1539SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1540{
1541 struct swap_info_struct *p = NULL;
1542 unsigned char *swap_map;
1543 struct file *swap_file, *victim;
1544 struct address_space *mapping;
1545 struct inode *inode;
1546 char *pathname;
1547 int oom_score_adj;
1548 int i, type, prev;
1549 int err;
1550
1551 if (!capable(CAP_SYS_ADMIN))
1552 return -EPERM;
1553
1554 BUG_ON(!current->mm);
1555
1556 pathname = getname(specialfile);
1557 err = PTR_ERR(pathname);
1558 if (IS_ERR(pathname))
1559 goto out;
1560
1561 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1562 putname(pathname);
1563 err = PTR_ERR(victim);
1564 if (IS_ERR(victim))
1565 goto out;
1566
1567 mapping = victim->f_mapping;
1568 prev = -1;
1569 spin_lock(&swap_lock);
1570 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1571 p = swap_info[type];
1572 if (p->flags & SWP_WRITEOK) {
1573 if (p->swap_file->f_mapping == mapping)
1574 break;
1575 }
1576 prev = type;
1577 }
1578 if (type < 0) {
1579 err = -EINVAL;
1580 spin_unlock(&swap_lock);
1581 goto out_dput;
1582 }
1583 if (!security_vm_enough_memory_mm(current->mm, p->pages))
1584 vm_unacct_memory(p->pages);
1585 else {
1586 err = -ENOMEM;
1587 spin_unlock(&swap_lock);
1588 goto out_dput;
1589 }
1590 if (prev < 0)
1591 swap_list.head = p->next;
1592 else
1593 swap_info[prev]->next = p->next;
1594 if (type == swap_list.next) {
1595
1596 swap_list.next = swap_list.head;
1597 }
1598 if (p->prio < 0) {
1599 for (i = p->next; i >= 0; i = swap_info[i]->next)
1600 swap_info[i]->prio = p->prio--;
1601 least_priority++;
1602 }
1603 nr_swap_pages -= p->pages;
1604 total_swap_pages -= p->pages;
1605 p->flags &= ~SWP_WRITEOK;
1606 spin_unlock(&swap_lock);
1607
1608 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1609 err = try_to_unuse(type, false, 0);
1610 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1611
1612 if (err) {
1613
1614
1615
1616
1617
1618
1619
1620 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1621 goto out_dput;
1622 }
1623
1624 destroy_swap_extents(p);
1625 if (p->flags & SWP_CONTINUED)
1626 free_swap_count_continuations(p);
1627
1628 mutex_lock(&swapon_mutex);
1629 spin_lock(&swap_lock);
1630 drain_mmlist();
1631
1632
1633 p->highest_bit = 0;
1634 while (p->flags >= SWP_SCANNING) {
1635 spin_unlock(&swap_lock);
1636 schedule_timeout_uninterruptible(1);
1637 spin_lock(&swap_lock);
1638 }
1639
1640 swap_file = p->swap_file;
1641 p->swap_file = NULL;
1642 p->max = 0;
1643 swap_map = p->swap_map;
1644 p->swap_map = NULL;
1645 p->flags = 0;
1646 frontswap_invalidate_area(type);
1647 spin_unlock(&swap_lock);
1648 mutex_unlock(&swapon_mutex);
1649 vfree(swap_map);
1650 vfree(frontswap_map_get(p));
1651
1652 swap_cgroup_swapoff(type);
1653
1654 inode = mapping->host;
1655 if (S_ISBLK(inode->i_mode)) {
1656 struct block_device *bdev = I_BDEV(inode);
1657 set_blocksize(bdev, p->old_block_size);
1658 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1659 } else {
1660 mutex_lock(&inode->i_mutex);
1661 inode->i_flags &= ~S_SWAPFILE;
1662 mutex_unlock(&inode->i_mutex);
1663 }
1664 filp_close(swap_file, NULL);
1665 err = 0;
1666 atomic_inc(&proc_poll_event);
1667 wake_up_interruptible(&proc_poll_wait);
1668
1669out_dput:
1670 filp_close(victim, NULL);
1671out:
1672 return err;
1673}
1674
1675#ifdef CONFIG_PROC_FS
1676static unsigned swaps_poll(struct file *file, poll_table *wait)
1677{
1678 struct seq_file *seq = file->private_data;
1679
1680 poll_wait(file, &proc_poll_wait, wait);
1681
1682 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1683 seq->poll_event = atomic_read(&proc_poll_event);
1684 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1685 }
1686
1687 return POLLIN | POLLRDNORM;
1688}
1689
1690
1691static void *swap_start(struct seq_file *swap, loff_t *pos)
1692{
1693 struct swap_info_struct *si;
1694 int type;
1695 loff_t l = *pos;
1696
1697 mutex_lock(&swapon_mutex);
1698
1699 if (!l)
1700 return SEQ_START_TOKEN;
1701
1702 for (type = 0; type < nr_swapfiles; type++) {
1703 smp_rmb();
1704 si = swap_info[type];
1705 if (!(si->flags & SWP_USED) || !si->swap_map)
1706 continue;
1707 if (!--l)
1708 return si;
1709 }
1710
1711 return NULL;
1712}
1713
1714static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1715{
1716 struct swap_info_struct *si = v;
1717 int type;
1718
1719 if (v == SEQ_START_TOKEN)
1720 type = 0;
1721 else
1722 type = si->type + 1;
1723
1724 for (; type < nr_swapfiles; type++) {
1725 smp_rmb();
1726 si = swap_info[type];
1727 if (!(si->flags & SWP_USED) || !si->swap_map)
1728 continue;
1729 ++*pos;
1730 return si;
1731 }
1732
1733 return NULL;
1734}
1735
1736static void swap_stop(struct seq_file *swap, void *v)
1737{
1738 mutex_unlock(&swapon_mutex);
1739}
1740
1741static int swap_show(struct seq_file *swap, void *v)
1742{
1743 struct swap_info_struct *si = v;
1744 struct file *file;
1745 int len;
1746
1747 if (si == SEQ_START_TOKEN) {
1748 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1749 return 0;
1750 }
1751
1752 file = si->swap_file;
1753 len = seq_path(swap, &file->f_path, " \t\n\\");
1754 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1755 len < 40 ? 40 - len : 1, " ",
1756 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1757 "partition" : "file\t",
1758 si->pages << (PAGE_SHIFT - 10),
1759 si->inuse_pages << (PAGE_SHIFT - 10),
1760 si->prio);
1761 return 0;
1762}
1763
1764static const struct seq_operations swaps_op = {
1765 .start = swap_start,
1766 .next = swap_next,
1767 .stop = swap_stop,
1768 .show = swap_show
1769};
1770
1771static int swaps_open(struct inode *inode, struct file *file)
1772{
1773 struct seq_file *seq;
1774 int ret;
1775
1776 ret = seq_open(file, &swaps_op);
1777 if (ret)
1778 return ret;
1779
1780 seq = file->private_data;
1781 seq->poll_event = atomic_read(&proc_poll_event);
1782 return 0;
1783}
1784
1785static const struct file_operations proc_swaps_operations = {
1786 .open = swaps_open,
1787 .read = seq_read,
1788 .llseek = seq_lseek,
1789 .release = seq_release,
1790 .poll = swaps_poll,
1791};
1792
1793static int __init procswaps_init(void)
1794{
1795 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1796 return 0;
1797}
1798__initcall(procswaps_init);
1799#endif
1800
1801#ifdef MAX_SWAPFILES_CHECK
1802static int __init max_swapfiles_check(void)
1803{
1804 MAX_SWAPFILES_CHECK();
1805 return 0;
1806}
1807late_initcall(max_swapfiles_check);
1808#endif
1809
1810static struct swap_info_struct *alloc_swap_info(void)
1811{
1812 struct swap_info_struct *p;
1813 unsigned int type;
1814
1815 p = kzalloc(sizeof(*p), GFP_KERNEL);
1816 if (!p)
1817 return ERR_PTR(-ENOMEM);
1818
1819 spin_lock(&swap_lock);
1820 for (type = 0; type < nr_swapfiles; type++) {
1821 if (!(swap_info[type]->flags & SWP_USED))
1822 break;
1823 }
1824 if (type >= MAX_SWAPFILES) {
1825 spin_unlock(&swap_lock);
1826 kfree(p);
1827 return ERR_PTR(-EPERM);
1828 }
1829 if (type >= nr_swapfiles) {
1830 p->type = type;
1831 swap_info[type] = p;
1832
1833
1834
1835
1836
1837 smp_wmb();
1838 nr_swapfiles++;
1839 } else {
1840 kfree(p);
1841 p = swap_info[type];
1842
1843
1844
1845
1846 }
1847 INIT_LIST_HEAD(&p->first_swap_extent.list);
1848 p->flags = SWP_USED;
1849 p->next = -1;
1850 spin_unlock(&swap_lock);
1851
1852 return p;
1853}
1854
1855static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1856{
1857 int error;
1858
1859 if (S_ISBLK(inode->i_mode)) {
1860 p->bdev = bdgrab(I_BDEV(inode));
1861 error = blkdev_get(p->bdev,
1862 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1863 sys_swapon);
1864 if (error < 0) {
1865 p->bdev = NULL;
1866 return -EINVAL;
1867 }
1868 p->old_block_size = block_size(p->bdev);
1869 error = set_blocksize(p->bdev, PAGE_SIZE);
1870 if (error < 0)
1871 return error;
1872 p->flags |= SWP_BLKDEV;
1873 } else if (S_ISREG(inode->i_mode)) {
1874 p->bdev = inode->i_sb->s_bdev;
1875 mutex_lock(&inode->i_mutex);
1876 if (IS_SWAPFILE(inode))
1877 return -EBUSY;
1878 } else
1879 return -EINVAL;
1880
1881 return 0;
1882}
1883
1884static unsigned long read_swap_header(struct swap_info_struct *p,
1885 union swap_header *swap_header,
1886 struct inode *inode)
1887{
1888 int i;
1889 unsigned long maxpages;
1890 unsigned long swapfilepages;
1891
1892 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1893 printk(KERN_ERR "Unable to find swap-space signature\n");
1894 return 0;
1895 }
1896
1897
1898 if (swab32(swap_header->info.version) == 1) {
1899 swab32s(&swap_header->info.version);
1900 swab32s(&swap_header->info.last_page);
1901 swab32s(&swap_header->info.nr_badpages);
1902 for (i = 0; i < swap_header->info.nr_badpages; i++)
1903 swab32s(&swap_header->info.badpages[i]);
1904 }
1905
1906 if (swap_header->info.version != 1) {
1907 printk(KERN_WARNING
1908 "Unable to handle swap header version %d\n",
1909 swap_header->info.version);
1910 return 0;
1911 }
1912
1913 p->lowest_bit = 1;
1914 p->cluster_next = 1;
1915 p->cluster_nr = 0;
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931 maxpages = swp_offset(pte_to_swp_entry(
1932 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1933 if (maxpages > swap_header->info.last_page) {
1934 maxpages = swap_header->info.last_page + 1;
1935
1936 if ((unsigned int)maxpages == 0)
1937 maxpages = UINT_MAX;
1938 }
1939 p->highest_bit = maxpages - 1;
1940
1941 if (!maxpages)
1942 return 0;
1943 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1944 if (swapfilepages && maxpages > swapfilepages) {
1945 printk(KERN_WARNING
1946 "Swap area shorter than signature indicates\n");
1947 return 0;
1948 }
1949 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1950 return 0;
1951 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1952 return 0;
1953
1954 return maxpages;
1955}
1956
1957static int setup_swap_map_and_extents(struct swap_info_struct *p,
1958 union swap_header *swap_header,
1959 unsigned char *swap_map,
1960 unsigned long maxpages,
1961 sector_t *span)
1962{
1963 int i;
1964 unsigned int nr_good_pages;
1965 int nr_extents;
1966
1967 nr_good_pages = maxpages - 1;
1968
1969 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1970 unsigned int page_nr = swap_header->info.badpages[i];
1971 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1972 return -EINVAL;
1973 if (page_nr < maxpages) {
1974 swap_map[page_nr] = SWAP_MAP_BAD;
1975 nr_good_pages--;
1976 }
1977 }
1978
1979 if (nr_good_pages) {
1980 swap_map[0] = SWAP_MAP_BAD;
1981 p->max = maxpages;
1982 p->pages = nr_good_pages;
1983 nr_extents = setup_swap_extents(p, span);
1984 if (nr_extents < 0)
1985 return nr_extents;
1986 nr_good_pages = p->pages;
1987 }
1988 if (!nr_good_pages) {
1989 printk(KERN_WARNING "Empty swap-file\n");
1990 return -EINVAL;
1991 }
1992
1993 return nr_extents;
1994}
1995
1996SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1997{
1998 struct swap_info_struct *p;
1999 char *name;
2000 struct file *swap_file = NULL;
2001 struct address_space *mapping;
2002 int i;
2003 int prio;
2004 int error;
2005 union swap_header *swap_header;
2006 int nr_extents;
2007 sector_t span;
2008 unsigned long maxpages;
2009 unsigned char *swap_map = NULL;
2010 unsigned long *frontswap_map = NULL;
2011 struct page *page = NULL;
2012 struct inode *inode = NULL;
2013
2014 if (swap_flags & ~SWAP_FLAGS_VALID)
2015 return -EINVAL;
2016
2017 if (!capable(CAP_SYS_ADMIN))
2018 return -EPERM;
2019
2020 p = alloc_swap_info();
2021 if (IS_ERR(p))
2022 return PTR_ERR(p);
2023
2024 name = getname(specialfile);
2025 if (IS_ERR(name)) {
2026 error = PTR_ERR(name);
2027 name = NULL;
2028 goto bad_swap;
2029 }
2030 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2031 if (IS_ERR(swap_file)) {
2032 error = PTR_ERR(swap_file);
2033 swap_file = NULL;
2034 goto bad_swap;
2035 }
2036
2037 p->swap_file = swap_file;
2038 mapping = swap_file->f_mapping;
2039
2040 for (i = 0; i < nr_swapfiles; i++) {
2041 struct swap_info_struct *q = swap_info[i];
2042
2043 if (q == p || !q->swap_file)
2044 continue;
2045 if (mapping == q->swap_file->f_mapping) {
2046 error = -EBUSY;
2047 goto bad_swap;
2048 }
2049 }
2050
2051 inode = mapping->host;
2052
2053 error = claim_swapfile(p, inode);
2054 if (unlikely(error))
2055 goto bad_swap;
2056
2057
2058
2059
2060 if (!mapping->a_ops->readpage) {
2061 error = -EINVAL;
2062 goto bad_swap;
2063 }
2064 page = read_mapping_page(mapping, 0, swap_file);
2065 if (IS_ERR(page)) {
2066 error = PTR_ERR(page);
2067 goto bad_swap;
2068 }
2069 swap_header = kmap(page);
2070
2071 maxpages = read_swap_header(p, swap_header, inode);
2072 if (unlikely(!maxpages)) {
2073 error = -EINVAL;
2074 goto bad_swap;
2075 }
2076
2077
2078 swap_map = vzalloc(maxpages);
2079 if (!swap_map) {
2080 error = -ENOMEM;
2081 goto bad_swap;
2082 }
2083
2084 error = swap_cgroup_swapon(p->type, maxpages);
2085 if (error)
2086 goto bad_swap;
2087
2088 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2089 maxpages, &span);
2090 if (unlikely(nr_extents < 0)) {
2091 error = nr_extents;
2092 goto bad_swap;
2093 }
2094
2095 if (frontswap_enabled)
2096 frontswap_map = vzalloc(maxpages / sizeof(long));
2097
2098 if (p->bdev) {
2099 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2100 p->flags |= SWP_SOLIDSTATE;
2101 p->cluster_next = 1 + (random32() % p->highest_bit);
2102 }
2103 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2104 p->flags |= SWP_DISCARDABLE;
2105 }
2106
2107 mutex_lock(&swapon_mutex);
2108 prio = -1;
2109 if (swap_flags & SWAP_FLAG_PREFER)
2110 prio =
2111 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2112 enable_swap_info(p, prio, swap_map, frontswap_map);
2113
2114 printk(KERN_INFO "Adding %uk swap on %s. "
2115 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2116 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2117 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2118 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2119 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2120 (frontswap_map) ? "FS" : "");
2121
2122 mutex_unlock(&swapon_mutex);
2123 atomic_inc(&proc_poll_event);
2124 wake_up_interruptible(&proc_poll_wait);
2125
2126 if (S_ISREG(inode->i_mode))
2127 inode->i_flags |= S_SWAPFILE;
2128 error = 0;
2129 goto out;
2130bad_swap:
2131 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2132 set_blocksize(p->bdev, p->old_block_size);
2133 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2134 }
2135 destroy_swap_extents(p);
2136 swap_cgroup_swapoff(p->type);
2137 spin_lock(&swap_lock);
2138 p->swap_file = NULL;
2139 p->flags = 0;
2140 spin_unlock(&swap_lock);
2141 vfree(swap_map);
2142 if (swap_file) {
2143 if (inode && S_ISREG(inode->i_mode)) {
2144 mutex_unlock(&inode->i_mutex);
2145 inode = NULL;
2146 }
2147 filp_close(swap_file, NULL);
2148 }
2149out:
2150 if (page && !IS_ERR(page)) {
2151 kunmap(page);
2152 page_cache_release(page);
2153 }
2154 if (name)
2155 putname(name);
2156 if (inode && S_ISREG(inode->i_mode))
2157 mutex_unlock(&inode->i_mutex);
2158 return error;
2159}
2160
2161void si_swapinfo(struct sysinfo *val)
2162{
2163 unsigned int type;
2164 unsigned long nr_to_be_unused = 0;
2165
2166 spin_lock(&swap_lock);
2167 for (type = 0; type < nr_swapfiles; type++) {
2168 struct swap_info_struct *si = swap_info[type];
2169
2170 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2171 nr_to_be_unused += si->inuse_pages;
2172 }
2173 val->freeswap = nr_swap_pages + nr_to_be_unused;
2174 val->totalswap = total_swap_pages + nr_to_be_unused;
2175 spin_unlock(&swap_lock);
2176}
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2190{
2191 struct swap_info_struct *p;
2192 unsigned long offset, type;
2193 unsigned char count;
2194 unsigned char has_cache;
2195 int err = -EINVAL;
2196
2197 if (non_swap_entry(entry))
2198 goto out;
2199
2200 type = swp_type(entry);
2201 if (type >= nr_swapfiles)
2202 goto bad_file;
2203 p = swap_info[type];
2204 offset = swp_offset(entry);
2205
2206 spin_lock(&swap_lock);
2207 if (unlikely(offset >= p->max))
2208 goto unlock_out;
2209
2210 count = p->swap_map[offset];
2211 has_cache = count & SWAP_HAS_CACHE;
2212 count &= ~SWAP_HAS_CACHE;
2213 err = 0;
2214
2215 if (usage == SWAP_HAS_CACHE) {
2216
2217
2218 if (!has_cache && count)
2219 has_cache = SWAP_HAS_CACHE;
2220 else if (has_cache)
2221 err = -EEXIST;
2222 else
2223 err = -ENOENT;
2224
2225 } else if (count || has_cache) {
2226
2227 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2228 count += usage;
2229 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2230 err = -EINVAL;
2231 else if (swap_count_continued(p, offset, count))
2232 count = COUNT_CONTINUED;
2233 else
2234 err = -ENOMEM;
2235 } else
2236 err = -ENOENT;
2237
2238 p->swap_map[offset] = count | has_cache;
2239
2240unlock_out:
2241 spin_unlock(&swap_lock);
2242out:
2243 return err;
2244
2245bad_file:
2246 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2247 goto out;
2248}
2249
2250
2251
2252
2253
2254void swap_shmem_alloc(swp_entry_t entry)
2255{
2256 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2257}
2258
2259
2260
2261
2262
2263
2264
2265
2266int swap_duplicate(swp_entry_t entry)
2267{
2268 int err = 0;
2269
2270 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2271 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2272 return err;
2273}
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283int swapcache_prepare(swp_entry_t entry)
2284{
2285 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2286}
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2304{
2305 struct swap_info_struct *si;
2306 struct page *head;
2307 struct page *page;
2308 struct page *list_page;
2309 pgoff_t offset;
2310 unsigned char count;
2311
2312
2313
2314
2315
2316 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2317
2318 si = swap_info_get(entry);
2319 if (!si) {
2320
2321
2322
2323
2324
2325 goto outer;
2326 }
2327
2328 offset = swp_offset(entry);
2329 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2330
2331 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2332
2333
2334
2335
2336
2337 goto out;
2338 }
2339
2340 if (!page) {
2341 spin_unlock(&swap_lock);
2342 return -ENOMEM;
2343 }
2344
2345
2346
2347
2348
2349
2350 head = vmalloc_to_page(si->swap_map + offset);
2351 offset &= ~PAGE_MASK;
2352
2353
2354
2355
2356
2357 if (!page_private(head)) {
2358 BUG_ON(count & COUNT_CONTINUED);
2359 INIT_LIST_HEAD(&head->lru);
2360 set_page_private(head, SWP_CONTINUED);
2361 si->flags |= SWP_CONTINUED;
2362 }
2363
2364 list_for_each_entry(list_page, &head->lru, lru) {
2365 unsigned char *map;
2366
2367
2368
2369
2370
2371 if (!(count & COUNT_CONTINUED))
2372 goto out;
2373
2374 map = kmap_atomic(list_page) + offset;
2375 count = *map;
2376 kunmap_atomic(map);
2377
2378
2379
2380
2381
2382 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2383 goto out;
2384 }
2385
2386 list_add_tail(&page->lru, &head->lru);
2387 page = NULL;
2388out:
2389 spin_unlock(&swap_lock);
2390outer:
2391 if (page)
2392 __free_page(page);
2393 return 0;
2394}
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404static bool swap_count_continued(struct swap_info_struct *si,
2405 pgoff_t offset, unsigned char count)
2406{
2407 struct page *head;
2408 struct page *page;
2409 unsigned char *map;
2410
2411 head = vmalloc_to_page(si->swap_map + offset);
2412 if (page_private(head) != SWP_CONTINUED) {
2413 BUG_ON(count & COUNT_CONTINUED);
2414 return false;
2415 }
2416
2417 offset &= ~PAGE_MASK;
2418 page = list_entry(head->lru.next, struct page, lru);
2419 map = kmap_atomic(page) + offset;
2420
2421 if (count == SWAP_MAP_MAX)
2422 goto init_map;
2423
2424 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2425
2426
2427
2428 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2429 kunmap_atomic(map);
2430 page = list_entry(page->lru.next, struct page, lru);
2431 BUG_ON(page == head);
2432 map = kmap_atomic(page) + offset;
2433 }
2434 if (*map == SWAP_CONT_MAX) {
2435 kunmap_atomic(map);
2436 page = list_entry(page->lru.next, struct page, lru);
2437 if (page == head)
2438 return false;
2439 map = kmap_atomic(page) + offset;
2440init_map: *map = 0;
2441 }
2442 *map += 1;
2443 kunmap_atomic(map);
2444 page = list_entry(page->lru.prev, struct page, lru);
2445 while (page != head) {
2446 map = kmap_atomic(page) + offset;
2447 *map = COUNT_CONTINUED;
2448 kunmap_atomic(map);
2449 page = list_entry(page->lru.prev, struct page, lru);
2450 }
2451 return true;
2452
2453 } else {
2454
2455
2456
2457 BUG_ON(count != COUNT_CONTINUED);
2458 while (*map == COUNT_CONTINUED) {
2459 kunmap_atomic(map);
2460 page = list_entry(page->lru.next, struct page, lru);
2461 BUG_ON(page == head);
2462 map = kmap_atomic(page) + offset;
2463 }
2464 BUG_ON(*map == 0);
2465 *map -= 1;
2466 if (*map == 0)
2467 count = 0;
2468 kunmap_atomic(map);
2469 page = list_entry(page->lru.prev, struct page, lru);
2470 while (page != head) {
2471 map = kmap_atomic(page) + offset;
2472 *map = SWAP_CONT_MAX | count;
2473 count = COUNT_CONTINUED;
2474 kunmap_atomic(map);
2475 page = list_entry(page->lru.prev, struct page, lru);
2476 }
2477 return count == COUNT_CONTINUED;
2478 }
2479}
2480
2481
2482
2483
2484
2485static void free_swap_count_continuations(struct swap_info_struct *si)
2486{
2487 pgoff_t offset;
2488
2489 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2490 struct page *head;
2491 head = vmalloc_to_page(si->swap_map + offset);
2492 if (page_private(head)) {
2493 struct list_head *this, *next;
2494 list_for_each_safe(this, next, &head->lru) {
2495 struct page *page;
2496 page = list_entry(this, struct page, lru);
2497 list_del(this);
2498 __free_page(page);
2499 }
2500 }
2501 }
2502}
2503