1
2
3
4
5
6
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/security.h>
27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
30#include <linux/syscalls.h>
31#include <linux/memcontrol.h>
32#include <linux/poll.h>
33#include <linux/oom.h>
34
35#include <asm/pgtable.h>
36#include <asm/tlbflush.h>
37#include <linux/swapops.h>
38#include <linux/page_cgroup.h>
39
40static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
41 unsigned char);
42static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44
45static DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles;
47long nr_swap_pages;
48long total_swap_pages;
49static int least_priority;
50
51static const char Bad_file[] = "Bad swap file entry ";
52static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry ";
55
56static struct swap_list_t swap_list = {-1, -1};
57
58static struct swap_info_struct *swap_info[MAX_SWAPFILES];
59
60static DEFINE_MUTEX(swapon_mutex);
61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
66static inline unsigned char swap_count(unsigned char ent)
67{
68 return ent & ~SWAP_HAS_CACHE;
69}
70
71
72static int
73__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
74{
75 swp_entry_t entry = swp_entry(si->type, offset);
76 struct page *page;
77 int ret = 0;
78
79 page = find_get_page(&swapper_space, entry.val);
80 if (!page)
81 return 0;
82
83
84
85
86
87
88
89 if (trylock_page(page)) {
90 ret = try_to_free_swap(page);
91 unlock_page(page);
92 }
93 page_cache_release(page);
94 return ret;
95}
96
97
98
99
100
101static int discard_swap(struct swap_info_struct *si)
102{
103 struct swap_extent *se;
104 sector_t start_block;
105 sector_t nr_blocks;
106 int err = 0;
107
108
109 se = &si->first_swap_extent;
110 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
111 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
112 if (nr_blocks) {
113 err = blkdev_issue_discard(si->bdev, start_block,
114 nr_blocks, GFP_KERNEL, 0);
115 if (err)
116 return err;
117 cond_resched();
118 }
119
120 list_for_each_entry(se, &si->first_swap_extent.list, list) {
121 start_block = se->start_block << (PAGE_SHIFT - 9);
122 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
123
124 err = blkdev_issue_discard(si->bdev, start_block,
125 nr_blocks, GFP_KERNEL, 0);
126 if (err)
127 break;
128
129 cond_resched();
130 }
131 return err;
132}
133
134
135
136
137
138static void discard_swap_cluster(struct swap_info_struct *si,
139 pgoff_t start_page, pgoff_t nr_pages)
140{
141 struct swap_extent *se = si->curr_swap_extent;
142 int found_extent = 0;
143
144 while (nr_pages) {
145 struct list_head *lh;
146
147 if (se->start_page <= start_page &&
148 start_page < se->start_page + se->nr_pages) {
149 pgoff_t offset = start_page - se->start_page;
150 sector_t start_block = se->start_block + offset;
151 sector_t nr_blocks = se->nr_pages - offset;
152
153 if (nr_blocks > nr_pages)
154 nr_blocks = nr_pages;
155 start_page += nr_blocks;
156 nr_pages -= nr_blocks;
157
158 if (!found_extent++)
159 si->curr_swap_extent = se;
160
161 start_block <<= PAGE_SHIFT - 9;
162 nr_blocks <<= PAGE_SHIFT - 9;
163 if (blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_NOIO, 0))
165 break;
166 }
167
168 lh = se->list.next;
169 se = list_entry(lh, struct swap_extent, list);
170 }
171}
172
173static int wait_for_discard(void *word)
174{
175 schedule();
176 return 0;
177}
178
179#define SWAPFILE_CLUSTER 256
180#define LATENCY_LIMIT 256
181
182static unsigned long scan_swap_map(struct swap_info_struct *si,
183 unsigned char usage)
184{
185 unsigned long offset;
186 unsigned long scan_base;
187 unsigned long last_in_cluster = 0;
188 int latency_ration = LATENCY_LIMIT;
189 int found_free_cluster = 0;
190
191
192
193
194
195
196
197
198
199
200
201
202 si->flags += SWP_SCANNING;
203 scan_base = offset = si->cluster_next;
204
205 if (unlikely(!si->cluster_nr--)) {
206 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
207 si->cluster_nr = SWAPFILE_CLUSTER - 1;
208 goto checks;
209 }
210 if (si->flags & SWP_DISCARDABLE) {
211
212
213
214
215
216
217
218 if (si->lowest_alloc)
219 goto checks;
220 si->lowest_alloc = si->max;
221 si->highest_alloc = 0;
222 }
223 spin_unlock(&swap_lock);
224
225
226
227
228
229
230
231
232
233 if (!(si->flags & SWP_SOLIDSTATE))
234 scan_base = offset = si->lowest_bit;
235 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
236
237
238 for (; last_in_cluster <= si->highest_bit; offset++) {
239 if (si->swap_map[offset])
240 last_in_cluster = offset + SWAPFILE_CLUSTER;
241 else if (offset == last_in_cluster) {
242 spin_lock(&swap_lock);
243 offset -= SWAPFILE_CLUSTER - 1;
244 si->cluster_next = offset;
245 si->cluster_nr = SWAPFILE_CLUSTER - 1;
246 found_free_cluster = 1;
247 goto checks;
248 }
249 if (unlikely(--latency_ration < 0)) {
250 cond_resched();
251 latency_ration = LATENCY_LIMIT;
252 }
253 }
254
255 offset = si->lowest_bit;
256 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
257
258
259 for (; last_in_cluster < scan_base; offset++) {
260 if (si->swap_map[offset])
261 last_in_cluster = offset + SWAPFILE_CLUSTER;
262 else if (offset == last_in_cluster) {
263 spin_lock(&swap_lock);
264 offset -= SWAPFILE_CLUSTER - 1;
265 si->cluster_next = offset;
266 si->cluster_nr = SWAPFILE_CLUSTER - 1;
267 found_free_cluster = 1;
268 goto checks;
269 }
270 if (unlikely(--latency_ration < 0)) {
271 cond_resched();
272 latency_ration = LATENCY_LIMIT;
273 }
274 }
275
276 offset = scan_base;
277 spin_lock(&swap_lock);
278 si->cluster_nr = SWAPFILE_CLUSTER - 1;
279 si->lowest_alloc = 0;
280 }
281
282checks:
283 if (!(si->flags & SWP_WRITEOK))
284 goto no_page;
285 if (!si->highest_bit)
286 goto no_page;
287 if (offset > si->highest_bit)
288 scan_base = offset = si->lowest_bit;
289
290
291 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
292 int swap_was_freed;
293 spin_unlock(&swap_lock);
294 swap_was_freed = __try_to_reclaim_swap(si, offset);
295 spin_lock(&swap_lock);
296
297 if (swap_was_freed)
298 goto checks;
299 goto scan;
300 }
301
302 if (si->swap_map[offset])
303 goto scan;
304
305 if (offset == si->lowest_bit)
306 si->lowest_bit++;
307 if (offset == si->highest_bit)
308 si->highest_bit--;
309 si->inuse_pages++;
310 if (si->inuse_pages == si->pages) {
311 si->lowest_bit = si->max;
312 si->highest_bit = 0;
313 }
314 si->swap_map[offset] = usage;
315 si->cluster_next = offset + 1;
316 si->flags -= SWP_SCANNING;
317
318 if (si->lowest_alloc) {
319
320
321
322
323 if (found_free_cluster) {
324
325
326
327
328
329
330
331 if (offset < si->highest_alloc &&
332 si->lowest_alloc <= last_in_cluster)
333 last_in_cluster = si->lowest_alloc - 1;
334 si->flags |= SWP_DISCARDING;
335 spin_unlock(&swap_lock);
336
337 if (offset < last_in_cluster)
338 discard_swap_cluster(si, offset,
339 last_in_cluster - offset + 1);
340
341 spin_lock(&swap_lock);
342 si->lowest_alloc = 0;
343 si->flags &= ~SWP_DISCARDING;
344
345 smp_mb();
346 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
347
348 } else if (si->flags & SWP_DISCARDING) {
349
350
351
352
353
354
355 spin_unlock(&swap_lock);
356 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
357 wait_for_discard, TASK_UNINTERRUPTIBLE);
358 spin_lock(&swap_lock);
359 } else {
360
361
362
363
364
365 if (offset < si->lowest_alloc)
366 si->lowest_alloc = offset;
367 if (offset > si->highest_alloc)
368 si->highest_alloc = offset;
369 }
370 }
371 return offset;
372
373scan:
374 spin_unlock(&swap_lock);
375 while (++offset <= si->highest_bit) {
376 if (!si->swap_map[offset]) {
377 spin_lock(&swap_lock);
378 goto checks;
379 }
380 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
381 spin_lock(&swap_lock);
382 goto checks;
383 }
384 if (unlikely(--latency_ration < 0)) {
385 cond_resched();
386 latency_ration = LATENCY_LIMIT;
387 }
388 }
389 offset = si->lowest_bit;
390 while (++offset < scan_base) {
391 if (!si->swap_map[offset]) {
392 spin_lock(&swap_lock);
393 goto checks;
394 }
395 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
396 spin_lock(&swap_lock);
397 goto checks;
398 }
399 if (unlikely(--latency_ration < 0)) {
400 cond_resched();
401 latency_ration = LATENCY_LIMIT;
402 }
403 }
404 spin_lock(&swap_lock);
405
406no_page:
407 si->flags -= SWP_SCANNING;
408 return 0;
409}
410
411swp_entry_t get_swap_page(void)
412{
413 struct swap_info_struct *si;
414 pgoff_t offset;
415 int type, next;
416 int wrapped = 0;
417
418 spin_lock(&swap_lock);
419 if (nr_swap_pages <= 0)
420 goto noswap;
421 nr_swap_pages--;
422
423 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
424 si = swap_info[type];
425 next = si->next;
426 if (next < 0 ||
427 (!wrapped && si->prio != swap_info[next]->prio)) {
428 next = swap_list.head;
429 wrapped++;
430 }
431
432 if (!si->highest_bit)
433 continue;
434 if (!(si->flags & SWP_WRITEOK))
435 continue;
436
437 swap_list.next = next;
438
439 offset = scan_swap_map(si, SWAP_HAS_CACHE);
440 if (offset) {
441 spin_unlock(&swap_lock);
442 return swp_entry(type, offset);
443 }
444 next = swap_list.next;
445 }
446
447 nr_swap_pages++;
448noswap:
449 spin_unlock(&swap_lock);
450 return (swp_entry_t) {0};
451}
452
453
454swp_entry_t get_swap_page_of_type(int type)
455{
456 struct swap_info_struct *si;
457 pgoff_t offset;
458
459 spin_lock(&swap_lock);
460 si = swap_info[type];
461 if (si && (si->flags & SWP_WRITEOK)) {
462 nr_swap_pages--;
463
464 offset = scan_swap_map(si, 1);
465 if (offset) {
466 spin_unlock(&swap_lock);
467 return swp_entry(type, offset);
468 }
469 nr_swap_pages++;
470 }
471 spin_unlock(&swap_lock);
472 return (swp_entry_t) {0};
473}
474
475static struct swap_info_struct *swap_info_get(swp_entry_t entry)
476{
477 struct swap_info_struct *p;
478 unsigned long offset, type;
479
480 if (!entry.val)
481 goto out;
482 type = swp_type(entry);
483 if (type >= nr_swapfiles)
484 goto bad_nofile;
485 p = swap_info[type];
486 if (!(p->flags & SWP_USED))
487 goto bad_device;
488 offset = swp_offset(entry);
489 if (offset >= p->max)
490 goto bad_offset;
491 if (!p->swap_map[offset])
492 goto bad_free;
493 spin_lock(&swap_lock);
494 return p;
495
496bad_free:
497 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
498 goto out;
499bad_offset:
500 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
501 goto out;
502bad_device:
503 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
504 goto out;
505bad_nofile:
506 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
507out:
508 return NULL;
509}
510
511static unsigned char swap_entry_free(struct swap_info_struct *p,
512 swp_entry_t entry, unsigned char usage)
513{
514 unsigned long offset = swp_offset(entry);
515 unsigned char count;
516 unsigned char has_cache;
517
518 count = p->swap_map[offset];
519 has_cache = count & SWAP_HAS_CACHE;
520 count &= ~SWAP_HAS_CACHE;
521
522 if (usage == SWAP_HAS_CACHE) {
523 VM_BUG_ON(!has_cache);
524 has_cache = 0;
525 } else if (count == SWAP_MAP_SHMEM) {
526
527
528
529
530 count = 0;
531 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
532 if (count == COUNT_CONTINUED) {
533 if (swap_count_continued(p, offset, count))
534 count = SWAP_MAP_MAX | COUNT_CONTINUED;
535 else
536 count = SWAP_MAP_MAX;
537 } else
538 count--;
539 }
540
541 if (!count)
542 mem_cgroup_uncharge_swap(entry);
543
544 usage = count | has_cache;
545 p->swap_map[offset] = usage;
546
547
548 if (!usage) {
549 struct gendisk *disk = p->bdev->bd_disk;
550 if (offset < p->lowest_bit)
551 p->lowest_bit = offset;
552 if (offset > p->highest_bit)
553 p->highest_bit = offset;
554 if (swap_list.next >= 0 &&
555 p->prio > swap_info[swap_list.next]->prio)
556 swap_list.next = p->type;
557 nr_swap_pages++;
558 p->inuse_pages--;
559 if ((p->flags & SWP_BLKDEV) &&
560 disk->fops->swap_slot_free_notify)
561 disk->fops->swap_slot_free_notify(p->bdev, offset);
562 }
563
564 return usage;
565}
566
567
568
569
570
571void swap_free(swp_entry_t entry)
572{
573 struct swap_info_struct *p;
574
575 p = swap_info_get(entry);
576 if (p) {
577 swap_entry_free(p, entry, 1);
578 spin_unlock(&swap_lock);
579 }
580}
581
582
583
584
585void swapcache_free(swp_entry_t entry, struct page *page)
586{
587 struct swap_info_struct *p;
588 unsigned char count;
589
590 p = swap_info_get(entry);
591 if (p) {
592 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
593 if (page)
594 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
595 spin_unlock(&swap_lock);
596 }
597}
598
599
600
601
602
603
604static inline int page_swapcount(struct page *page)
605{
606 int count = 0;
607 struct swap_info_struct *p;
608 swp_entry_t entry;
609
610 entry.val = page_private(page);
611 p = swap_info_get(entry);
612 if (p) {
613 count = swap_count(p->swap_map[swp_offset(entry)]);
614 spin_unlock(&swap_lock);
615 }
616 return count;
617}
618
619
620
621
622
623
624
625int reuse_swap_page(struct page *page)
626{
627 int count;
628
629 VM_BUG_ON(!PageLocked(page));
630 if (unlikely(PageKsm(page)))
631 return 0;
632 count = page_mapcount(page);
633 if (count <= 1 && PageSwapCache(page)) {
634 count += page_swapcount(page);
635 if (count == 1 && !PageWriteback(page)) {
636 delete_from_swap_cache(page);
637 SetPageDirty(page);
638 }
639 }
640 return count <= 1;
641}
642
643
644
645
646
647int try_to_free_swap(struct page *page)
648{
649 VM_BUG_ON(!PageLocked(page));
650
651 if (!PageSwapCache(page))
652 return 0;
653 if (PageWriteback(page))
654 return 0;
655 if (page_swapcount(page))
656 return 0;
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673 if (!(gfp_allowed_mask & __GFP_IO))
674 return 0;
675
676 delete_from_swap_cache(page);
677 SetPageDirty(page);
678 return 1;
679}
680
681
682
683
684
685int free_swap_and_cache(swp_entry_t entry)
686{
687 struct swap_info_struct *p;
688 struct page *page = NULL;
689
690 if (non_swap_entry(entry))
691 return 1;
692
693 p = swap_info_get(entry);
694 if (p) {
695 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
696 page = find_get_page(&swapper_space, entry.val);
697 if (page && !trylock_page(page)) {
698 page_cache_release(page);
699 page = NULL;
700 }
701 }
702 spin_unlock(&swap_lock);
703 }
704 if (page) {
705
706
707
708
709 if (PageSwapCache(page) && !PageWriteback(page) &&
710 (!page_mapped(page) || vm_swap_full())) {
711 delete_from_swap_cache(page);
712 SetPageDirty(page);
713 }
714 unlock_page(page);
715 page_cache_release(page);
716 }
717 return p != NULL;
718}
719
720#ifdef CONFIG_CGROUP_MEM_RES_CTLR
721
722
723
724
725
726
727
728
729
730
731int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
732{
733 struct page *page;
734 struct swap_info_struct *p;
735 int count = 0;
736
737 page = find_get_page(&swapper_space, ent.val);
738 if (page)
739 count += page_mapcount(page);
740 p = swap_info_get(ent);
741 if (p) {
742 count += swap_count(p->swap_map[swp_offset(ent)]);
743 spin_unlock(&swap_lock);
744 }
745
746 *pagep = page;
747 return count;
748}
749#endif
750
751#ifdef CONFIG_HIBERNATION
752
753
754
755
756
757
758
759
760int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
761{
762 struct block_device *bdev = NULL;
763 int type;
764
765 if (device)
766 bdev = bdget(device);
767
768 spin_lock(&swap_lock);
769 for (type = 0; type < nr_swapfiles; type++) {
770 struct swap_info_struct *sis = swap_info[type];
771
772 if (!(sis->flags & SWP_WRITEOK))
773 continue;
774
775 if (!bdev) {
776 if (bdev_p)
777 *bdev_p = bdgrab(sis->bdev);
778
779 spin_unlock(&swap_lock);
780 return type;
781 }
782 if (bdev == sis->bdev) {
783 struct swap_extent *se = &sis->first_swap_extent;
784
785 if (se->start_block == offset) {
786 if (bdev_p)
787 *bdev_p = bdgrab(sis->bdev);
788
789 spin_unlock(&swap_lock);
790 bdput(bdev);
791 return type;
792 }
793 }
794 }
795 spin_unlock(&swap_lock);
796 if (bdev)
797 bdput(bdev);
798
799 return -ENODEV;
800}
801
802
803
804
805
806sector_t swapdev_block(int type, pgoff_t offset)
807{
808 struct block_device *bdev;
809
810 if ((unsigned int)type >= nr_swapfiles)
811 return 0;
812 if (!(swap_info[type]->flags & SWP_WRITEOK))
813 return 0;
814 return map_swap_entry(swp_entry(type, offset), &bdev);
815}
816
817
818
819
820
821
822
823unsigned int count_swap_pages(int type, int free)
824{
825 unsigned int n = 0;
826
827 spin_lock(&swap_lock);
828 if ((unsigned int)type < nr_swapfiles) {
829 struct swap_info_struct *sis = swap_info[type];
830
831 if (sis->flags & SWP_WRITEOK) {
832 n = sis->pages;
833 if (free)
834 n -= sis->inuse_pages;
835 }
836 }
837 spin_unlock(&swap_lock);
838 return n;
839}
840#endif
841
842
843
844
845
846
847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
848 unsigned long addr, swp_entry_t entry, struct page *page)
849{
850 struct mem_cgroup *ptr;
851 spinlock_t *ptl;
852 pte_t *pte;
853 int ret = 1;
854
855 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
856 ret = -ENOMEM;
857 goto out_nolock;
858 }
859
860 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
861 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
862 if (ret > 0)
863 mem_cgroup_cancel_charge_swapin(ptr);
864 ret = 0;
865 goto out;
866 }
867
868 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
869 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
870 get_page(page);
871 set_pte_at(vma->vm_mm, addr, pte,
872 pte_mkold(mk_pte(page, vma->vm_page_prot)));
873 page_add_anon_rmap(page, vma, addr);
874 mem_cgroup_commit_charge_swapin(page, ptr);
875 swap_free(entry);
876
877
878
879
880 activate_page(page);
881out:
882 pte_unmap_unlock(pte, ptl);
883out_nolock:
884 return ret;
885}
886
887static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
888 unsigned long addr, unsigned long end,
889 swp_entry_t entry, struct page *page)
890{
891 pte_t swp_pte = swp_entry_to_pte(entry);
892 pte_t *pte;
893 int ret = 0;
894
895
896
897
898
899
900
901
902
903
904 pte = pte_offset_map(pmd, addr);
905 do {
906
907
908
909
910 if (unlikely(pte_same(*pte, swp_pte))) {
911 pte_unmap(pte);
912 ret = unuse_pte(vma, pmd, addr, entry, page);
913 if (ret)
914 goto out;
915 pte = pte_offset_map(pmd, addr);
916 }
917 } while (pte++, addr += PAGE_SIZE, addr != end);
918 pte_unmap(pte - 1);
919out:
920 return ret;
921}
922
923static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
924 unsigned long addr, unsigned long end,
925 swp_entry_t entry, struct page *page)
926{
927 pmd_t *pmd;
928 unsigned long next;
929 int ret;
930
931 pmd = pmd_offset(pud, addr);
932 do {
933 next = pmd_addr_end(addr, end);
934 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
935 continue;
936 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
937 if (ret)
938 return ret;
939 } while (pmd++, addr = next, addr != end);
940 return 0;
941}
942
943static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
944 unsigned long addr, unsigned long end,
945 swp_entry_t entry, struct page *page)
946{
947 pud_t *pud;
948 unsigned long next;
949 int ret;
950
951 pud = pud_offset(pgd, addr);
952 do {
953 next = pud_addr_end(addr, end);
954 if (pud_none_or_clear_bad(pud))
955 continue;
956 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
957 if (ret)
958 return ret;
959 } while (pud++, addr = next, addr != end);
960 return 0;
961}
962
963static int unuse_vma(struct vm_area_struct *vma,
964 swp_entry_t entry, struct page *page)
965{
966 pgd_t *pgd;
967 unsigned long addr, end, next;
968 int ret;
969
970 if (page_anon_vma(page)) {
971 addr = page_address_in_vma(page, vma);
972 if (addr == -EFAULT)
973 return 0;
974 else
975 end = addr + PAGE_SIZE;
976 } else {
977 addr = vma->vm_start;
978 end = vma->vm_end;
979 }
980
981 pgd = pgd_offset(vma->vm_mm, addr);
982 do {
983 next = pgd_addr_end(addr, end);
984 if (pgd_none_or_clear_bad(pgd))
985 continue;
986 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
987 if (ret)
988 return ret;
989 } while (pgd++, addr = next, addr != end);
990 return 0;
991}
992
993static int unuse_mm(struct mm_struct *mm,
994 swp_entry_t entry, struct page *page)
995{
996 struct vm_area_struct *vma;
997 int ret = 0;
998
999 if (!down_read_trylock(&mm->mmap_sem)) {
1000
1001
1002
1003
1004 activate_page(page);
1005 unlock_page(page);
1006 down_read(&mm->mmap_sem);
1007 lock_page(page);
1008 }
1009 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1010 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1011 break;
1012 }
1013 up_read(&mm->mmap_sem);
1014 return (ret < 0)? ret: 0;
1015}
1016
1017
1018
1019
1020
1021static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1022 unsigned int prev)
1023{
1024 unsigned int max = si->max;
1025 unsigned int i = prev;
1026 unsigned char count;
1027
1028
1029
1030
1031
1032
1033
1034 for (;;) {
1035 if (++i >= max) {
1036 if (!prev) {
1037 i = 0;
1038 break;
1039 }
1040
1041
1042
1043
1044 max = prev + 1;
1045 prev = 0;
1046 i = 1;
1047 }
1048 count = si->swap_map[i];
1049 if (count && swap_count(count) != SWAP_MAP_BAD)
1050 break;
1051 }
1052 return i;
1053}
1054
1055
1056
1057
1058
1059
1060static int try_to_unuse(unsigned int type)
1061{
1062 struct swap_info_struct *si = swap_info[type];
1063 struct mm_struct *start_mm;
1064 unsigned char *swap_map;
1065 unsigned char swcount;
1066 struct page *page;
1067 swp_entry_t entry;
1068 unsigned int i = 0;
1069 int retval = 0;
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085 start_mm = &init_mm;
1086 atomic_inc(&init_mm.mm_users);
1087
1088
1089
1090
1091
1092
1093 while ((i = find_next_to_unuse(si, i)) != 0) {
1094 if (signal_pending(current)) {
1095 retval = -EINTR;
1096 break;
1097 }
1098
1099
1100
1101
1102
1103
1104 swap_map = &si->swap_map[i];
1105 entry = swp_entry(type, i);
1106 page = read_swap_cache_async(entry,
1107 GFP_HIGHUSER_MOVABLE, NULL, 0);
1108 if (!page) {
1109
1110
1111
1112
1113
1114
1115 if (!*swap_map)
1116 continue;
1117 retval = -ENOMEM;
1118 break;
1119 }
1120
1121
1122
1123
1124 if (atomic_read(&start_mm->mm_users) == 1) {
1125 mmput(start_mm);
1126 start_mm = &init_mm;
1127 atomic_inc(&init_mm.mm_users);
1128 }
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138 wait_on_page_locked(page);
1139 wait_on_page_writeback(page);
1140 lock_page(page);
1141 wait_on_page_writeback(page);
1142
1143
1144
1145
1146 swcount = *swap_map;
1147 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1148 retval = shmem_unuse(entry, page);
1149
1150 if (retval < 0)
1151 break;
1152 continue;
1153 }
1154 if (swap_count(swcount) && start_mm != &init_mm)
1155 retval = unuse_mm(start_mm, entry, page);
1156
1157 if (swap_count(*swap_map)) {
1158 int set_start_mm = (*swap_map >= swcount);
1159 struct list_head *p = &start_mm->mmlist;
1160 struct mm_struct *new_start_mm = start_mm;
1161 struct mm_struct *prev_mm = start_mm;
1162 struct mm_struct *mm;
1163
1164 atomic_inc(&new_start_mm->mm_users);
1165 atomic_inc(&prev_mm->mm_users);
1166 spin_lock(&mmlist_lock);
1167 while (swap_count(*swap_map) && !retval &&
1168 (p = p->next) != &start_mm->mmlist) {
1169 mm = list_entry(p, struct mm_struct, mmlist);
1170 if (!atomic_inc_not_zero(&mm->mm_users))
1171 continue;
1172 spin_unlock(&mmlist_lock);
1173 mmput(prev_mm);
1174 prev_mm = mm;
1175
1176 cond_resched();
1177
1178 swcount = *swap_map;
1179 if (!swap_count(swcount))
1180 ;
1181 else if (mm == &init_mm)
1182 set_start_mm = 1;
1183 else
1184 retval = unuse_mm(mm, entry, page);
1185
1186 if (set_start_mm && *swap_map < swcount) {
1187 mmput(new_start_mm);
1188 atomic_inc(&mm->mm_users);
1189 new_start_mm = mm;
1190 set_start_mm = 0;
1191 }
1192 spin_lock(&mmlist_lock);
1193 }
1194 spin_unlock(&mmlist_lock);
1195 mmput(prev_mm);
1196 mmput(start_mm);
1197 start_mm = new_start_mm;
1198 }
1199 if (retval) {
1200 unlock_page(page);
1201 page_cache_release(page);
1202 break;
1203 }
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224 if (swap_count(*swap_map) &&
1225 PageDirty(page) && PageSwapCache(page)) {
1226 struct writeback_control wbc = {
1227 .sync_mode = WB_SYNC_NONE,
1228 };
1229
1230 swap_writepage(page, &wbc);
1231 lock_page(page);
1232 wait_on_page_writeback(page);
1233 }
1234
1235
1236
1237
1238
1239
1240
1241
1242 if (PageSwapCache(page) &&
1243 likely(page_private(page) == entry.val))
1244 delete_from_swap_cache(page);
1245
1246
1247
1248
1249
1250
1251 SetPageDirty(page);
1252 unlock_page(page);
1253 page_cache_release(page);
1254
1255
1256
1257
1258
1259 cond_resched();
1260 }
1261
1262 mmput(start_mm);
1263 return retval;
1264}
1265
1266
1267
1268
1269
1270
1271
1272static void drain_mmlist(void)
1273{
1274 struct list_head *p, *next;
1275 unsigned int type;
1276
1277 for (type = 0; type < nr_swapfiles; type++)
1278 if (swap_info[type]->inuse_pages)
1279 return;
1280 spin_lock(&mmlist_lock);
1281 list_for_each_safe(p, next, &init_mm.mmlist)
1282 list_del_init(p);
1283 spin_unlock(&mmlist_lock);
1284}
1285
1286
1287
1288
1289
1290
1291
1292static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1293{
1294 struct swap_info_struct *sis;
1295 struct swap_extent *start_se;
1296 struct swap_extent *se;
1297 pgoff_t offset;
1298
1299 sis = swap_info[swp_type(entry)];
1300 *bdev = sis->bdev;
1301
1302 offset = swp_offset(entry);
1303 start_se = sis->curr_swap_extent;
1304 se = start_se;
1305
1306 for ( ; ; ) {
1307 struct list_head *lh;
1308
1309 if (se->start_page <= offset &&
1310 offset < (se->start_page + se->nr_pages)) {
1311 return se->start_block + (offset - se->start_page);
1312 }
1313 lh = se->list.next;
1314 se = list_entry(lh, struct swap_extent, list);
1315 sis->curr_swap_extent = se;
1316 BUG_ON(se == start_se);
1317 }
1318}
1319
1320
1321
1322
1323sector_t map_swap_page(struct page *page, struct block_device **bdev)
1324{
1325 swp_entry_t entry;
1326 entry.val = page_private(page);
1327 return map_swap_entry(entry, bdev);
1328}
1329
1330
1331
1332
1333static void destroy_swap_extents(struct swap_info_struct *sis)
1334{
1335 while (!list_empty(&sis->first_swap_extent.list)) {
1336 struct swap_extent *se;
1337
1338 se = list_entry(sis->first_swap_extent.list.next,
1339 struct swap_extent, list);
1340 list_del(&se->list);
1341 kfree(se);
1342 }
1343}
1344
1345
1346
1347
1348
1349
1350
1351static int
1352add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1353 unsigned long nr_pages, sector_t start_block)
1354{
1355 struct swap_extent *se;
1356 struct swap_extent *new_se;
1357 struct list_head *lh;
1358
1359 if (start_page == 0) {
1360 se = &sis->first_swap_extent;
1361 sis->curr_swap_extent = se;
1362 se->start_page = 0;
1363 se->nr_pages = nr_pages;
1364 se->start_block = start_block;
1365 return 1;
1366 } else {
1367 lh = sis->first_swap_extent.list.prev;
1368 se = list_entry(lh, struct swap_extent, list);
1369 BUG_ON(se->start_page + se->nr_pages != start_page);
1370 if (se->start_block + se->nr_pages == start_block) {
1371
1372 se->nr_pages += nr_pages;
1373 return 0;
1374 }
1375 }
1376
1377
1378
1379
1380 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1381 if (new_se == NULL)
1382 return -ENOMEM;
1383 new_se->start_page = start_page;
1384 new_se->nr_pages = nr_pages;
1385 new_se->start_block = start_block;
1386
1387 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1388 return 1;
1389}
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1423{
1424 struct inode *inode;
1425 unsigned blocks_per_page;
1426 unsigned long page_no;
1427 unsigned blkbits;
1428 sector_t probe_block;
1429 sector_t last_block;
1430 sector_t lowest_block = -1;
1431 sector_t highest_block = 0;
1432 int nr_extents = 0;
1433 int ret;
1434
1435 inode = sis->swap_file->f_mapping->host;
1436 if (S_ISBLK(inode->i_mode)) {
1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1438 *span = sis->pages;
1439 goto out;
1440 }
1441
1442 blkbits = inode->i_blkbits;
1443 blocks_per_page = PAGE_SIZE >> blkbits;
1444
1445
1446
1447
1448
1449 probe_block = 0;
1450 page_no = 0;
1451 last_block = i_size_read(inode) >> blkbits;
1452 while ((probe_block + blocks_per_page) <= last_block &&
1453 page_no < sis->max) {
1454 unsigned block_in_page;
1455 sector_t first_block;
1456
1457 first_block = bmap(inode, probe_block);
1458 if (first_block == 0)
1459 goto bad_bmap;
1460
1461
1462
1463
1464 if (first_block & (blocks_per_page - 1)) {
1465 probe_block++;
1466 goto reprobe;
1467 }
1468
1469 for (block_in_page = 1; block_in_page < blocks_per_page;
1470 block_in_page++) {
1471 sector_t block;
1472
1473 block = bmap(inode, probe_block + block_in_page);
1474 if (block == 0)
1475 goto bad_bmap;
1476 if (block != first_block + block_in_page) {
1477
1478 probe_block++;
1479 goto reprobe;
1480 }
1481 }
1482
1483 first_block >>= (PAGE_SHIFT - blkbits);
1484 if (page_no) {
1485 if (first_block < lowest_block)
1486 lowest_block = first_block;
1487 if (first_block > highest_block)
1488 highest_block = first_block;
1489 }
1490
1491
1492
1493
1494 ret = add_swap_extent(sis, page_no, 1, first_block);
1495 if (ret < 0)
1496 goto out;
1497 nr_extents += ret;
1498 page_no++;
1499 probe_block += blocks_per_page;
1500reprobe:
1501 continue;
1502 }
1503 ret = nr_extents;
1504 *span = 1 + highest_block - lowest_block;
1505 if (page_no == 0)
1506 page_no = 1;
1507 sis->max = page_no;
1508 sis->pages = page_no - 1;
1509 sis->highest_bit = page_no - 1;
1510out:
1511 return ret;
1512bad_bmap:
1513 printk(KERN_ERR "swapon: swapfile has holes\n");
1514 ret = -EINVAL;
1515 goto out;
1516}
1517
1518static void enable_swap_info(struct swap_info_struct *p, int prio,
1519 unsigned char *swap_map)
1520{
1521 int i, prev;
1522
1523 spin_lock(&swap_lock);
1524 if (prio >= 0)
1525 p->prio = prio;
1526 else
1527 p->prio = --least_priority;
1528 p->swap_map = swap_map;
1529 p->flags |= SWP_WRITEOK;
1530 nr_swap_pages += p->pages;
1531 total_swap_pages += p->pages;
1532
1533
1534 prev = -1;
1535 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1536 if (p->prio >= swap_info[i]->prio)
1537 break;
1538 prev = i;
1539 }
1540 p->next = i;
1541 if (prev < 0)
1542 swap_list.head = swap_list.next = p->type;
1543 else
1544 swap_info[prev]->next = p->type;
1545 spin_unlock(&swap_lock);
1546}
1547
1548SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1549{
1550 struct swap_info_struct *p = NULL;
1551 unsigned char *swap_map;
1552 struct file *swap_file, *victim;
1553 struct address_space *mapping;
1554 struct inode *inode;
1555 char *pathname;
1556 int oom_score_adj;
1557 int i, type, prev;
1558 int err;
1559
1560 if (!capable(CAP_SYS_ADMIN))
1561 return -EPERM;
1562
1563 pathname = getname(specialfile);
1564 err = PTR_ERR(pathname);
1565 if (IS_ERR(pathname))
1566 goto out;
1567
1568 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1569 putname(pathname);
1570 err = PTR_ERR(victim);
1571 if (IS_ERR(victim))
1572 goto out;
1573
1574 mapping = victim->f_mapping;
1575 prev = -1;
1576 spin_lock(&swap_lock);
1577 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1578 p = swap_info[type];
1579 if (p->flags & SWP_WRITEOK) {
1580 if (p->swap_file->f_mapping == mapping)
1581 break;
1582 }
1583 prev = type;
1584 }
1585 if (type < 0) {
1586 err = -EINVAL;
1587 spin_unlock(&swap_lock);
1588 goto out_dput;
1589 }
1590 if (!security_vm_enough_memory(p->pages))
1591 vm_unacct_memory(p->pages);
1592 else {
1593 err = -ENOMEM;
1594 spin_unlock(&swap_lock);
1595 goto out_dput;
1596 }
1597 if (prev < 0)
1598 swap_list.head = p->next;
1599 else
1600 swap_info[prev]->next = p->next;
1601 if (type == swap_list.next) {
1602
1603 swap_list.next = swap_list.head;
1604 }
1605 if (p->prio < 0) {
1606 for (i = p->next; i >= 0; i = swap_info[i]->next)
1607 swap_info[i]->prio = p->prio--;
1608 least_priority++;
1609 }
1610 nr_swap_pages -= p->pages;
1611 total_swap_pages -= p->pages;
1612 p->flags &= ~SWP_WRITEOK;
1613 spin_unlock(&swap_lock);
1614
1615 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1616 err = try_to_unuse(type);
1617 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
1618
1619 if (err) {
1620
1621
1622
1623
1624
1625
1626
1627 enable_swap_info(p, p->prio, p->swap_map);
1628 goto out_dput;
1629 }
1630
1631 destroy_swap_extents(p);
1632 if (p->flags & SWP_CONTINUED)
1633 free_swap_count_continuations(p);
1634
1635 mutex_lock(&swapon_mutex);
1636 spin_lock(&swap_lock);
1637 drain_mmlist();
1638
1639
1640 p->highest_bit = 0;
1641 while (p->flags >= SWP_SCANNING) {
1642 spin_unlock(&swap_lock);
1643 schedule_timeout_uninterruptible(1);
1644 spin_lock(&swap_lock);
1645 }
1646
1647 swap_file = p->swap_file;
1648 p->swap_file = NULL;
1649 p->max = 0;
1650 swap_map = p->swap_map;
1651 p->swap_map = NULL;
1652 p->flags = 0;
1653 spin_unlock(&swap_lock);
1654 mutex_unlock(&swapon_mutex);
1655 vfree(swap_map);
1656
1657 swap_cgroup_swapoff(type);
1658
1659 inode = mapping->host;
1660 if (S_ISBLK(inode->i_mode)) {
1661 struct block_device *bdev = I_BDEV(inode);
1662 set_blocksize(bdev, p->old_block_size);
1663 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1664 } else {
1665 mutex_lock(&inode->i_mutex);
1666 inode->i_flags &= ~S_SWAPFILE;
1667 mutex_unlock(&inode->i_mutex);
1668 }
1669 filp_close(swap_file, NULL);
1670 err = 0;
1671 atomic_inc(&proc_poll_event);
1672 wake_up_interruptible(&proc_poll_wait);
1673
1674out_dput:
1675 filp_close(victim, NULL);
1676out:
1677 return err;
1678}
1679
1680#ifdef CONFIG_PROC_FS
1681static unsigned swaps_poll(struct file *file, poll_table *wait)
1682{
1683 struct seq_file *seq = file->private_data;
1684
1685 poll_wait(file, &proc_poll_wait, wait);
1686
1687 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1688 seq->poll_event = atomic_read(&proc_poll_event);
1689 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1690 }
1691
1692 return POLLIN | POLLRDNORM;
1693}
1694
1695
1696static void *swap_start(struct seq_file *swap, loff_t *pos)
1697{
1698 struct swap_info_struct *si;
1699 int type;
1700 loff_t l = *pos;
1701
1702 mutex_lock(&swapon_mutex);
1703
1704 if (!l)
1705 return SEQ_START_TOKEN;
1706
1707 for (type = 0; type < nr_swapfiles; type++) {
1708 smp_rmb();
1709 si = swap_info[type];
1710 if (!(si->flags & SWP_USED) || !si->swap_map)
1711 continue;
1712 if (!--l)
1713 return si;
1714 }
1715
1716 return NULL;
1717}
1718
1719static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1720{
1721 struct swap_info_struct *si = v;
1722 int type;
1723
1724 if (v == SEQ_START_TOKEN)
1725 type = 0;
1726 else
1727 type = si->type + 1;
1728
1729 for (; type < nr_swapfiles; type++) {
1730 smp_rmb();
1731 si = swap_info[type];
1732 if (!(si->flags & SWP_USED) || !si->swap_map)
1733 continue;
1734 ++*pos;
1735 return si;
1736 }
1737
1738 return NULL;
1739}
1740
1741static void swap_stop(struct seq_file *swap, void *v)
1742{
1743 mutex_unlock(&swapon_mutex);
1744}
1745
1746static int swap_show(struct seq_file *swap, void *v)
1747{
1748 struct swap_info_struct *si = v;
1749 struct file *file;
1750 int len;
1751
1752 if (si == SEQ_START_TOKEN) {
1753 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1754 return 0;
1755 }
1756
1757 file = si->swap_file;
1758 len = seq_path(swap, &file->f_path, " \t\n\\");
1759 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1760 len < 40 ? 40 - len : 1, " ",
1761 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1762 "partition" : "file\t",
1763 si->pages << (PAGE_SHIFT - 10),
1764 si->inuse_pages << (PAGE_SHIFT - 10),
1765 si->prio);
1766 return 0;
1767}
1768
1769static const struct seq_operations swaps_op = {
1770 .start = swap_start,
1771 .next = swap_next,
1772 .stop = swap_stop,
1773 .show = swap_show
1774};
1775
1776static int swaps_open(struct inode *inode, struct file *file)
1777{
1778 struct seq_file *seq;
1779 int ret;
1780
1781 ret = seq_open(file, &swaps_op);
1782 if (ret)
1783 return ret;
1784
1785 seq = file->private_data;
1786 seq->poll_event = atomic_read(&proc_poll_event);
1787 return 0;
1788}
1789
1790static const struct file_operations proc_swaps_operations = {
1791 .open = swaps_open,
1792 .read = seq_read,
1793 .llseek = seq_lseek,
1794 .release = seq_release,
1795 .poll = swaps_poll,
1796};
1797
1798static int __init procswaps_init(void)
1799{
1800 proc_create("swaps", 0, NULL, &proc_swaps_operations);
1801 return 0;
1802}
1803__initcall(procswaps_init);
1804#endif
1805
1806#ifdef MAX_SWAPFILES_CHECK
1807static int __init max_swapfiles_check(void)
1808{
1809 MAX_SWAPFILES_CHECK();
1810 return 0;
1811}
1812late_initcall(max_swapfiles_check);
1813#endif
1814
1815static struct swap_info_struct *alloc_swap_info(void)
1816{
1817 struct swap_info_struct *p;
1818 unsigned int type;
1819
1820 p = kzalloc(sizeof(*p), GFP_KERNEL);
1821 if (!p)
1822 return ERR_PTR(-ENOMEM);
1823
1824 spin_lock(&swap_lock);
1825 for (type = 0; type < nr_swapfiles; type++) {
1826 if (!(swap_info[type]->flags & SWP_USED))
1827 break;
1828 }
1829 if (type >= MAX_SWAPFILES) {
1830 spin_unlock(&swap_lock);
1831 kfree(p);
1832 return ERR_PTR(-EPERM);
1833 }
1834 if (type >= nr_swapfiles) {
1835 p->type = type;
1836 swap_info[type] = p;
1837
1838
1839
1840
1841
1842 smp_wmb();
1843 nr_swapfiles++;
1844 } else {
1845 kfree(p);
1846 p = swap_info[type];
1847
1848
1849
1850
1851 }
1852 INIT_LIST_HEAD(&p->first_swap_extent.list);
1853 p->flags = SWP_USED;
1854 p->next = -1;
1855 spin_unlock(&swap_lock);
1856
1857 return p;
1858}
1859
1860static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1861{
1862 int error;
1863
1864 if (S_ISBLK(inode->i_mode)) {
1865 p->bdev = bdgrab(I_BDEV(inode));
1866 error = blkdev_get(p->bdev,
1867 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1868 sys_swapon);
1869 if (error < 0) {
1870 p->bdev = NULL;
1871 return -EINVAL;
1872 }
1873 p->old_block_size = block_size(p->bdev);
1874 error = set_blocksize(p->bdev, PAGE_SIZE);
1875 if (error < 0)
1876 return error;
1877 p->flags |= SWP_BLKDEV;
1878 } else if (S_ISREG(inode->i_mode)) {
1879 p->bdev = inode->i_sb->s_bdev;
1880 mutex_lock(&inode->i_mutex);
1881 if (IS_SWAPFILE(inode))
1882 return -EBUSY;
1883 } else
1884 return -EINVAL;
1885
1886 return 0;
1887}
1888
1889static unsigned long read_swap_header(struct swap_info_struct *p,
1890 union swap_header *swap_header,
1891 struct inode *inode)
1892{
1893 int i;
1894 unsigned long maxpages;
1895 unsigned long swapfilepages;
1896
1897 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1898 printk(KERN_ERR "Unable to find swap-space signature\n");
1899 return 0;
1900 }
1901
1902
1903 if (swab32(swap_header->info.version) == 1) {
1904 swab32s(&swap_header->info.version);
1905 swab32s(&swap_header->info.last_page);
1906 swab32s(&swap_header->info.nr_badpages);
1907 for (i = 0; i < swap_header->info.nr_badpages; i++)
1908 swab32s(&swap_header->info.badpages[i]);
1909 }
1910
1911 if (swap_header->info.version != 1) {
1912 printk(KERN_WARNING
1913 "Unable to handle swap header version %d\n",
1914 swap_header->info.version);
1915 return 0;
1916 }
1917
1918 p->lowest_bit = 1;
1919 p->cluster_next = 1;
1920 p->cluster_nr = 0;
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937 maxpages = swp_offset(pte_to_swp_entry(
1938 swp_entry_to_pte(swp_entry(0, ~0UL))));
1939 maxpages = swp_offset(radix_to_swp_entry(
1940 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1941
1942 if (maxpages > swap_header->info.last_page) {
1943 maxpages = swap_header->info.last_page + 1;
1944
1945 if ((unsigned int)maxpages == 0)
1946 maxpages = UINT_MAX;
1947 }
1948 p->highest_bit = maxpages - 1;
1949
1950 if (!maxpages)
1951 return 0;
1952 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1953 if (swapfilepages && maxpages > swapfilepages) {
1954 printk(KERN_WARNING
1955 "Swap area shorter than signature indicates\n");
1956 return 0;
1957 }
1958 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1959 return 0;
1960 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1961 return 0;
1962
1963 return maxpages;
1964}
1965
1966static int setup_swap_map_and_extents(struct swap_info_struct *p,
1967 union swap_header *swap_header,
1968 unsigned char *swap_map,
1969 unsigned long maxpages,
1970 sector_t *span)
1971{
1972 int i;
1973 unsigned int nr_good_pages;
1974 int nr_extents;
1975
1976 nr_good_pages = maxpages - 1;
1977
1978 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1979 unsigned int page_nr = swap_header->info.badpages[i];
1980 if (page_nr == 0 || page_nr > swap_header->info.last_page)
1981 return -EINVAL;
1982 if (page_nr < maxpages) {
1983 swap_map[page_nr] = SWAP_MAP_BAD;
1984 nr_good_pages--;
1985 }
1986 }
1987
1988 if (nr_good_pages) {
1989 swap_map[0] = SWAP_MAP_BAD;
1990 p->max = maxpages;
1991 p->pages = nr_good_pages;
1992 nr_extents = setup_swap_extents(p, span);
1993 if (nr_extents < 0)
1994 return nr_extents;
1995 nr_good_pages = p->pages;
1996 }
1997 if (!nr_good_pages) {
1998 printk(KERN_WARNING "Empty swap-file\n");
1999 return -EINVAL;
2000 }
2001
2002 return nr_extents;
2003}
2004
2005SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2006{
2007 struct swap_info_struct *p;
2008 char *name;
2009 struct file *swap_file = NULL;
2010 struct address_space *mapping;
2011 int i;
2012 int prio;
2013 int error;
2014 union swap_header *swap_header;
2015 int nr_extents;
2016 sector_t span;
2017 unsigned long maxpages;
2018 unsigned char *swap_map = NULL;
2019 struct page *page = NULL;
2020 struct inode *inode = NULL;
2021
2022 if (!capable(CAP_SYS_ADMIN))
2023 return -EPERM;
2024
2025 p = alloc_swap_info();
2026 if (IS_ERR(p))
2027 return PTR_ERR(p);
2028
2029 name = getname(specialfile);
2030 if (IS_ERR(name)) {
2031 error = PTR_ERR(name);
2032 name = NULL;
2033 goto bad_swap;
2034 }
2035 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2036 if (IS_ERR(swap_file)) {
2037 error = PTR_ERR(swap_file);
2038 swap_file = NULL;
2039 goto bad_swap;
2040 }
2041
2042 p->swap_file = swap_file;
2043 mapping = swap_file->f_mapping;
2044
2045 for (i = 0; i < nr_swapfiles; i++) {
2046 struct swap_info_struct *q = swap_info[i];
2047
2048 if (q == p || !q->swap_file)
2049 continue;
2050 if (mapping == q->swap_file->f_mapping) {
2051 error = -EBUSY;
2052 goto bad_swap;
2053 }
2054 }
2055
2056 inode = mapping->host;
2057
2058 error = claim_swapfile(p, inode);
2059 if (unlikely(error))
2060 goto bad_swap;
2061
2062
2063
2064
2065 if (!mapping->a_ops->readpage) {
2066 error = -EINVAL;
2067 goto bad_swap;
2068 }
2069 page = read_mapping_page(mapping, 0, swap_file);
2070 if (IS_ERR(page)) {
2071 error = PTR_ERR(page);
2072 goto bad_swap;
2073 }
2074 swap_header = kmap(page);
2075
2076 maxpages = read_swap_header(p, swap_header, inode);
2077 if (unlikely(!maxpages)) {
2078 error = -EINVAL;
2079 goto bad_swap;
2080 }
2081
2082
2083 swap_map = vzalloc(maxpages);
2084 if (!swap_map) {
2085 error = -ENOMEM;
2086 goto bad_swap;
2087 }
2088
2089 error = swap_cgroup_swapon(p->type, maxpages);
2090 if (error)
2091 goto bad_swap;
2092
2093 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2094 maxpages, &span);
2095 if (unlikely(nr_extents < 0)) {
2096 error = nr_extents;
2097 goto bad_swap;
2098 }
2099
2100 if (p->bdev) {
2101 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2102 p->flags |= SWP_SOLIDSTATE;
2103 p->cluster_next = 1 + (random32() % p->highest_bit);
2104 }
2105 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2106 p->flags |= SWP_DISCARDABLE;
2107 }
2108
2109 mutex_lock(&swapon_mutex);
2110 prio = -1;
2111 if (swap_flags & SWAP_FLAG_PREFER)
2112 prio =
2113 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2114 enable_swap_info(p, prio, swap_map);
2115
2116 printk(KERN_INFO "Adding %uk swap on %s. "
2117 "Priority:%d extents:%d across:%lluk %s%s\n",
2118 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2119 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2120 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2121 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2122
2123 mutex_unlock(&swapon_mutex);
2124 atomic_inc(&proc_poll_event);
2125 wake_up_interruptible(&proc_poll_wait);
2126
2127 if (S_ISREG(inode->i_mode))
2128 inode->i_flags |= S_SWAPFILE;
2129 error = 0;
2130 goto out;
2131bad_swap:
2132 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2133 set_blocksize(p->bdev, p->old_block_size);
2134 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2135 }
2136 destroy_swap_extents(p);
2137 swap_cgroup_swapoff(p->type);
2138 spin_lock(&swap_lock);
2139 p->swap_file = NULL;
2140 p->flags = 0;
2141 spin_unlock(&swap_lock);
2142 vfree(swap_map);
2143 if (swap_file) {
2144 if (inode && S_ISREG(inode->i_mode)) {
2145 mutex_unlock(&inode->i_mutex);
2146 inode = NULL;
2147 }
2148 filp_close(swap_file, NULL);
2149 }
2150out:
2151 if (page && !IS_ERR(page)) {
2152 kunmap(page);
2153 page_cache_release(page);
2154 }
2155 if (name)
2156 putname(name);
2157 if (inode && S_ISREG(inode->i_mode))
2158 mutex_unlock(&inode->i_mutex);
2159 return error;
2160}
2161
2162void si_swapinfo(struct sysinfo *val)
2163{
2164 unsigned int type;
2165 unsigned long nr_to_be_unused = 0;
2166
2167 spin_lock(&swap_lock);
2168 for (type = 0; type < nr_swapfiles; type++) {
2169 struct swap_info_struct *si = swap_info[type];
2170
2171 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2172 nr_to_be_unused += si->inuse_pages;
2173 }
2174 val->freeswap = nr_swap_pages + nr_to_be_unused;
2175 val->totalswap = total_swap_pages + nr_to_be_unused;
2176 spin_unlock(&swap_lock);
2177}
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2191{
2192 struct swap_info_struct *p;
2193 unsigned long offset, type;
2194 unsigned char count;
2195 unsigned char has_cache;
2196 int err = -EINVAL;
2197
2198 if (non_swap_entry(entry))
2199 goto out;
2200
2201 type = swp_type(entry);
2202 if (type >= nr_swapfiles)
2203 goto bad_file;
2204 p = swap_info[type];
2205 offset = swp_offset(entry);
2206
2207 spin_lock(&swap_lock);
2208 if (unlikely(offset >= p->max))
2209 goto unlock_out;
2210
2211 count = p->swap_map[offset];
2212 has_cache = count & SWAP_HAS_CACHE;
2213 count &= ~SWAP_HAS_CACHE;
2214 err = 0;
2215
2216 if (usage == SWAP_HAS_CACHE) {
2217
2218
2219 if (!has_cache && count)
2220 has_cache = SWAP_HAS_CACHE;
2221 else if (has_cache)
2222 err = -EEXIST;
2223 else
2224 err = -ENOENT;
2225
2226 } else if (count || has_cache) {
2227
2228 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2229 count += usage;
2230 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2231 err = -EINVAL;
2232 else if (swap_count_continued(p, offset, count))
2233 count = COUNT_CONTINUED;
2234 else
2235 err = -ENOMEM;
2236 } else
2237 err = -ENOENT;
2238
2239 p->swap_map[offset] = count | has_cache;
2240
2241unlock_out:
2242 spin_unlock(&swap_lock);
2243out:
2244 return err;
2245
2246bad_file:
2247 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2248 goto out;
2249}
2250
2251
2252
2253
2254
2255void swap_shmem_alloc(swp_entry_t entry)
2256{
2257 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2258}
2259
2260
2261
2262
2263
2264
2265
2266
2267int swap_duplicate(swp_entry_t entry)
2268{
2269 int err = 0;
2270
2271 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2272 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2273 return err;
2274}
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284int swapcache_prepare(swp_entry_t entry)
2285{
2286 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2287}
2288
2289
2290
2291
2292
2293int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2294{
2295 struct swap_info_struct *si;
2296 int our_page_cluster = page_cluster;
2297 pgoff_t target, toff;
2298 pgoff_t base, end;
2299 int nr_pages = 0;
2300
2301 if (!our_page_cluster)
2302 return 0;
2303
2304 si = swap_info[swp_type(entry)];
2305 target = swp_offset(entry);
2306 base = (target >> our_page_cluster) << our_page_cluster;
2307 end = base + (1 << our_page_cluster);
2308 if (!base)
2309 base++;
2310
2311 spin_lock(&swap_lock);
2312 if (end > si->max)
2313 end = si->max;
2314
2315
2316 for (toff = target; ++toff < end; nr_pages++) {
2317
2318 if (!si->swap_map[toff])
2319 break;
2320 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2321 break;
2322 }
2323
2324 for (toff = target; --toff >= base; nr_pages++) {
2325
2326 if (!si->swap_map[toff])
2327 break;
2328 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2329 break;
2330 }
2331 spin_unlock(&swap_lock);
2332
2333
2334
2335
2336
2337 *offset = ++toff;
2338 return nr_pages? ++nr_pages: 0;
2339}
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2357{
2358 struct swap_info_struct *si;
2359 struct page *head;
2360 struct page *page;
2361 struct page *list_page;
2362 pgoff_t offset;
2363 unsigned char count;
2364
2365
2366
2367
2368
2369 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2370
2371 si = swap_info_get(entry);
2372 if (!si) {
2373
2374
2375
2376
2377
2378 goto outer;
2379 }
2380
2381 offset = swp_offset(entry);
2382 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2383
2384 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2385
2386
2387
2388
2389
2390 goto out;
2391 }
2392
2393 if (!page) {
2394 spin_unlock(&swap_lock);
2395 return -ENOMEM;
2396 }
2397
2398
2399
2400
2401
2402
2403 head = vmalloc_to_page(si->swap_map + offset);
2404 offset &= ~PAGE_MASK;
2405
2406
2407
2408
2409
2410 if (!page_private(head)) {
2411 BUG_ON(count & COUNT_CONTINUED);
2412 INIT_LIST_HEAD(&head->lru);
2413 set_page_private(head, SWP_CONTINUED);
2414 si->flags |= SWP_CONTINUED;
2415 }
2416
2417 list_for_each_entry(list_page, &head->lru, lru) {
2418 unsigned char *map;
2419
2420
2421
2422
2423
2424 if (!(count & COUNT_CONTINUED))
2425 goto out;
2426
2427 map = kmap_atomic(list_page, KM_USER0) + offset;
2428 count = *map;
2429 kunmap_atomic(map, KM_USER0);
2430
2431
2432
2433
2434
2435 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2436 goto out;
2437 }
2438
2439 list_add_tail(&page->lru, &head->lru);
2440 page = NULL;
2441out:
2442 spin_unlock(&swap_lock);
2443outer:
2444 if (page)
2445 __free_page(page);
2446 return 0;
2447}
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457static bool swap_count_continued(struct swap_info_struct *si,
2458 pgoff_t offset, unsigned char count)
2459{
2460 struct page *head;
2461 struct page *page;
2462 unsigned char *map;
2463
2464 head = vmalloc_to_page(si->swap_map + offset);
2465 if (page_private(head) != SWP_CONTINUED) {
2466 BUG_ON(count & COUNT_CONTINUED);
2467 return false;
2468 }
2469
2470 offset &= ~PAGE_MASK;
2471 page = list_entry(head->lru.next, struct page, lru);
2472 map = kmap_atomic(page, KM_USER0) + offset;
2473
2474 if (count == SWAP_MAP_MAX)
2475 goto init_map;
2476
2477 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) {
2478
2479
2480
2481 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2482 kunmap_atomic(map, KM_USER0);
2483 page = list_entry(page->lru.next, struct page, lru);
2484 BUG_ON(page == head);
2485 map = kmap_atomic(page, KM_USER0) + offset;
2486 }
2487 if (*map == SWAP_CONT_MAX) {
2488 kunmap_atomic(map, KM_USER0);
2489 page = list_entry(page->lru.next, struct page, lru);
2490 if (page == head)
2491 return false;
2492 map = kmap_atomic(page, KM_USER0) + offset;
2493init_map: *map = 0;
2494 }
2495 *map += 1;
2496 kunmap_atomic(map, KM_USER0);
2497 page = list_entry(page->lru.prev, struct page, lru);
2498 while (page != head) {
2499 map = kmap_atomic(page, KM_USER0) + offset;
2500 *map = COUNT_CONTINUED;
2501 kunmap_atomic(map, KM_USER0);
2502 page = list_entry(page->lru.prev, struct page, lru);
2503 }
2504 return true;
2505
2506 } else {
2507
2508
2509
2510 BUG_ON(count != COUNT_CONTINUED);
2511 while (*map == COUNT_CONTINUED) {
2512 kunmap_atomic(map, KM_USER0);
2513 page = list_entry(page->lru.next, struct page, lru);
2514 BUG_ON(page == head);
2515 map = kmap_atomic(page, KM_USER0) + offset;
2516 }
2517 BUG_ON(*map == 0);
2518 *map -= 1;
2519 if (*map == 0)
2520 count = 0;
2521 kunmap_atomic(map, KM_USER0);
2522 page = list_entry(page->lru.prev, struct page, lru);
2523 while (page != head) {
2524 map = kmap_atomic(page, KM_USER0) + offset;
2525 *map = SWAP_CONT_MAX | count;
2526 count = COUNT_CONTINUED;
2527 kunmap_atomic(map, KM_USER0);
2528 page = list_entry(page->lru.prev, struct page, lru);
2529 }
2530 return count == COUNT_CONTINUED;
2531 }
2532}
2533
2534
2535
2536
2537
2538static void free_swap_count_continuations(struct swap_info_struct *si)
2539{
2540 pgoff_t offset;
2541
2542 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2543 struct page *head;
2544 head = vmalloc_to_page(si->swap_map + offset);
2545 if (page_private(head)) {
2546 struct list_head *this, *next;
2547 list_for_each_safe(this, next, &head->lru) {
2548 struct page *page;
2549 page = list_entry(this, struct page, lru);
2550 list_del(this);
2551 __free_page(page);
2552 }
2553 }
2554 }
2555}
2556