1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <linux/atomic.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
92
93
94#define EP_MAX_NESTS 4
95
96#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
97
98#define EP_UNACTIVE_PTR ((void *) -1L)
99
100#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
101
102struct epoll_filefd {
103 struct file *file;
104 int fd;
105};
106
107
108
109
110
111struct nested_call_node {
112 struct list_head llink;
113 void *cookie;
114 void *ctx;
115};
116
117
118
119
120
121struct nested_calls {
122 struct list_head tasks_call_list;
123 spinlock_t lock;
124};
125
126
127
128
129
130struct epitem {
131
132 struct rb_node rbn;
133
134
135 struct list_head rdllink;
136
137
138
139
140
141 struct epitem *next;
142
143
144 struct epoll_filefd ffd;
145
146
147 int nwait;
148
149
150 struct list_head pwqlist;
151
152
153 struct eventpoll *ep;
154
155
156 struct list_head fllink;
157
158
159 struct epoll_event event;
160};
161
162
163
164
165
166
167struct eventpoll {
168
169 spinlock_t lock;
170
171
172
173
174
175
176
177 struct mutex mtx;
178
179
180 wait_queue_head_t wq;
181
182
183 wait_queue_head_t poll_wait;
184
185
186 struct list_head rdllist;
187
188
189 struct rb_root rbr;
190
191
192
193
194
195
196 struct epitem *ovflist;
197
198
199 struct user_struct *user;
200};
201
202
203struct eppoll_entry {
204
205 struct list_head llink;
206
207
208 struct epitem *base;
209
210
211
212
213
214 wait_queue_t wait;
215
216
217 wait_queue_head_t *whead;
218};
219
220
221struct ep_pqueue {
222 poll_table pt;
223 struct epitem *epi;
224};
225
226
227struct ep_send_events_data {
228 int maxevents;
229 struct epoll_event __user *events;
230};
231
232
233
234
235
236static long max_user_watches __read_mostly;
237
238
239
240
241static DEFINE_MUTEX(epmutex);
242
243
244static struct nested_calls poll_loop_ncalls;
245
246
247static struct nested_calls poll_safewake_ncalls;
248
249
250static struct nested_calls poll_readywalk_ncalls;
251
252
253static struct kmem_cache *epi_cache __read_mostly;
254
255
256static struct kmem_cache *pwq_cache __read_mostly;
257
258#ifdef CONFIG_SYSCTL
259
260#include <linux/sysctl.h>
261
262static long zero;
263static long long_max = LONG_MAX;
264
265ctl_table epoll_table[] = {
266 {
267 .procname = "max_user_watches",
268 .data = &max_user_watches,
269 .maxlen = sizeof(max_user_watches),
270 .mode = 0644,
271 .proc_handler = proc_doulongvec_minmax,
272 .extra1 = &zero,
273 .extra2 = &long_max,
274 },
275 { }
276};
277#endif
278
279
280
281static inline void ep_set_ffd(struct epoll_filefd *ffd,
282 struct file *file, int fd)
283{
284 ffd->file = file;
285 ffd->fd = fd;
286}
287
288
289static inline int ep_cmp_ffd(struct epoll_filefd *p1,
290 struct epoll_filefd *p2)
291{
292 return (p1->file > p2->file ? +1:
293 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
294}
295
296
297static inline int ep_is_linked(struct list_head *p)
298{
299 return !list_empty(p);
300}
301
302
303static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
304{
305 return container_of(p, struct eppoll_entry, wait)->base;
306}
307
308
309static inline struct epitem *ep_item_from_epqueue(poll_table *p)
310{
311 return container_of(p, struct ep_pqueue, pt)->epi;
312}
313
314
315static inline int ep_op_has_event(int op)
316{
317 return op != EPOLL_CTL_DEL;
318}
319
320
321static void ep_nested_calls_init(struct nested_calls *ncalls)
322{
323 INIT_LIST_HEAD(&ncalls->tasks_call_list);
324 spin_lock_init(&ncalls->lock);
325}
326
327
328
329
330
331
332
333
334
335static inline int ep_events_available(struct eventpoll *ep)
336{
337 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
338}
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
357 int (*nproc)(void *, void *, int), void *priv,
358 void *cookie, void *ctx)
359{
360 int error, call_nests = 0;
361 unsigned long flags;
362 struct list_head *lsthead = &ncalls->tasks_call_list;
363 struct nested_call_node *tncur;
364 struct nested_call_node tnode;
365
366 spin_lock_irqsave(&ncalls->lock, flags);
367
368
369
370
371
372
373 list_for_each_entry(tncur, lsthead, llink) {
374 if (tncur->ctx == ctx &&
375 (tncur->cookie == cookie || ++call_nests > max_nests)) {
376
377
378
379
380 error = -1;
381 goto out_unlock;
382 }
383 }
384
385
386 tnode.ctx = ctx;
387 tnode.cookie = cookie;
388 list_add(&tnode.llink, lsthead);
389
390 spin_unlock_irqrestore(&ncalls->lock, flags);
391
392
393 error = (*nproc)(priv, cookie, call_nests);
394
395
396 spin_lock_irqsave(&ncalls->lock, flags);
397 list_del(&tnode.llink);
398out_unlock:
399 spin_unlock_irqrestore(&ncalls->lock, flags);
400
401 return error;
402}
403
404#ifdef CONFIG_DEBUG_LOCK_ALLOC
405static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
406 unsigned long events, int subclass)
407{
408 unsigned long flags;
409
410 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
411 wake_up_locked_poll(wqueue, events);
412 spin_unlock_irqrestore(&wqueue->lock, flags);
413}
414#else
415static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
416 unsigned long events, int subclass)
417{
418 wake_up_poll(wqueue, events);
419}
420#endif
421
422static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
423{
424 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
425 1 + call_nests);
426 return 0;
427}
428
429
430
431
432
433
434
435
436
437
438
439static void ep_poll_safewake(wait_queue_head_t *wq)
440{
441 int this_cpu = get_cpu();
442
443 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
444 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
445
446 put_cpu();
447}
448
449
450
451
452
453
454static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
455{
456 struct list_head *lsthead = &epi->pwqlist;
457 struct eppoll_entry *pwq;
458
459 while (!list_empty(lsthead)) {
460 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
461
462 list_del(&pwq->llink);
463 remove_wait_queue(pwq->whead, &pwq->wait);
464 kmem_cache_free(pwq_cache, pwq);
465 }
466}
467
468
469
470
471
472
473
474
475
476
477
478
479
480static int ep_scan_ready_list(struct eventpoll *ep,
481 int (*sproc)(struct eventpoll *,
482 struct list_head *, void *),
483 void *priv,
484 int depth)
485{
486 int error, pwake = 0;
487 unsigned long flags;
488 struct epitem *epi, *nepi;
489 LIST_HEAD(txlist);
490
491
492
493
494
495 mutex_lock_nested(&ep->mtx, depth);
496
497
498
499
500
501
502
503
504
505 spin_lock_irqsave(&ep->lock, flags);
506 list_splice_init(&ep->rdllist, &txlist);
507 ep->ovflist = NULL;
508 spin_unlock_irqrestore(&ep->lock, flags);
509
510
511
512
513 error = (*sproc)(ep, &txlist, priv);
514
515 spin_lock_irqsave(&ep->lock, flags);
516
517
518
519
520
521 for (nepi = ep->ovflist; (epi = nepi) != NULL;
522 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
523
524
525
526
527
528
529 if (!ep_is_linked(&epi->rdllink))
530 list_add_tail(&epi->rdllink, &ep->rdllist);
531 }
532
533
534
535
536
537 ep->ovflist = EP_UNACTIVE_PTR;
538
539
540
541
542 list_splice(&txlist, &ep->rdllist);
543
544 if (!list_empty(&ep->rdllist)) {
545
546
547
548
549 if (waitqueue_active(&ep->wq))
550 wake_up_locked(&ep->wq);
551 if (waitqueue_active(&ep->poll_wait))
552 pwake++;
553 }
554 spin_unlock_irqrestore(&ep->lock, flags);
555
556 mutex_unlock(&ep->mtx);
557
558
559 if (pwake)
560 ep_poll_safewake(&ep->poll_wait);
561
562 return error;
563}
564
565
566
567
568
569static int ep_remove(struct eventpoll *ep, struct epitem *epi)
570{
571 unsigned long flags;
572 struct file *file = epi->ffd.file;
573
574
575
576
577
578
579
580
581
582 ep_unregister_pollwait(ep, epi);
583
584
585 spin_lock(&file->f_lock);
586 if (ep_is_linked(&epi->fllink))
587 list_del_init(&epi->fllink);
588 spin_unlock(&file->f_lock);
589
590 rb_erase(&epi->rbn, &ep->rbr);
591
592 spin_lock_irqsave(&ep->lock, flags);
593 if (ep_is_linked(&epi->rdllink))
594 list_del_init(&epi->rdllink);
595 spin_unlock_irqrestore(&ep->lock, flags);
596
597
598 kmem_cache_free(epi_cache, epi);
599
600 atomic_long_dec(&ep->user->epoll_watches);
601
602 return 0;
603}
604
605static void ep_free(struct eventpoll *ep)
606{
607 struct rb_node *rbp;
608 struct epitem *epi;
609
610
611 if (waitqueue_active(&ep->poll_wait))
612 ep_poll_safewake(&ep->poll_wait);
613
614
615
616
617
618
619
620
621
622 mutex_lock(&epmutex);
623
624
625
626
627 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
628 epi = rb_entry(rbp, struct epitem, rbn);
629
630 ep_unregister_pollwait(ep, epi);
631 }
632
633
634
635
636
637
638
639 while ((rbp = rb_first(&ep->rbr)) != NULL) {
640 epi = rb_entry(rbp, struct epitem, rbn);
641 ep_remove(ep, epi);
642 }
643
644 mutex_unlock(&epmutex);
645 mutex_destroy(&ep->mtx);
646 free_uid(ep->user);
647 kfree(ep);
648}
649
650static int ep_eventpoll_release(struct inode *inode, struct file *file)
651{
652 struct eventpoll *ep = file->private_data;
653
654 if (ep)
655 ep_free(ep);
656
657 return 0;
658}
659
660static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
661 void *priv)
662{
663 struct epitem *epi, *tmp;
664
665 list_for_each_entry_safe(epi, tmp, head, rdllink) {
666 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
667 epi->event.events)
668 return POLLIN | POLLRDNORM;
669 else {
670
671
672
673
674
675 list_del_init(&epi->rdllink);
676 }
677 }
678
679 return 0;
680}
681
682static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
683{
684 return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
685}
686
687static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
688{
689 int pollflags;
690 struct eventpoll *ep = file->private_data;
691
692
693 poll_wait(file, &ep->poll_wait, wait);
694
695
696
697
698
699
700
701 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
702 ep_poll_readyevents_proc, ep, ep, current);
703
704 return pollflags != -1 ? pollflags : 0;
705}
706
707
708static const struct file_operations eventpoll_fops = {
709 .release = ep_eventpoll_release,
710 .poll = ep_eventpoll_poll,
711 .llseek = noop_llseek,
712};
713
714
715static inline int is_file_epoll(struct file *f)
716{
717 return f->f_op == &eventpoll_fops;
718}
719
720
721
722
723
724
725void eventpoll_release_file(struct file *file)
726{
727 struct list_head *lsthead = &file->f_ep_links;
728 struct eventpoll *ep;
729 struct epitem *epi;
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744 mutex_lock(&epmutex);
745
746 while (!list_empty(lsthead)) {
747 epi = list_first_entry(lsthead, struct epitem, fllink);
748
749 ep = epi->ep;
750 list_del_init(&epi->fllink);
751 mutex_lock_nested(&ep->mtx, 0);
752 ep_remove(ep, epi);
753 mutex_unlock(&ep->mtx);
754 }
755
756 mutex_unlock(&epmutex);
757}
758
759static int ep_alloc(struct eventpoll **pep)
760{
761 int error;
762 struct user_struct *user;
763 struct eventpoll *ep;
764
765 user = get_current_user();
766 error = -ENOMEM;
767 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
768 if (unlikely(!ep))
769 goto free_uid;
770
771 spin_lock_init(&ep->lock);
772 mutex_init(&ep->mtx);
773 init_waitqueue_head(&ep->wq);
774 init_waitqueue_head(&ep->poll_wait);
775 INIT_LIST_HEAD(&ep->rdllist);
776 ep->rbr = RB_ROOT;
777 ep->ovflist = EP_UNACTIVE_PTR;
778 ep->user = user;
779
780 *pep = ep;
781
782 return 0;
783
784free_uid:
785 free_uid(user);
786 return error;
787}
788
789
790
791
792
793
794static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
795{
796 int kcmp;
797 struct rb_node *rbp;
798 struct epitem *epi, *epir = NULL;
799 struct epoll_filefd ffd;
800
801 ep_set_ffd(&ffd, file, fd);
802 for (rbp = ep->rbr.rb_node; rbp; ) {
803 epi = rb_entry(rbp, struct epitem, rbn);
804 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
805 if (kcmp > 0)
806 rbp = rbp->rb_right;
807 else if (kcmp < 0)
808 rbp = rbp->rb_left;
809 else {
810 epir = epi;
811 break;
812 }
813 }
814
815 return epir;
816}
817
818
819
820
821
822
823static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
824{
825 int pwake = 0;
826 unsigned long flags;
827 struct epitem *epi = ep_item_from_wait(wait);
828 struct eventpoll *ep = epi->ep;
829
830 spin_lock_irqsave(&ep->lock, flags);
831
832
833
834
835
836
837
838 if (!(epi->event.events & ~EP_PRIVATE_BITS))
839 goto out_unlock;
840
841
842
843
844
845
846
847 if (key && !((unsigned long) key & epi->event.events))
848 goto out_unlock;
849
850
851
852
853
854
855
856 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
857 if (epi->next == EP_UNACTIVE_PTR) {
858 epi->next = ep->ovflist;
859 ep->ovflist = epi;
860 }
861 goto out_unlock;
862 }
863
864
865 if (!ep_is_linked(&epi->rdllink))
866 list_add_tail(&epi->rdllink, &ep->rdllist);
867
868
869
870
871
872 if (waitqueue_active(&ep->wq))
873 wake_up_locked(&ep->wq);
874 if (waitqueue_active(&ep->poll_wait))
875 pwake++;
876
877out_unlock:
878 spin_unlock_irqrestore(&ep->lock, flags);
879
880
881 if (pwake)
882 ep_poll_safewake(&ep->poll_wait);
883
884 return 1;
885}
886
887
888
889
890
891static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
892 poll_table *pt)
893{
894 struct epitem *epi = ep_item_from_epqueue(pt);
895 struct eppoll_entry *pwq;
896
897 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
898 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
899 pwq->whead = whead;
900 pwq->base = epi;
901 add_wait_queue(whead, &pwq->wait);
902 list_add_tail(&pwq->llink, &epi->pwqlist);
903 epi->nwait++;
904 } else {
905
906 epi->nwait = -1;
907 }
908}
909
910static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
911{
912 int kcmp;
913 struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
914 struct epitem *epic;
915
916 while (*p) {
917 parent = *p;
918 epic = rb_entry(parent, struct epitem, rbn);
919 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
920 if (kcmp > 0)
921 p = &parent->rb_right;
922 else
923 p = &parent->rb_left;
924 }
925 rb_link_node(&epi->rbn, parent, p);
926 rb_insert_color(&epi->rbn, &ep->rbr);
927}
928
929
930
931
932static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
933 struct file *tfile, int fd)
934{
935 int error, revents, pwake = 0;
936 unsigned long flags;
937 long user_watches;
938 struct epitem *epi;
939 struct ep_pqueue epq;
940
941 user_watches = atomic_long_read(&ep->user->epoll_watches);
942 if (unlikely(user_watches >= max_user_watches))
943 return -ENOSPC;
944 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
945 return -ENOMEM;
946
947
948 INIT_LIST_HEAD(&epi->rdllink);
949 INIT_LIST_HEAD(&epi->fllink);
950 INIT_LIST_HEAD(&epi->pwqlist);
951 epi->ep = ep;
952 ep_set_ffd(&epi->ffd, tfile, fd);
953 epi->event = *event;
954 epi->nwait = 0;
955 epi->next = EP_UNACTIVE_PTR;
956
957
958 epq.epi = epi;
959 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
960
961
962
963
964
965
966
967
968 revents = tfile->f_op->poll(tfile, &epq.pt);
969
970
971
972
973
974
975 error = -ENOMEM;
976 if (epi->nwait < 0)
977 goto error_unregister;
978
979
980 spin_lock(&tfile->f_lock);
981 list_add_tail(&epi->fllink, &tfile->f_ep_links);
982 spin_unlock(&tfile->f_lock);
983
984
985
986
987
988 ep_rbtree_insert(ep, epi);
989
990
991 spin_lock_irqsave(&ep->lock, flags);
992
993
994 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
995 list_add_tail(&epi->rdllink, &ep->rdllist);
996
997
998 if (waitqueue_active(&ep->wq))
999 wake_up_locked(&ep->wq);
1000 if (waitqueue_active(&ep->poll_wait))
1001 pwake++;
1002 }
1003
1004 spin_unlock_irqrestore(&ep->lock, flags);
1005
1006 atomic_long_inc(&ep->user->epoll_watches);
1007
1008
1009 if (pwake)
1010 ep_poll_safewake(&ep->poll_wait);
1011
1012 return 0;
1013
1014error_unregister:
1015 ep_unregister_pollwait(ep, epi);
1016
1017
1018
1019
1020
1021
1022
1023 spin_lock_irqsave(&ep->lock, flags);
1024 if (ep_is_linked(&epi->rdllink))
1025 list_del_init(&epi->rdllink);
1026 spin_unlock_irqrestore(&ep->lock, flags);
1027
1028 kmem_cache_free(epi_cache, epi);
1029
1030 return error;
1031}
1032
1033
1034
1035
1036
1037static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1038{
1039 int pwake = 0;
1040 unsigned int revents;
1041
1042
1043
1044
1045
1046
1047 epi->event.events = event->events;
1048 epi->event.data = event->data;
1049
1050
1051
1052
1053
1054 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
1055
1056
1057
1058
1059
1060 if (revents & event->events) {
1061 spin_lock_irq(&ep->lock);
1062 if (!ep_is_linked(&epi->rdllink)) {
1063 list_add_tail(&epi->rdllink, &ep->rdllist);
1064
1065
1066 if (waitqueue_active(&ep->wq))
1067 wake_up_locked(&ep->wq);
1068 if (waitqueue_active(&ep->poll_wait))
1069 pwake++;
1070 }
1071 spin_unlock_irq(&ep->lock);
1072 }
1073
1074
1075 if (pwake)
1076 ep_poll_safewake(&ep->poll_wait);
1077
1078 return 0;
1079}
1080
1081static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1082 void *priv)
1083{
1084 struct ep_send_events_data *esed = priv;
1085 int eventcnt;
1086 unsigned int revents;
1087 struct epitem *epi;
1088 struct epoll_event __user *uevent;
1089
1090
1091
1092
1093
1094
1095 for (eventcnt = 0, uevent = esed->events;
1096 !list_empty(head) && eventcnt < esed->maxevents;) {
1097 epi = list_first_entry(head, struct epitem, rdllink);
1098
1099 list_del_init(&epi->rdllink);
1100
1101 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
1102 epi->event.events;
1103
1104
1105
1106
1107
1108
1109
1110 if (revents) {
1111 if (__put_user(revents, &uevent->events) ||
1112 __put_user(epi->event.data, &uevent->data)) {
1113 list_add(&epi->rdllink, head);
1114 return eventcnt ? eventcnt : -EFAULT;
1115 }
1116 eventcnt++;
1117 uevent++;
1118 if (epi->event.events & EPOLLONESHOT)
1119 epi->event.events &= EP_PRIVATE_BITS;
1120 else if (!(epi->event.events & EPOLLET)) {
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132 list_add_tail(&epi->rdllink, &ep->rdllist);
1133 }
1134 }
1135 }
1136
1137 return eventcnt;
1138}
1139
1140static int ep_send_events(struct eventpoll *ep,
1141 struct epoll_event __user *events, int maxevents)
1142{
1143 struct ep_send_events_data esed;
1144
1145 esed.maxevents = maxevents;
1146 esed.events = events;
1147
1148 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1149}
1150
1151static inline struct timespec ep_set_mstimeout(long ms)
1152{
1153 struct timespec now, ts = {
1154 .tv_sec = ms / MSEC_PER_SEC,
1155 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1156 };
1157
1158 ktime_get_ts(&now);
1159 return timespec_add_safe(now, ts);
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1180 int maxevents, long timeout)
1181{
1182 int res = 0, eavail, timed_out = 0;
1183 unsigned long flags;
1184 long slack = 0;
1185 wait_queue_t wait;
1186 ktime_t expires, *to = NULL;
1187
1188 if (timeout > 0) {
1189 struct timespec end_time = ep_set_mstimeout(timeout);
1190
1191 slack = select_estimate_accuracy(&end_time);
1192 to = &expires;
1193 *to = timespec_to_ktime(end_time);
1194 } else if (timeout == 0) {
1195
1196
1197
1198
1199 timed_out = 1;
1200 spin_lock_irqsave(&ep->lock, flags);
1201 goto check_events;
1202 }
1203
1204fetch_events:
1205 spin_lock_irqsave(&ep->lock, flags);
1206
1207 if (!ep_events_available(ep)) {
1208
1209
1210
1211
1212
1213 init_waitqueue_entry(&wait, current);
1214 __add_wait_queue_exclusive(&ep->wq, &wait);
1215
1216 for (;;) {
1217
1218
1219
1220
1221
1222 set_current_state(TASK_INTERRUPTIBLE);
1223 if (ep_events_available(ep) || timed_out)
1224 break;
1225 if (signal_pending(current)) {
1226 res = -EINTR;
1227 break;
1228 }
1229
1230 spin_unlock_irqrestore(&ep->lock, flags);
1231 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1232 timed_out = 1;
1233
1234 spin_lock_irqsave(&ep->lock, flags);
1235 }
1236 __remove_wait_queue(&ep->wq, &wait);
1237
1238 set_current_state(TASK_RUNNING);
1239 }
1240check_events:
1241
1242 eavail = ep_events_available(ep);
1243
1244 spin_unlock_irqrestore(&ep->lock, flags);
1245
1246
1247
1248
1249
1250
1251 if (!res && eavail &&
1252 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1253 goto fetch_events;
1254
1255 return res;
1256}
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1274{
1275 int error = 0;
1276 struct file *file = priv;
1277 struct eventpoll *ep = file->private_data;
1278 struct rb_node *rbp;
1279 struct epitem *epi;
1280
1281 mutex_lock_nested(&ep->mtx, call_nests + 1);
1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1283 epi = rb_entry(rbp, struct epitem, rbn);
1284 if (unlikely(is_file_epoll(epi->ffd.file))) {
1285 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1286 ep_loop_check_proc, epi->ffd.file,
1287 epi->ffd.file->private_data, current);
1288 if (error != 0)
1289 break;
1290 }
1291 }
1292 mutex_unlock(&ep->mtx);
1293
1294 return error;
1295}
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308static int ep_loop_check(struct eventpoll *ep, struct file *file)
1309{
1310 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1311 ep_loop_check_proc, file, ep, current);
1312}
1313
1314
1315
1316
1317SYSCALL_DEFINE1(epoll_create1, int, flags)
1318{
1319 int error;
1320 struct eventpoll *ep = NULL;
1321
1322
1323 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1324
1325 if (flags & ~EPOLL_CLOEXEC)
1326 return -EINVAL;
1327
1328
1329
1330 error = ep_alloc(&ep);
1331 if (error < 0)
1332 return error;
1333
1334
1335
1336
1337 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1338 O_RDWR | (flags & O_CLOEXEC));
1339 if (error < 0)
1340 ep_free(ep);
1341
1342 return error;
1343}
1344
1345SYSCALL_DEFINE1(epoll_create, int, size)
1346{
1347 if (size <= 0)
1348 return -EINVAL;
1349
1350 return sys_epoll_create1(0);
1351}
1352
1353
1354
1355
1356
1357
1358SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1359 struct epoll_event __user *, event)
1360{
1361 int error;
1362 int did_lock_epmutex = 0;
1363 struct file *file, *tfile;
1364 struct eventpoll *ep;
1365 struct epitem *epi;
1366 struct epoll_event epds;
1367
1368 error = -EFAULT;
1369 if (ep_op_has_event(op) &&
1370 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1371 goto error_return;
1372
1373
1374 error = -EBADF;
1375 file = fget(epfd);
1376 if (!file)
1377 goto error_return;
1378
1379
1380 tfile = fget(fd);
1381 if (!tfile)
1382 goto error_fput;
1383
1384
1385 error = -EPERM;
1386 if (!tfile->f_op || !tfile->f_op->poll)
1387 goto error_tgt_fput;
1388
1389
1390
1391
1392
1393
1394 error = -EINVAL;
1395 if (file == tfile || !is_file_epoll(file))
1396 goto error_tgt_fput;
1397
1398
1399
1400
1401
1402 ep = file->private_data;
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1415 mutex_lock(&epmutex);
1416 did_lock_epmutex = 1;
1417 error = -ELOOP;
1418 if (ep_loop_check(ep, tfile) != 0)
1419 goto error_tgt_fput;
1420 }
1421
1422
1423 mutex_lock_nested(&ep->mtx, 0);
1424
1425
1426
1427
1428
1429
1430 epi = ep_find(ep, tfile, fd);
1431
1432 error = -EINVAL;
1433 switch (op) {
1434 case EPOLL_CTL_ADD:
1435 if (!epi) {
1436 epds.events |= POLLERR | POLLHUP;
1437 error = ep_insert(ep, &epds, tfile, fd);
1438 } else
1439 error = -EEXIST;
1440 break;
1441 case EPOLL_CTL_DEL:
1442 if (epi)
1443 error = ep_remove(ep, epi);
1444 else
1445 error = -ENOENT;
1446 break;
1447 case EPOLL_CTL_MOD:
1448 if (epi) {
1449 epds.events |= POLLERR | POLLHUP;
1450 error = ep_modify(ep, epi, &epds);
1451 } else
1452 error = -ENOENT;
1453 break;
1454 }
1455 mutex_unlock(&ep->mtx);
1456
1457error_tgt_fput:
1458 if (unlikely(did_lock_epmutex))
1459 mutex_unlock(&epmutex);
1460
1461 fput(tfile);
1462error_fput:
1463 fput(file);
1464error_return:
1465
1466 return error;
1467}
1468
1469
1470
1471
1472
1473SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1474 int, maxevents, int, timeout)
1475{
1476 int error;
1477 struct file *file;
1478 struct eventpoll *ep;
1479
1480
1481 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1482 return -EINVAL;
1483
1484
1485 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1486 error = -EFAULT;
1487 goto error_return;
1488 }
1489
1490
1491 error = -EBADF;
1492 file = fget(epfd);
1493 if (!file)
1494 goto error_return;
1495
1496
1497
1498
1499
1500 error = -EINVAL;
1501 if (!is_file_epoll(file))
1502 goto error_fput;
1503
1504
1505
1506
1507
1508 ep = file->private_data;
1509
1510
1511 error = ep_poll(ep, events, maxevents, timeout);
1512
1513error_fput:
1514 fput(file);
1515error_return:
1516
1517 return error;
1518}
1519
1520#ifdef HAVE_SET_RESTORE_SIGMASK
1521
1522
1523
1524
1525
1526SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1527 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1528 size_t, sigsetsize)
1529{
1530 int error;
1531 sigset_t ksigmask, sigsaved;
1532
1533
1534
1535
1536
1537 if (sigmask) {
1538 if (sigsetsize != sizeof(sigset_t))
1539 return -EINVAL;
1540 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1541 return -EFAULT;
1542 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1543 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1544 }
1545
1546 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1547
1548
1549
1550
1551
1552
1553
1554 if (sigmask) {
1555 if (error == -EINTR) {
1556 memcpy(¤t->saved_sigmask, &sigsaved,
1557 sizeof(sigsaved));
1558 set_restore_sigmask();
1559 } else
1560 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1561 }
1562
1563 return error;
1564}
1565
1566#endif
1567
1568static int __init eventpoll_init(void)
1569{
1570 struct sysinfo si;
1571
1572 si_meminfo(&si);
1573
1574
1575
1576 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1577 EP_ITEM_COST;
1578 BUG_ON(max_user_watches < 0);
1579
1580
1581
1582
1583
1584 ep_nested_calls_init(&poll_loop_ncalls);
1585
1586
1587 ep_nested_calls_init(&poll_safewake_ncalls);
1588
1589
1590 ep_nested_calls_init(&poll_readywalk_ncalls);
1591
1592
1593 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1594 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1595
1596
1597 pwq_cache = kmem_cache_create("eventpoll_pwq",
1598 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1599
1600 return 0;
1601}
1602fs_initcall(eventpoll_init);
1603