1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <asm/atomic.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74#define DEBUG_EPOLL 0
75
76#if DEBUG_EPOLL > 0
77#define DPRINTK(x) printk x
78#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
79#else
80#define DPRINTK(x) (void) 0
81#define DNPRINTK(n, x) (void) 0
82#endif
83
84#define DEBUG_EPI 0
85
86#if DEBUG_EPI != 0
87#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE )
88#else
89#define EPI_SLAB_DEBUG 0
90#endif
91
92
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94
95
96#define EP_MAX_POLLWAKE_NESTS 4
97
98
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
100
101#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
102
103#define EP_UNACTIVE_PTR ((void *) -1L)
104
105#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
106
107struct epoll_filefd {
108 struct file *file;
109 int fd;
110};
111
112
113
114
115
116
117
118
119struct wake_task_node {
120 struct list_head llink;
121 struct task_struct *task;
122 wait_queue_head_t *wq;
123};
124
125
126
127
128
129struct poll_safewake {
130 struct list_head wake_task_list;
131 spinlock_t lock;
132};
133
134
135
136
137
138struct epitem {
139
140 struct rb_node rbn;
141
142
143 struct list_head rdllink;
144
145
146
147
148
149 struct epitem *next;
150
151
152 struct epoll_filefd ffd;
153
154
155 int nwait;
156
157
158 struct list_head pwqlist;
159
160
161 struct eventpoll *ep;
162
163
164 struct list_head fllink;
165
166
167 struct epoll_event event;
168};
169
170
171
172
173
174
175struct eventpoll {
176
177 spinlock_t lock;
178
179
180
181
182
183
184
185 struct mutex mtx;
186
187
188 wait_queue_head_t wq;
189
190
191 wait_queue_head_t poll_wait;
192
193
194 struct list_head rdllist;
195
196
197 struct rb_root rbr;
198
199
200
201
202
203
204 struct epitem *ovflist;
205
206
207 struct user_struct *user;
208};
209
210
211struct eppoll_entry {
212
213 struct list_head llink;
214
215
216 void *base;
217
218
219
220
221
222 wait_queue_t wait;
223
224
225 wait_queue_head_t *whead;
226};
227
228
229struct ep_pqueue {
230 poll_table pt;
231 struct epitem *epi;
232};
233
234
235
236
237
238static int max_user_watches __read_mostly;
239
240
241
242
243static DEFINE_MUTEX(epmutex);
244
245
246static struct poll_safewake psw;
247
248
249static struct kmem_cache *epi_cache __read_mostly;
250
251
252static struct kmem_cache *pwq_cache __read_mostly;
253
254#ifdef CONFIG_SYSCTL
255
256#include <linux/sysctl.h>
257
258static int zero;
259
260ctl_table epoll_table[] = {
261 {
262 .procname = "max_user_watches",
263 .data = &max_user_watches,
264 .maxlen = sizeof(int),
265 .mode = 0644,
266 .proc_handler = &proc_dointvec_minmax,
267 .extra1 = &zero,
268 },
269 { .ctl_name = 0 }
270};
271#endif
272
273
274
275static inline void ep_set_ffd(struct epoll_filefd *ffd,
276 struct file *file, int fd)
277{
278 ffd->file = file;
279 ffd->fd = fd;
280}
281
282
283static inline int ep_cmp_ffd(struct epoll_filefd *p1,
284 struct epoll_filefd *p2)
285{
286 return (p1->file > p2->file ? +1:
287 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
288}
289
290
291static inline int ep_is_linked(struct list_head *p)
292{
293 return !list_empty(p);
294}
295
296
297static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
298{
299 return container_of(p, struct eppoll_entry, wait)->base;
300}
301
302
303static inline struct epitem *ep_item_from_epqueue(poll_table *p)
304{
305 return container_of(p, struct ep_pqueue, pt)->epi;
306}
307
308
309static inline int ep_op_has_event(int op)
310{
311 return op != EPOLL_CTL_DEL;
312}
313
314
315static void ep_poll_safewake_init(struct poll_safewake *psw)
316{
317
318 INIT_LIST_HEAD(&psw->wake_task_list);
319 spin_lock_init(&psw->lock);
320}
321
322
323
324
325
326
327
328
329
330
331
332
333
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
335{
336 int wake_nests = 0;
337 unsigned long flags;
338 struct task_struct *this_task = current;
339 struct list_head *lsthead = &psw->wake_task_list;
340 struct wake_task_node *tncur;
341 struct wake_task_node tnode;
342
343 spin_lock_irqsave(&psw->lock, flags);
344
345
346 list_for_each_entry(tncur, lsthead, llink) {
347
348 if (tncur->wq == wq ||
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350
351
352
353
354 spin_unlock_irqrestore(&psw->lock, flags);
355 return;
356 }
357 }
358
359
360 tnode.task = this_task;
361 tnode.wq = wq;
362 list_add(&tnode.llink, lsthead);
363
364 spin_unlock_irqrestore(&psw->lock, flags);
365
366
367 wake_up_nested(wq, 1 + wake_nests);
368
369
370 spin_lock_irqsave(&psw->lock, flags);
371 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags);
373}
374
375
376
377
378
379
380static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
381{
382 int nwait;
383 struct list_head *lsthead = &epi->pwqlist;
384 struct eppoll_entry *pwq;
385
386
387 nwait = xchg(&epi->nwait, 0);
388
389 if (nwait) {
390 while (!list_empty(lsthead)) {
391 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
392
393 list_del_init(&pwq->llink);
394 remove_wait_queue(pwq->whead, &pwq->wait);
395 kmem_cache_free(pwq_cache, pwq);
396 }
397 }
398}
399
400
401
402
403
404static int ep_remove(struct eventpoll *ep, struct epitem *epi)
405{
406 unsigned long flags;
407 struct file *file = epi->ffd.file;
408
409
410
411
412
413
414
415
416
417 ep_unregister_pollwait(ep, epi);
418
419
420 spin_lock(&file->f_ep_lock);
421 if (ep_is_linked(&epi->fllink))
422 list_del_init(&epi->fllink);
423 spin_unlock(&file->f_ep_lock);
424
425 rb_erase(&epi->rbn, &ep->rbr);
426
427 spin_lock_irqsave(&ep->lock, flags);
428 if (ep_is_linked(&epi->rdllink))
429 list_del_init(&epi->rdllink);
430 spin_unlock_irqrestore(&ep->lock, flags);
431
432
433 kmem_cache_free(epi_cache, epi);
434
435 atomic_dec(&ep->user->epoll_watches);
436
437 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
438 current, ep, file));
439
440 return 0;
441}
442
443static void ep_free(struct eventpoll *ep)
444{
445 struct rb_node *rbp;
446 struct epitem *epi;
447
448
449 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait);
451
452
453
454
455
456
457
458
459
460 mutex_lock(&epmutex);
461
462
463
464
465 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
466 epi = rb_entry(rbp, struct epitem, rbn);
467
468 ep_unregister_pollwait(ep, epi);
469 }
470
471
472
473
474
475
476
477 while ((rbp = rb_first(&ep->rbr)) != NULL) {
478 epi = rb_entry(rbp, struct epitem, rbn);
479 ep_remove(ep, epi);
480 }
481
482 mutex_unlock(&epmutex);
483 mutex_destroy(&ep->mtx);
484 free_uid(ep->user);
485 kfree(ep);
486}
487
488static int ep_eventpoll_release(struct inode *inode, struct file *file)
489{
490 struct eventpoll *ep = file->private_data;
491
492 if (ep)
493 ep_free(ep);
494
495 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
496 return 0;
497}
498
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{
501 unsigned int pollflags = 0;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data;
504
505
506 poll_wait(file, &ep->poll_wait, wait);
507
508
509 spin_lock_irqsave(&ep->lock, flags);
510 if (!list_empty(&ep->rdllist))
511 pollflags = POLLIN | POLLRDNORM;
512 spin_unlock_irqrestore(&ep->lock, flags);
513
514 return pollflags;
515}
516
517
518static const struct file_operations eventpoll_fops = {
519 .release = ep_eventpoll_release,
520 .poll = ep_eventpoll_poll
521};
522
523
524static inline int is_file_epoll(struct file *f)
525{
526 return f->f_op == &eventpoll_fops;
527}
528
529
530
531
532
533
534void eventpoll_release_file(struct file *file)
535{
536 struct list_head *lsthead = &file->f_ep_links;
537 struct eventpoll *ep;
538 struct epitem *epi;
539
540
541
542
543
544
545
546
547
548
549
550
551 mutex_lock(&epmutex);
552
553 while (!list_empty(lsthead)) {
554 epi = list_first_entry(lsthead, struct epitem, fllink);
555
556 ep = epi->ep;
557 list_del_init(&epi->fllink);
558 mutex_lock(&ep->mtx);
559 ep_remove(ep, epi);
560 mutex_unlock(&ep->mtx);
561 }
562
563 mutex_unlock(&epmutex);
564}
565
566static int ep_alloc(struct eventpoll **pep)
567{
568 int error;
569 struct user_struct *user;
570 struct eventpoll *ep;
571
572 user = get_current_user();
573 error = -ENOMEM;
574 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
575 if (unlikely(!ep))
576 goto free_uid;
577
578 spin_lock_init(&ep->lock);
579 mutex_init(&ep->mtx);
580 init_waitqueue_head(&ep->wq);
581 init_waitqueue_head(&ep->poll_wait);
582 INIT_LIST_HEAD(&ep->rdllist);
583 ep->rbr = RB_ROOT;
584 ep->ovflist = EP_UNACTIVE_PTR;
585 ep->user = user;
586
587 *pep = ep;
588
589 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
590 current, ep));
591 return 0;
592
593free_uid:
594 free_uid(user);
595 return error;
596}
597
598
599
600
601
602
603static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
604{
605 int kcmp;
606 struct rb_node *rbp;
607 struct epitem *epi, *epir = NULL;
608 struct epoll_filefd ffd;
609
610 ep_set_ffd(&ffd, file, fd);
611 for (rbp = ep->rbr.rb_node; rbp; ) {
612 epi = rb_entry(rbp, struct epitem, rbn);
613 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
614 if (kcmp > 0)
615 rbp = rbp->rb_right;
616 else if (kcmp < 0)
617 rbp = rbp->rb_left;
618 else {
619 epir = epi;
620 break;
621 }
622 }
623
624 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
625 current, file, epir));
626
627 return epir;
628}
629
630
631
632
633
634
635static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
636{
637 int pwake = 0;
638 unsigned long flags;
639 struct epitem *epi = ep_item_from_wait(wait);
640 struct eventpoll *ep = epi->ep;
641
642 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
643 current, epi->ffd.file, epi, ep));
644
645 spin_lock_irqsave(&ep->lock, flags);
646
647
648
649
650
651
652
653 if (!(epi->event.events & ~EP_PRIVATE_BITS))
654 goto out_unlock;
655
656
657
658
659
660
661
662 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
663 if (epi->next == EP_UNACTIVE_PTR) {
664 epi->next = ep->ovflist;
665 ep->ovflist = epi;
666 }
667 goto out_unlock;
668 }
669
670
671 if (ep_is_linked(&epi->rdllink))
672 goto is_linked;
673
674 list_add_tail(&epi->rdllink, &ep->rdllist);
675
676is_linked:
677
678
679
680
681 if (waitqueue_active(&ep->wq))
682 wake_up_locked(&ep->wq);
683 if (waitqueue_active(&ep->poll_wait))
684 pwake++;
685
686out_unlock:
687 spin_unlock_irqrestore(&ep->lock, flags);
688
689
690 if (pwake)
691 ep_poll_safewake(&psw, &ep->poll_wait);
692
693 return 1;
694}
695
696
697
698
699
700static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
701 poll_table *pt)
702{
703 struct epitem *epi = ep_item_from_epqueue(pt);
704 struct eppoll_entry *pwq;
705
706 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
707 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
708 pwq->whead = whead;
709 pwq->base = epi;
710 add_wait_queue(whead, &pwq->wait);
711 list_add_tail(&pwq->llink, &epi->pwqlist);
712 epi->nwait++;
713 } else {
714
715 epi->nwait = -1;
716 }
717}
718
719static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
720{
721 int kcmp;
722 struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
723 struct epitem *epic;
724
725 while (*p) {
726 parent = *p;
727 epic = rb_entry(parent, struct epitem, rbn);
728 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
729 if (kcmp > 0)
730 p = &parent->rb_right;
731 else
732 p = &parent->rb_left;
733 }
734 rb_link_node(&epi->rbn, parent, p);
735 rb_insert_color(&epi->rbn, &ep->rbr);
736}
737
738
739
740
741static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
742 struct file *tfile, int fd)
743{
744 int error, revents, pwake = 0;
745 unsigned long flags;
746 struct epitem *epi;
747 struct ep_pqueue epq;
748
749 if (unlikely(atomic_read(&ep->user->epoll_watches) >=
750 max_user_watches))
751 return -ENOSPC;
752 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
753 return -ENOMEM;
754
755
756 INIT_LIST_HEAD(&epi->rdllink);
757 INIT_LIST_HEAD(&epi->fllink);
758 INIT_LIST_HEAD(&epi->pwqlist);
759 epi->ep = ep;
760 ep_set_ffd(&epi->ffd, tfile, fd);
761 epi->event = *event;
762 epi->nwait = 0;
763 epi->next = EP_UNACTIVE_PTR;
764
765
766 epq.epi = epi;
767 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
768
769
770
771
772
773
774
775
776 revents = tfile->f_op->poll(tfile, &epq.pt);
777
778
779
780
781
782
783 error = -ENOMEM;
784 if (epi->nwait < 0)
785 goto error_unregister;
786
787
788 spin_lock(&tfile->f_ep_lock);
789 list_add_tail(&epi->fllink, &tfile->f_ep_links);
790 spin_unlock(&tfile->f_ep_lock);
791
792
793
794
795
796 ep_rbtree_insert(ep, epi);
797
798
799 spin_lock_irqsave(&ep->lock, flags);
800
801
802 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
803 list_add_tail(&epi->rdllink, &ep->rdllist);
804
805
806 if (waitqueue_active(&ep->wq))
807 wake_up_locked(&ep->wq);
808 if (waitqueue_active(&ep->poll_wait))
809 pwake++;
810 }
811
812 spin_unlock_irqrestore(&ep->lock, flags);
813
814 atomic_inc(&ep->user->epoll_watches);
815
816
817 if (pwake)
818 ep_poll_safewake(&psw, &ep->poll_wait);
819
820 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
821 current, ep, tfile, fd));
822
823 return 0;
824
825error_unregister:
826 ep_unregister_pollwait(ep, epi);
827
828
829
830
831
832
833
834 spin_lock_irqsave(&ep->lock, flags);
835 if (ep_is_linked(&epi->rdllink))
836 list_del_init(&epi->rdllink);
837 spin_unlock_irqrestore(&ep->lock, flags);
838
839 kmem_cache_free(epi_cache, epi);
840
841 return error;
842}
843
844
845
846
847
848static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
849{
850 int pwake = 0;
851 unsigned int revents;
852 unsigned long flags;
853
854
855
856
857
858
859
860 epi->event.events = event->events;
861
862
863
864
865
866 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
867
868 spin_lock_irqsave(&ep->lock, flags);
869
870
871 epi->event.data = event->data;
872
873
874
875
876
877 if (revents & event->events) {
878 if (!ep_is_linked(&epi->rdllink)) {
879 list_add_tail(&epi->rdllink, &ep->rdllist);
880
881
882 if (waitqueue_active(&ep->wq))
883 wake_up_locked(&ep->wq);
884 if (waitqueue_active(&ep->poll_wait))
885 pwake++;
886 }
887 }
888 spin_unlock_irqrestore(&ep->lock, flags);
889
890
891 if (pwake)
892 ep_poll_safewake(&psw, &ep->poll_wait);
893
894 return 0;
895}
896
897static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
898 int maxevents)
899{
900 int eventcnt, error = -EFAULT, pwake = 0;
901 unsigned int revents;
902 unsigned long flags;
903 struct epitem *epi, *nepi;
904 struct list_head txlist;
905
906 INIT_LIST_HEAD(&txlist);
907
908
909
910
911
912 mutex_lock(&ep->mtx);
913
914
915
916
917
918
919
920
921 spin_lock_irqsave(&ep->lock, flags);
922 list_splice(&ep->rdllist, &txlist);
923 INIT_LIST_HEAD(&ep->rdllist);
924 ep->ovflist = NULL;
925 spin_unlock_irqrestore(&ep->lock, flags);
926
927
928
929
930
931
932 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
933 epi = list_first_entry(&txlist, struct epitem, rdllink);
934
935 list_del_init(&epi->rdllink);
936
937
938
939
940
941
942 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
943 revents &= epi->event.events;
944
945
946
947
948
949
950
951 if (revents) {
952 if (__put_user(revents,
953 &events[eventcnt].events) ||
954 __put_user(epi->event.data,
955 &events[eventcnt].data))
956 goto errxit;
957 if (epi->event.events & EPOLLONESHOT)
958 epi->event.events &= EP_PRIVATE_BITS;
959 eventcnt++;
960 }
961
962
963
964
965
966 if (!(epi->event.events & EPOLLET) &&
967 (revents & epi->event.events))
968 list_add_tail(&epi->rdllink, &ep->rdllist);
969 }
970 error = 0;
971
972errxit:
973
974 spin_lock_irqsave(&ep->lock, flags);
975
976
977
978
979
980 for (nepi = ep->ovflist; (epi = nepi) != NULL;
981 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
982
983
984
985
986
987 if (!ep_is_linked(&epi->rdllink))
988 list_add_tail(&epi->rdllink, &ep->rdllist);
989 }
990
991
992
993
994
995 ep->ovflist = EP_UNACTIVE_PTR;
996
997
998
999
1000
1001
1002 list_splice(&txlist, &ep->rdllist);
1003
1004 if (!list_empty(&ep->rdllist)) {
1005
1006
1007
1008
1009 if (waitqueue_active(&ep->wq))
1010 wake_up_locked(&ep->wq);
1011 if (waitqueue_active(&ep->poll_wait))
1012 pwake++;
1013 }
1014 spin_unlock_irqrestore(&ep->lock, flags);
1015
1016 mutex_unlock(&ep->mtx);
1017
1018
1019 if (pwake)
1020 ep_poll_safewake(&psw, &ep->poll_wait);
1021
1022 return eventcnt == 0 ? error: eventcnt;
1023}
1024
1025static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1026 int maxevents, long timeout)
1027{
1028 int res, eavail;
1029 unsigned long flags;
1030 long jtimeout;
1031 wait_queue_t wait;
1032
1033
1034
1035
1036
1037
1038 jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
1039 MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
1040
1041retry:
1042 spin_lock_irqsave(&ep->lock, flags);
1043
1044 res = 0;
1045 if (list_empty(&ep->rdllist)) {
1046
1047
1048
1049
1050
1051 init_waitqueue_entry(&wait, current);
1052 wait.flags |= WQ_FLAG_EXCLUSIVE;
1053 __add_wait_queue(&ep->wq, &wait);
1054
1055 for (;;) {
1056
1057
1058
1059
1060
1061 set_current_state(TASK_INTERRUPTIBLE);
1062 if (!list_empty(&ep->rdllist) || !jtimeout)
1063 break;
1064 if (signal_pending(current)) {
1065 res = -EINTR;
1066 break;
1067 }
1068
1069 spin_unlock_irqrestore(&ep->lock, flags);
1070 jtimeout = schedule_timeout(jtimeout);
1071 spin_lock_irqsave(&ep->lock, flags);
1072 }
1073 __remove_wait_queue(&ep->wq, &wait);
1074
1075 set_current_state(TASK_RUNNING);
1076 }
1077
1078
1079 eavail = !list_empty(&ep->rdllist);
1080
1081 spin_unlock_irqrestore(&ep->lock, flags);
1082
1083
1084
1085
1086
1087
1088 if (!res && eavail &&
1089 !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
1090 goto retry;
1091
1092 return res;
1093}
1094
1095
1096
1097
1098SYSCALL_DEFINE1(epoll_create1, int, flags)
1099{
1100 int error, fd = -1;
1101 struct eventpoll *ep;
1102
1103
1104 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1105
1106 if (flags & ~EPOLL_CLOEXEC)
1107 return -EINVAL;
1108
1109 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1110 current, flags));
1111
1112
1113
1114
1115 error = ep_alloc(&ep);
1116 if (error < 0) {
1117 fd = error;
1118 goto error_return;
1119 }
1120
1121
1122
1123
1124
1125 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1126 flags & O_CLOEXEC);
1127 if (fd < 0)
1128 ep_free(ep);
1129
1130error_return:
1131 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1132 current, flags, fd));
1133
1134 return fd;
1135}
1136
1137SYSCALL_DEFINE1(epoll_create, int, size)
1138{
1139 if (size < 0)
1140 return -EINVAL;
1141
1142 return sys_epoll_create1(0);
1143}
1144
1145
1146
1147
1148
1149
1150SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1151 struct epoll_event __user *, event)
1152{
1153 int error;
1154 struct file *file, *tfile;
1155 struct eventpoll *ep;
1156 struct epitem *epi;
1157 struct epoll_event epds;
1158
1159 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1160 current, epfd, op, fd, event));
1161
1162 error = -EFAULT;
1163 if (ep_op_has_event(op) &&
1164 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1165 goto error_return;
1166
1167
1168 error = -EBADF;
1169 file = fget(epfd);
1170 if (!file)
1171 goto error_return;
1172
1173
1174 tfile = fget(fd);
1175 if (!tfile)
1176 goto error_fput;
1177
1178
1179 error = -EPERM;
1180 if (!tfile->f_op || !tfile->f_op->poll)
1181 goto error_tgt_fput;
1182
1183
1184
1185
1186
1187
1188 error = -EINVAL;
1189 if (file == tfile || !is_file_epoll(file))
1190 goto error_tgt_fput;
1191
1192
1193
1194
1195
1196 ep = file->private_data;
1197
1198 mutex_lock(&ep->mtx);
1199
1200
1201
1202
1203
1204
1205 epi = ep_find(ep, tfile, fd);
1206
1207 error = -EINVAL;
1208 switch (op) {
1209 case EPOLL_CTL_ADD:
1210 if (!epi) {
1211 epds.events |= POLLERR | POLLHUP;
1212
1213 error = ep_insert(ep, &epds, tfile, fd);
1214 } else
1215 error = -EEXIST;
1216 break;
1217 case EPOLL_CTL_DEL:
1218 if (epi)
1219 error = ep_remove(ep, epi);
1220 else
1221 error = -ENOENT;
1222 break;
1223 case EPOLL_CTL_MOD:
1224 if (epi) {
1225 epds.events |= POLLERR | POLLHUP;
1226 error = ep_modify(ep, epi, &epds);
1227 } else
1228 error = -ENOENT;
1229 break;
1230 }
1231 mutex_unlock(&ep->mtx);
1232
1233error_tgt_fput:
1234 fput(tfile);
1235error_fput:
1236 fput(file);
1237error_return:
1238 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1239 current, epfd, op, fd, event, error));
1240
1241 return error;
1242}
1243
1244
1245
1246
1247
1248SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1249 int, maxevents, int, timeout)
1250{
1251 int error;
1252 struct file *file;
1253 struct eventpoll *ep;
1254
1255 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1256 current, epfd, events, maxevents, timeout));
1257
1258
1259 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1260 return -EINVAL;
1261
1262
1263 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1264 error = -EFAULT;
1265 goto error_return;
1266 }
1267
1268
1269 error = -EBADF;
1270 file = fget(epfd);
1271 if (!file)
1272 goto error_return;
1273
1274
1275
1276
1277
1278 error = -EINVAL;
1279 if (!is_file_epoll(file))
1280 goto error_fput;
1281
1282
1283
1284
1285
1286 ep = file->private_data;
1287
1288
1289 error = ep_poll(ep, events, maxevents, timeout);
1290
1291error_fput:
1292 fput(file);
1293error_return:
1294 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1295 current, epfd, events, maxevents, timeout, error));
1296
1297 return error;
1298}
1299
1300#ifdef HAVE_SET_RESTORE_SIGMASK
1301
1302
1303
1304
1305
1306SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1307 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1308 size_t, sigsetsize)
1309{
1310 int error;
1311 sigset_t ksigmask, sigsaved;
1312
1313
1314
1315
1316
1317 if (sigmask) {
1318 if (sigsetsize != sizeof(sigset_t))
1319 return -EINVAL;
1320 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1321 return -EFAULT;
1322 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1323 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1324 }
1325
1326 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1327
1328
1329
1330
1331
1332
1333
1334 if (sigmask) {
1335 if (error == -EINTR) {
1336 memcpy(¤t->saved_sigmask, &sigsaved,
1337 sizeof(sigsaved));
1338 set_restore_sigmask();
1339 } else
1340 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1341 }
1342
1343 return error;
1344}
1345
1346#endif
1347
1348static int __init eventpoll_init(void)
1349{
1350 struct sysinfo si;
1351
1352 si_meminfo(&si);
1353
1354
1355
1356 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1357 EP_ITEM_COST;
1358
1359
1360 ep_poll_safewake_init(&psw);
1361
1362
1363 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1364 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
1365 NULL);
1366
1367
1368 pwq_cache = kmem_cache_create("eventpoll_pwq",
1369 sizeof(struct eppoll_entry), 0,
1370 EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1371
1372 return 0;
1373}
1374fs_initcall(eventpoll_init);
1375