1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/fs.h>
18#include <linux/file.h>
19#include <linux/signal.h>
20#include <linux/errno.h>
21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/poll.h>
24#include <linux/string.h>
25#include <linux/list.h>
26#include <linux/hash.h>
27#include <linux/spinlock.h>
28#include <linux/syscalls.h>
29#include <linux/rbtree.h>
30#include <linux/wait.h>
31#include <linux/eventpoll.h>
32#include <linux/mount.h>
33#include <linux/bitops.h>
34#include <linux/mutex.h>
35#include <linux/anon_inodes.h>
36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h>
39#include <asm/mman.h>
40#include <asm/atomic.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74#define DEBUG_EPOLL 0
75
76#if DEBUG_EPOLL > 0
77#define DPRINTK(x) printk x
78#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
79#else
80#define DPRINTK(x) (void) 0
81#define DNPRINTK(n, x) (void) 0
82#endif
83
84#define DEBUG_EPI 0
85
86#if DEBUG_EPI != 0
87#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE )
88#else
89#define EPI_SLAB_DEBUG 0
90#endif
91
92
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94
95
96#define EP_MAX_POLLWAKE_NESTS 4
97
98
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
100
101#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
102
103#define EP_UNACTIVE_PTR ((void *) -1L)
104
105#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
106
107struct epoll_filefd {
108 struct file *file;
109 int fd;
110};
111
112
113
114
115
116
117
118
119struct wake_task_node {
120 struct list_head llink;
121 struct task_struct *task;
122 wait_queue_head_t *wq;
123};
124
125
126
127
128
129struct poll_safewake {
130 struct list_head wake_task_list;
131 spinlock_t lock;
132};
133
134
135
136
137
138struct epitem {
139
140 struct rb_node rbn;
141
142
143 struct list_head rdllink;
144
145
146
147
148
149 struct epitem *next;
150
151
152 struct epoll_filefd ffd;
153
154
155 int nwait;
156
157
158 struct list_head pwqlist;
159
160
161 struct eventpoll *ep;
162
163
164 struct list_head fllink;
165
166
167 struct epoll_event event;
168};
169
170
171
172
173
174
175struct eventpoll {
176
177 spinlock_t lock;
178
179
180
181
182
183
184
185 struct mutex mtx;
186
187
188 wait_queue_head_t wq;
189
190
191 wait_queue_head_t poll_wait;
192
193
194 struct list_head rdllist;
195
196
197 struct rb_root rbr;
198
199
200
201
202
203
204 struct epitem *ovflist;
205
206
207 struct user_struct *user;
208};
209
210
211struct eppoll_entry {
212
213 struct list_head llink;
214
215
216 void *base;
217
218
219
220
221
222 wait_queue_t wait;
223
224
225 wait_queue_head_t *whead;
226};
227
228
229struct ep_pqueue {
230 poll_table pt;
231 struct epitem *epi;
232};
233
234
235
236
237
238static int max_user_instances __read_mostly;
239
240static int max_user_watches __read_mostly;
241
242
243
244
245static DEFINE_MUTEX(epmutex);
246
247
248static struct poll_safewake psw;
249
250
251static struct kmem_cache *epi_cache __read_mostly;
252
253
254static struct kmem_cache *pwq_cache __read_mostly;
255
256#ifdef CONFIG_SYSCTL
257
258#include <linux/sysctl.h>
259
260static int zero;
261
262ctl_table epoll_table[] = {
263 {
264 .procname = "max_user_instances",
265 .data = &max_user_instances,
266 .maxlen = sizeof(int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .extra1 = &zero,
270 },
271 {
272 .procname = "max_user_watches",
273 .data = &max_user_watches,
274 .maxlen = sizeof(int),
275 .mode = 0644,
276 .proc_handler = &proc_dointvec_minmax,
277 .extra1 = &zero,
278 },
279 { .ctl_name = 0 }
280};
281#endif
282
283
284
285static inline void ep_set_ffd(struct epoll_filefd *ffd,
286 struct file *file, int fd)
287{
288 ffd->file = file;
289 ffd->fd = fd;
290}
291
292
293static inline int ep_cmp_ffd(struct epoll_filefd *p1,
294 struct epoll_filefd *p2)
295{
296 return (p1->file > p2->file ? +1:
297 (p1->file < p2->file ? -1 : p1->fd - p2->fd));
298}
299
300
301static inline int ep_is_linked(struct list_head *p)
302{
303 return !list_empty(p);
304}
305
306
307static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
308{
309 return container_of(p, struct eppoll_entry, wait)->base;
310}
311
312
313static inline struct epitem *ep_item_from_epqueue(poll_table *p)
314{
315 return container_of(p, struct ep_pqueue, pt)->epi;
316}
317
318
319static inline int ep_op_has_event(int op)
320{
321 return op != EPOLL_CTL_DEL;
322}
323
324
325static void ep_poll_safewake_init(struct poll_safewake *psw)
326{
327
328 INIT_LIST_HEAD(&psw->wake_task_list);
329 spin_lock_init(&psw->lock);
330}
331
332
333
334
335
336
337
338
339
340
341
342
343
344static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
345{
346 int wake_nests = 0;
347 unsigned long flags;
348 struct task_struct *this_task = current;
349 struct list_head *lsthead = &psw->wake_task_list;
350 struct wake_task_node *tncur;
351 struct wake_task_node tnode;
352
353 spin_lock_irqsave(&psw->lock, flags);
354
355
356 list_for_each_entry(tncur, lsthead, llink) {
357
358 if (tncur->wq == wq ||
359 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
360
361
362
363
364 spin_unlock_irqrestore(&psw->lock, flags);
365 return;
366 }
367 }
368
369
370 tnode.task = this_task;
371 tnode.wq = wq;
372 list_add(&tnode.llink, lsthead);
373
374 spin_unlock_irqrestore(&psw->lock, flags);
375
376
377 wake_up_nested(wq, 1 + wake_nests);
378
379
380 spin_lock_irqsave(&psw->lock, flags);
381 list_del(&tnode.llink);
382 spin_unlock_irqrestore(&psw->lock, flags);
383}
384
385
386
387
388
389
390static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
391{
392 int nwait;
393 struct list_head *lsthead = &epi->pwqlist;
394 struct eppoll_entry *pwq;
395
396
397 nwait = xchg(&epi->nwait, 0);
398
399 if (nwait) {
400 while (!list_empty(lsthead)) {
401 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
402
403 list_del_init(&pwq->llink);
404 remove_wait_queue(pwq->whead, &pwq->wait);
405 kmem_cache_free(pwq_cache, pwq);
406 }
407 }
408}
409
410
411
412
413
414static int ep_remove(struct eventpoll *ep, struct epitem *epi)
415{
416 unsigned long flags;
417 struct file *file = epi->ffd.file;
418
419
420
421
422
423
424
425
426
427 ep_unregister_pollwait(ep, epi);
428
429
430 spin_lock(&file->f_ep_lock);
431 if (ep_is_linked(&epi->fllink))
432 list_del_init(&epi->fllink);
433 spin_unlock(&file->f_ep_lock);
434
435 rb_erase(&epi->rbn, &ep->rbr);
436
437 spin_lock_irqsave(&ep->lock, flags);
438 if (ep_is_linked(&epi->rdllink))
439 list_del_init(&epi->rdllink);
440 spin_unlock_irqrestore(&ep->lock, flags);
441
442
443 kmem_cache_free(epi_cache, epi);
444
445 atomic_dec(&ep->user->epoll_watches);
446
447 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
448 current, ep, file));
449
450 return 0;
451}
452
453static void ep_free(struct eventpoll *ep)
454{
455 struct rb_node *rbp;
456 struct epitem *epi;
457
458
459 if (waitqueue_active(&ep->poll_wait))
460 ep_poll_safewake(&psw, &ep->poll_wait);
461
462
463
464
465
466
467
468
469
470 mutex_lock(&epmutex);
471
472
473
474
475 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
476 epi = rb_entry(rbp, struct epitem, rbn);
477
478 ep_unregister_pollwait(ep, epi);
479 }
480
481
482
483
484
485
486
487 while ((rbp = rb_first(&ep->rbr)) != NULL) {
488 epi = rb_entry(rbp, struct epitem, rbn);
489 ep_remove(ep, epi);
490 }
491
492 mutex_unlock(&epmutex);
493 mutex_destroy(&ep->mtx);
494 atomic_dec(&ep->user->epoll_devs);
495 free_uid(ep->user);
496 kfree(ep);
497}
498
499static int ep_eventpoll_release(struct inode *inode, struct file *file)
500{
501 struct eventpoll *ep = file->private_data;
502
503 if (ep)
504 ep_free(ep);
505
506 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
507 return 0;
508}
509
510static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
511{
512 unsigned int pollflags = 0;
513 unsigned long flags;
514 struct eventpoll *ep = file->private_data;
515
516
517 poll_wait(file, &ep->poll_wait, wait);
518
519
520 spin_lock_irqsave(&ep->lock, flags);
521 if (!list_empty(&ep->rdllist))
522 pollflags = POLLIN | POLLRDNORM;
523 spin_unlock_irqrestore(&ep->lock, flags);
524
525 return pollflags;
526}
527
528
529static const struct file_operations eventpoll_fops = {
530 .release = ep_eventpoll_release,
531 .poll = ep_eventpoll_poll
532};
533
534
535static inline int is_file_epoll(struct file *f)
536{
537 return f->f_op == &eventpoll_fops;
538}
539
540
541
542
543
544
545void eventpoll_release_file(struct file *file)
546{
547 struct list_head *lsthead = &file->f_ep_links;
548 struct eventpoll *ep;
549 struct epitem *epi;
550
551
552
553
554
555
556
557
558
559
560
561
562 mutex_lock(&epmutex);
563
564 while (!list_empty(lsthead)) {
565 epi = list_first_entry(lsthead, struct epitem, fllink);
566
567 ep = epi->ep;
568 list_del_init(&epi->fllink);
569 mutex_lock(&ep->mtx);
570 ep_remove(ep, epi);
571 mutex_unlock(&ep->mtx);
572 }
573
574 mutex_unlock(&epmutex);
575}
576
577static int ep_alloc(struct eventpoll **pep)
578{
579 int error;
580 struct user_struct *user;
581 struct eventpoll *ep;
582
583 user = get_current_user();
584 error = -EMFILE;
585 if (unlikely(atomic_read(&user->epoll_devs) >=
586 max_user_instances))
587 goto free_uid;
588 error = -ENOMEM;
589 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
590 if (unlikely(!ep))
591 goto free_uid;
592
593 spin_lock_init(&ep->lock);
594 mutex_init(&ep->mtx);
595 init_waitqueue_head(&ep->wq);
596 init_waitqueue_head(&ep->poll_wait);
597 INIT_LIST_HEAD(&ep->rdllist);
598 ep->rbr = RB_ROOT;
599 ep->ovflist = EP_UNACTIVE_PTR;
600 ep->user = user;
601
602 *pep = ep;
603
604 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
605 current, ep));
606 return 0;
607
608free_uid:
609 free_uid(user);
610 return error;
611}
612
613
614
615
616
617
618static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
619{
620 int kcmp;
621 struct rb_node *rbp;
622 struct epitem *epi, *epir = NULL;
623 struct epoll_filefd ffd;
624
625 ep_set_ffd(&ffd, file, fd);
626 for (rbp = ep->rbr.rb_node; rbp; ) {
627 epi = rb_entry(rbp, struct epitem, rbn);
628 kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
629 if (kcmp > 0)
630 rbp = rbp->rb_right;
631 else if (kcmp < 0)
632 rbp = rbp->rb_left;
633 else {
634 epir = epi;
635 break;
636 }
637 }
638
639 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
640 current, file, epir));
641
642 return epir;
643}
644
645
646
647
648
649
650static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
651{
652 int pwake = 0;
653 unsigned long flags;
654 struct epitem *epi = ep_item_from_wait(wait);
655 struct eventpoll *ep = epi->ep;
656
657 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
658 current, epi->ffd.file, epi, ep));
659
660 spin_lock_irqsave(&ep->lock, flags);
661
662
663
664
665
666
667
668 if (!(epi->event.events & ~EP_PRIVATE_BITS))
669 goto out_unlock;
670
671
672
673
674
675
676
677 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
678 if (epi->next == EP_UNACTIVE_PTR) {
679 epi->next = ep->ovflist;
680 ep->ovflist = epi;
681 }
682 goto out_unlock;
683 }
684
685
686 if (ep_is_linked(&epi->rdllink))
687 goto is_linked;
688
689 list_add_tail(&epi->rdllink, &ep->rdllist);
690
691is_linked:
692
693
694
695
696 if (waitqueue_active(&ep->wq))
697 wake_up_locked(&ep->wq);
698 if (waitqueue_active(&ep->poll_wait))
699 pwake++;
700
701out_unlock:
702 spin_unlock_irqrestore(&ep->lock, flags);
703
704
705 if (pwake)
706 ep_poll_safewake(&psw, &ep->poll_wait);
707
708 return 1;
709}
710
711
712
713
714
715static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
716 poll_table *pt)
717{
718 struct epitem *epi = ep_item_from_epqueue(pt);
719 struct eppoll_entry *pwq;
720
721 if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
722 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
723 pwq->whead = whead;
724 pwq->base = epi;
725 add_wait_queue(whead, &pwq->wait);
726 list_add_tail(&pwq->llink, &epi->pwqlist);
727 epi->nwait++;
728 } else {
729
730 epi->nwait = -1;
731 }
732}
733
734static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
735{
736 int kcmp;
737 struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
738 struct epitem *epic;
739
740 while (*p) {
741 parent = *p;
742 epic = rb_entry(parent, struct epitem, rbn);
743 kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
744 if (kcmp > 0)
745 p = &parent->rb_right;
746 else
747 p = &parent->rb_left;
748 }
749 rb_link_node(&epi->rbn, parent, p);
750 rb_insert_color(&epi->rbn, &ep->rbr);
751}
752
753
754
755
756static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
757 struct file *tfile, int fd)
758{
759 int error, revents, pwake = 0;
760 unsigned long flags;
761 struct epitem *epi;
762 struct ep_pqueue epq;
763
764 if (unlikely(atomic_read(&ep->user->epoll_watches) >=
765 max_user_watches))
766 return -ENOSPC;
767 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
768 return -ENOMEM;
769
770
771 INIT_LIST_HEAD(&epi->rdllink);
772 INIT_LIST_HEAD(&epi->fllink);
773 INIT_LIST_HEAD(&epi->pwqlist);
774 epi->ep = ep;
775 ep_set_ffd(&epi->ffd, tfile, fd);
776 epi->event = *event;
777 epi->nwait = 0;
778 epi->next = EP_UNACTIVE_PTR;
779
780
781 epq.epi = epi;
782 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
783
784
785
786
787
788
789
790
791 revents = tfile->f_op->poll(tfile, &epq.pt);
792
793
794
795
796
797
798 error = -ENOMEM;
799 if (epi->nwait < 0)
800 goto error_unregister;
801
802
803 spin_lock(&tfile->f_ep_lock);
804 list_add_tail(&epi->fllink, &tfile->f_ep_links);
805 spin_unlock(&tfile->f_ep_lock);
806
807
808
809
810
811 ep_rbtree_insert(ep, epi);
812
813
814 spin_lock_irqsave(&ep->lock, flags);
815
816
817 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
818 list_add_tail(&epi->rdllink, &ep->rdllist);
819
820
821 if (waitqueue_active(&ep->wq))
822 wake_up_locked(&ep->wq);
823 if (waitqueue_active(&ep->poll_wait))
824 pwake++;
825 }
826
827 spin_unlock_irqrestore(&ep->lock, flags);
828
829 atomic_inc(&ep->user->epoll_watches);
830
831
832 if (pwake)
833 ep_poll_safewake(&psw, &ep->poll_wait);
834
835 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
836 current, ep, tfile, fd));
837
838 return 0;
839
840error_unregister:
841 ep_unregister_pollwait(ep, epi);
842
843
844
845
846
847
848
849 spin_lock_irqsave(&ep->lock, flags);
850 if (ep_is_linked(&epi->rdllink))
851 list_del_init(&epi->rdllink);
852 spin_unlock_irqrestore(&ep->lock, flags);
853
854 kmem_cache_free(epi_cache, epi);
855
856 return error;
857}
858
859
860
861
862
863static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
864{
865 int pwake = 0;
866 unsigned int revents;
867 unsigned long flags;
868
869
870
871
872
873
874
875 epi->event.events = event->events;
876
877
878
879
880
881 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
882
883 spin_lock_irqsave(&ep->lock, flags);
884
885
886 epi->event.data = event->data;
887
888
889
890
891
892 if (revents & event->events) {
893 if (!ep_is_linked(&epi->rdllink)) {
894 list_add_tail(&epi->rdllink, &ep->rdllist);
895
896
897 if (waitqueue_active(&ep->wq))
898 wake_up_locked(&ep->wq);
899 if (waitqueue_active(&ep->poll_wait))
900 pwake++;
901 }
902 }
903 spin_unlock_irqrestore(&ep->lock, flags);
904
905
906 if (pwake)
907 ep_poll_safewake(&psw, &ep->poll_wait);
908
909 return 0;
910}
911
912static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
913 int maxevents)
914{
915 int eventcnt, error = -EFAULT, pwake = 0;
916 unsigned int revents;
917 unsigned long flags;
918 struct epitem *epi, *nepi;
919 struct list_head txlist;
920
921 INIT_LIST_HEAD(&txlist);
922
923
924
925
926
927 mutex_lock(&ep->mtx);
928
929
930
931
932
933
934
935
936 spin_lock_irqsave(&ep->lock, flags);
937 list_splice(&ep->rdllist, &txlist);
938 INIT_LIST_HEAD(&ep->rdllist);
939 ep->ovflist = NULL;
940 spin_unlock_irqrestore(&ep->lock, flags);
941
942
943
944
945
946
947 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
948 epi = list_first_entry(&txlist, struct epitem, rdllink);
949
950 list_del_init(&epi->rdllink);
951
952
953
954
955
956
957 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
958 revents &= epi->event.events;
959
960
961
962
963
964
965
966 if (revents) {
967 if (__put_user(revents,
968 &events[eventcnt].events) ||
969 __put_user(epi->event.data,
970 &events[eventcnt].data))
971 goto errxit;
972 if (epi->event.events & EPOLLONESHOT)
973 epi->event.events &= EP_PRIVATE_BITS;
974 eventcnt++;
975 }
976
977
978
979
980
981 if (!(epi->event.events & EPOLLET) &&
982 (revents & epi->event.events))
983 list_add_tail(&epi->rdllink, &ep->rdllist);
984 }
985 error = 0;
986
987errxit:
988
989 spin_lock_irqsave(&ep->lock, flags);
990
991
992
993
994
995 for (nepi = ep->ovflist; (epi = nepi) != NULL;
996 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
997 if (!ep_is_linked(&epi->rdllink) &&
998 (epi->event.events & ~EP_PRIVATE_BITS))
999 list_add_tail(&epi->rdllink, &ep->rdllist);
1000 }
1001
1002
1003
1004
1005
1006 ep->ovflist = EP_UNACTIVE_PTR;
1007
1008
1009
1010
1011
1012
1013 list_splice(&txlist, &ep->rdllist);
1014
1015 if (!list_empty(&ep->rdllist)) {
1016
1017
1018
1019
1020 if (waitqueue_active(&ep->wq))
1021 wake_up_locked(&ep->wq);
1022 if (waitqueue_active(&ep->poll_wait))
1023 pwake++;
1024 }
1025 spin_unlock_irqrestore(&ep->lock, flags);
1026
1027 mutex_unlock(&ep->mtx);
1028
1029
1030 if (pwake)
1031 ep_poll_safewake(&psw, &ep->poll_wait);
1032
1033 return eventcnt == 0 ? error: eventcnt;
1034}
1035
1036static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1037 int maxevents, long timeout)
1038{
1039 int res, eavail;
1040 unsigned long flags;
1041 long jtimeout;
1042 wait_queue_t wait;
1043
1044
1045
1046
1047
1048
1049 jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
1050 MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
1051
1052retry:
1053 spin_lock_irqsave(&ep->lock, flags);
1054
1055 res = 0;
1056 if (list_empty(&ep->rdllist)) {
1057
1058
1059
1060
1061
1062 init_waitqueue_entry(&wait, current);
1063 wait.flags |= WQ_FLAG_EXCLUSIVE;
1064 __add_wait_queue(&ep->wq, &wait);
1065
1066 for (;;) {
1067
1068
1069
1070
1071
1072 set_current_state(TASK_INTERRUPTIBLE);
1073 if (!list_empty(&ep->rdllist) || !jtimeout)
1074 break;
1075 if (signal_pending(current)) {
1076 res = -EINTR;
1077 break;
1078 }
1079
1080 spin_unlock_irqrestore(&ep->lock, flags);
1081 jtimeout = schedule_timeout(jtimeout);
1082 spin_lock_irqsave(&ep->lock, flags);
1083 }
1084 __remove_wait_queue(&ep->wq, &wait);
1085
1086 set_current_state(TASK_RUNNING);
1087 }
1088
1089
1090 eavail = !list_empty(&ep->rdllist);
1091
1092 spin_unlock_irqrestore(&ep->lock, flags);
1093
1094
1095
1096
1097
1098
1099 if (!res && eavail &&
1100 !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
1101 goto retry;
1102
1103 return res;
1104}
1105
1106
1107
1108
1109asmlinkage long sys_epoll_create1(int flags)
1110{
1111 int error, fd = -1;
1112 struct eventpoll *ep;
1113
1114
1115 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1116
1117 if (flags & ~EPOLL_CLOEXEC)
1118 return -EINVAL;
1119
1120 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1121 current, flags));
1122
1123
1124
1125
1126 error = ep_alloc(&ep);
1127 if (error < 0) {
1128 fd = error;
1129 goto error_return;
1130 }
1131
1132
1133
1134
1135
1136 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1137 flags & O_CLOEXEC);
1138 if (fd < 0)
1139 ep_free(ep);
1140 atomic_inc(&ep->user->epoll_devs);
1141
1142error_return:
1143 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1144 current, flags, fd));
1145
1146 return fd;
1147}
1148
1149asmlinkage long sys_epoll_create(int size)
1150{
1151 if (size < 0)
1152 return -EINVAL;
1153
1154 return sys_epoll_create1(0);
1155}
1156
1157
1158
1159
1160
1161
1162asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
1163 struct epoll_event __user *event)
1164{
1165 int error;
1166 struct file *file, *tfile;
1167 struct eventpoll *ep;
1168 struct epitem *epi;
1169 struct epoll_event epds;
1170
1171 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1172 current, epfd, op, fd, event));
1173
1174 error = -EFAULT;
1175 if (ep_op_has_event(op) &&
1176 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1177 goto error_return;
1178
1179
1180 error = -EBADF;
1181 file = fget(epfd);
1182 if (!file)
1183 goto error_return;
1184
1185
1186 tfile = fget(fd);
1187 if (!tfile)
1188 goto error_fput;
1189
1190
1191 error = -EPERM;
1192 if (!tfile->f_op || !tfile->f_op->poll)
1193 goto error_tgt_fput;
1194
1195
1196
1197
1198
1199
1200 error = -EINVAL;
1201 if (file == tfile || !is_file_epoll(file))
1202 goto error_tgt_fput;
1203
1204
1205
1206
1207
1208 ep = file->private_data;
1209
1210 mutex_lock(&ep->mtx);
1211
1212
1213
1214
1215
1216
1217 epi = ep_find(ep, tfile, fd);
1218
1219 error = -EINVAL;
1220 switch (op) {
1221 case EPOLL_CTL_ADD:
1222 if (!epi) {
1223 epds.events |= POLLERR | POLLHUP;
1224
1225 error = ep_insert(ep, &epds, tfile, fd);
1226 } else
1227 error = -EEXIST;
1228 break;
1229 case EPOLL_CTL_DEL:
1230 if (epi)
1231 error = ep_remove(ep, epi);
1232 else
1233 error = -ENOENT;
1234 break;
1235 case EPOLL_CTL_MOD:
1236 if (epi) {
1237 epds.events |= POLLERR | POLLHUP;
1238 error = ep_modify(ep, epi, &epds);
1239 } else
1240 error = -ENOENT;
1241 break;
1242 }
1243 mutex_unlock(&ep->mtx);
1244
1245error_tgt_fput:
1246 fput(tfile);
1247error_fput:
1248 fput(file);
1249error_return:
1250 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1251 current, epfd, op, fd, event, error));
1252
1253 return error;
1254}
1255
1256
1257
1258
1259
1260asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
1261 int maxevents, int timeout)
1262{
1263 int error;
1264 struct file *file;
1265 struct eventpoll *ep;
1266
1267 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1268 current, epfd, events, maxevents, timeout));
1269
1270
1271 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1272 return -EINVAL;
1273
1274
1275 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1276 error = -EFAULT;
1277 goto error_return;
1278 }
1279
1280
1281 error = -EBADF;
1282 file = fget(epfd);
1283 if (!file)
1284 goto error_return;
1285
1286
1287
1288
1289
1290 error = -EINVAL;
1291 if (!is_file_epoll(file))
1292 goto error_fput;
1293
1294
1295
1296
1297
1298 ep = file->private_data;
1299
1300
1301 error = ep_poll(ep, events, maxevents, timeout);
1302
1303error_fput:
1304 fput(file);
1305error_return:
1306 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1307 current, epfd, events, maxevents, timeout, error));
1308
1309 return error;
1310}
1311
1312#ifdef HAVE_SET_RESTORE_SIGMASK
1313
1314
1315
1316
1317
1318asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
1319 int maxevents, int timeout, const sigset_t __user *sigmask,
1320 size_t sigsetsize)
1321{
1322 int error;
1323 sigset_t ksigmask, sigsaved;
1324
1325
1326
1327
1328
1329 if (sigmask) {
1330 if (sigsetsize != sizeof(sigset_t))
1331 return -EINVAL;
1332 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1333 return -EFAULT;
1334 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1335 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1336 }
1337
1338 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1339
1340
1341
1342
1343
1344
1345
1346 if (sigmask) {
1347 if (error == -EINTR) {
1348 memcpy(¤t->saved_sigmask, &sigsaved,
1349 sizeof(sigsaved));
1350 set_restore_sigmask();
1351 } else
1352 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1353 }
1354
1355 return error;
1356}
1357
1358#endif
1359
1360static int __init eventpoll_init(void)
1361{
1362 struct sysinfo si;
1363
1364 si_meminfo(&si);
1365 max_user_instances = 128;
1366 max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
1367 EP_ITEM_COST;
1368
1369
1370 ep_poll_safewake_init(&psw);
1371
1372
1373 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1374 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
1375 NULL);
1376
1377
1378 pwq_cache = kmem_cache_create("eventpoll_pwq",
1379 sizeof(struct eppoll_entry), 0,
1380 EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1381
1382 return 0;
1383}
1384fs_initcall(eventpoll_init);
1385