1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/module.h>
15#include <linux/init.h>
16#include <linux/kernel.h>
17#include <linux/sched.h>
18#include <linux/fs.h>
19#include <linux/file.h>
20#include <linux/signal.h>
21#include <linux/errno.h>
22#include <linux/mm.h>
23#include <linux/slab.h>
24#include <linux/poll.h>
25#include <linux/smp_lock.h>
26#include <linux/string.h>
27#include <linux/list.h>
28#include <linux/hash.h>
29#include <linux/spinlock.h>
30#include <linux/rwsem.h>
31#include <linux/wait.h>
32#include <linux/eventpoll.h>
33#include <linux/mount.h>
34#include <asm/bitops.h>
35#include <asm/uaccess.h>
36#include <asm/system.h>
37#include <asm/io.h>
38#include <asm/mman.h>
39#include <asm/atomic.h>
40#include <asm/semaphore.h>
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76#define EVENTPOLLFS_MAGIC 0x03111965
77
78#define DEBUG_EPOLL 0
79
80#if DEBUG_EPOLL > 0
81#define DPRINTK(x) printk x
82#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
83#else
84#define DPRINTK(x) (void) 0
85#define DNPRINTK(n, x) (void) 0
86#endif
87
88#define DEBUG_EPI 0
89
90#if DEBUG_EPI != 0
91#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE )
92#else
93#define EPI_SLAB_DEBUG 0
94#endif
95
96
97
98#define EP_MAX_POLLWAKE_NESTS 4
99
100
101#define EP_MAX_HASH_BITS 17
102
103
104#define EP_MIN_HASH_BITS 9
105
106
107#define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head))
108
109
110#define EP_MAX_HPAGES ((1 << EP_MAX_HASH_BITS) / EP_HENTRY_X_PAGE + 1)
111
112
113#define EP_HASH_PAGES(hbits) ((int) ((1 << (hbits)) / EP_HENTRY_X_PAGE + \
114 ((1 << (hbits)) % EP_HENTRY_X_PAGE ? 1: 0)))
115
116
117#define EPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL)
118
119
120#define EPI_MEM_FREE(p) kmem_cache_free(epi_cache, p)
121
122
123#define PWQ_MEM_ALLOC() (struct eppoll_entry *) kmem_cache_alloc(pwq_cache, SLAB_KERNEL)
124
125
126#define PWQ_MEM_FREE(p) kmem_cache_free(pwq_cache, p)
127
128
129#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)
130
131
132
133
134
135
136#define EP_LIST_DEL(p) do { list_del(p); INIT_LIST_HEAD(p); } while (0)
137
138
139#define EP_IS_LINKED(p) (!list_empty(p))
140
141
142#define EP_ITEM_FROM_WAIT(p) ((struct epitem *) container_of(p, struct eppoll_entry, wait)->base)
143
144
145#define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi)
146
147
148
149
150
151#define EP_MAX_BUF_EVENTS 32
152
153
154
155
156
157
158
159
160
161
162struct wake_task_node {
163 struct list_head llink;
164 task_t *task;
165 wait_queue_head_t *wq;
166};
167
168
169
170
171
172struct poll_safewake {
173 struct list_head wake_task_list;
174 spinlock_t lock;
175};
176
177
178
179
180
181
182struct eventpoll {
183
184 rwlock_t lock;
185
186
187
188
189
190
191
192 struct rw_semaphore sem;
193
194
195 wait_queue_head_t wq;
196
197
198 wait_queue_head_t poll_wait;
199
200
201 struct list_head rdllist;
202
203
204 unsigned int hashbits;
205
206
207 char *hpages[EP_MAX_HPAGES];
208};
209
210
211struct eppoll_entry {
212
213 struct list_head llink;
214
215
216 void *base;
217
218
219
220
221
222 wait_queue_t wait;
223
224
225 wait_queue_head_t *whead;
226};
227
228
229
230
231
232struct epitem {
233
234 struct list_head llink;
235
236
237 struct list_head rdllink;
238
239
240 int nwait;
241
242
243 struct list_head pwqlist;
244
245
246 struct eventpoll *ep;
247
248
249 int fd;
250
251
252 struct file *file;
253
254
255 struct epoll_event event;
256
257
258
259
260
261 atomic_t usecnt;
262
263
264 struct list_head fllink;
265
266
267 struct list_head txlink;
268
269
270
271
272
273 unsigned int revents;
274};
275
276
277struct ep_pqueue {
278 poll_table pt;
279 struct epitem *epi;
280};
281
282
283
284static void ep_poll_safewake_init(struct poll_safewake *psw);
285static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
286static unsigned int ep_get_hash_bits(unsigned int hintsize);
287static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
288static int ep_alloc_pages(char **pages, int numpages);
289static int ep_free_pages(char **pages, int numpages);
290static int ep_file_init(struct file *file, unsigned int hashbits);
291static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file,
292 int fd);
293static struct list_head *ep_hash_entry(struct eventpoll *ep,
294 unsigned int index);
295static int ep_init(struct eventpoll *ep, unsigned int hashbits);
296static void ep_free(struct eventpoll *ep);
297static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
298static void ep_use_epitem(struct epitem *epi);
299static void ep_release_epitem(struct epitem *epi);
300static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
301 poll_table *pt);
302static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
303 struct file *tfile, int fd);
304static int ep_modify(struct eventpoll *ep, struct epitem *epi,
305 struct epoll_event *event);
306static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi);
307static int ep_unlink(struct eventpoll *ep, struct epitem *epi);
308static int ep_remove(struct eventpoll *ep, struct epitem *epi);
309static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync);
310static int ep_eventpoll_close(struct inode *inode, struct file *file);
311static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
312static int ep_collect_ready_items(struct eventpoll *ep,
313 struct list_head *txlist, int maxevents);
314static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
315 struct epoll_event __user *events);
316static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist);
317static int ep_events_transfer(struct eventpoll *ep,
318 struct epoll_event __user *events,
319 int maxevents);
320static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
321 int maxevents, long timeout);
322static int eventpollfs_delete_dentry(struct dentry *dentry);
323static struct inode *ep_eventpoll_inode(void);
324static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
325 int flags, const char *dev_name,
326 void *data);
327
328
329
330
331struct semaphore epsem;
332
333
334static struct poll_safewake psw;
335
336
337static kmem_cache_t *epi_cache;
338
339
340static kmem_cache_t *pwq_cache;
341
342
343static struct vfsmount *eventpoll_mnt;
344
345
346static struct file_operations eventpoll_fops = {
347 .release = ep_eventpoll_close,
348 .poll = ep_eventpoll_poll
349};
350
351
352
353
354
355static struct file_system_type eventpoll_fs_type = {
356 .name = "eventpollfs",
357 .get_sb = eventpollfs_get_sb,
358 .kill_sb = kill_anon_super,
359};
360
361
362static struct dentry_operations eventpollfs_dentry_operations = {
363 .d_delete = eventpollfs_delete_dentry,
364};
365
366
367
368
369static void ep_poll_safewake_init(struct poll_safewake *psw)
370{
371
372 INIT_LIST_HEAD(&psw->wake_task_list);
373 spin_lock_init(&psw->lock);
374}
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
390{
391 int wake_nests = 0;
392 unsigned long flags;
393 task_t *this_task = current;
394 struct list_head *lsthead = &psw->wake_task_list, *lnk;
395 struct wake_task_node *tncur;
396 struct wake_task_node tnode;
397
398 spin_lock_irqsave(&psw->lock, flags);
399
400
401 list_for_each(lnk, lsthead) {
402 tncur = list_entry(lnk, struct wake_task_node, llink);
403
404 if (tncur->wq == wq ||
405 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
406
407
408
409
410 spin_unlock_irqrestore(&psw->lock, flags);
411 return;
412 }
413 }
414
415
416 tnode.task = this_task;
417 tnode.wq = wq;
418 list_add(&tnode.llink, lsthead);
419
420 spin_unlock_irqrestore(&psw->lock, flags);
421
422
423 wake_up(wq);
424
425
426 spin_lock_irqsave(&psw->lock, flags);
427 list_del(&tnode.llink);
428 spin_unlock_irqrestore(&psw->lock, flags);
429}
430
431
432
433
434
435
436static unsigned int ep_get_hash_bits(unsigned int hintsize)
437{
438 unsigned int i, val;
439
440 for (i = 0, val = 1; val < hintsize && i < EP_MAX_HASH_BITS; i++, val <<= 1);
441 return i < EP_MIN_HASH_BITS ? EP_MIN_HASH_BITS: i;
442}
443
444
445
446void eventpoll_init_file(struct file *file)
447{
448
449 INIT_LIST_HEAD(&file->f_ep_links);
450 spin_lock_init(&file->f_ep_lock);
451}
452
453
454
455
456
457
458
459void eventpoll_release_file(struct file *file)
460{
461 struct list_head *lsthead = &file->f_ep_links;
462 struct eventpoll *ep;
463 struct epitem *epi;
464
465
466
467
468
469
470
471
472
473
474 down(&epsem);
475
476 while (!list_empty(lsthead)) {
477 epi = list_entry(lsthead->next, struct epitem, fllink);
478
479 ep = epi->ep;
480 EP_LIST_DEL(&epi->fllink);
481 down_write(&ep->sem);
482 ep_remove(ep, epi);
483 up_write(&ep->sem);
484 }
485
486 up(&epsem);
487}
488
489
490
491
492
493
494
495
496
497asmlinkage long sys_epoll_create(int size)
498{
499 int error, fd;
500 unsigned int hashbits;
501 struct inode *inode;
502 struct file *file;
503
504 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
505 current, size));
506
507
508 hashbits = ep_get_hash_bits((unsigned int) size);
509
510
511
512
513
514 error = ep_getfd(&fd, &inode, &file);
515 if (error)
516 goto eexit_1;
517
518
519 error = ep_file_init(file, hashbits);
520 if (error)
521 goto eexit_2;
522
523
524 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
525 current, size, fd));
526
527 return fd;
528
529eexit_2:
530 sys_close(fd);
531eexit_1:
532 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
533 current, size, error));
534 return error;
535}
536
537
538
539
540
541
542
543
544asmlinkage long
545sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
546{
547 int error;
548 struct file *file, *tfile;
549 struct eventpoll *ep;
550 struct epitem *epi;
551 struct epoll_event epds;
552
553 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
554 current, epfd, op, fd, event));
555
556 error = -EFAULT;
557 if (copy_from_user(&epds, event, sizeof(struct epoll_event)))
558 goto eexit_1;
559
560
561 error = -EBADF;
562 file = fget(epfd);
563 if (!file)
564 goto eexit_1;
565
566
567 tfile = fget(fd);
568 if (!tfile)
569 goto eexit_2;
570
571
572 error = -EPERM;
573 if (!tfile->f_op || !tfile->f_op->poll)
574 goto eexit_3;
575
576
577
578
579
580
581 error = -EINVAL;
582 if (file == tfile || !IS_FILE_EPOLL(file))
583 goto eexit_3;
584
585
586
587
588
589 ep = file->private_data;
590
591 down_write(&ep->sem);
592
593
594 epi = ep_find(ep, tfile, fd);
595
596 error = -EINVAL;
597 switch (op) {
598 case EPOLL_CTL_ADD:
599 if (!epi) {
600 epds.events |= POLLERR | POLLHUP;
601
602 error = ep_insert(ep, &epds, tfile, fd);
603 } else
604 error = -EEXIST;
605 break;
606 case EPOLL_CTL_DEL:
607 if (epi)
608 error = ep_remove(ep, epi);
609 else
610 error = -ENOENT;
611 break;
612 case EPOLL_CTL_MOD:
613 if (epi) {
614 epds.events |= POLLERR | POLLHUP;
615 error = ep_modify(ep, epi, &epds);
616 } else
617 error = -ENOENT;
618 break;
619 }
620
621
622
623
624
625 if (epi)
626 ep_release_epitem(epi);
627
628 up_write(&ep->sem);
629
630eexit_3:
631 fput(tfile);
632eexit_2:
633 fput(file);
634eexit_1:
635 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
636 current, epfd, op, fd, event, error));
637
638 return error;
639}
640
641
642
643
644
645
646asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
647 int maxevents, int timeout)
648{
649 int error;
650 struct file *file;
651 struct eventpoll *ep;
652
653 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
654 current, epfd, events, maxevents, timeout));
655
656
657 if (maxevents <= 0)
658 return -EINVAL;
659
660
661 if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))))
662 goto eexit_1;
663
664
665 error = -EBADF;
666 file = fget(epfd);
667 if (!file)
668 goto eexit_1;
669
670
671
672
673
674 error = -EINVAL;
675 if (!IS_FILE_EPOLL(file))
676 goto eexit_2;
677
678
679
680
681
682 ep = file->private_data;
683
684
685 error = ep_poll(ep, events, maxevents, timeout);
686
687eexit_2:
688 fput(file);
689eexit_1:
690 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
691 current, epfd, events, maxevents, timeout, error));
692
693 return error;
694}
695
696
697
698
699
700static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
701{
702 struct qstr this;
703 char name[32];
704 struct dentry *dentry;
705 struct inode *inode;
706 struct file *file;
707 int error, fd;
708
709
710 error = -ENFILE;
711 file = get_empty_filp();
712 if (!file)
713 goto eexit_1;
714
715
716 inode = ep_eventpoll_inode();
717 error = PTR_ERR(inode);
718 if (IS_ERR(inode))
719 goto eexit_2;
720
721
722 error = get_unused_fd();
723 if (error < 0)
724 goto eexit_3;
725 fd = error;
726
727
728
729
730
731 error = -ENOMEM;
732 sprintf(name, "[%lu]", inode->i_ino);
733 this.name = name;
734 this.len = strlen(name);
735 this.hash = inode->i_ino;
736 dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
737 if (!dentry)
738 goto eexit_4;
739 dentry->d_op = &eventpollfs_dentry_operations;
740 d_add(dentry, inode);
741 file->f_vfsmnt = mntget(eventpoll_mnt);
742 file->f_dentry = dget(dentry);
743
744 file->f_pos = 0;
745 file->f_flags = O_RDONLY;
746 file->f_op = &eventpoll_fops;
747 file->f_mode = FMODE_READ;
748 file->f_version = 0;
749 file->private_data = NULL;
750
751
752 fd_install(fd, file);
753
754 *efd = fd;
755 *einode = inode;
756 *efile = file;
757 return 0;
758
759eexit_4:
760 put_unused_fd(fd);
761eexit_3:
762 iput(inode);
763eexit_2:
764 put_filp(file);
765eexit_1:
766 return error;
767}
768
769
770static int ep_alloc_pages(char **pages, int numpages)
771{
772 int i;
773
774 for (i = 0; i < numpages; i++) {
775 pages[i] = (char *) __get_free_pages(GFP_KERNEL, 0);
776 if (!pages[i]) {
777 for (--i; i >= 0; i--) {
778 ClearPageReserved(virt_to_page(pages[i]));
779 free_pages((unsigned long) pages[i], 0);
780 }
781 return -ENOMEM;
782 }
783 SetPageReserved(virt_to_page(pages[i]));
784 }
785 return 0;
786}
787
788
789static int ep_free_pages(char **pages, int numpages)
790{
791 int i;
792
793 for (i = 0; i < numpages; i++) {
794 ClearPageReserved(virt_to_page(pages[i]));
795 free_pages((unsigned long) pages[i], 0);
796 }
797 return 0;
798}
799
800
801static int ep_file_init(struct file *file, unsigned int hashbits)
802{
803 int error;
804 struct eventpoll *ep;
805
806 if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
807 return -ENOMEM;
808
809 memset(ep, 0, sizeof(*ep));
810
811 error = ep_init(ep, hashbits);
812 if (error) {
813 kfree(ep);
814 return error;
815 }
816
817 file->private_data = ep;
818
819 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
820 current, ep));
821 return 0;
822}
823
824
825
826
827
828static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file, int fd)
829{
830 unsigned long ptr = (unsigned long) file ^ (fd << ep->hashbits);
831
832 return (unsigned int) hash_ptr((void *) ptr, ep->hashbits);
833}
834
835
836
837
838
839static struct list_head *ep_hash_entry(struct eventpoll *ep, unsigned int index)
840{
841
842 return (struct list_head *) (ep->hpages[index / EP_HENTRY_X_PAGE] +
843 (index % EP_HENTRY_X_PAGE) * sizeof(struct list_head));
844}
845
846
847static int ep_init(struct eventpoll *ep, unsigned int hashbits)
848{
849 int error;
850 unsigned int i, hsize;
851
852 rwlock_init(&ep->lock);
853 init_rwsem(&ep->sem);
854 init_waitqueue_head(&ep->wq);
855 init_waitqueue_head(&ep->poll_wait);
856 INIT_LIST_HEAD(&ep->rdllist);
857
858
859 ep->hashbits = hashbits;
860 error = ep_alloc_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits));
861 if (error)
862 goto eexit_1;
863
864
865 for (i = 0, hsize = 1 << hashbits; i < hsize; i++)
866 INIT_LIST_HEAD(ep_hash_entry(ep, i));
867
868 return 0;
869eexit_1:
870 return error;
871}
872
873
874static void ep_free(struct eventpoll *ep)
875{
876 unsigned int i, hsize;
877 struct list_head *lsthead, *lnk;
878 struct epitem *epi;
879
880
881 if (waitqueue_active(&ep->poll_wait))
882 ep_poll_safewake(&psw, &ep->poll_wait);
883
884
885
886
887
888
889
890
891
892 down(&epsem);
893
894
895
896
897 for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) {
898 lsthead = ep_hash_entry(ep, i);
899
900 list_for_each(lnk, lsthead) {
901 epi = list_entry(lnk, struct epitem, llink);
902
903 ep_unregister_pollwait(ep, epi);
904 }
905 }
906
907
908
909
910
911
912
913 for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) {
914 lsthead = ep_hash_entry(ep, i);
915
916 while (!list_empty(lsthead)) {
917 epi = list_entry(lsthead->next, struct epitem, llink);
918
919 ep_remove(ep, epi);
920 }
921 }
922
923 up(&epsem);
924
925
926 ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits));
927}
928
929
930
931
932
933
934
935static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
936{
937 unsigned long flags;
938 struct list_head *lsthead, *lnk;
939 struct epitem *epi = NULL;
940
941 read_lock_irqsave(&ep->lock, flags);
942
943 lsthead = ep_hash_entry(ep, ep_hash_index(ep, file, fd));
944 list_for_each(lnk, lsthead) {
945 epi = list_entry(lnk, struct epitem, llink);
946
947 if (epi->file == file && epi->fd == fd) {
948 ep_use_epitem(epi);
949 break;
950 }
951 epi = NULL;
952 }
953
954 read_unlock_irqrestore(&ep->lock, flags);
955
956 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
957 current, file, epi));
958
959 return epi;
960}
961
962
963
964
965
966
967static void ep_use_epitem(struct epitem *epi)
968{
969
970 atomic_inc(&epi->usecnt);
971}
972
973
974
975
976
977
978
979static void ep_release_epitem(struct epitem *epi)
980{
981
982 if (atomic_dec_and_test(&epi->usecnt))
983 EPI_MEM_FREE(epi);
984}
985
986
987
988
989
990
991static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
992 poll_table *pt)
993{
994 struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt);
995 struct eppoll_entry *pwq;
996
997 if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) {
998 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
999 pwq->whead = whead;
1000 pwq->base = epi;
1001 add_wait_queue(whead, &pwq->wait);
1002 list_add_tail(&pwq->llink, &epi->pwqlist);
1003 epi->nwait++;
1004 } else {
1005
1006 epi->nwait = -1;
1007 }
1008}
1009
1010
1011static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1012 struct file *tfile, int fd)
1013{
1014 int error, revents, pwake = 0;
1015 unsigned long flags;
1016 struct epitem *epi;
1017 struct ep_pqueue epq;
1018
1019 error = -ENOMEM;
1020 if (!(epi = EPI_MEM_ALLOC()))
1021 goto eexit_1;
1022
1023
1024 INIT_LIST_HEAD(&epi->llink);
1025 INIT_LIST_HEAD(&epi->rdllink);
1026 INIT_LIST_HEAD(&epi->fllink);
1027 INIT_LIST_HEAD(&epi->txlink);
1028 INIT_LIST_HEAD(&epi->pwqlist);
1029 epi->ep = ep;
1030 epi->file = tfile;
1031 epi->fd = fd;
1032 epi->event = *event;
1033 atomic_set(&epi->usecnt, 1);
1034 epi->nwait = 0;
1035
1036
1037 epq.epi = epi;
1038 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1039
1040
1041
1042
1043
1044
1045 revents = tfile->f_op->poll(tfile, &epq.pt);
1046
1047
1048
1049
1050
1051
1052 if (epi->nwait < 0)
1053 goto eexit_2;
1054
1055
1056 spin_lock(&tfile->f_ep_lock);
1057 list_add_tail(&epi->fllink, &tfile->f_ep_links);
1058 spin_unlock(&tfile->f_ep_lock);
1059
1060
1061 write_lock_irqsave(&ep->lock, flags);
1062
1063
1064 list_add(&epi->llink, ep_hash_entry(ep, ep_hash_index(ep, tfile, fd)));
1065
1066
1067 if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
1068 list_add_tail(&epi->rdllink, &ep->rdllist);
1069
1070
1071 if (waitqueue_active(&ep->wq))
1072 wake_up(&ep->wq);
1073 if (waitqueue_active(&ep->poll_wait))
1074 pwake++;
1075 }
1076
1077 write_unlock_irqrestore(&ep->lock, flags);
1078
1079
1080 if (pwake)
1081 ep_poll_safewake(&psw, &ep->poll_wait);
1082
1083 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
1084 current, ep, tfile, fd));
1085
1086 return 0;
1087
1088eexit_2:
1089 ep_unregister_pollwait(ep, epi);
1090
1091
1092
1093
1094
1095 write_lock_irqsave(&ep->lock, flags);
1096 if (EP_IS_LINKED(&epi->rdllink))
1097 EP_LIST_DEL(&epi->rdllink);
1098 write_unlock_irqrestore(&ep->lock, flags);
1099
1100 EPI_MEM_FREE(epi);
1101eexit_1:
1102 return error;
1103}
1104
1105
1106
1107
1108
1109
1110static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1111{
1112 int pwake = 0;
1113 unsigned int revents;
1114 unsigned long flags;
1115
1116
1117
1118
1119
1120
1121
1122 epi->event.events = event->events;
1123
1124
1125
1126
1127
1128 revents = epi->file->f_op->poll(epi->file, NULL);
1129
1130 write_lock_irqsave(&ep->lock, flags);
1131
1132
1133 epi->event.data = event->data;
1134
1135
1136
1137
1138
1139 if (EP_IS_LINKED(&epi->llink)) {
1140
1141
1142
1143
1144
1145 if (revents & event->events) {
1146 if (!EP_IS_LINKED(&epi->rdllink)) {
1147 list_add_tail(&epi->rdllink, &ep->rdllist);
1148
1149
1150 if (waitqueue_active(&ep->wq))
1151 wake_up(&ep->wq);
1152 if (waitqueue_active(&ep->poll_wait))
1153 pwake++;
1154 }
1155 } else if (EP_IS_LINKED(&epi->rdllink))
1156 EP_LIST_DEL(&epi->rdllink);
1157 }
1158
1159 write_unlock_irqrestore(&ep->lock, flags);
1160
1161
1162 if (pwake)
1163 ep_poll_safewake(&psw, &ep->poll_wait);
1164
1165 return 0;
1166}
1167
1168
1169
1170
1171
1172
1173
1174static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
1175{
1176 int nwait;
1177 struct list_head *lsthead = &epi->pwqlist;
1178 struct eppoll_entry *pwq;
1179
1180
1181 nwait = xchg(&epi->nwait, 0);
1182
1183 if (nwait) {
1184 while (!list_empty(lsthead)) {
1185 pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
1186
1187 EP_LIST_DEL(&pwq->llink);
1188 remove_wait_queue(pwq->whead, &pwq->wait);
1189 PWQ_MEM_FREE(pwq);
1190 }
1191 }
1192}
1193
1194
1195
1196
1197
1198
1199static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
1200{
1201 int error;
1202
1203
1204
1205
1206
1207 error = -ENOENT;
1208 if (!EP_IS_LINKED(&epi->llink))
1209 goto eexit_1;
1210
1211
1212
1213
1214
1215
1216 epi->event.events = 0;
1217
1218
1219
1220
1221
1222
1223 EP_LIST_DEL(&epi->llink);
1224
1225
1226
1227
1228
1229 if (EP_IS_LINKED(&epi->rdllink))
1230 EP_LIST_DEL(&epi->rdllink);
1231
1232 error = 0;
1233eexit_1:
1234
1235 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
1236 current, ep, epi->file, error));
1237
1238 return error;
1239}
1240
1241
1242
1243
1244
1245
1246static int ep_remove(struct eventpoll *ep, struct epitem *epi)
1247{
1248 int error;
1249 unsigned long flags;
1250 struct file *file = epi->file;
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260 ep_unregister_pollwait(ep, epi);
1261
1262
1263 spin_lock(&file->f_ep_lock);
1264 if (EP_IS_LINKED(&epi->fllink))
1265 EP_LIST_DEL(&epi->fllink);
1266 spin_unlock(&file->f_ep_lock);
1267
1268
1269 write_lock_irqsave(&ep->lock, flags);
1270
1271
1272 error = ep_unlink(ep, epi);
1273
1274 write_unlock_irqrestore(&ep->lock, flags);
1275
1276 if (error)
1277 goto eexit_1;
1278
1279
1280 ep_release_epitem(epi);
1281
1282 error = 0;
1283eexit_1:
1284 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
1285 current, ep, file, error));
1286
1287 return error;
1288}
1289
1290
1291
1292
1293
1294
1295
1296static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync)
1297{
1298 int pwake = 0;
1299 unsigned long flags;
1300 struct epitem *epi = EP_ITEM_FROM_WAIT(wait);
1301 struct eventpoll *ep = epi->ep;
1302
1303 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
1304 current, epi->file, epi, ep));
1305
1306 write_lock_irqsave(&ep->lock, flags);
1307
1308
1309 if (EP_IS_LINKED(&epi->rdllink))
1310 goto is_linked;
1311
1312 list_add_tail(&epi->rdllink, &ep->rdllist);
1313
1314is_linked:
1315
1316
1317
1318
1319 if (waitqueue_active(&ep->wq))
1320 wake_up(&ep->wq);
1321 if (waitqueue_active(&ep->poll_wait))
1322 pwake++;
1323
1324 write_unlock_irqrestore(&ep->lock, flags);
1325
1326
1327 if (pwake)
1328 ep_poll_safewake(&psw, &ep->poll_wait);
1329
1330 return 1;
1331}
1332
1333
1334static int ep_eventpoll_close(struct inode *inode, struct file *file)
1335{
1336 struct eventpoll *ep = file->private_data;
1337
1338 if (ep) {
1339 ep_free(ep);
1340 kfree(ep);
1341 }
1342
1343 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
1344 return 0;
1345}
1346
1347
1348static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
1349{
1350 unsigned int pollflags = 0;
1351 unsigned long flags;
1352 struct eventpoll *ep = file->private_data;
1353
1354
1355 poll_wait(file, &ep->poll_wait, wait);
1356
1357
1358 read_lock_irqsave(&ep->lock, flags);
1359 if (!list_empty(&ep->rdllist))
1360 pollflags = POLLIN | POLLRDNORM;
1361 read_unlock_irqrestore(&ep->lock, flags);
1362
1363 return pollflags;
1364}
1365
1366
1367
1368
1369
1370
1371
1372static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
1373{
1374 int nepi;
1375 unsigned long flags;
1376 struct list_head *lsthead = &ep->rdllist, *lnk;
1377 struct epitem *epi;
1378
1379 write_lock_irqsave(&ep->lock, flags);
1380
1381 for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
1382 epi = list_entry(lnk, struct epitem, rdllink);
1383
1384 lnk = lnk->next;
1385
1386
1387 if (!EP_IS_LINKED(&epi->txlink)) {
1388
1389
1390
1391
1392
1393 epi->revents = epi->event.events;
1394
1395
1396 list_add(&epi->txlink, txlist);
1397 nepi++;
1398
1399
1400
1401
1402 EP_LIST_DEL(&epi->rdllink);
1403 }
1404 }
1405
1406 write_unlock_irqrestore(&ep->lock, flags);
1407
1408 return nepi;
1409}
1410
1411
1412
1413
1414
1415
1416
1417static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1418 struct epoll_event __user *events)
1419{
1420 int eventcnt = 0, eventbuf = 0;
1421 unsigned int revents;
1422 struct list_head *lnk;
1423 struct epitem *epi;
1424 struct epoll_event event[EP_MAX_BUF_EVENTS];
1425
1426
1427
1428
1429
1430
1431
1432 list_for_each(lnk, txlist) {
1433 epi = list_entry(lnk, struct epitem, txlink);
1434
1435
1436
1437
1438
1439
1440 revents = epi->file->f_op->poll(epi->file, NULL);
1441
1442
1443
1444
1445
1446
1447 epi->revents = revents & epi->event.events;
1448
1449 if (epi->revents) {
1450 event[eventbuf] = epi->event;
1451 event[eventbuf].events &= revents;
1452 eventbuf++;
1453 if (eventbuf == EP_MAX_BUF_EVENTS) {
1454 if (__copy_to_user(&events[eventcnt], event,
1455 eventbuf * sizeof(struct epoll_event)))
1456 return -EFAULT;
1457 eventcnt += eventbuf;
1458 eventbuf = 0;
1459 }
1460 }
1461 }
1462
1463 if (eventbuf) {
1464 if (__copy_to_user(&events[eventcnt], event,
1465 eventbuf * sizeof(struct epoll_event)))
1466 return -EFAULT;
1467 eventcnt += eventbuf;
1468 }
1469
1470 return eventcnt;
1471}
1472
1473
1474
1475
1476
1477
1478
1479
1480static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1481{
1482 int ricnt = 0, pwake = 0;
1483 unsigned long flags;
1484 struct epitem *epi;
1485
1486 write_lock_irqsave(&ep->lock, flags);
1487
1488 while (!list_empty(txlist)) {
1489 epi = list_entry(txlist->next, struct epitem, txlink);
1490
1491
1492 EP_LIST_DEL(&epi->txlink);
1493
1494
1495
1496
1497
1498
1499
1500
1501 if (EP_IS_LINKED(&epi->llink) && !(epi->event.events & EPOLLET) &&
1502 (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) {
1503 list_add_tail(&epi->rdllink, &ep->rdllist);
1504 ricnt++;
1505 }
1506 }
1507
1508 if (ricnt) {
1509
1510
1511
1512
1513 if (waitqueue_active(&ep->wq))
1514 wake_up(&ep->wq);
1515 if (waitqueue_active(&ep->poll_wait))
1516 pwake++;
1517 }
1518
1519 write_unlock_irqrestore(&ep->lock, flags);
1520
1521
1522 if (pwake)
1523 ep_poll_safewake(&psw, &ep->poll_wait);
1524}
1525
1526
1527
1528
1529
1530static int ep_events_transfer(struct eventpoll *ep,
1531 struct epoll_event __user *events, int maxevents)
1532{
1533 int eventcnt = 0;
1534 struct list_head txlist;
1535
1536 INIT_LIST_HEAD(&txlist);
1537
1538
1539
1540
1541
1542 down_read(&ep->sem);
1543
1544
1545 if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
1546
1547 eventcnt = ep_send_events(ep, &txlist, events);
1548
1549
1550 ep_reinject_items(ep, &txlist);
1551 }
1552
1553 up_read(&ep->sem);
1554
1555 return eventcnt;
1556}
1557
1558
1559static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1560 int maxevents, long timeout)
1561{
1562 int res, eavail;
1563 unsigned long flags;
1564 long jtimeout;
1565 wait_queue_t wait;
1566
1567
1568
1569
1570
1571
1572 jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
1573 MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;
1574
1575retry:
1576 write_lock_irqsave(&ep->lock, flags);
1577
1578 res = 0;
1579 if (list_empty(&ep->rdllist)) {
1580
1581
1582
1583
1584
1585 init_waitqueue_entry(&wait, current);
1586 add_wait_queue(&ep->wq, &wait);
1587
1588 for (;;) {
1589
1590
1591
1592
1593
1594 set_current_state(TASK_INTERRUPTIBLE);
1595 if (!list_empty(&ep->rdllist) || !jtimeout)
1596 break;
1597 if (signal_pending(current)) {
1598 res = -EINTR;
1599 break;
1600 }
1601
1602 write_unlock_irqrestore(&ep->lock, flags);
1603 jtimeout = schedule_timeout(jtimeout);
1604 write_lock_irqsave(&ep->lock, flags);
1605 }
1606 remove_wait_queue(&ep->wq, &wait);
1607
1608 set_current_state(TASK_RUNNING);
1609 }
1610
1611
1612 eavail = !list_empty(&ep->rdllist);
1613
1614 write_unlock_irqrestore(&ep->lock, flags);
1615
1616
1617
1618
1619
1620
1621 if (!res && eavail &&
1622 !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
1623 goto retry;
1624
1625 return res;
1626}
1627
1628
1629static int eventpollfs_delete_dentry(struct dentry *dentry)
1630{
1631
1632 return 1;
1633}
1634
1635
1636static struct inode *ep_eventpoll_inode(void)
1637{
1638 int error = -ENOMEM;
1639 struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
1640
1641 if (!inode)
1642 goto eexit_1;
1643
1644 inode->i_fop = &eventpoll_fops;
1645
1646
1647
1648
1649
1650
1651
1652 inode->i_state = I_DIRTY;
1653 inode->i_mode = S_IRUSR | S_IWUSR;
1654 inode->i_uid = current->fsuid;
1655 inode->i_gid = current->fsgid;
1656 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1657 inode->i_blksize = PAGE_SIZE;
1658 return inode;
1659
1660eexit_1:
1661 return ERR_PTR(error);
1662}
1663
1664
1665static struct super_block *
1666eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
1667 const char *dev_name, void *data)
1668{
1669 return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
1670}
1671
1672
1673static int __init eventpoll_init(void)
1674{
1675 int error;
1676
1677 init_MUTEX(&epsem);
1678
1679
1680 ep_poll_safewake_init(&psw);
1681
1682
1683 error = -ENOMEM;
1684 epi_cache = kmem_cache_create("eventpoll_epi",
1685 sizeof(struct epitem),
1686 0,
1687 SLAB_HWCACHE_ALIGN | EPI_SLAB_DEBUG, NULL, NULL);
1688 if (!epi_cache)
1689 goto eexit_1;
1690
1691
1692 error = -ENOMEM;
1693 pwq_cache = kmem_cache_create("eventpoll_pwq",
1694 sizeof(struct eppoll_entry),
1695 0,
1696 EPI_SLAB_DEBUG, NULL, NULL);
1697 if (!pwq_cache)
1698 goto eexit_2;
1699
1700
1701
1702
1703
1704 error = register_filesystem(&eventpoll_fs_type);
1705 if (error)
1706 goto eexit_3;
1707
1708
1709 eventpoll_mnt = kern_mount(&eventpoll_fs_type);
1710 error = PTR_ERR(eventpoll_mnt);
1711 if (IS_ERR(eventpoll_mnt))
1712 goto eexit_4;
1713
1714 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n", current));
1715
1716 return 0;
1717
1718eexit_4:
1719 unregister_filesystem(&eventpoll_fs_type);
1720eexit_3:
1721 kmem_cache_destroy(pwq_cache);
1722eexit_2:
1723 kmem_cache_destroy(epi_cache);
1724eexit_1:
1725
1726 return error;
1727}
1728
1729
1730static void __exit eventpoll_exit(void)
1731{
1732
1733 unregister_filesystem(&eventpoll_fs_type);
1734 mntput(eventpoll_mnt);
1735 kmem_cache_destroy(pwq_cache);
1736 kmem_cache_destroy(epi_cache);
1737}
1738
1739module_init(eventpoll_init);
1740module_exit(eventpoll_exit);
1741
1742MODULE_LICENSE("GPL");
1743
1744