linux/fs/eventpoll.c
<<
>>
Prefs
   1/*
   2 *  fs/eventpoll.c (Efficient event retrieval implementation)
   3 *  Copyright (C) 2001,...,2009  Davide Libenzi
   4 *
   5 *  This program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2 of the License, or
   8 *  (at your option) any later version.
   9 *
  10 *  Davide Libenzi <davidel@xmailserver.org>
  11 *
  12 */
  13
  14#include <linux/init.h>
  15#include <linux/kernel.h>
  16#include <linux/sched.h>
  17#include <linux/fs.h>
  18#include <linux/file.h>
  19#include <linux/signal.h>
  20#include <linux/errno.h>
  21#include <linux/mm.h>
  22#include <linux/slab.h>
  23#include <linux/poll.h>
  24#include <linux/string.h>
  25#include <linux/list.h>
  26#include <linux/hash.h>
  27#include <linux/spinlock.h>
  28#include <linux/syscalls.h>
  29#include <linux/rbtree.h>
  30#include <linux/wait.h>
  31#include <linux/eventpoll.h>
  32#include <linux/mount.h>
  33#include <linux/bitops.h>
  34#include <linux/mutex.h>
  35#include <linux/anon_inodes.h>
  36#include <linux/device.h>
  37#include <asm/uaccess.h>
  38#include <asm/io.h>
  39#include <asm/mman.h>
  40#include <linux/atomic.h>
  41
  42/*
  43 * LOCKING:
  44 * There are three level of locking required by epoll :
  45 *
  46 * 1) epmutex (mutex)
  47 * 2) ep->mtx (mutex)
  48 * 3) ep->lock (spinlock)
  49 *
  50 * The acquire order is the one listed above, from 1 to 3.
  51 * We need a spinlock (ep->lock) because we manipulate objects
  52 * from inside the poll callback, that might be triggered from
  53 * a wake_up() that in turn might be called from IRQ context.
  54 * So we can't sleep inside the poll callback and hence we need
  55 * a spinlock. During the event transfer loop (from kernel to
  56 * user space) we could end up sleeping due a copy_to_user(), so
  57 * we need a lock that will allow us to sleep. This lock is a
  58 * mutex (ep->mtx). It is acquired during the event transfer loop,
  59 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
  60 * Then we also need a global mutex to serialize eventpoll_release_file()
  61 * and ep_free().
  62 * This mutex is acquired by ep_free() during the epoll file
  63 * cleanup path and it is also acquired by eventpoll_release_file()
  64 * if a file has been pushed inside an epoll set and it is then
  65 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
  66 * It is also acquired when inserting an epoll fd onto another epoll
  67 * fd. We do this so that we walk the epoll tree and ensure that this
  68 * insertion does not create a cycle of epoll file descriptors, which
  69 * could lead to deadlock. We need a global mutex to prevent two
  70 * simultaneous inserts (A into B and B into A) from racing and
  71 * constructing a cycle without either insert observing that it is
  72 * going to.
  73 * It is necessary to acquire multiple "ep->mtx"es at once in the
  74 * case when one epoll fd is added to another. In this case, we
  75 * always acquire the locks in the order of nesting (i.e. after
  76 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
  77 * before e2->mtx). Since we disallow cycles of epoll file
  78 * descriptors, this ensures that the mutexes are well-ordered. In
  79 * order to communicate this nesting to lockdep, when walking a tree
  80 * of epoll file descriptors, we use the current recursion depth as
  81 * the lockdep subkey.
  82 * It is possible to drop the "ep->mtx" and to use the global
  83 * mutex "epmutex" (together with "ep->lock") to have it working,
  84 * but having "ep->mtx" will make the interface more scalable.
  85 * Events that require holding "epmutex" are very rare, while for
  86 * normal operations the epoll private "ep->mtx" will guarantee
  87 * a better scalability.
  88 */
  89
  90/* Epoll private bits inside the event mask */
  91#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
  92
  93/* Maximum number of nesting allowed inside epoll sets */
  94#define EP_MAX_NESTS 4
  95
  96#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
  97
  98#define EP_UNACTIVE_PTR ((void *) -1L)
  99
 100#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
 101
 102struct epoll_filefd {
 103        struct file *file;
 104        int fd;
 105};
 106
 107/*
 108 * Structure used to track possible nested calls, for too deep recursions
 109 * and loop cycles.
 110 */
 111struct nested_call_node {
 112        struct list_head llink;
 113        void *cookie;
 114        void *ctx;
 115};
 116
 117/*
 118 * This structure is used as collector for nested calls, to check for
 119 * maximum recursion dept and loop cycles.
 120 */
 121struct nested_calls {
 122        struct list_head tasks_call_list;
 123        spinlock_t lock;
 124};
 125
 126/*
 127 * Each file descriptor added to the eventpoll interface will
 128 * have an entry of this type linked to the "rbr" RB tree.
 129 */
 130struct epitem {
 131        /* RB tree node used to link this structure to the eventpoll RB tree */
 132        struct rb_node rbn;
 133
 134        /* List header used to link this structure to the eventpoll ready list */
 135        struct list_head rdllink;
 136
 137        /*
 138         * Works together "struct eventpoll"->ovflist in keeping the
 139         * single linked chain of items.
 140         */
 141        struct epitem *next;
 142
 143        /* The file descriptor information this item refers to */
 144        struct epoll_filefd ffd;
 145
 146        /* Number of active wait queue attached to poll operations */
 147        int nwait;
 148
 149        /* List containing poll wait queues */
 150        struct list_head pwqlist;
 151
 152        /* The "container" of this item */
 153        struct eventpoll *ep;
 154
 155        /* List header used to link this item to the "struct file" items list */
 156        struct list_head fllink;
 157
 158        /* wakeup_source used when EPOLLWAKEUP is set */
 159        struct wakeup_source *ws;
 160
 161        /* The structure that describe the interested events and the source fd */
 162        struct epoll_event event;
 163};
 164
 165/*
 166 * This structure is stored inside the "private_data" member of the file
 167 * structure and represents the main data structure for the eventpoll
 168 * interface.
 169 */
 170struct eventpoll {
 171        /* Protect the access to this structure */
 172        spinlock_t lock;
 173
 174        /*
 175         * This mutex is used to ensure that files are not removed
 176         * while epoll is using them. This is held during the event
 177         * collection loop, the file cleanup path, the epoll file exit
 178         * code and the ctl operations.
 179         */
 180        struct mutex mtx;
 181
 182        /* Wait queue used by sys_epoll_wait() */
 183        wait_queue_head_t wq;
 184
 185        /* Wait queue used by file->poll() */
 186        wait_queue_head_t poll_wait;
 187
 188        /* List of ready file descriptors */
 189        struct list_head rdllist;
 190
 191        /* RB tree root used to store monitored fd structs */
 192        struct rb_root rbr;
 193
 194        /*
 195         * This is a single linked list that chains all the "struct epitem" that
 196         * happened while transferring ready events to userspace w/out
 197         * holding ->lock.
 198         */
 199        struct epitem *ovflist;
 200
 201        /* wakeup_source used when ep_scan_ready_list is running */
 202        struct wakeup_source *ws;
 203
 204        /* The user that created the eventpoll descriptor */
 205        struct user_struct *user;
 206
 207        struct file *file;
 208
 209        /* used to optimize loop detection check */
 210        int visited;
 211        struct list_head visited_list_link;
 212};
 213
 214/* Wait structure used by the poll hooks */
 215struct eppoll_entry {
 216        /* List header used to link this structure to the "struct epitem" */
 217        struct list_head llink;
 218
 219        /* The "base" pointer is set to the container "struct epitem" */
 220        struct epitem *base;
 221
 222        /*
 223         * Wait queue item that will be linked to the target file wait
 224         * queue head.
 225         */
 226        wait_queue_t wait;
 227
 228        /* The wait queue head that linked the "wait" wait queue item */
 229        wait_queue_head_t *whead;
 230};
 231
 232/* Wrapper struct used by poll queueing */
 233struct ep_pqueue {
 234        poll_table pt;
 235        struct epitem *epi;
 236};
 237
 238/* Used by the ep_send_events() function as callback private data */
 239struct ep_send_events_data {
 240        int maxevents;
 241        struct epoll_event __user *events;
 242};
 243
 244/*
 245 * Configuration options available inside /proc/sys/fs/epoll/
 246 */
 247/* Maximum number of epoll watched descriptors, per user */
 248static long max_user_watches __read_mostly;
 249
 250/*
 251 * This mutex is used to serialize ep_free() and eventpoll_release_file().
 252 */
 253static DEFINE_MUTEX(epmutex);
 254
 255/* Used to check for epoll file descriptor inclusion loops */
 256static struct nested_calls poll_loop_ncalls;
 257
 258/* Used for safe wake up implementation */
 259static struct nested_calls poll_safewake_ncalls;
 260
 261/* Used to call file's f_op->poll() under the nested calls boundaries */
 262static struct nested_calls poll_readywalk_ncalls;
 263
 264/* Slab cache used to allocate "struct epitem" */
 265static struct kmem_cache *epi_cache __read_mostly;
 266
 267/* Slab cache used to allocate "struct eppoll_entry" */
 268static struct kmem_cache *pwq_cache __read_mostly;
 269
 270/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
 271static LIST_HEAD(visited_list);
 272
 273/*
 274 * List of files with newly added links, where we may need to limit the number
 275 * of emanating paths. Protected by the epmutex.
 276 */
 277static LIST_HEAD(tfile_check_list);
 278
 279#ifdef CONFIG_SYSCTL
 280
 281#include <linux/sysctl.h>
 282
 283static long zero;
 284static long long_max = LONG_MAX;
 285
 286ctl_table epoll_table[] = {
 287        {
 288                .procname       = "max_user_watches",
 289                .data           = &max_user_watches,
 290                .maxlen         = sizeof(max_user_watches),
 291                .mode           = 0644,
 292                .proc_handler   = proc_doulongvec_minmax,
 293                .extra1         = &zero,
 294                .extra2         = &long_max,
 295        },
 296        { }
 297};
 298#endif /* CONFIG_SYSCTL */
 299
 300static const struct file_operations eventpoll_fops;
 301
 302static inline int is_file_epoll(struct file *f)
 303{
 304        return f->f_op == &eventpoll_fops;
 305}
 306
 307/* Setup the structure that is used as key for the RB tree */
 308static inline void ep_set_ffd(struct epoll_filefd *ffd,
 309                              struct file *file, int fd)
 310{
 311        ffd->file = file;
 312        ffd->fd = fd;
 313}
 314
 315/* Compare RB tree keys */
 316static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 317                             struct epoll_filefd *p2)
 318{
 319        return (p1->file > p2->file ? +1:
 320                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
 321}
 322
 323/* Tells us if the item is currently linked */
 324static inline int ep_is_linked(struct list_head *p)
 325{
 326        return !list_empty(p);
 327}
 328
 329static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
 330{
 331        return container_of(p, struct eppoll_entry, wait);
 332}
 333
 334/* Get the "struct epitem" from a wait queue pointer */
 335static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
 336{
 337        return container_of(p, struct eppoll_entry, wait)->base;
 338}
 339
 340/* Get the "struct epitem" from an epoll queue wrapper */
 341static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 342{
 343        return container_of(p, struct ep_pqueue, pt)->epi;
 344}
 345
 346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 347static inline int ep_op_has_event(int op)
 348{
 349        return op != EPOLL_CTL_DEL;
 350}
 351
 352/* Initialize the poll safe wake up structure */
 353static void ep_nested_calls_init(struct nested_calls *ncalls)
 354{
 355        INIT_LIST_HEAD(&ncalls->tasks_call_list);
 356        spin_lock_init(&ncalls->lock);
 357}
 358
 359/**
 360 * ep_events_available - Checks if ready events might be available.
 361 *
 362 * @ep: Pointer to the eventpoll context.
 363 *
 364 * Returns: Returns a value different than zero if ready events are available,
 365 *          or zero otherwise.
 366 */
 367static inline int ep_events_available(struct eventpoll *ep)
 368{
 369        return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
 370}
 371
 372/**
 373 * ep_call_nested - Perform a bound (possibly) nested call, by checking
 374 *                  that the recursion limit is not exceeded, and that
 375 *                  the same nested call (by the meaning of same cookie) is
 376 *                  no re-entered.
 377 *
 378 * @ncalls: Pointer to the nested_calls structure to be used for this call.
 379 * @max_nests: Maximum number of allowed nesting calls.
 380 * @nproc: Nested call core function pointer.
 381 * @priv: Opaque data to be passed to the @nproc callback.
 382 * @cookie: Cookie to be used to identify this nested call.
 383 * @ctx: This instance context.
 384 *
 385 * Returns: Returns the code returned by the @nproc callback, or -1 if
 386 *          the maximum recursion limit has been exceeded.
 387 */
 388static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 389                          int (*nproc)(void *, void *, int), void *priv,
 390                          void *cookie, void *ctx)
 391{
 392        int error, call_nests = 0;
 393        unsigned long flags;
 394        struct list_head *lsthead = &ncalls->tasks_call_list;
 395        struct nested_call_node *tncur;
 396        struct nested_call_node tnode;
 397
 398        spin_lock_irqsave(&ncalls->lock, flags);
 399
 400        /*
 401         * Try to see if the current task is already inside this wakeup call.
 402         * We use a list here, since the population inside this set is always
 403         * very much limited.
 404         */
 405        list_for_each_entry(tncur, lsthead, llink) {
 406                if (tncur->ctx == ctx &&
 407                    (tncur->cookie == cookie || ++call_nests > max_nests)) {
 408                        /*
 409                         * Ops ... loop detected or maximum nest level reached.
 410                         * We abort this wake by breaking the cycle itself.
 411                         */
 412                        error = -1;
 413                        goto out_unlock;
 414                }
 415        }
 416
 417        /* Add the current task and cookie to the list */
 418        tnode.ctx = ctx;
 419        tnode.cookie = cookie;
 420        list_add(&tnode.llink, lsthead);
 421
 422        spin_unlock_irqrestore(&ncalls->lock, flags);
 423
 424        /* Call the nested function */
 425        error = (*nproc)(priv, cookie, call_nests);
 426
 427        /* Remove the current task from the list */
 428        spin_lock_irqsave(&ncalls->lock, flags);
 429        list_del(&tnode.llink);
 430out_unlock:
 431        spin_unlock_irqrestore(&ncalls->lock, flags);
 432
 433        return error;
 434}
 435
 436/*
 437 * As described in commit 0ccf831cb lockdep: annotate epoll
 438 * the use of wait queues used by epoll is done in a very controlled
 439 * manner. Wake ups can nest inside each other, but are never done
 440 * with the same locking. For example:
 441 *
 442 *   dfd = socket(...);
 443 *   efd1 = epoll_create();
 444 *   efd2 = epoll_create();
 445 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 446 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 447 *
 448 * When a packet arrives to the device underneath "dfd", the net code will
 449 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 450 * callback wakeup entry on that queue, and the wake_up() performed by the
 451 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 452 * (efd1) notices that it may have some event ready, so it needs to wake up
 453 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 454 * that ends up in another wake_up(), after having checked about the
 455 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
 456 * avoid stack blasting.
 457 *
 458 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 459 * this special case of epoll.
 460 */
 461#ifdef CONFIG_DEBUG_LOCK_ALLOC
 462static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
 463                                     unsigned long events, int subclass)
 464{
 465        unsigned long flags;
 466
 467        spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
 468        wake_up_locked_poll(wqueue, events);
 469        spin_unlock_irqrestore(&wqueue->lock, flags);
 470}
 471#else
 472static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
 473                                     unsigned long events, int subclass)
 474{
 475        wake_up_poll(wqueue, events);
 476}
 477#endif
 478
 479static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 480{
 481        ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
 482                          1 + call_nests);
 483        return 0;
 484}
 485
 486/*
 487 * Perform a safe wake up of the poll wait list. The problem is that
 488 * with the new callback'd wake up system, it is possible that the
 489 * poll callback is reentered from inside the call to wake_up() done
 490 * on the poll wait queue head. The rule is that we cannot reenter the
 491 * wake up code from the same task more than EP_MAX_NESTS times,
 492 * and we cannot reenter the same wait queue head at all. This will
 493 * enable to have a hierarchy of epoll file descriptor of no more than
 494 * EP_MAX_NESTS deep.
 495 */
 496static void ep_poll_safewake(wait_queue_head_t *wq)
 497{
 498        int this_cpu = get_cpu();
 499
 500        ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
 501                       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
 502
 503        put_cpu();
 504}
 505
 506static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 507{
 508        wait_queue_head_t *whead;
 509
 510        rcu_read_lock();
 511        /* If it is cleared by POLLFREE, it should be rcu-safe */
 512        whead = rcu_dereference(pwq->whead);
 513        if (whead)
 514                remove_wait_queue(whead, &pwq->wait);
 515        rcu_read_unlock();
 516}
 517
 518/*
 519 * This function unregisters poll callbacks from the associated file
 520 * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
 521 * ep_free).
 522 */
 523static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 524{
 525        struct list_head *lsthead = &epi->pwqlist;
 526        struct eppoll_entry *pwq;
 527
 528        while (!list_empty(lsthead)) {
 529                pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
 530
 531                list_del(&pwq->llink);
 532                ep_remove_wait_queue(pwq);
 533                kmem_cache_free(pwq_cache, pwq);
 534        }
 535}
 536
 537/**
 538 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
 539 *                      the scan code, to call f_op->poll(). Also allows for
 540 *                      O(NumReady) performance.
 541 *
 542 * @ep: Pointer to the epoll private data structure.
 543 * @sproc: Pointer to the scan callback.
 544 * @priv: Private opaque data passed to the @sproc callback.
 545 * @depth: The current depth of recursive f_op->poll calls.
 546 *
 547 * Returns: The same integer error code returned by the @sproc callback.
 548 */
 549static int ep_scan_ready_list(struct eventpoll *ep,
 550                              int (*sproc)(struct eventpoll *,
 551                                           struct list_head *, void *),
 552                              void *priv,
 553                              int depth)
 554{
 555        int error, pwake = 0;
 556        unsigned long flags;
 557        struct epitem *epi, *nepi;
 558        LIST_HEAD(txlist);
 559
 560        /*
 561         * We need to lock this because we could be hit by
 562         * eventpoll_release_file() and epoll_ctl().
 563         */
 564        mutex_lock_nested(&ep->mtx, depth);
 565
 566        /*
 567         * Steal the ready list, and re-init the original one to the
 568         * empty list. Also, set ep->ovflist to NULL so that events
 569         * happening while looping w/out locks, are not lost. We cannot
 570         * have the poll callback to queue directly on ep->rdllist,
 571         * because we want the "sproc" callback to be able to do it
 572         * in a lockless way.
 573         */
 574        spin_lock_irqsave(&ep->lock, flags);
 575        list_splice_init(&ep->rdllist, &txlist);
 576        ep->ovflist = NULL;
 577        spin_unlock_irqrestore(&ep->lock, flags);
 578
 579        /*
 580         * Now call the callback function.
 581         */
 582        error = (*sproc)(ep, &txlist, priv);
 583
 584        spin_lock_irqsave(&ep->lock, flags);
 585        /*
 586         * During the time we spent inside the "sproc" callback, some
 587         * other events might have been queued by the poll callback.
 588         * We re-insert them inside the main ready-list here.
 589         */
 590        for (nepi = ep->ovflist; (epi = nepi) != NULL;
 591             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
 592                /*
 593                 * We need to check if the item is already in the list.
 594                 * During the "sproc" callback execution time, items are
 595                 * queued into ->ovflist but the "txlist" might already
 596                 * contain them, and the list_splice() below takes care of them.
 597                 */
 598                if (!ep_is_linked(&epi->rdllink)) {
 599                        list_add_tail(&epi->rdllink, &ep->rdllist);
 600                        __pm_stay_awake(epi->ws);
 601                }
 602        }
 603        /*
 604         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
 605         * releasing the lock, events will be queued in the normal way inside
 606         * ep->rdllist.
 607         */
 608        ep->ovflist = EP_UNACTIVE_PTR;
 609
 610        /*
 611         * Quickly re-inject items left on "txlist".
 612         */
 613        list_splice(&txlist, &ep->rdllist);
 614        __pm_relax(ep->ws);
 615
 616        if (!list_empty(&ep->rdllist)) {
 617                /*
 618                 * Wake up (if active) both the eventpoll wait list and
 619                 * the ->poll() wait list (delayed after we release the lock).
 620                 */
 621                if (waitqueue_active(&ep->wq))
 622                        wake_up_locked(&ep->wq);
 623                if (waitqueue_active(&ep->poll_wait))
 624                        pwake++;
 625        }
 626        spin_unlock_irqrestore(&ep->lock, flags);
 627
 628        mutex_unlock(&ep->mtx);
 629
 630        /* We have to call this outside the lock */
 631        if (pwake)
 632                ep_poll_safewake(&ep->poll_wait);
 633
 634        return error;
 635}
 636
 637/*
 638 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 639 * all the associated resources. Must be called with "mtx" held.
 640 */
 641static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 642{
 643        unsigned long flags;
 644        struct file *file = epi->ffd.file;
 645
 646        /*
 647         * Removes poll wait queue hooks. We _have_ to do this without holding
 648         * the "ep->lock" otherwise a deadlock might occur. This because of the
 649         * sequence of the lock acquisition. Here we do "ep->lock" then the wait
 650         * queue head lock when unregistering the wait queue. The wakeup callback
 651         * will run by holding the wait queue head lock and will call our callback
 652         * that will try to get "ep->lock".
 653         */
 654        ep_unregister_pollwait(ep, epi);
 655
 656        /* Remove the current item from the list of epoll hooks */
 657        spin_lock(&file->f_lock);
 658        if (ep_is_linked(&epi->fllink))
 659                list_del_init(&epi->fllink);
 660        spin_unlock(&file->f_lock);
 661
 662        rb_erase(&epi->rbn, &ep->rbr);
 663
 664        spin_lock_irqsave(&ep->lock, flags);
 665        if (ep_is_linked(&epi->rdllink))
 666                list_del_init(&epi->rdllink);
 667        spin_unlock_irqrestore(&ep->lock, flags);
 668
 669        wakeup_source_unregister(epi->ws);
 670
 671        /* At this point it is safe to free the eventpoll item */
 672        kmem_cache_free(epi_cache, epi);
 673
 674        atomic_long_dec(&ep->user->epoll_watches);
 675
 676        return 0;
 677}
 678
 679static void ep_free(struct eventpoll *ep)
 680{
 681        struct rb_node *rbp;
 682        struct epitem *epi;
 683
 684        /* We need to release all tasks waiting for these file */
 685        if (waitqueue_active(&ep->poll_wait))
 686                ep_poll_safewake(&ep->poll_wait);
 687
 688        /*
 689         * We need to lock this because we could be hit by
 690         * eventpoll_release_file() while we're freeing the "struct eventpoll".
 691         * We do not need to hold "ep->mtx" here because the epoll file
 692         * is on the way to be removed and no one has references to it
 693         * anymore. The only hit might come from eventpoll_release_file() but
 694         * holding "epmutex" is sufficient here.
 695         */
 696        mutex_lock(&epmutex);
 697
 698        /*
 699         * Walks through the whole tree by unregistering poll callbacks.
 700         */
 701        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 702                epi = rb_entry(rbp, struct epitem, rbn);
 703
 704                ep_unregister_pollwait(ep, epi);
 705        }
 706
 707        /*
 708         * Walks through the whole tree by freeing each "struct epitem". At this
 709         * point we are sure no poll callbacks will be lingering around, and also by
 710         * holding "epmutex" we can be sure that no file cleanup code will hit
 711         * us during this operation. So we can avoid the lock on "ep->lock".
 712         */
 713        while ((rbp = rb_first(&ep->rbr)) != NULL) {
 714                epi = rb_entry(rbp, struct epitem, rbn);
 715                ep_remove(ep, epi);
 716        }
 717
 718        mutex_unlock(&epmutex);
 719        mutex_destroy(&ep->mtx);
 720        free_uid(ep->user);
 721        wakeup_source_unregister(ep->ws);
 722        kfree(ep);
 723}
 724
 725static int ep_eventpoll_release(struct inode *inode, struct file *file)
 726{
 727        struct eventpoll *ep = file->private_data;
 728
 729        if (ep)
 730                ep_free(ep);
 731
 732        return 0;
 733}
 734
 735static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 736                               void *priv)
 737{
 738        struct epitem *epi, *tmp;
 739        poll_table pt;
 740
 741        init_poll_funcptr(&pt, NULL);
 742        list_for_each_entry_safe(epi, tmp, head, rdllink) {
 743                pt._key = epi->event.events;
 744                if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
 745                    epi->event.events)
 746                        return POLLIN | POLLRDNORM;
 747                else {
 748                        /*
 749                         * Item has been dropped into the ready list by the poll
 750                         * callback, but it's not actually ready, as far as
 751                         * caller requested events goes. We can remove it here.
 752                         */
 753                        __pm_relax(epi->ws);
 754                        list_del_init(&epi->rdllink);
 755                }
 756        }
 757
 758        return 0;
 759}
 760
 761static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
 762{
 763        return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
 764}
 765
 766static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 767{
 768        int pollflags;
 769        struct eventpoll *ep = file->private_data;
 770
 771        /* Insert inside our poll wait queue */
 772        poll_wait(file, &ep->poll_wait, wait);
 773
 774        /*
 775         * Proceed to find out if wanted events are really available inside
 776         * the ready list. This need to be done under ep_call_nested()
 777         * supervision, since the call to f_op->poll() done on listed files
 778         * could re-enter here.
 779         */
 780        pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
 781                                   ep_poll_readyevents_proc, ep, ep, current);
 782
 783        return pollflags != -1 ? pollflags : 0;
 784}
 785
 786/* File callbacks that implement the eventpoll file behaviour */
 787static const struct file_operations eventpoll_fops = {
 788        .release        = ep_eventpoll_release,
 789        .poll           = ep_eventpoll_poll,
 790        .llseek         = noop_llseek,
 791};
 792
 793/*
 794 * This is called from eventpoll_release() to unlink files from the eventpoll
 795 * interface. We need to have this facility to cleanup correctly files that are
 796 * closed without being removed from the eventpoll interface.
 797 */
 798void eventpoll_release_file(struct file *file)
 799{
 800        struct list_head *lsthead = &file->f_ep_links;
 801        struct eventpoll *ep;
 802        struct epitem *epi;
 803
 804        /*
 805         * We don't want to get "file->f_lock" because it is not
 806         * necessary. It is not necessary because we're in the "struct file"
 807         * cleanup path, and this means that no one is using this file anymore.
 808         * So, for example, epoll_ctl() cannot hit here since if we reach this
 809         * point, the file counter already went to zero and fget() would fail.
 810         * The only hit might come from ep_free() but by holding the mutex
 811         * will correctly serialize the operation. We do need to acquire
 812         * "ep->mtx" after "epmutex" because ep_remove() requires it when called
 813         * from anywhere but ep_free().
 814         *
 815         * Besides, ep_remove() acquires the lock, so we can't hold it here.
 816         */
 817        mutex_lock(&epmutex);
 818
 819        while (!list_empty(lsthead)) {
 820                epi = list_first_entry(lsthead, struct epitem, fllink);
 821
 822                ep = epi->ep;
 823                list_del_init(&epi->fllink);
 824                mutex_lock_nested(&ep->mtx, 0);
 825                ep_remove(ep, epi);
 826                mutex_unlock(&ep->mtx);
 827        }
 828
 829        mutex_unlock(&epmutex);
 830}
 831
 832static int ep_alloc(struct eventpoll **pep)
 833{
 834        int error;
 835        struct user_struct *user;
 836        struct eventpoll *ep;
 837
 838        user = get_current_user();
 839        error = -ENOMEM;
 840        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 841        if (unlikely(!ep))
 842                goto free_uid;
 843
 844        spin_lock_init(&ep->lock);
 845        mutex_init(&ep->mtx);
 846        init_waitqueue_head(&ep->wq);
 847        init_waitqueue_head(&ep->poll_wait);
 848        INIT_LIST_HEAD(&ep->rdllist);
 849        ep->rbr = RB_ROOT;
 850        ep->ovflist = EP_UNACTIVE_PTR;
 851        ep->user = user;
 852
 853        *pep = ep;
 854
 855        return 0;
 856
 857free_uid:
 858        free_uid(user);
 859        return error;
 860}
 861
 862/*
 863 * Search the file inside the eventpoll tree. The RB tree operations
 864 * are protected by the "mtx" mutex, and ep_find() must be called with
 865 * "mtx" held.
 866 */
 867static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 868{
 869        int kcmp;
 870        struct rb_node *rbp;
 871        struct epitem *epi, *epir = NULL;
 872        struct epoll_filefd ffd;
 873
 874        ep_set_ffd(&ffd, file, fd);
 875        for (rbp = ep->rbr.rb_node; rbp; ) {
 876                epi = rb_entry(rbp, struct epitem, rbn);
 877                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
 878                if (kcmp > 0)
 879                        rbp = rbp->rb_right;
 880                else if (kcmp < 0)
 881                        rbp = rbp->rb_left;
 882                else {
 883                        epir = epi;
 884                        break;
 885                }
 886        }
 887
 888        return epir;
 889}
 890
 891/*
 892 * This is the callback that is passed to the wait queue wakeup
 893 * mechanism. It is called by the stored file descriptors when they
 894 * have events to report.
 895 */
 896static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
 897{
 898        int pwake = 0;
 899        unsigned long flags;
 900        struct epitem *epi = ep_item_from_wait(wait);
 901        struct eventpoll *ep = epi->ep;
 902
 903        if ((unsigned long)key & POLLFREE) {
 904                ep_pwq_from_wait(wait)->whead = NULL;
 905                /*
 906                 * whead = NULL above can race with ep_remove_wait_queue()
 907                 * which can do another remove_wait_queue() after us, so we
 908                 * can't use __remove_wait_queue(). whead->lock is held by
 909                 * the caller.
 910                 */
 911                list_del_init(&wait->task_list);
 912        }
 913
 914        spin_lock_irqsave(&ep->lock, flags);
 915
 916        /*
 917         * If the event mask does not contain any poll(2) event, we consider the
 918         * descriptor to be disabled. This condition is likely the effect of the
 919         * EPOLLONESHOT bit that disables the descriptor when an event is received,
 920         * until the next EPOLL_CTL_MOD will be issued.
 921         */
 922        if (!(epi->event.events & ~EP_PRIVATE_BITS))
 923                goto out_unlock;
 924
 925        /*
 926         * Check the events coming with the callback. At this stage, not
 927         * every device reports the events in the "key" parameter of the
 928         * callback. We need to be able to handle both cases here, hence the
 929         * test for "key" != NULL before the event match test.
 930         */
 931        if (key && !((unsigned long) key & epi->event.events))
 932                goto out_unlock;
 933
 934        /*
 935         * If we are transferring events to userspace, we can hold no locks
 936         * (because we're accessing user memory, and because of linux f_op->poll()
 937         * semantics). All the events that happen during that period of time are
 938         * chained in ep->ovflist and requeued later on.
 939         */
 940        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
 941                if (epi->next == EP_UNACTIVE_PTR) {
 942                        epi->next = ep->ovflist;
 943                        ep->ovflist = epi;
 944                        if (epi->ws) {
 945                                /*
 946                                 * Activate ep->ws since epi->ws may get
 947                                 * deactivated at any time.
 948                                 */
 949                                __pm_stay_awake(ep->ws);
 950                        }
 951
 952                }
 953                goto out_unlock;
 954        }
 955
 956        /* If this file is already in the ready list we exit soon */
 957        if (!ep_is_linked(&epi->rdllink)) {
 958                list_add_tail(&epi->rdllink, &ep->rdllist);
 959                __pm_stay_awake(epi->ws);
 960        }
 961
 962        /*
 963         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 964         * wait list.
 965         */
 966        if (waitqueue_active(&ep->wq))
 967                wake_up_locked(&ep->wq);
 968        if (waitqueue_active(&ep->poll_wait))
 969                pwake++;
 970
 971out_unlock:
 972        spin_unlock_irqrestore(&ep->lock, flags);
 973
 974        /* We have to call this outside the lock */
 975        if (pwake)
 976                ep_poll_safewake(&ep->poll_wait);
 977
 978        return 1;
 979}
 980
 981/*
 982 * This is the callback that is used to add our wait queue to the
 983 * target file wakeup lists.
 984 */
 985static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 986                                 poll_table *pt)
 987{
 988        struct epitem *epi = ep_item_from_epqueue(pt);
 989        struct eppoll_entry *pwq;
 990
 991        if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
 992                init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
 993                pwq->whead = whead;
 994                pwq->base = epi;
 995                add_wait_queue(whead, &pwq->wait);
 996                list_add_tail(&pwq->llink, &epi->pwqlist);
 997                epi->nwait++;
 998        } else {
 999                /* We have to signal that an error occurred */
1000                epi->nwait = -1;
1001        }
1002}
1003
1004static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1005{
1006        int kcmp;
1007        struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
1008        struct epitem *epic;
1009
1010        while (*p) {
1011                parent = *p;
1012                epic = rb_entry(parent, struct epitem, rbn);
1013                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
1014                if (kcmp > 0)
1015                        p = &parent->rb_right;
1016                else
1017                        p = &parent->rb_left;
1018        }
1019        rb_link_node(&epi->rbn, parent, p);
1020        rb_insert_color(&epi->rbn, &ep->rbr);
1021}
1022
1023
1024
1025#define PATH_ARR_SIZE 5
1026/*
1027 * These are the number paths of length 1 to 5, that we are allowing to emanate
1028 * from a single file of interest. For example, we allow 1000 paths of length
1029 * 1, to emanate from each file of interest. This essentially represents the
1030 * potential wakeup paths, which need to be limited in order to avoid massive
1031 * uncontrolled wakeup storms. The common use case should be a single ep which
1032 * is connected to n file sources. In this case each file source has 1 path
1033 * of length 1. Thus, the numbers below should be more than sufficient. These
1034 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
1035 * and delete can't add additional paths. Protected by the epmutex.
1036 */
1037static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1038static int path_count[PATH_ARR_SIZE];
1039
1040static int path_count_inc(int nests)
1041{
1042        /* Allow an arbitrary number of depth 1 paths */
1043        if (nests == 0)
1044                return 0;
1045
1046        if (++path_count[nests] > path_limits[nests])
1047                return -1;
1048        return 0;
1049}
1050
1051static void path_count_init(void)
1052{
1053        int i;
1054
1055        for (i = 0; i < PATH_ARR_SIZE; i++)
1056                path_count[i] = 0;
1057}
1058
1059static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1060{
1061        int error = 0;
1062        struct file *file = priv;
1063        struct file *child_file;
1064        struct epitem *epi;
1065
1066        list_for_each_entry(epi, &file->f_ep_links, fllink) {
1067                child_file = epi->ep->file;
1068                if (is_file_epoll(child_file)) {
1069                        if (list_empty(&child_file->f_ep_links)) {
1070                                if (path_count_inc(call_nests)) {
1071                                        error = -1;
1072                                        break;
1073                                }
1074                        } else {
1075                                error = ep_call_nested(&poll_loop_ncalls,
1076                                                        EP_MAX_NESTS,
1077                                                        reverse_path_check_proc,
1078                                                        child_file, child_file,
1079                                                        current);
1080                        }
1081                        if (error != 0)
1082                                break;
1083                } else {
1084                        printk(KERN_ERR "reverse_path_check_proc: "
1085                                "file is not an ep!\n");
1086                }
1087        }
1088        return error;
1089}
1090
1091/**
1092 * reverse_path_check - The tfile_check_list is list of file *, which have
1093 *                      links that are proposed to be newly added. We need to
1094 *                      make sure that those added links don't add too many
1095 *                      paths such that we will spend all our time waking up
1096 *                      eventpoll objects.
1097 *
1098 * Returns: Returns zero if the proposed links don't create too many paths,
1099 *          -1 otherwise.
1100 */
1101static int reverse_path_check(void)
1102{
1103        int error = 0;
1104        struct file *current_file;
1105
1106        /* let's call this for all tfiles */
1107        list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1108                path_count_init();
1109                error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1110                                        reverse_path_check_proc, current_file,
1111                                        current_file, current);
1112                if (error)
1113                        break;
1114        }
1115        return error;
1116}
1117
1118static int ep_create_wakeup_source(struct epitem *epi)
1119{
1120        const char *name;
1121
1122        if (!epi->ep->ws) {
1123                epi->ep->ws = wakeup_source_register("eventpoll");
1124                if (!epi->ep->ws)
1125                        return -ENOMEM;
1126        }
1127
1128        name = epi->ffd.file->f_path.dentry->d_name.name;
1129        epi->ws = wakeup_source_register(name);
1130        if (!epi->ws)
1131                return -ENOMEM;
1132
1133        return 0;
1134}
1135
1136static void ep_destroy_wakeup_source(struct epitem *epi)
1137{
1138        wakeup_source_unregister(epi->ws);
1139        epi->ws = NULL;
1140}
1141
1142/*
1143 * Must be called with "mtx" held.
1144 */
1145static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1146                     struct file *tfile, int fd)
1147{
1148        int error, revents, pwake = 0;
1149        unsigned long flags;
1150        long user_watches;
1151        struct epitem *epi;
1152        struct ep_pqueue epq;
1153
1154        user_watches = atomic_long_read(&ep->user->epoll_watches);
1155        if (unlikely(user_watches >= max_user_watches))
1156                return -ENOSPC;
1157        if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1158                return -ENOMEM;
1159
1160        /* Item initialization follow here ... */
1161        INIT_LIST_HEAD(&epi->rdllink);
1162        INIT_LIST_HEAD(&epi->fllink);
1163        INIT_LIST_HEAD(&epi->pwqlist);
1164        epi->ep = ep;
1165        ep_set_ffd(&epi->ffd, tfile, fd);
1166        epi->event = *event;
1167        epi->nwait = 0;
1168        epi->next = EP_UNACTIVE_PTR;
1169        if (epi->event.events & EPOLLWAKEUP) {
1170                error = ep_create_wakeup_source(epi);
1171                if (error)
1172                        goto error_create_wakeup_source;
1173        } else {
1174                epi->ws = NULL;
1175        }
1176
1177        /* Initialize the poll table using the queue callback */
1178        epq.epi = epi;
1179        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1180        epq.pt._key = event->events;
1181
1182        /*
1183         * Attach the item to the poll hooks and get current event bits.
1184         * We can safely use the file* here because its usage count has
1185         * been increased by the caller of this function. Note that after
1186         * this operation completes, the poll callback can start hitting
1187         * the new item.
1188         */
1189        revents = tfile->f_op->poll(tfile, &epq.pt);
1190
1191        /*
1192         * We have to check if something went wrong during the poll wait queue
1193         * install process. Namely an allocation for a wait queue failed due
1194         * high memory pressure.
1195         */
1196        error = -ENOMEM;
1197        if (epi->nwait < 0)
1198                goto error_unregister;
1199
1200        /* Add the current item to the list of active epoll hook for this file */
1201        spin_lock(&tfile->f_lock);
1202        list_add_tail(&epi->fllink, &tfile->f_ep_links);
1203        spin_unlock(&tfile->f_lock);
1204
1205        /*
1206         * Add the current item to the RB tree. All RB tree operations are
1207         * protected by "mtx", and ep_insert() is called with "mtx" held.
1208         */
1209        ep_rbtree_insert(ep, epi);
1210
1211        /* now check if we've created too many backpaths */
1212        error = -EINVAL;
1213        if (reverse_path_check())
1214                goto error_remove_epi;
1215
1216        /* We have to drop the new item inside our item list to keep track of it */
1217        spin_lock_irqsave(&ep->lock, flags);
1218
1219        /* If the file is already "ready" we drop it inside the ready list */
1220        if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1221                list_add_tail(&epi->rdllink, &ep->rdllist);
1222                __pm_stay_awake(epi->ws);
1223
1224                /* Notify waiting tasks that events are available */
1225                if (waitqueue_active(&ep->wq))
1226                        wake_up_locked(&ep->wq);
1227                if (waitqueue_active(&ep->poll_wait))
1228                        pwake++;
1229        }
1230
1231        spin_unlock_irqrestore(&ep->lock, flags);
1232
1233        atomic_long_inc(&ep->user->epoll_watches);
1234
1235        /* We have to call this outside the lock */
1236        if (pwake)
1237                ep_poll_safewake(&ep->poll_wait);
1238
1239        return 0;
1240
1241error_remove_epi:
1242        spin_lock(&tfile->f_lock);
1243        if (ep_is_linked(&epi->fllink))
1244                list_del_init(&epi->fllink);
1245        spin_unlock(&tfile->f_lock);
1246
1247        rb_erase(&epi->rbn, &ep->rbr);
1248
1249error_unregister:
1250        ep_unregister_pollwait(ep, epi);
1251
1252        /*
1253         * We need to do this because an event could have been arrived on some
1254         * allocated wait queue. Note that we don't care about the ep->ovflist
1255         * list, since that is used/cleaned only inside a section bound by "mtx".
1256         * And ep_insert() is called with "mtx" held.
1257         */
1258        spin_lock_irqsave(&ep->lock, flags);
1259        if (ep_is_linked(&epi->rdllink))
1260                list_del_init(&epi->rdllink);
1261        spin_unlock_irqrestore(&ep->lock, flags);
1262
1263        wakeup_source_unregister(epi->ws);
1264
1265error_create_wakeup_source:
1266        kmem_cache_free(epi_cache, epi);
1267
1268        return error;
1269}
1270
1271/*
1272 * Modify the interest event mask by dropping an event if the new mask
1273 * has a match in the current file status. Must be called with "mtx" held.
1274 */
1275static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
1276{
1277        int pwake = 0;
1278        unsigned int revents;
1279        poll_table pt;
1280
1281        init_poll_funcptr(&pt, NULL);
1282
1283        /*
1284         * Set the new event interest mask before calling f_op->poll();
1285         * otherwise we might miss an event that happens between the
1286         * f_op->poll() call and the new event set registering.
1287         */
1288        epi->event.events = event->events; /* need barrier below */
1289        pt._key = event->events;
1290        epi->event.data = event->data; /* protected by mtx */
1291        if (epi->event.events & EPOLLWAKEUP) {
1292                if (!epi->ws)
1293                        ep_create_wakeup_source(epi);
1294        } else if (epi->ws) {
1295                ep_destroy_wakeup_source(epi);
1296        }
1297
1298        /*
1299         * The following barrier has two effects:
1300         *
1301         * 1) Flush epi changes above to other CPUs.  This ensures
1302         *    we do not miss events from ep_poll_callback if an
1303         *    event occurs immediately after we call f_op->poll().
1304         *    We need this because we did not take ep->lock while
1305         *    changing epi above (but ep_poll_callback does take
1306         *    ep->lock).
1307         *
1308         * 2) We also need to ensure we do not miss _past_ events
1309         *    when calling f_op->poll().  This barrier also
1310         *    pairs with the barrier in wq_has_sleeper (see
1311         *    comments for wq_has_sleeper).
1312         *
1313         * This barrier will now guarantee ep_poll_callback or f_op->poll
1314         * (or both) will notice the readiness of an item.
1315         */
1316        smp_mb();
1317
1318        /*
1319         * Get current event bits. We can safely use the file* here because
1320         * its usage count has been increased by the caller of this function.
1321         */
1322        revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
1323
1324        /*
1325         * If the item is "hot" and it is not registered inside the ready
1326         * list, push it inside.
1327         */
1328        if (revents & event->events) {
1329                spin_lock_irq(&ep->lock);
1330                if (!ep_is_linked(&epi->rdllink)) {
1331                        list_add_tail(&epi->rdllink, &ep->rdllist);
1332                        __pm_stay_awake(epi->ws);
1333
1334                        /* Notify waiting tasks that events are available */
1335                        if (waitqueue_active(&ep->wq))
1336                                wake_up_locked(&ep->wq);
1337                        if (waitqueue_active(&ep->poll_wait))
1338                                pwake++;
1339                }
1340                spin_unlock_irq(&ep->lock);
1341        }
1342
1343        /* We have to call this outside the lock */
1344        if (pwake)
1345                ep_poll_safewake(&ep->poll_wait);
1346
1347        return 0;
1348}
1349
1350static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1351                               void *priv)
1352{
1353        struct ep_send_events_data *esed = priv;
1354        int eventcnt;
1355        unsigned int revents;
1356        struct epitem *epi;
1357        struct epoll_event __user *uevent;
1358        poll_table pt;
1359
1360        init_poll_funcptr(&pt, NULL);
1361
1362        /*
1363         * We can loop without lock because we are passed a task private list.
1364         * Items cannot vanish during the loop because ep_scan_ready_list() is
1365         * holding "mtx" during this call.
1366         */
1367        for (eventcnt = 0, uevent = esed->events;
1368             !list_empty(head) && eventcnt < esed->maxevents;) {
1369                epi = list_first_entry(head, struct epitem, rdllink);
1370
1371                /*
1372                 * Activate ep->ws before deactivating epi->ws to prevent
1373                 * triggering auto-suspend here (in case we reactive epi->ws
1374                 * below).
1375                 *
1376                 * This could be rearranged to delay the deactivation of epi->ws
1377                 * instead, but then epi->ws would temporarily be out of sync
1378                 * with ep_is_linked().
1379                 */
1380                if (epi->ws && epi->ws->active)
1381                        __pm_stay_awake(ep->ws);
1382                __pm_relax(epi->ws);
1383                list_del_init(&epi->rdllink);
1384
1385                pt._key = epi->event.events;
1386                revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
1387                        epi->event.events;
1388
1389                /*
1390                 * If the event mask intersect the caller-requested one,
1391                 * deliver the event to userspace. Again, ep_scan_ready_list()
1392                 * is holding "mtx", so no operations coming from userspace
1393                 * can change the item.
1394                 */
1395                if (revents) {
1396                        if (__put_user(revents, &uevent->events) ||
1397                            __put_user(epi->event.data, &uevent->data)) {
1398                                list_add(&epi->rdllink, head);
1399                                __pm_stay_awake(epi->ws);
1400                                return eventcnt ? eventcnt : -EFAULT;
1401                        }
1402                        eventcnt++;
1403                        uevent++;
1404                        if (epi->event.events & EPOLLONESHOT)
1405                                epi->event.events &= EP_PRIVATE_BITS;
1406                        else if (!(epi->event.events & EPOLLET)) {
1407                                /*
1408                                 * If this file has been added with Level
1409                                 * Trigger mode, we need to insert back inside
1410                                 * the ready list, so that the next call to
1411                                 * epoll_wait() will check again the events
1412                                 * availability. At this point, no one can insert
1413                                 * into ep->rdllist besides us. The epoll_ctl()
1414                                 * callers are locked out by
1415                                 * ep_scan_ready_list() holding "mtx" and the
1416                                 * poll callback will queue them in ep->ovflist.
1417                                 */
1418                                list_add_tail(&epi->rdllink, &ep->rdllist);
1419                                __pm_stay_awake(epi->ws);
1420                        }
1421                }
1422        }
1423
1424        return eventcnt;
1425}
1426
1427static int ep_send_events(struct eventpoll *ep,
1428                          struct epoll_event __user *events, int maxevents)
1429{
1430        struct ep_send_events_data esed;
1431
1432        esed.maxevents = maxevents;
1433        esed.events = events;
1434
1435        return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1436}
1437
1438static inline struct timespec ep_set_mstimeout(long ms)
1439{
1440        struct timespec now, ts = {
1441                .tv_sec = ms / MSEC_PER_SEC,
1442                .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1443        };
1444
1445        ktime_get_ts(&now);
1446        return timespec_add_safe(now, ts);
1447}
1448
1449/**
1450 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
1451 *           event buffer.
1452 *
1453 * @ep: Pointer to the eventpoll context.
1454 * @events: Pointer to the userspace buffer where the ready events should be
1455 *          stored.
1456 * @maxevents: Size (in terms of number of events) of the caller event buffer.
1457 * @timeout: Maximum timeout for the ready events fetch operation, in
1458 *           milliseconds. If the @timeout is zero, the function will not block,
1459 *           while if the @timeout is less than zero, the function will block
1460 *           until at least one event has been retrieved (or an error
1461 *           occurred).
1462 *
1463 * Returns: Returns the number of ready events which have been fetched, or an
1464 *          error code, in case of error.
1465 */
1466static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1467                   int maxevents, long timeout)
1468{
1469        int res = 0, eavail, timed_out = 0;
1470        unsigned long flags;
1471        long slack = 0;
1472        wait_queue_t wait;
1473        ktime_t expires, *to = NULL;
1474
1475        if (timeout > 0) {
1476                struct timespec end_time = ep_set_mstimeout(timeout);
1477
1478                slack = select_estimate_accuracy(&end_time);
1479                to = &expires;
1480                *to = timespec_to_ktime(end_time);
1481        } else if (timeout == 0) {
1482                /*
1483                 * Avoid the unnecessary trip to the wait queue loop, if the
1484                 * caller specified a non blocking operation.
1485                 */
1486                timed_out = 1;
1487                spin_lock_irqsave(&ep->lock, flags);
1488                goto check_events;
1489        }
1490
1491fetch_events:
1492        spin_lock_irqsave(&ep->lock, flags);
1493
1494        if (!ep_events_available(ep)) {
1495                /*
1496                 * We don't have any available event to return to the caller.
1497                 * We need to sleep here, and we will be wake up by
1498                 * ep_poll_callback() when events will become available.
1499                 */
1500                init_waitqueue_entry(&wait, current);
1501                __add_wait_queue_exclusive(&ep->wq, &wait);
1502
1503                for (;;) {
1504                        /*
1505                         * We don't want to sleep if the ep_poll_callback() sends us
1506                         * a wakeup in between. That's why we set the task state
1507                         * to TASK_INTERRUPTIBLE before doing the checks.
1508                         */
1509                        set_current_state(TASK_INTERRUPTIBLE);
1510                        if (ep_events_available(ep) || timed_out)
1511                                break;
1512                        if (signal_pending(current)) {
1513                                res = -EINTR;
1514                                break;
1515                        }
1516
1517                        spin_unlock_irqrestore(&ep->lock, flags);
1518                        if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
1519                                timed_out = 1;
1520
1521                        spin_lock_irqsave(&ep->lock, flags);
1522                }
1523                __remove_wait_queue(&ep->wq, &wait);
1524
1525                set_current_state(TASK_RUNNING);
1526        }
1527check_events:
1528        /* Is it worth to try to dig for events ? */
1529        eavail = ep_events_available(ep);
1530
1531        spin_unlock_irqrestore(&ep->lock, flags);
1532
1533        /*
1534         * Try to transfer events to user space. In case we get 0 events and
1535         * there's still timeout left over, we go trying again in search of
1536         * more luck.
1537         */
1538        if (!res && eavail &&
1539            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1540                goto fetch_events;
1541
1542        return res;
1543}
1544
1545/**
1546 * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
1547 *                      API, to verify that adding an epoll file inside another
1548 *                      epoll structure, does not violate the constraints, in
1549 *                      terms of closed loops, or too deep chains (which can
1550 *                      result in excessive stack usage).
1551 *
1552 * @priv: Pointer to the epoll file to be currently checked.
1553 * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
1554 *          data structure pointer.
1555 * @call_nests: Current dept of the @ep_call_nested() call stack.
1556 *
1557 * Returns: Returns zero if adding the epoll @file inside current epoll
1558 *          structure @ep does not violate the constraints, or -1 otherwise.
1559 */
1560static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1561{
1562        int error = 0;
1563        struct file *file = priv;
1564        struct eventpoll *ep = file->private_data;
1565        struct eventpoll *ep_tovisit;
1566        struct rb_node *rbp;
1567        struct epitem *epi;
1568
1569        mutex_lock_nested(&ep->mtx, call_nests + 1);
1570        ep->visited = 1;
1571        list_add(&ep->visited_list_link, &visited_list);
1572        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1573                epi = rb_entry(rbp, struct epitem, rbn);
1574                if (unlikely(is_file_epoll(epi->ffd.file))) {
1575                        ep_tovisit = epi->ffd.file->private_data;
1576                        if (ep_tovisit->visited)
1577                                continue;
1578                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1579                                        ep_loop_check_proc, epi->ffd.file,
1580                                        ep_tovisit, current);
1581                        if (error != 0)
1582                                break;
1583                } else {
1584                        /*
1585                         * If we've reached a file that is not associated with
1586                         * an ep, then we need to check if the newly added
1587                         * links are going to add too many wakeup paths. We do
1588                         * this by adding it to the tfile_check_list, if it's
1589                         * not already there, and calling reverse_path_check()
1590                         * during ep_insert().
1591                         */
1592                        if (list_empty(&epi->ffd.file->f_tfile_llink))
1593                                list_add(&epi->ffd.file->f_tfile_llink,
1594                                         &tfile_check_list);
1595                }
1596        }
1597        mutex_unlock(&ep->mtx);
1598
1599        return error;
1600}
1601
1602/**
1603 * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
1604 *                 another epoll file (represented by @ep) does not create
1605 *                 closed loops or too deep chains.
1606 *
1607 * @ep: Pointer to the epoll private data structure.
1608 * @file: Pointer to the epoll file to be checked.
1609 *
1610 * Returns: Returns zero if adding the epoll @file inside current epoll
1611 *          structure @ep does not violate the constraints, or -1 otherwise.
1612 */
1613static int ep_loop_check(struct eventpoll *ep, struct file *file)
1614{
1615        int ret;
1616        struct eventpoll *ep_cur, *ep_next;
1617
1618        ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1619                              ep_loop_check_proc, file, ep, current);
1620        /* clear visited list */
1621        list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1622                                                        visited_list_link) {
1623                ep_cur->visited = 0;
1624                list_del(&ep_cur->visited_list_link);
1625        }
1626        return ret;
1627}
1628
1629static void clear_tfile_check_list(void)
1630{
1631        struct file *file;
1632
1633        /* first clear the tfile_check_list */
1634        while (!list_empty(&tfile_check_list)) {
1635                file = list_first_entry(&tfile_check_list, struct file,
1636                                        f_tfile_llink);
1637                list_del_init(&file->f_tfile_llink);
1638        }
1639        INIT_LIST_HEAD(&tfile_check_list);
1640}
1641
1642/*
1643 * Open an eventpoll file descriptor.
1644 */
1645SYSCALL_DEFINE1(epoll_create1, int, flags)
1646{
1647        int error, fd;
1648        struct eventpoll *ep = NULL;
1649        struct file *file;
1650
1651        /* Check the EPOLL_* constant for consistency.  */
1652        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1653
1654        if (flags & ~EPOLL_CLOEXEC)
1655                return -EINVAL;
1656        /*
1657         * Create the internal data structure ("struct eventpoll").
1658         */
1659        error = ep_alloc(&ep);
1660        if (error < 0)
1661                return error;
1662        /*
1663         * Creates all the items needed to setup an eventpoll file. That is,
1664         * a file structure and a free file descriptor.
1665         */
1666        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1667        if (fd < 0) {
1668                error = fd;
1669                goto out_free_ep;
1670        }
1671        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1672                                 O_RDWR | (flags & O_CLOEXEC));
1673        if (IS_ERR(file)) {
1674                error = PTR_ERR(file);
1675                goto out_free_fd;
1676        }
1677        ep->file = file;
1678        fd_install(fd, file);
1679        return fd;
1680
1681out_free_fd:
1682        put_unused_fd(fd);
1683out_free_ep:
1684        ep_free(ep);
1685        return error;
1686}
1687
1688SYSCALL_DEFINE1(epoll_create, int, size)
1689{
1690        if (size <= 0)
1691                return -EINVAL;
1692
1693        return sys_epoll_create1(0);
1694}
1695
1696/*
1697 * The following function implements the controller interface for
1698 * the eventpoll file that enables the insertion/removal/change of
1699 * file descriptors inside the interest set.
1700 */
1701SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1702                struct epoll_event __user *, event)
1703{
1704        int error;
1705        int did_lock_epmutex = 0;
1706        struct file *file, *tfile;
1707        struct eventpoll *ep;
1708        struct epitem *epi;
1709        struct epoll_event epds;
1710
1711        error = -EFAULT;
1712        if (ep_op_has_event(op) &&
1713            copy_from_user(&epds, event, sizeof(struct epoll_event)))
1714                goto error_return;
1715
1716        /* Get the "struct file *" for the eventpoll file */
1717        error = -EBADF;
1718        file = fget(epfd);
1719        if (!file)
1720                goto error_return;
1721
1722        /* Get the "struct file *" for the target file */
1723        tfile = fget(fd);
1724        if (!tfile)
1725                goto error_fput;
1726
1727        /* The target file descriptor must support poll */
1728        error = -EPERM;
1729        if (!tfile->f_op || !tfile->f_op->poll)
1730                goto error_tgt_fput;
1731
1732        /* Check if EPOLLWAKEUP is allowed */
1733        if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1734                epds.events &= ~EPOLLWAKEUP;
1735
1736        /*
1737         * We have to check that the file structure underneath the file descriptor
1738         * the user passed to us _is_ an eventpoll file. And also we do not permit
1739         * adding an epoll file descriptor inside itself.
1740         */
1741        error = -EINVAL;
1742        if (file == tfile || !is_file_epoll(file))
1743                goto error_tgt_fput;
1744
1745        /*
1746         * At this point it is safe to assume that the "private_data" contains
1747         * our own data structure.
1748         */
1749        ep = file->private_data;
1750
1751        /*
1752         * When we insert an epoll file descriptor, inside another epoll file
1753         * descriptor, there is the change of creating closed loops, which are
1754         * better be handled here, than in more critical paths. While we are
1755         * checking for loops we also determine the list of files reachable
1756         * and hang them on the tfile_check_list, so we can check that we
1757         * haven't created too many possible wakeup paths.
1758         *
1759         * We need to hold the epmutex across both ep_insert and ep_remove
1760         * b/c we want to make sure we are looking at a coherent view of
1761         * epoll network.
1762         */
1763        if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1764                mutex_lock(&epmutex);
1765                did_lock_epmutex = 1;
1766        }
1767        if (op == EPOLL_CTL_ADD) {
1768                if (is_file_epoll(tfile)) {
1769                        error = -ELOOP;
1770                        if (ep_loop_check(ep, tfile) != 0) {
1771                                clear_tfile_check_list();
1772                                goto error_tgt_fput;
1773                        }
1774                } else
1775                        list_add(&tfile->f_tfile_llink, &tfile_check_list);
1776        }
1777
1778        mutex_lock_nested(&ep->mtx, 0);
1779
1780        /*
1781         * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1782         * above, we can be sure to be able to use the item looked up by
1783         * ep_find() till we release the mutex.
1784         */
1785        epi = ep_find(ep, tfile, fd);
1786
1787        error = -EINVAL;
1788        switch (op) {
1789        case EPOLL_CTL_ADD:
1790                if (!epi) {
1791                        epds.events |= POLLERR | POLLHUP;
1792                        error = ep_insert(ep, &epds, tfile, fd);
1793                } else
1794                        error = -EEXIST;
1795                clear_tfile_check_list();
1796                break;
1797        case EPOLL_CTL_DEL:
1798                if (epi)
1799                        error = ep_remove(ep, epi);
1800                else
1801                        error = -ENOENT;
1802                break;
1803        case EPOLL_CTL_MOD:
1804                if (epi) {
1805                        epds.events |= POLLERR | POLLHUP;
1806                        error = ep_modify(ep, epi, &epds);
1807                } else
1808                        error = -ENOENT;
1809                break;
1810        }
1811        mutex_unlock(&ep->mtx);
1812
1813error_tgt_fput:
1814        if (did_lock_epmutex)
1815                mutex_unlock(&epmutex);
1816
1817        fput(tfile);
1818error_fput:
1819        fput(file);
1820error_return:
1821
1822        return error;
1823}
1824
1825/*
1826 * Implement the event wait interface for the eventpoll file. It is the kernel
1827 * part of the user space epoll_wait(2).
1828 */
1829SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1830                int, maxevents, int, timeout)
1831{
1832        int error;
1833        struct fd f;
1834        struct eventpoll *ep;
1835
1836        /* The maximum number of event must be greater than zero */
1837        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1838                return -EINVAL;
1839
1840        /* Verify that the area passed by the user is writeable */
1841        if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
1842                return -EFAULT;
1843
1844        /* Get the "struct file *" for the eventpoll file */
1845        f = fdget(epfd);
1846        if (!f.file)
1847                return -EBADF;
1848
1849        /*
1850         * We have to check that the file structure underneath the fd
1851         * the user passed to us _is_ an eventpoll file.
1852         */
1853        error = -EINVAL;
1854        if (!is_file_epoll(f.file))
1855                goto error_fput;
1856
1857        /*
1858         * At this point it is safe to assume that the "private_data" contains
1859         * our own data structure.
1860         */
1861        ep = f.file->private_data;
1862
1863        /* Time to fish for events ... */
1864        error = ep_poll(ep, events, maxevents, timeout);
1865
1866error_fput:
1867        fdput(f);
1868        return error;
1869}
1870
1871/*
1872 * Implement the event wait interface for the eventpoll file. It is the kernel
1873 * part of the user space epoll_pwait(2).
1874 */
1875SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1876                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1877                size_t, sigsetsize)
1878{
1879        int error;
1880        sigset_t ksigmask, sigsaved;
1881
1882        /*
1883         * If the caller wants a certain signal mask to be set during the wait,
1884         * we apply it here.
1885         */
1886        if (sigmask) {
1887                if (sigsetsize != sizeof(sigset_t))
1888                        return -EINVAL;
1889                if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1890                        return -EFAULT;
1891                sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1892                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1893        }
1894
1895        error = sys_epoll_wait(epfd, events, maxevents, timeout);
1896
1897        /*
1898         * If we changed the signal mask, we need to restore the original one.
1899         * In case we've got a signal while waiting, we do not restore the
1900         * signal mask yet, and we allow do_signal() to deliver the signal on
1901         * the way back to userspace, before the signal mask is restored.
1902         */
1903        if (sigmask) {
1904                if (error == -EINTR) {
1905                        memcpy(&current->saved_sigmask, &sigsaved,
1906                               sizeof(sigsaved));
1907                        set_restore_sigmask();
1908                } else
1909                        sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1910        }
1911
1912        return error;
1913}
1914
1915static int __init eventpoll_init(void)
1916{
1917        struct sysinfo si;
1918
1919        si_meminfo(&si);
1920        /*
1921         * Allows top 4% of lomem to be allocated for epoll watches (per user).
1922         */
1923        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1924                EP_ITEM_COST;
1925        BUG_ON(max_user_watches < 0);
1926
1927        /*
1928         * Initialize the structure used to perform epoll file descriptor
1929         * inclusion loops checks.
1930         */
1931        ep_nested_calls_init(&poll_loop_ncalls);
1932
1933        /* Initialize the structure used to perform safe poll wait head wake ups */
1934        ep_nested_calls_init(&poll_safewake_ncalls);
1935
1936        /* Initialize the structure used to perform file's f_op->poll() calls */
1937        ep_nested_calls_init(&poll_readywalk_ncalls);
1938
1939        /* Allocates slab cache used to allocate "struct epitem" items */
1940        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1941                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1942
1943        /* Allocates slab cache used to allocate "struct eppoll_entry" */
1944        pwq_cache = kmem_cache_create("eventpoll_pwq",
1945                        sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1946
1947        return 0;
1948}
1949fs_initcall(eventpoll_init);
1950
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.