linux-bk/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
   6 *  enough at me, Linus for the original (flawed) idea, Matthew
   7 *  Kirkwood for proof-of-concept implementation.
   8 *
   9 *  "The futexes are also cursed."
  10 *  "But they come in a choice of three flavours!"
  11 *
  12 *  This program is free software; you can redistribute it and/or modify
  13 *  it under the terms of the GNU General Public License as published by
  14 *  the Free Software Foundation; either version 2 of the License, or
  15 *  (at your option) any later version.
  16 *
  17 *  This program is distributed in the hope that it will be useful,
  18 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 *  GNU General Public License for more details.
  21 *
  22 *  You should have received a copy of the GNU General Public License
  23 *  along with this program; if not, write to the Free Software
  24 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  25 */
  26#include <linux/kernel.h>
  27#include <linux/spinlock.h>
  28#include <linux/sched.h>
  29#include <linux/mm.h>
  30#include <linux/hash.h>
  31#include <linux/init.h>
  32#include <linux/fs.h>
  33#include <linux/futex.h>
  34#include <linux/highmem.h>
  35#include <linux/time.h>
  36#include <linux/pagemap.h>
  37#include <linux/slab.h>
  38#include <linux/poll.h>
  39#include <linux/file.h>
  40#include <linux/dcache.h>
  41#include <asm/uaccess.h>
  42
  43/* Simple "sleep if unchanged" interface. */
  44
  45/* FIXME: This may be way too small. --RR */
  46#define FUTEX_HASHBITS 6
  47
  48extern void send_sigio(struct fown_struct *fown, int fd, int band);
  49
  50/* Everyone needs a dentry and inode */
  51static struct vfsmount *futex_mnt;
  52
  53/* We use this instead of a normal wait_queue_t, so we can wake only
  54   the relevent ones (hashed queues may be shared) */
  55struct futex_q {
  56        struct list_head list;
  57        wait_queue_head_t waiters;
  58        /* Page struct and offset within it. */
  59        struct page *page;
  60        unsigned int offset;
  61        /* For fd, sigio sent using these. */
  62        int fd;
  63        struct file *filp;
  64};
  65
  66/* The key for the hash is the address + index + offset within page */
  67static struct list_head futex_queues[1<<FUTEX_HASHBITS];
  68static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
  69
  70static inline struct list_head *hash_futex(struct page *page,
  71                                           unsigned long offset)
  72{
  73        unsigned long h;
  74
  75        /* struct page is shared, so we can hash on its address */
  76        h = (unsigned long)page + offset;
  77        return &futex_queues[hash_long(h, FUTEX_HASHBITS)];
  78}
  79
  80/* Waiter either waiting in FUTEX_WAIT or poll(), or expecting signal */
  81static inline void tell_waiter(struct futex_q *q)
  82{
  83        wake_up_all(&q->waiters);
  84        if (q->filp)
  85                send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
  86}
  87
  88static inline void unpin_page(struct page *page)
  89{
  90        /* Avoid releasing the page which is on the LRU list.  I don't
  91           know if this is correct, but it stops the BUG() in
  92           __free_pages_ok(). */
  93        page_cache_release(page);
  94}
  95
  96static int futex_wake(struct list_head *head,
  97                      struct page *page,
  98                      unsigned int offset,
  99                      int num)
 100{
 101        struct list_head *i, *next;
 102        int num_woken = 0;
 103
 104        spin_lock(&futex_lock);
 105        list_for_each_safe(i, next, head) {
 106                struct futex_q *this = list_entry(i, struct futex_q, list);
 107
 108                if (this->page == page && this->offset == offset) {
 109                        list_del_init(i);
 110                        tell_waiter(this);
 111                        num_woken++;
 112                        if (num_woken >= num) break;
 113                }
 114        }
 115        spin_unlock(&futex_lock);
 116        return num_woken;
 117}
 118
 119/* Add at end to avoid starvation */
 120static inline void queue_me(struct list_head *head,
 121                            struct futex_q *q,
 122                            struct page *page,
 123                            unsigned int offset,
 124                            int fd,
 125                            struct file *filp)
 126{
 127        q->page = page;
 128        q->offset = offset;
 129        q->fd = fd;
 130        q->filp = filp;
 131
 132        spin_lock(&futex_lock);
 133        list_add_tail(&q->list, head);
 134        spin_unlock(&futex_lock);
 135}
 136
 137/* Return 1 if we were still queued (ie. 0 means we were woken) */
 138static inline int unqueue_me(struct futex_q *q)
 139{
 140        int ret = 0;
 141        spin_lock(&futex_lock);
 142        if (!list_empty(&q->list)) {
 143                list_del(&q->list);
 144                ret = 1;
 145        }
 146        spin_unlock(&futex_lock);
 147        return ret;
 148}
 149
 150/* Get kernel address of the user page and pin it. */
 151static struct page *pin_page(unsigned long page_start)
 152{
 153        struct mm_struct *mm = current->mm;
 154        struct page *page;
 155        int err;
 156
 157        down_read(&mm->mmap_sem);
 158        err = get_user_pages(current, mm, page_start,
 159                             1 /* one page */,
 160                             0 /* writable not important */,
 161                             0 /* don't force */,
 162                             &page,
 163                             NULL /* don't return vmas */);
 164        up_read(&mm->mmap_sem);
 165
 166        if (err < 0)
 167                return ERR_PTR(err);
 168        return page;
 169}
 170
 171static int futex_wait(struct list_head *head,
 172                      struct page *page,
 173                      int offset,
 174                      int val,
 175                      int *uaddr,
 176                      unsigned long time)
 177{
 178        int curval;
 179        struct futex_q q;
 180        DECLARE_WAITQUEUE(wait, current);
 181        int ret = 0;
 182
 183        set_current_state(TASK_INTERRUPTIBLE);
 184        init_waitqueue_head(&q.waiters);
 185        add_wait_queue(&q.waiters, &wait);
 186        queue_me(head, &q, page, offset, -1, NULL);
 187
 188        /* Page is pinned, but may no longer be in this address space. */
 189        if (get_user(curval, uaddr) != 0) {
 190                ret = -EFAULT;
 191                goto out;
 192        }
 193
 194        if (curval != val) {
 195                ret = -EWOULDBLOCK;
 196                goto out;
 197        }
 198        time = schedule_timeout(time);
 199        if (time == 0) {
 200                ret = -ETIMEDOUT;
 201                goto out;
 202        }
 203        if (signal_pending(current)) {
 204                ret = -EINTR;
 205                goto out;
 206        }
 207 out:
 208        set_current_state(TASK_RUNNING);
 209        /* Were we woken up anyway? */
 210        if (!unqueue_me(&q))
 211                return 0;
 212        return ret;
 213}
 214
 215static int futex_close(struct inode *inode, struct file *filp)
 216{
 217        struct futex_q *q = filp->private_data;
 218
 219        spin_lock(&futex_lock);
 220        if (!list_empty(&q->list)) {
 221                list_del(&q->list);
 222                /* Noone can be polling on us now. */
 223                BUG_ON(waitqueue_active(&q->waiters));
 224        }
 225        spin_unlock(&futex_lock);
 226        unpin_page(q->page);
 227        kfree(filp->private_data);
 228        return 0;
 229}
 230
 231/* This is one-shot: once it's gone off you need a new fd */
 232static unsigned int futex_poll(struct file *filp,
 233                               struct poll_table_struct *wait)
 234{
 235        struct futex_q *q = filp->private_data;
 236        int ret = 0;
 237
 238        poll_wait(filp, &q->waiters, wait);
 239        spin_lock(&futex_lock);
 240        if (list_empty(&q->list))
 241                ret = POLLIN | POLLRDNORM;
 242        spin_unlock(&futex_lock);
 243
 244        return ret;
 245}
 246
 247static struct file_operations futex_fops = {
 248        .release        = futex_close,
 249        .poll           = futex_poll,
 250};
 251
 252/* Signal allows caller to avoid the race which would occur if they
 253   set the sigio stuff up afterwards. */
 254static int futex_fd(struct list_head *head,
 255                    struct page *page,
 256                    int offset,
 257                    int signal)
 258{
 259        int fd;
 260        struct futex_q *q;
 261        struct file *filp;
 262
 263        if (signal < 0 || signal > _NSIG)
 264                return -EINVAL;
 265
 266        fd = get_unused_fd();
 267        if (fd < 0)
 268                return fd;
 269        filp = get_empty_filp();
 270        if (!filp) {
 271                put_unused_fd(fd);
 272                return -ENFILE;
 273        }
 274        filp->f_op = &futex_fops;
 275        filp->f_vfsmnt = mntget(futex_mnt);
 276        filp->f_dentry = dget(futex_mnt->mnt_root);
 277
 278        if (signal) {
 279                int ret;
 280                
 281                ret = f_setown(filp, current->tgid, 1);
 282                if (ret) {
 283                        put_unused_fd(fd);
 284                        put_filp(filp);
 285                        return ret;
 286                }
 287                filp->f_owner.signum = signal;
 288        }
 289
 290        q = kmalloc(sizeof(*q), GFP_KERNEL);
 291        if (!q) {
 292                put_unused_fd(fd);
 293                put_filp(filp);
 294                return -ENOMEM;
 295        }
 296
 297        /* Initialize queue structure, and add to hash table. */
 298        filp->private_data = q;
 299        init_waitqueue_head(&q->waiters);
 300        queue_me(head, q, page, offset, fd, filp);
 301
 302        /* Now we map fd to filp, so userspace can access it */
 303        fd_install(fd, filp);
 304        return fd;
 305}
 306
 307asmlinkage int sys_futex(void *uaddr, int op, int val, struct timespec *utime)
 308{
 309        int ret;
 310        unsigned long pos_in_page;
 311        struct list_head *head;
 312        struct page *page;
 313        unsigned long time = MAX_SCHEDULE_TIMEOUT;
 314
 315        if (utime) {
 316                struct timespec t;
 317                if (copy_from_user(&t, utime, sizeof(t)) != 0)
 318                        return -EFAULT;
 319                time = timespec_to_jiffies(&t) + 1;
 320        }
 321
 322        pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE;
 323
 324        /* Must be "naturally" aligned, and not on page boundary. */
 325        if ((pos_in_page % __alignof__(int)) != 0
 326            || pos_in_page + sizeof(int) > PAGE_SIZE)
 327                return -EINVAL;
 328
 329        /* Simpler if it doesn't vanish underneath us. */
 330        page = pin_page((unsigned long)uaddr - pos_in_page);
 331        if (IS_ERR(page))
 332                return PTR_ERR(page);
 333
 334        head = hash_futex(page, pos_in_page);
 335        switch (op) {
 336        case FUTEX_WAIT:
 337                ret = futex_wait(head, page, pos_in_page, val, uaddr, time);
 338                break;
 339        case FUTEX_WAKE:
 340                ret = futex_wake(head, page, pos_in_page, val);
 341                break;
 342        case FUTEX_FD:
 343                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
 344                ret = futex_fd(head, page, pos_in_page, val);
 345                if (ret >= 0)
 346                        /* Leave page pinned (attached to fd). */
 347                        return ret;
 348                break;
 349        default:
 350                ret = -EINVAL;
 351        }
 352        unpin_page(page);
 353
 354        return ret;
 355}
 356
 357static struct super_block *
 358futexfs_get_sb(struct file_system_type *fs_type,
 359               int flags, char *dev_name, void *data)
 360{
 361        return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
 362}
 363
 364static struct file_system_type futex_fs_type = {
 365        .name           = "futexfs",
 366        .get_sb         = futexfs_get_sb,
 367};
 368
 369static int __init init(void)
 370{
 371        unsigned int i;
 372
 373        register_filesystem(&futex_fs_type);
 374        futex_mnt = kern_mount(&futex_fs_type);
 375
 376        for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
 377                INIT_LIST_HEAD(&futex_queues[i]);
 378        return 0;
 379}
 380__initcall(init);
 381
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.