linux/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  23 *  enough at me, Linus for the original (flawed) idea, Matthew
  24 *  Kirkwood for proof-of-concept implementation.
  25 *
  26 *  "The futexes are also cursed."
  27 *  "But they come in a choice of three flavours!"
  28 *
  29 *  This program is free software; you can redistribute it and/or modify
  30 *  it under the terms of the GNU General Public License as published by
  31 *  the Free Software Foundation; either version 2 of the License, or
  32 *  (at your option) any later version.
  33 *
  34 *  This program is distributed in the hope that it will be useful,
  35 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  36 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  37 *  GNU General Public License for more details.
  38 *
  39 *  You should have received a copy of the GNU General Public License
  40 *  along with this program; if not, write to the Free Software
  41 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  42 */
  43#include <linux/slab.h>
  44#include <linux/poll.h>
  45#include <linux/fs.h>
  46#include <linux/file.h>
  47#include <linux/jhash.h>
  48#include <linux/init.h>
  49#include <linux/futex.h>
  50#include <linux/mount.h>
  51#include <linux/pagemap.h>
  52#include <linux/syscalls.h>
  53#include <linux/signal.h>
  54#include <linux/module.h>
  55#include <linux/magic.h>
  56#include <linux/pid.h>
  57#include <linux/nsproxy.h>
  58
  59#include <asm/futex.h>
  60
  61#include "rtmutex_common.h"
  62
  63int __read_mostly futex_cmpxchg_enabled;
  64
  65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
  66
  67/*
  68 * Priority Inheritance state:
  69 */
  70struct futex_pi_state {
  71        /*
  72         * list of 'owned' pi_state instances - these have to be
  73         * cleaned up in do_exit() if the task exits prematurely:
  74         */
  75        struct list_head list;
  76
  77        /*
  78         * The PI object:
  79         */
  80        struct rt_mutex pi_mutex;
  81
  82        struct task_struct *owner;
  83        atomic_t refcount;
  84
  85        union futex_key key;
  86};
  87
  88/*
  89 * We use this hashed waitqueue instead of a normal wait_queue_t, so
  90 * we can wake only the relevant ones (hashed queues may be shared).
  91 *
  92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
  93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
  94 * The order of wakup is always to make the first condition true, then
  95 * wake up q->waiters, then make the second condition true.
  96 */
  97struct futex_q {
  98        struct plist_node list;
  99        wait_queue_head_t waiters;
 100
 101        /* Which hash list lock to use: */
 102        spinlock_t *lock_ptr;
 103
 104        /* Key which the futex is hashed on: */
 105        union futex_key key;
 106
 107        /* Optional priority inheritance state: */
 108        struct futex_pi_state *pi_state;
 109        struct task_struct *task;
 110
 111        /* Bitset for the optional bitmasked wakeup */
 112        u32 bitset;
 113};
 114
 115/*
 116 * Split the global futex_lock into every hash list lock.
 117 */
 118struct futex_hash_bucket {
 119        spinlock_t lock;
 120        struct plist_head chain;
 121};
 122
 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 124
 125/*
 126 * Take mm->mmap_sem, when futex is shared
 127 */
 128static inline void futex_lock_mm(struct rw_semaphore *fshared)
 129{
 130        if (fshared)
 131                down_read(fshared);
 132}
 133
 134/*
 135 * Release mm->mmap_sem, when the futex is shared
 136 */
 137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
 138{
 139        if (fshared)
 140                up_read(fshared);
 141}
 142
 143/*
 144 * We hash on the keys returned from get_futex_key (see below).
 145 */
 146static struct futex_hash_bucket *hash_futex(union futex_key *key)
 147{
 148        u32 hash = jhash2((u32*)&key->both.word,
 149                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 150                          key->both.offset);
 151        return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
 152}
 153
 154/*
 155 * Return 1 if two futex_keys are equal, 0 otherwise.
 156 */
 157static inline int match_futex(union futex_key *key1, union futex_key *key2)
 158{
 159        return (key1->both.word == key2->both.word
 160                && key1->both.ptr == key2->both.ptr
 161                && key1->both.offset == key2->both.offset);
 162}
 163
 164/**
 165 * get_futex_key - Get parameters which are the keys for a futex.
 166 * @uaddr: virtual address of the futex
 167 * @shared: NULL for a PROCESS_PRIVATE futex,
 168 *      &current->mm->mmap_sem for a PROCESS_SHARED futex
 169 * @key: address where result is stored.
 170 *
 171 * Returns a negative error code or 0
 172 * The key words are stored in *key on success.
 173 *
 174 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 175 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 176 * We can usually work out the index without swapping in the page.
 177 *
 178 * fshared is NULL for PROCESS_PRIVATE futexes
 179 * For other futexes, it points to &current->mm->mmap_sem and
 180 * caller must have taken the reader lock. but NOT any spinlocks.
 181 */
 182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 183                         union futex_key *key)
 184{
 185        unsigned long address = (unsigned long)uaddr;
 186        struct mm_struct *mm = current->mm;
 187        struct vm_area_struct *vma;
 188        struct page *page;
 189        int err;
 190
 191        /*
 192         * The futex address must be "naturally" aligned.
 193         */
 194        key->both.offset = address % PAGE_SIZE;
 195        if (unlikely((address % sizeof(u32)) != 0))
 196                return -EINVAL;
 197        address -= key->both.offset;
 198
 199        /*
 200         * PROCESS_PRIVATE futexes are fast.
 201         * As the mm cannot disappear under us and the 'key' only needs
 202         * virtual address, we dont even have to find the underlying vma.
 203         * Note : We do have to check 'uaddr' is a valid user address,
 204         *        but access_ok() should be faster than find_vma()
 205         */
 206        if (!fshared) {
 207                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
 208                        return -EFAULT;
 209                key->private.mm = mm;
 210                key->private.address = address;
 211                return 0;
 212        }
 213        /*
 214         * The futex is hashed differently depending on whether
 215         * it's in a shared or private mapping.  So check vma first.
 216         */
 217        vma = find_extend_vma(mm, address);
 218        if (unlikely(!vma))
 219                return -EFAULT;
 220
 221        /*
 222         * Permissions.
 223         */
 224        if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
 225                return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
 226
 227        /*
 228         * Private mappings are handled in a simple way.
 229         *
 230         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 231         * it's a read-only handle, it's expected that futexes attach to
 232         * the object not the particular process.  Therefore we use
 233         * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
 234         * mappings of _writable_ handles.
 235         */
 236        if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
 237                key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
 238                key->private.mm = mm;
 239                key->private.address = address;
 240                return 0;
 241        }
 242
 243        /*
 244         * Linear file mappings are also simple.
 245         */
 246        key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
 247        key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
 248        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
 249                key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 250                                     + vma->vm_pgoff);
 251                return 0;
 252        }
 253
 254        /*
 255         * We could walk the page table to read the non-linear
 256         * pte, and get the page index without fetching the page
 257         * from swap.  But that's a lot of code to duplicate here
 258         * for a rare case, so we simply fetch the page.
 259         */
 260        err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
 261        if (err >= 0) {
 262                key->shared.pgoff =
 263                        page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 264                put_page(page);
 265                return 0;
 266        }
 267        return err;
 268}
 269
 270/*
 271 * Take a reference to the resource addressed by a key.
 272 * Can be called while holding spinlocks.
 273 *
 274 */
 275static void get_futex_key_refs(union futex_key *key)
 276{
 277        if (key->both.ptr == NULL)
 278                return;
 279        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 280                case FUT_OFF_INODE:
 281                        atomic_inc(&key->shared.inode->i_count);
 282                        break;
 283                case FUT_OFF_MMSHARED:
 284                        atomic_inc(&key->private.mm->mm_count);
 285                        break;
 286        }
 287}
 288
 289/*
 290 * Drop a reference to the resource addressed by a key.
 291 * The hash bucket spinlock must not be held.
 292 */
 293static void drop_futex_key_refs(union futex_key *key)
 294{
 295        if (!key->both.ptr)
 296                return;
 297        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 298                case FUT_OFF_INODE:
 299                        iput(key->shared.inode);
 300                        break;
 301                case FUT_OFF_MMSHARED:
 302                        mmdrop(key->private.mm);
 303                        break;
 304        }
 305}
 306
 307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 308{
 309        u32 curval;
 310
 311        pagefault_disable();
 312        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
 313        pagefault_enable();
 314
 315        return curval;
 316}
 317
 318static int get_futex_value_locked(u32 *dest, u32 __user *from)
 319{
 320        int ret;
 321
 322        pagefault_disable();
 323        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 324        pagefault_enable();
 325
 326        return ret ? -EFAULT : 0;
 327}
 328
 329/*
 330 * Fault handling.
 331 * if fshared is non NULL, current->mm->mmap_sem is already held
 332 */
 333static int futex_handle_fault(unsigned long address,
 334                              struct rw_semaphore *fshared, int attempt)
 335{
 336        struct vm_area_struct * vma;
 337        struct mm_struct *mm = current->mm;
 338        int ret = -EFAULT;
 339
 340        if (attempt > 2)
 341                return ret;
 342
 343        if (!fshared)
 344                down_read(&mm->mmap_sem);
 345        vma = find_vma(mm, address);
 346        if (vma && address >= vma->vm_start &&
 347            (vma->vm_flags & VM_WRITE)) {
 348                int fault;
 349                fault = handle_mm_fault(mm, vma, address, 1);
 350                if (unlikely((fault & VM_FAULT_ERROR))) {
 351#if 0
 352                        /* XXX: let's do this when we verify it is OK */
 353                        if (ret & VM_FAULT_OOM)
 354                                ret = -ENOMEM;
 355#endif
 356                } else {
 357                        ret = 0;
 358                        if (fault & VM_FAULT_MAJOR)
 359                                current->maj_flt++;
 360                        else
 361                                current->min_flt++;
 362                }
 363        }
 364        if (!fshared)
 365                up_read(&mm->mmap_sem);
 366        return ret;
 367}
 368
 369/*
 370 * PI code:
 371 */
 372static int refill_pi_state_cache(void)
 373{
 374        struct futex_pi_state *pi_state;
 375
 376        if (likely(current->pi_state_cache))
 377                return 0;
 378
 379        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 380
 381        if (!pi_state)
 382                return -ENOMEM;
 383
 384        INIT_LIST_HEAD(&pi_state->list);
 385        /* pi_mutex gets initialized later */
 386        pi_state->owner = NULL;
 387        atomic_set(&pi_state->refcount, 1);
 388
 389        current->pi_state_cache = pi_state;
 390
 391        return 0;
 392}
 393
 394static struct futex_pi_state * alloc_pi_state(void)
 395{
 396        struct futex_pi_state *pi_state = current->pi_state_cache;
 397
 398        WARN_ON(!pi_state);
 399        current->pi_state_cache = NULL;
 400
 401        return pi_state;
 402}
 403
 404static void free_pi_state(struct futex_pi_state *pi_state)
 405{
 406        if (!atomic_dec_and_test(&pi_state->refcount))
 407                return;
 408
 409        /*
 410         * If pi_state->owner is NULL, the owner is most probably dying
 411         * and has cleaned up the pi_state already
 412         */
 413        if (pi_state->owner) {
 414                spin_lock_irq(&pi_state->owner->pi_lock);
 415                list_del_init(&pi_state->list);
 416                spin_unlock_irq(&pi_state->owner->pi_lock);
 417
 418                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
 419        }
 420
 421        if (current->pi_state_cache)
 422                kfree(pi_state);
 423        else {
 424                /*
 425                 * pi_state->list is already empty.
 426                 * clear pi_state->owner.
 427                 * refcount is at 0 - put it back to 1.
 428                 */
 429                pi_state->owner = NULL;
 430                atomic_set(&pi_state->refcount, 1);
 431                current->pi_state_cache = pi_state;
 432        }
 433}
 434
 435/*
 436 * Look up the task based on what TID userspace gave us.
 437 * We dont trust it.
 438 */
 439static struct task_struct * futex_find_get_task(pid_t pid)
 440{
 441        struct task_struct *p;
 442
 443        rcu_read_lock();
 444        p = find_task_by_vpid(pid);
 445        if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
 446                p = ERR_PTR(-ESRCH);
 447        else
 448                get_task_struct(p);
 449
 450        rcu_read_unlock();
 451
 452        return p;
 453}
 454
 455/*
 456 * This task is holding PI mutexes at exit time => bad.
 457 * Kernel cleans up PI-state, but userspace is likely hosed.
 458 * (Robust-futex cleanup is separate and might save the day for userspace.)
 459 */
 460void exit_pi_state_list(struct task_struct *curr)
 461{
 462        struct list_head *next, *head = &curr->pi_state_list;
 463        struct futex_pi_state *pi_state;
 464        struct futex_hash_bucket *hb;
 465        union futex_key key;
 466
 467        if (!futex_cmpxchg_enabled)
 468                return;
 469        /*
 470         * We are a ZOMBIE and nobody can enqueue itself on
 471         * pi_state_list anymore, but we have to be careful
 472         * versus waiters unqueueing themselves:
 473         */
 474        spin_lock_irq(&curr->pi_lock);
 475        while (!list_empty(head)) {
 476
 477                next = head->next;
 478                pi_state = list_entry(next, struct futex_pi_state, list);
 479                key = pi_state->key;
 480                hb = hash_futex(&key);
 481                spin_unlock_irq(&curr->pi_lock);
 482
 483                spin_lock(&hb->lock);
 484
 485                spin_lock_irq(&curr->pi_lock);
 486                /*
 487                 * We dropped the pi-lock, so re-check whether this
 488                 * task still owns the PI-state:
 489                 */
 490                if (head->next != next) {
 491                        spin_unlock(&hb->lock);
 492                        continue;
 493                }
 494
 495                WARN_ON(pi_state->owner != curr);
 496                WARN_ON(list_empty(&pi_state->list));
 497                list_del_init(&pi_state->list);
 498                pi_state->owner = NULL;
 499                spin_unlock_irq(&curr->pi_lock);
 500
 501                rt_mutex_unlock(&pi_state->pi_mutex);
 502
 503                spin_unlock(&hb->lock);
 504
 505                spin_lock_irq(&curr->pi_lock);
 506        }
 507        spin_unlock_irq(&curr->pi_lock);
 508}
 509
 510static int
 511lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 512                union futex_key *key, struct futex_pi_state **ps)
 513{
 514        struct futex_pi_state *pi_state = NULL;
 515        struct futex_q *this, *next;
 516        struct plist_head *head;
 517        struct task_struct *p;
 518        pid_t pid = uval & FUTEX_TID_MASK;
 519
 520        head = &hb->chain;
 521
 522        plist_for_each_entry_safe(this, next, head, list) {
 523                if (match_futex(&this->key, key)) {
 524                        /*
 525                         * Another waiter already exists - bump up
 526                         * the refcount and return its pi_state:
 527                         */
 528                        pi_state = this->pi_state;
 529                        /*
 530                         * Userspace might have messed up non PI and PI futexes
 531                         */
 532                        if (unlikely(!pi_state))
 533                                return -EINVAL;
 534
 535                        WARN_ON(!atomic_read(&pi_state->refcount));
 536                        WARN_ON(pid && pi_state->owner &&
 537                                pi_state->owner->pid != pid);
 538
 539                        atomic_inc(&pi_state->refcount);
 540                        *ps = pi_state;
 541
 542                        return 0;
 543                }
 544        }
 545
 546        /*
 547         * We are the first waiter - try to look up the real owner and attach
 548         * the new pi_state to it, but bail out when TID = 0
 549         */
 550        if (!pid)
 551                return -ESRCH;
 552        p = futex_find_get_task(pid);
 553        if (IS_ERR(p))
 554                return PTR_ERR(p);
 555
 556        /*
 557         * We need to look at the task state flags to figure out,
 558         * whether the task is exiting. To protect against the do_exit
 559         * change of the task flags, we do this protected by
 560         * p->pi_lock:
 561         */
 562        spin_lock_irq(&p->pi_lock);
 563        if (unlikely(p->flags & PF_EXITING)) {
 564                /*
 565                 * The task is on the way out. When PF_EXITPIDONE is
 566                 * set, we know that the task has finished the
 567                 * cleanup:
 568                 */
 569                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
 570
 571                spin_unlock_irq(&p->pi_lock);
 572                put_task_struct(p);
 573                return ret;
 574        }
 575
 576        pi_state = alloc_pi_state();
 577
 578        /*
 579         * Initialize the pi_mutex in locked state and make 'p'
 580         * the owner of it:
 581         */
 582        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 583
 584        /* Store the key for possible exit cleanups: */
 585        pi_state->key = *key;
 586
 587        WARN_ON(!list_empty(&pi_state->list));
 588        list_add(&pi_state->list, &p->pi_state_list);
 589        pi_state->owner = p;
 590        spin_unlock_irq(&p->pi_lock);
 591
 592        put_task_struct(p);
 593
 594        *ps = pi_state;
 595
 596        return 0;
 597}
 598
 599/*
 600 * The hash bucket lock must be held when this is called.
 601 * Afterwards, the futex_q must not be accessed.
 602 */
 603static void wake_futex(struct futex_q *q)
 604{
 605        plist_del(&q->list, &q->list.plist);
 606        /*
 607         * The lock in wake_up_all() is a crucial memory barrier after the
 608         * plist_del() and also before assigning to q->lock_ptr.
 609         */
 610        wake_up_all(&q->waiters);
 611        /*
 612         * The waiting task can free the futex_q as soon as this is written,
 613         * without taking any locks.  This must come last.
 614         *
 615         * A memory barrier is required here to prevent the following store
 616         * to lock_ptr from getting ahead of the wakeup. Clearing the lock
 617         * at the end of wake_up_all() does not prevent this store from
 618         * moving.
 619         */
 620        smp_wmb();
 621        q->lock_ptr = NULL;
 622}
 623
 624static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 625{
 626        struct task_struct *new_owner;
 627        struct futex_pi_state *pi_state = this->pi_state;
 628        u32 curval, newval;
 629
 630        if (!pi_state)
 631                return -EINVAL;
 632
 633        spin_lock(&pi_state->pi_mutex.wait_lock);
 634        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 635
 636        /*
 637         * This happens when we have stolen the lock and the original
 638         * pending owner did not enqueue itself back on the rt_mutex.
 639         * Thats not a tragedy. We know that way, that a lock waiter
 640         * is on the fly. We make the futex_q waiter the pending owner.
 641         */
 642        if (!new_owner)
 643                new_owner = this->task;
 644
 645        /*
 646         * We pass it to the next owner. (The WAITERS bit is always
 647         * kept enabled while there is PI state around. We must also
 648         * preserve the owner died bit.)
 649         */
 650        if (!(uval & FUTEX_OWNER_DIED)) {
 651                int ret = 0;
 652
 653                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 654
 655                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
 656
 657                if (curval == -EFAULT)
 658                        ret = -EFAULT;
 659                else if (curval != uval)
 660                        ret = -EINVAL;
 661                if (ret) {
 662                        spin_unlock(&pi_state->pi_mutex.wait_lock);
 663                        return ret;
 664                }
 665        }
 666
 667        spin_lock_irq(&pi_state->owner->pi_lock);
 668        WARN_ON(list_empty(&pi_state->list));
 669        list_del_init(&pi_state->list);
 670        spin_unlock_irq(&pi_state->owner->pi_lock);
 671
 672        spin_lock_irq(&new_owner->pi_lock);
 673        WARN_ON(!list_empty(&pi_state->list));
 674        list_add(&pi_state->list, &new_owner->pi_state_list);
 675        pi_state->owner = new_owner;
 676        spin_unlock_irq(&new_owner->pi_lock);
 677
 678        spin_unlock(&pi_state->pi_mutex.wait_lock);
 679        rt_mutex_unlock(&pi_state->pi_mutex);
 680
 681        return 0;
 682}
 683
 684static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 685{
 686        u32 oldval;
 687
 688        /*
 689         * There is no waiter, so we unlock the futex. The owner died
 690         * bit has not to be preserved here. We are the owner:
 691         */
 692        oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
 693
 694        if (oldval == -EFAULT)
 695                return oldval;
 696        if (oldval != uval)
 697                return -EAGAIN;
 698
 699        return 0;
 700}
 701
 702/*
 703 * Express the locking dependencies for lockdep:
 704 */
 705static inline void
 706double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 707{
 708        if (hb1 <= hb2) {
 709                spin_lock(&hb1->lock);
 710                if (hb1 < hb2)
 711                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
 712        } else { /* hb1 > hb2 */
 713                spin_lock(&hb2->lock);
 714                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
 715        }
 716}
 717
 718/*
 719 * Wake up all waiters hashed on the physical page that is mapped
 720 * to this virtual address:
 721 */
 722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 723                      int nr_wake, u32 bitset)
 724{
 725        struct futex_hash_bucket *hb;
 726        struct futex_q *this, *next;
 727        struct plist_head *head;
 728        union futex_key key;
 729        int ret;
 730
 731        if (!bitset)
 732                return -EINVAL;
 733
 734        futex_lock_mm(fshared);
 735
 736        ret = get_futex_key(uaddr, fshared, &key);
 737        if (unlikely(ret != 0))
 738                goto out;
 739
 740        hb = hash_futex(&key);
 741        spin_lock(&hb->lock);
 742        head = &hb->chain;
 743
 744        plist_for_each_entry_safe(this, next, head, list) {
 745                if (match_futex (&this->key, &key)) {
 746                        if (this->pi_state) {
 747                                ret = -EINVAL;
 748                                break;
 749                        }
 750
 751                        /* Check if one of the bits is set in both bitsets */
 752                        if (!(this->bitset & bitset))
 753                                continue;
 754
 755                        wake_futex(this);
 756                        if (++ret >= nr_wake)
 757                                break;
 758                }
 759        }
 760
 761        spin_unlock(&hb->lock);
 762out:
 763        futex_unlock_mm(fshared);
 764        return ret;
 765}
 766
 767/*
 768 * Wake up all waiters hashed on the physical page that is mapped
 769 * to this virtual address:
 770 */
 771static int
 772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 773              u32 __user *uaddr2,
 774              int nr_wake, int nr_wake2, int op)
 775{
 776        union futex_key key1, key2;
 777        struct futex_hash_bucket *hb1, *hb2;
 778        struct plist_head *head;
 779        struct futex_q *this, *next;
 780        int ret, op_ret, attempt = 0;
 781
 782retryfull:
 783        futex_lock_mm(fshared);
 784
 785        ret = get_futex_key(uaddr1, fshared, &key1);
 786        if (unlikely(ret != 0))
 787                goto out;
 788        ret = get_futex_key(uaddr2, fshared, &key2);
 789        if (unlikely(ret != 0))
 790                goto out;
 791
 792        hb1 = hash_futex(&key1);
 793        hb2 = hash_futex(&key2);
 794
 795retry:
 796        double_lock_hb(hb1, hb2);
 797
 798        op_ret = futex_atomic_op_inuser(op, uaddr2);
 799        if (unlikely(op_ret < 0)) {
 800                u32 dummy;
 801
 802                spin_unlock(&hb1->lock);
 803                if (hb1 != hb2)
 804                        spin_unlock(&hb2->lock);
 805
 806#ifndef CONFIG_MMU
 807                /*
 808                 * we don't get EFAULT from MMU faults if we don't have an MMU,
 809                 * but we might get them from range checking
 810                 */
 811                ret = op_ret;
 812                goto out;
 813#endif
 814
 815                if (unlikely(op_ret != -EFAULT)) {
 816                        ret = op_ret;
 817                        goto out;
 818                }
 819
 820                /*
 821                 * futex_atomic_op_inuser needs to both read and write
 822                 * *(int __user *)uaddr2, but we can't modify it
 823                 * non-atomically.  Therefore, if get_user below is not
 824                 * enough, we need to handle the fault ourselves, while
 825                 * still holding the mmap_sem.
 826                 */
 827                if (attempt++) {
 828                        ret = futex_handle_fault((unsigned long)uaddr2,
 829                                                 fshared, attempt);
 830                        if (ret)
 831                                goto out;
 832                        goto retry;
 833                }
 834
 835                /*
 836                 * If we would have faulted, release mmap_sem,
 837                 * fault it in and start all over again.
 838                 */
 839                futex_unlock_mm(fshared);
 840
 841                ret = get_user(dummy, uaddr2);
 842                if (ret)
 843                        return ret;
 844
 845                goto retryfull;
 846        }
 847
 848        head = &hb1->chain;
 849
 850        plist_for_each_entry_safe(this, next, head, list) {
 851                if (match_futex (&this->key, &key1)) {
 852                        wake_futex(this);
 853                        if (++ret >= nr_wake)
 854                                break;
 855                }
 856        }
 857
 858        if (op_ret > 0) {
 859                head = &hb2->chain;
 860
 861                op_ret = 0;
 862                plist_for_each_entry_safe(this, next, head, list) {
 863                        if (match_futex (&this->key, &key2)) {
 864                                wake_futex(this);
 865                                if (++op_ret >= nr_wake2)
 866                                        break;
 867                        }
 868                }
 869                ret += op_ret;
 870        }
 871
 872        spin_unlock(&hb1->lock);
 873        if (hb1 != hb2)
 874                spin_unlock(&hb2->lock);
 875out:
 876        futex_unlock_mm(fshared);
 877
 878        return ret;
 879}
 880
 881/*
 882 * Requeue all waiters hashed on one physical page to another
 883 * physical page.
 884 */
 885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 886                         u32 __user *uaddr2,
 887                         int nr_wake, int nr_requeue, u32 *cmpval)
 888{
 889        union futex_key key1, key2;
 890        struct futex_hash_bucket *hb1, *hb2;
 891        struct plist_head *head1;
 892        struct futex_q *this, *next;
 893        int ret, drop_count = 0;
 894
 895 retry:
 896        futex_lock_mm(fshared);
 897
 898        ret = get_futex_key(uaddr1, fshared, &key1);
 899        if (unlikely(ret != 0))
 900                goto out;
 901        ret = get_futex_key(uaddr2, fshared, &key2);
 902        if (unlikely(ret != 0))
 903                goto out;
 904
 905        hb1 = hash_futex(&key1);
 906        hb2 = hash_futex(&key2);
 907
 908        double_lock_hb(hb1, hb2);
 909
 910        if (likely(cmpval != NULL)) {
 911                u32 curval;
 912
 913                ret = get_futex_value_locked(&curval, uaddr1);
 914
 915                if (unlikely(ret)) {
 916                        spin_unlock(&hb1->lock);
 917                        if (hb1 != hb2)
 918                                spin_unlock(&hb2->lock);
 919
 920                        /*
 921                         * If we would have faulted, release mmap_sem, fault
 922                         * it in and start all over again.
 923                         */
 924                        futex_unlock_mm(fshared);
 925
 926                        ret = get_user(curval, uaddr1);
 927
 928                        if (!ret)
 929                                goto retry;
 930
 931                        return ret;
 932                }
 933                if (curval != *cmpval) {
 934                        ret = -EAGAIN;
 935                        goto out_unlock;
 936                }
 937        }
 938
 939        head1 = &hb1->chain;
 940        plist_for_each_entry_safe(this, next, head1, list) {
 941                if (!match_futex (&this->key, &key1))
 942                        continue;
 943                if (++ret <= nr_wake) {
 944                        wake_futex(this);
 945                } else {
 946                        /*
 947                         * If key1 and key2 hash to the same bucket, no need to
 948                         * requeue.
 949                         */
 950                        if (likely(head1 != &hb2->chain)) {
 951                                plist_del(&this->list, &hb1->chain);
 952                                plist_add(&this->list, &hb2->chain);
 953                                this->lock_ptr = &hb2->lock;
 954#ifdef CONFIG_DEBUG_PI_LIST
 955                                this->list.plist.lock = &hb2->lock;
 956#endif
 957                        }
 958                        this->key = key2;
 959                        get_futex_key_refs(&key2);
 960                        drop_count++;
 961
 962                        if (ret - nr_wake >= nr_requeue)
 963                                break;
 964                }
 965        }
 966
 967out_unlock:
 968        spin_unlock(&hb1->lock);
 969        if (hb1 != hb2)
 970                spin_unlock(&hb2->lock);
 971
 972        /* drop_futex_key_refs() must be called outside the spinlocks. */
 973        while (--drop_count >= 0)
 974                drop_futex_key_refs(&key1);
 975
 976out:
 977        futex_unlock_mm(fshared);
 978        return ret;
 979}
 980
 981/* The key must be already stored in q->key. */
 982static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 983{
 984        struct futex_hash_bucket *hb;
 985
 986        init_waitqueue_head(&q->waiters);
 987
 988        get_futex_key_refs(&q->key);
 989        hb = hash_futex(&q->key);
 990        q->lock_ptr = &hb->lock;
 991
 992        spin_lock(&hb->lock);
 993        return hb;
 994}
 995
 996static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 997{
 998        int prio;
 999
1000        /*
1001         * The priority used to register this element is
1002         * - either the real thread-priority for the real-time threads
1003         * (i.e. threads with a priority lower than MAX_RT_PRIO)
1004         * - or MAX_RT_PRIO for non-RT threads.
1005         * Thus, all RT-threads are woken first in priority order, and
1006         * the others are woken last, in FIFO order.
1007         */
1008        prio = min(current->normal_prio, MAX_RT_PRIO);
1009
1010        plist_node_init(&q->list, prio);
1011#ifdef CONFIG_DEBUG_PI_LIST
1012        q->list.plist.lock = &hb->lock;
1013#endif
1014        plist_add(&q->list, &hb->chain);
1015        q->task = current;
1016        spin_unlock(&hb->lock);
1017}
1018
1019static inline void
1020queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1021{
1022        spin_unlock(&hb->lock);
1023        drop_futex_key_refs(&q->key);
1024}
1025
1026/*
1027 * queue_me and unqueue_me must be called as a pair, each
1028 * exactly once.  They are called with the hashed spinlock held.
1029 */
1030
1031/* Return 1 if we were still queued (ie. 0 means we were woken) */
1032static int unqueue_me(struct futex_q *q)
1033{
1034        spinlock_t *lock_ptr;
1035        int ret = 0;
1036
1037        /* In the common case we don't take the spinlock, which is nice. */
1038 retry:
1039        lock_ptr = q->lock_ptr;
1040        barrier();
1041        if (lock_ptr != NULL) {
1042                spin_lock(lock_ptr);
1043                /*
1044                 * q->lock_ptr can change between reading it and
1045                 * spin_lock(), causing us to take the wrong lock.  This
1046                 * corrects the race condition.
1047                 *
1048                 * Reasoning goes like this: if we have the wrong lock,
1049                 * q->lock_ptr must have changed (maybe several times)
1050                 * between reading it and the spin_lock().  It can
1051                 * change again after the spin_lock() but only if it was
1052                 * already changed before the spin_lock().  It cannot,
1053                 * however, change back to the original value.  Therefore
1054                 * we can detect whether we acquired the correct lock.
1055                 */
1056                if (unlikely(lock_ptr != q->lock_ptr)) {
1057                        spin_unlock(lock_ptr);
1058                        goto retry;
1059                }
1060                WARN_ON(plist_node_empty(&q->list));
1061                plist_del(&q->list, &q->list.plist);
1062
1063                BUG_ON(q->pi_state);
1064
1065                spin_unlock(lock_ptr);
1066                ret = 1;
1067        }
1068
1069        drop_futex_key_refs(&q->key);
1070        return ret;
1071}
1072
1073/*
1074 * PI futexes can not be requeued and must remove themself from the
1075 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1076 * and dropped here.
1077 */
1078static void unqueue_me_pi(struct futex_q *q)
1079{
1080        WARN_ON(plist_node_empty(&q->list));
1081        plist_del(&q->list, &q->list.plist);
1082
1083        BUG_ON(!q->pi_state);
1084        free_pi_state(q->pi_state);
1085        q->pi_state = NULL;
1086
1087        spin_unlock(q->lock_ptr);
1088
1089        drop_futex_key_refs(&q->key);
1090}
1091
1092/*
1093 * Fixup the pi_state owner with the new owner.
1094 *
1095 * Must be called with hash bucket lock held and mm->sem held for non
1096 * private futexes.
1097 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099                                struct task_struct *newowner,
1100                                struct rw_semaphore *fshared)
1101{
1102        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103        struct futex_pi_state *pi_state = q->pi_state;
1104        struct task_struct *oldowner = pi_state->owner;
1105        u32 uval, curval, newval;
1106        int ret, attempt = 0;
1107
1108        /* Owner died? */
1109        if (!pi_state->owner)
1110                newtid |= FUTEX_OWNER_DIED;
1111
1112        /*
1113         * We are here either because we stole the rtmutex from the
1114         * pending owner or we are the pending owner which failed to
1115         * get the rtmutex. We have to replace the pending owner TID
1116         * in the user space variable. This must be atomic as we have
1117         * to preserve the owner died bit here.
1118         *
1119         * Note: We write the user space value _before_ changing the
1120         * pi_state because we can fault here. Imagine swapped out
1121         * pages or a fork, which was running right before we acquired
1122         * mmap_sem, that marked all the anonymous memory readonly for
1123         * cow.
1124         *
1125         * Modifying pi_state _before_ the user space value would
1126         * leave the pi_state in an inconsistent state when we fault
1127         * here, because we need to drop the hash bucket lock to
1128         * handle the fault. This might be observed in the PID check
1129         * in lookup_pi_state.
1130         */
1131retry:
1132        if (get_futex_value_locked(&uval, uaddr))
1133                goto handle_fault;
1134
1135        while (1) {
1136                newval = (uval & FUTEX_OWNER_DIED) | newtid;
1137
1138                curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1139
1140                if (curval == -EFAULT)
1141                        goto handle_fault;
1142                if (curval == uval)
1143                        break;
1144                uval = curval;
1145        }
1146
1147        /*
1148         * We fixed up user space. Now we need to fix the pi_state
1149         * itself.
1150         */
1151        if (pi_state->owner != NULL) {
1152                spin_lock_irq(&pi_state->owner->pi_lock);
1153                WARN_ON(list_empty(&pi_state->list));
1154                list_del_init(&pi_state->list);
1155                spin_unlock_irq(&pi_state->owner->pi_lock);
1156        }
1157
1158        pi_state->owner = newowner;
1159
1160        spin_lock_irq(&newowner->pi_lock);
1161        WARN_ON(!list_empty(&pi_state->list));
1162        list_add(&pi_state->list, &newowner->pi_state_list);
1163        spin_unlock_irq(&newowner->pi_lock);
1164        return 0;
1165
1166        /*
1167         * To handle the page fault we need to drop the hash bucket
1168         * lock here. That gives the other task (either the pending
1169         * owner itself or the task which stole the rtmutex) the
1170         * chance to try the fixup of the pi_state. So once we are
1171         * back from handling the fault we need to check the pi_state
1172         * after reacquiring the hash bucket lock and before trying to
1173         * do another fixup. When the fixup has been done already we
1174         * simply return.
1175         */
1176handle_fault:
1177        spin_unlock(q->lock_ptr);
1178
1179        ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
1180
1181        spin_lock(q->lock_ptr);
1182
1183        /*
1184         * Check if someone else fixed it for us:
1185         */
1186        if (pi_state->owner != oldowner)
1187                return 0;
1188
1189        if (ret)
1190                return ret;
1191
1192        goto retry;
1193}
1194
1195/*
1196 * In case we must use restart_block to restart a futex_wait,
1197 * we encode in the 'flags' shared capability
1198 */
1199#define FLAGS_SHARED  1
1200
1201static long futex_wait_restart(struct restart_block *restart);
1202
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1204                      u32 val, ktime_t *abs_time, u32 bitset)
1205{
1206        struct task_struct *curr = current;
1207        DECLARE_WAITQUEUE(wait, curr);
1208        struct futex_hash_bucket *hb;
1209        struct futex_q q;
1210        u32 uval;
1211        int ret;
1212        struct hrtimer_sleeper t;
1213        int rem = 0;
1214
1215        if (!bitset)
1216                return -EINVAL;
1217
1218        q.pi_state = NULL;
1219        q.bitset = bitset;
1220 retry:
1221        futex_lock_mm(fshared);
1222
1223        ret = get_futex_key(uaddr, fshared, &q.key);
1224        if (unlikely(ret != 0))
1225                goto out_release_sem;
1226
1227        hb = queue_lock(&q);
1228
1229        /*
1230         * Access the page AFTER the futex is queued.
1231         * Order is important:
1232         *
1233         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1234         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1235         *
1236         * The basic logical guarantee of a futex is that it blocks ONLY
1237         * if cond(var) is known to be true at the time of blocking, for
1238         * any cond.  If we queued after testing *uaddr, that would open
1239         * a race condition where we could block indefinitely with
1240         * cond(var) false, which would violate the guarantee.
1241         *
1242         * A consequence is that futex_wait() can return zero and absorb
1243         * a wakeup when *uaddr != val on entry to the syscall.  This is
1244         * rare, but normal.
1245         *
1246         * for shared futexes, we hold the mmap semaphore, so the mapping
1247         * cannot have changed since we looked it up in get_futex_key.
1248         */
1249        ret = get_futex_value_locked(&uval, uaddr);
1250
1251        if (unlikely(ret)) {
1252                queue_unlock(&q, hb);
1253
1254                /*
1255                 * If we would have faulted, release mmap_sem, fault it in and
1256                 * start all over again.
1257                 */
1258                futex_unlock_mm(fshared);
1259
1260                ret = get_user(uval, uaddr);
1261
1262                if (!ret)
1263                        goto retry;
1264                return ret;
1265        }
1266        ret = -EWOULDBLOCK;
1267        if (uval != val)
1268                goto out_unlock_release_sem;
1269
1270        /* Only actually queue if *uaddr contained val.  */
1271        queue_me(&q, hb);
1272
1273        /*
1274         * Now the futex is queued and we have checked the data, we
1275         * don't want to hold mmap_sem while we sleep.
1276         */
1277        futex_unlock_mm(fshared);
1278
1279        /*
1280         * There might have been scheduling since the queue_me(), as we
1281         * cannot hold a spinlock across the get_user() in case it
1282         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1283         * queueing ourselves into the futex hash.  This code thus has to
1284         * rely on the futex_wake() code removing us from hash when it
1285         * wakes us up.
1286         */
1287
1288        /* add_wait_queue is the barrier after __set_current_state. */
1289        __set_current_state(TASK_INTERRUPTIBLE);
1290        add_wait_queue(&q.waiters, &wait);
1291        /*
1292         * !plist_node_empty() is safe here without any lock.
1293         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1294         */
1295        if (likely(!plist_node_empty(&q.list))) {
1296                if (!abs_time)
1297                        schedule();
1298                else {
1299                        hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1300                                                HRTIMER_MODE_ABS);
1301                        hrtimer_init_sleeper(&t, current);
1302                        t.timer.expires = *abs_time;
1303
1304                        hrtimer_start(&t.timer, t.timer.expires,
1305                                                HRTIMER_MODE_ABS);
1306                        if (!hrtimer_active(&t.timer))
1307                                t.task = NULL;
1308
1309                        /*
1310                         * the timer could have already expired, in which
1311                         * case current would be flagged for rescheduling.
1312                         * Don't bother calling schedule.
1313                         */
1314                        if (likely(t.task))
1315                                schedule();
1316
1317                        hrtimer_cancel(&t.timer);
1318
1319                        /* Flag if a timeout occured */
1320                        rem = (t.task == NULL);
1321
1322                        destroy_hrtimer_on_stack(&t.timer);
1323                }
1324        }
1325        __set_current_state(TASK_RUNNING);
1326
1327        /*
1328         * NOTE: we don't remove ourselves from the waitqueue because
1329         * we are the only user of it.
1330         */
1331
1332        /* If we were woken (and unqueued), we succeeded, whatever. */
1333        if (!unqueue_me(&q))
1334                return 0;
1335        if (rem)
1336                return -ETIMEDOUT;
1337
1338        /*
1339         * We expect signal_pending(current), but another thread may
1340         * have handled it for us already.
1341         */
1342        if (!abs_time)
1343                return -ERESTARTSYS;
1344        else {
1345                struct restart_block *restart;
1346                restart = &current_thread_info()->restart_block;
1347                restart->fn = futex_wait_restart;
1348                restart->futex.uaddr = (u32 *)uaddr;
1349                restart->futex.val = val;
1350                restart->futex.time = abs_time->tv64;
1351                restart->futex.bitset = bitset;
1352                restart->futex.flags = 0;
1353
1354                if (fshared)
1355                        restart->futex.flags |= FLAGS_SHARED;
1356                return -ERESTART_RESTARTBLOCK;
1357        }
1358
1359 out_unlock_release_sem:
1360        queue_unlock(&q, hb);
1361
1362 out_release_sem:
1363        futex_unlock_mm(fshared);
1364        return ret;
1365}
1366
1367
1368static long futex_wait_restart(struct restart_block *restart)
1369{
1370        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1371        struct rw_semaphore *fshared = NULL;
1372        ktime_t t;
1373
1374        t.tv64 = restart->futex.time;
1375        restart->fn = do_no_restart_syscall;
1376        if (restart->futex.flags & FLAGS_SHARED)
1377                fshared = &current->mm->mmap_sem;
1378        return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1379                                restart->futex.bitset);
1380}
1381
1382
1383/*
1384 * Userspace tried a 0 -> TID atomic transition of the futex value
1385 * and failed. The kernel side here does the whole locking operation:
1386 * if there are waiters then it will block, it does PI, etc. (Due to
1387 * races the kernel might see a 0 value of the futex too.)
1388 */
1389static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1390                         int detect, ktime_t *time, int trylock)
1391{
1392        struct hrtimer_sleeper timeout, *to = NULL;
1393        struct task_struct *curr = current;
1394        struct futex_hash_bucket *hb;
1395        u32 uval, newval, curval;
1396        struct futex_q q;
1397        int ret, lock_taken, ownerdied = 0, attempt = 0;
1398
1399        if (refill_pi_state_cache())
1400                return -ENOMEM;
1401
1402        if (time) {
1403                to = &timeout;
1404                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1405                                      HRTIMER_MODE_ABS);
1406                hrtimer_init_sleeper(to, current);
1407                to->timer.expires = *time;
1408        }
1409
1410        q.pi_state = NULL;
1411 retry:
1412        futex_lock_mm(fshared);
1413
1414        ret = get_futex_key(uaddr, fshared, &q.key);
1415        if (unlikely(ret != 0))
1416                goto out_release_sem;
1417
1418 retry_unlocked:
1419        hb = queue_lock(&q);
1420
1421 retry_locked:
1422        ret = lock_taken = 0;
1423
1424        /*
1425         * To avoid races, we attempt to take the lock here again
1426         * (by doing a 0 -> TID atomic cmpxchg), while holding all
1427         * the locks. It will most likely not succeed.
1428         */
1429        newval = task_pid_vnr(current);
1430
1431        curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1432
1433        if (unlikely(curval == -EFAULT))
1434                goto uaddr_faulted;
1435
1436        /*
1437         * Detect deadlocks. In case of REQUEUE_PI this is a valid
1438         * situation and we return success to user space.
1439         */
1440        if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1441                ret = -EDEADLK;
1442                goto out_unlock_release_sem;
1443        }
1444
1445        /*
1446         * Surprise - we got the lock. Just return to userspace:
1447         */
1448        if (unlikely(!curval))
1449                goto out_unlock_release_sem;
1450
1451        uval = curval;
1452
1453        /*
1454         * Set the WAITERS flag, so the owner will know it has someone
1455         * to wake at next unlock
1456         */
1457        newval = curval | FUTEX_WAITERS;
1458
1459        /*
1460         * There are two cases, where a futex might have no owner (the
1461         * owner TID is 0): OWNER_DIED. We take over the futex in this
1462         * case. We also do an unconditional take over, when the owner
1463         * of the futex died.
1464         *
1465         * This is safe as we are protected by the hash bucket lock !
1466         */
1467        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1468                /* Keep the OWNER_DIED bit */
1469                newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1470                ownerdied = 0;
1471                lock_taken = 1;
1472        }
1473
1474        curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1475
1476        if (unlikely(curval == -EFAULT))
1477                goto uaddr_faulted;
1478        if (unlikely(curval != uval))
1479                goto retry_locked;
1480
1481        /*
1482         * We took the lock due to owner died take over.
1483         */
1484        if (unlikely(lock_taken))
1485                goto out_unlock_release_sem;
1486
1487        /*
1488         * We dont have the lock. Look up the PI state (or create it if
1489         * we are the first waiter):
1490         */
1491        ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1492
1493        if (unlikely(ret)) {
1494                switch (ret) {
1495
1496                case -EAGAIN:
1497                        /*
1498                         * Task is exiting and we just wait for the
1499                         * exit to complete.
1500                         */
1501                        queue_unlock(&q, hb);
1502                        futex_unlock_mm(fshared);
1503                        cond_resched();
1504                        goto retry;
1505
1506                case -ESRCH:
1507                        /*
1508                         * No owner found for this futex. Check if the
1509                         * OWNER_DIED bit is set to figure out whether
1510                         * this is a robust futex or not.
1511                         */
1512                        if (get_futex_value_locked(&curval, uaddr))
1513                                goto uaddr_faulted;
1514
1515                        /*
1516                         * We simply start over in case of a robust
1517                         * futex. The code above will take the futex
1518                         * and return happy.
1519                         */
1520                        if (curval & FUTEX_OWNER_DIED) {
1521                                ownerdied = 1;
1522                                goto retry_locked;
1523                        }
1524                default:
1525                        goto out_unlock_release_sem;
1526                }
1527        }
1528
1529        /*
1530         * Only actually queue now that the atomic ops are done:
1531         */
1532        queue_me(&q, hb);
1533
1534        /*
1535         * Now the futex is queued and we have checked the data, we
1536         * don't want to hold mmap_sem while we sleep.
1537         */
1538        futex_unlock_mm(fshared);
1539
1540        WARN_ON(!q.pi_state);
1541        /*
1542         * Block on the PI mutex:
1543         */
1544        if (!trylock)
1545                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1546        else {
1547                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1548                /* Fixup the trylock return value: */
1549                ret = ret ? 0 : -EWOULDBLOCK;
1550        }
1551
1552        futex_lock_mm(fshared);
1553        spin_lock(q.lock_ptr);
1554
1555        if (!ret) {
1556                /*
1557                 * Got the lock. We might not be the anticipated owner
1558                 * if we did a lock-steal - fix up the PI-state in
1559                 * that case:
1560                 */
1561                if (q.pi_state->owner != curr)
1562                        ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
1563        } else {
1564                /*
1565                 * Catch the rare case, where the lock was released
1566                 * when we were on the way back before we locked the
1567                 * hash bucket.
1568                 */
1569                if (q.pi_state->owner == curr) {
1570                        /*
1571                         * Try to get the rt_mutex now. This might
1572                         * fail as some other task acquired the
1573                         * rt_mutex after we removed ourself from the
1574                         * rt_mutex waiters list.
1575                         */
1576                        if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1577                                ret = 0;
1578                        else {
1579                                /*
1580                                 * pi_state is incorrect, some other
1581                                 * task did a lock steal and we
1582                                 * returned due to timeout or signal
1583                                 * without taking the rt_mutex. Too
1584                                 * late. We can access the
1585                                 * rt_mutex_owner without locking, as
1586                                 * the other task is now blocked on
1587                                 * the hash bucket lock. Fix the state
1588                                 * up.
1589                                 */
1590                                struct task_struct *owner;
1591                                int res;
1592
1593                                owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1594                                res = fixup_pi_state_owner(uaddr, &q, owner,
1595                                                           fshared);
1596
1597                                /* propagate -EFAULT, if the fixup failed */
1598                                if (res)
1599                                        ret = res;
1600                        }
1601                } else {
1602                        /*
1603                         * Paranoia check. If we did not take the lock
1604                         * in the trylock above, then we should not be
1605                         * the owner of the rtmutex, neither the real
1606                         * nor the pending one:
1607                         */
1608                        if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1609                                printk(KERN_ERR "futex_lock_pi: ret = %d "
1610                                       "pi-mutex: %p pi-state %p\n", ret,
1611                                       q.pi_state->pi_mutex.owner,
1612                                       q.pi_state->owner);
1613                }
1614        }
1615
1616        /* Unqueue and drop the lock */
1617        unqueue_me_pi(&q);
1618        futex_unlock_mm(fshared);
1619
1620        if (to)
1621                destroy_hrtimer_on_stack(&to->timer);
1622        return ret != -EINTR ? ret : -ERESTARTNOINTR;
1623
1624 out_unlock_release_sem:
1625        queue_unlock(&q, hb);
1626
1627 out_release_sem:
1628        futex_unlock_mm(fshared);
1629        if (to)
1630                destroy_hrtimer_on_stack(&to->timer);
1631        return ret;
1632
1633 uaddr_faulted:
1634        /*
1635         * We have to r/w  *(int __user *)uaddr, but we can't modify it
1636         * non-atomically.  Therefore, if get_user below is not
1637         * enough, we need to handle the fault ourselves, while
1638         * still holding the mmap_sem.
1639         *
1640         * ... and hb->lock. :-) --ANK
1641         */
1642        queue_unlock(&q, hb);
1643
1644        if (attempt++) {
1645                ret = futex_handle_fault((unsigned long)uaddr, fshared,
1646                                         attempt);
1647                if (ret)
1648                        goto out_release_sem;
1649                goto retry_unlocked;
1650        }
1651
1652        futex_unlock_mm(fshared);
1653
1654        ret = get_user(uval, uaddr);
1655        if (!ret && (uval != -EFAULT))
1656                goto retry;
1657
1658        if (to)
1659                destroy_hrtimer_on_stack(&to->timer);
1660        return ret;
1661}
1662
1663/*
1664 * Userspace attempted a TID -> 0 atomic transition, and failed.
1665 * This is the in-kernel slowpath: we look up the PI state (if any),
1666 * and do the rt-mutex unlock.
1667 */
1668static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1669{
1670        struct futex_hash_bucket *hb;
1671        struct futex_q *this, *next;
1672        u32 uval;
1673        struct plist_head *head;
1674        union futex_key key;
1675        int ret, attempt = 0;
1676
1677retry:
1678        if (get_user(uval, uaddr))
1679                return -EFAULT;
1680        /*
1681         * We release only a lock we actually own:
1682         */
1683        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1684                return -EPERM;
1685        /*
1686         * First take all the futex related locks:
1687         */
1688        futex_lock_mm(fshared);
1689
1690        ret = get_futex_key(uaddr, fshared, &key);
1691        if (unlikely(ret != 0))
1692                goto out;
1693
1694        hb = hash_futex(&key);
1695retry_unlocked:
1696        spin_lock(&hb->lock);
1697
1698        /*
1699         * To avoid races, try to do the TID -> 0 atomic transition
1700         * again. If it succeeds then we can return without waking
1701         * anyone else up:
1702         */
1703        if (!(uval & FUTEX_OWNER_DIED))
1704                uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
1705
1706
1707        if (unlikely(uval == -EFAULT))
1708                goto pi_faulted;
1709        /*
1710         * Rare case: we managed to release the lock atomically,
1711         * no need to wake anyone else up:
1712         */
1713        if (unlikely(uval == task_pid_vnr(current)))
1714                goto out_unlock;
1715
1716        /*
1717         * Ok, other tasks may need to be woken up - check waiters
1718         * and do the wakeup if necessary:
1719         */
1720        head = &hb->chain;
1721
1722        plist_for_each_entry_safe(this, next, head, list) {
1723                if (!match_futex (&this->key, &key))
1724                        continue;
1725                ret = wake_futex_pi(uaddr, uval, this);
1726                /*
1727                 * The atomic access to the futex value
1728                 * generated a pagefault, so retry the
1729                 * user-access and the wakeup:
1730                 */
1731                if (ret == -EFAULT)
1732                        goto pi_faulted;
1733                goto out_unlock;
1734        }
1735        /*
1736         * No waiters - kernel unlocks the futex:
1737         */
1738        if (!(uval & FUTEX_OWNER_DIED)) {
1739                ret = unlock_futex_pi(uaddr, uval);
1740                if (ret == -EFAULT)
1741                        goto pi_faulted;
1742        }
1743
1744out_unlock:
1745        spin_unlock(&hb->lock);
1746out:
1747        futex_unlock_mm(fshared);
1748
1749        return ret;
1750
1751pi_faulted:
1752        /*
1753         * We have to r/w  *(int __user *)uaddr, but we can't modify it
1754         * non-atomically.  Therefore, if get_user below is not
1755         * enough, we need to handle the fault ourselves, while
1756         * still holding the mmap_sem.
1757         *
1758         * ... and hb->lock. --ANK
1759         */
1760        spin_unlock(&hb->lock);
1761
1762        if (attempt++) {
1763                ret = futex_handle_fault((unsigned long)uaddr, fshared,
1764                                         attempt);
1765                if (ret)
1766                        goto out;
1767                uval = 0;
1768                goto retry_unlocked;
1769        }
1770
1771        futex_unlock_mm(fshared);
1772
1773        ret = get_user(uval, uaddr);
1774        if (!ret && (uval != -EFAULT))
1775                goto retry;
1776
1777        return ret;
1778}
1779
1780/*
1781 * Support for robust futexes: the kernel cleans up held futexes at
1782 * thread exit time.
1783 *
1784 * Implementation: user-space maintains a per-thread list of locks it
1785 * is holding. Upon do_exit(), the kernel carefully walks this list,
1786 * and marks all locks that are owned by this thread with the
1787 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
1788 * always manipulated with the lock held, so the list is private and
1789 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
1790 * field, to allow the kernel to clean up if the thread dies after
1791 * acquiring the lock, but just before it could have added itself to
1792 * the list. There can only be one such pending lock.
1793 */
1794
1795/**
1796 * sys_set_robust_list - set the robust-futex list head of a task
1797 * @head: pointer to the list-head
1798 * @len: length of the list-head, as userspace expects
1799 */
1800asmlinkage long
1801sys_set_robust_list(struct robust_list_head __user *head,
1802                    size_t len)
1803{
1804        if (!futex_cmpxchg_enabled)
1805                return -ENOSYS;
1806        /*
1807         * The kernel knows only one size for now:
1808         */
1809        if (unlikely(len != sizeof(*head)))
1810                return -EINVAL;
1811
1812        current->robust_list = head;
1813
1814        return 0;
1815}
1816
1817/**
1818 * sys_get_robust_list - get the robust-futex list head of a task
1819 * @pid: pid of the process [zero for current task]
1820 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
1821 * @len_ptr: pointer to a length field, the kernel fills in the header size
1822 */
1823asmlinkage long
1824sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1825                    size_t __user *len_ptr)
1826{
1827        struct robust_list_head __user *head;
1828        unsigned long ret;
1829
1830        if (!futex_cmpxchg_enabled)
1831                return -ENOSYS;
1832
1833        if (!pid)
1834                head = current->robust_list;
1835        else {
1836                struct task_struct *p;
1837
1838                ret = -ESRCH;
1839                rcu_read_lock();
1840                p = find_task_by_vpid(pid);
1841                if (!p)
1842                        goto err_unlock;
1843                ret = -EPERM;
1844                if ((current->euid != p->euid) && (current->euid != p->uid) &&
1845                                !capable(CAP_SYS_PTRACE))
1846                        goto err_unlock;
1847                head = p->robust_list;
1848                rcu_read_unlock();
1849        }
1850
1851        if (put_user(sizeof(*head), len_ptr))
1852                return -EFAULT;
1853        return put_user(head, head_ptr);
1854
1855err_unlock:
1856        rcu_read_unlock();
1857
1858        return ret;
1859}
1860
1861/*
1862 * Process a futex-list entry, check whether it's owned by the
1863 * dying task, and do notification if so:
1864 */
1865int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
1866{
1867        u32 uval, nval, mval;
1868
1869retry:
1870        if (get_user(uval, uaddr))
1871                return -1;
1872
1873        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
1874                /*
1875                 * Ok, this dying thread is truly holding a futex
1876                 * of interest. Set the OWNER_DIED bit atomically
1877                 * via cmpxchg, and if the value had FUTEX_WAITERS
1878                 * set, wake up a waiter (if any). (We have to do a
1879                 * futex_wake() even if OWNER_DIED is already set -
1880                 * to handle the rare but possible case of recursive
1881                 * thread-death.) The rest of the cleanup is done in
1882                 * userspace.
1883                 */
1884                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1885                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1886
1887                if (nval == -EFAULT)
1888                        return -1;
1889
1890                if (nval != uval)
1891                        goto retry;
1892
1893                /*
1894                 * Wake robust non-PI futexes here. The wakeup of
1895                 * PI futexes happens in exit_pi_state():
1896                 */
1897                if (!pi && (uval & FUTEX_WAITERS))
1898                        futex_wake(uaddr, &curr->mm->mmap_sem, 1,
1899                                   FUTEX_BITSET_MATCH_ANY);
1900        }
1901        return 0;
1902}
1903
1904/*
1905 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1906 */
1907static inline int fetch_robust_entry(struct robust_list __user **entry,
1908                                     struct robust_list __user * __user *head,
1909                                     int *pi)
1910{
1911        unsigned long uentry;
1912
1913        if (get_user(uentry, (unsigned long __user *)head))
1914                return -EFAULT;
1915
1916        *entry = (void __user *)(uentry & ~1UL);
1917        *pi = uentry & 1;
1918
1919        return 0;
1920}
1921
1922/*
1923 * Walk curr->robust_list (very carefully, it's a userspace list!)
1924 * and mark any locks found there dead, and notify any waiters.
1925 *
1926 * We silently return on any sign of list-walking problem.
1927 */
1928void exit_robust_list(struct task_struct *curr)
1929{
1930        struct robust_list_head __user *head = curr->robust_list;
1931        struct robust_list __user *entry, *next_entry, *pending;
1932        unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
1933        unsigned long futex_offset;
1934        int rc;
1935
1936        if (!futex_cmpxchg_enabled)
1937                return;
1938
1939        /*
1940         * Fetch the list head (which was registered earlier, via
1941         * sys_set_robust_list()):
1942         */
1943        if (fetch_robust_entry(&entry, &head->list.next, &pi))
1944                return;
1945        /*
1946         * Fetch the relative futex offset:
1947         */
1948        if (get_user(futex_offset, &head->futex_offset))
1949                return;
1950        /*
1951         * Fetch any possibly pending lock-add first, and handle it
1952         * if it exists:
1953         */
1954        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
1955                return;
1956
1957        next_entry = NULL;      /* avoid warning with gcc */
1958        while (entry != &head->list) {
1959                /*
1960                 * Fetch the next entry in the list before calling
1961                 * handle_futex_death:
1962                 */
1963                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
1964                /*
1965                 * A pending lock might already be on the list, so
1966                 * don't process it twice:
1967                 */
1968                if (entry != pending)
1969                        if (handle_futex_death((void __user *)entry + futex_offset,
1970                                                curr, pi))
1971                                return;
1972                if (rc)
1973                        return;
1974                entry = next_entry;
1975                pi = next_pi;
1976                /*
1977                 * Avoid excessively long or circular lists:
1978                 */
1979                if (!--limit)
1980                        break;
1981
1982                cond_resched();
1983        }
1984
1985        if (pending)
1986                handle_futex_death((void __user *)pending + futex_offset,
1987                                   curr, pip);
1988}
1989
1990long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1991                u32 __user *uaddr2, u32 val2, u32 val3)
1992{
1993        int ret = -ENOSYS;
1994        int cmd = op & FUTEX_CMD_MASK;
1995        struct rw_semaphore *fshared = NULL;
1996
1997        if (!(op & FUTEX_PRIVATE_FLAG))
1998                fshared = &current->mm->mmap_sem;
1999
2000        switch (cmd) {
2001        case FUTEX_WAIT:
2002                val3 = FUTEX_BITSET_MATCH_ANY;
2003        case FUTEX_WAIT_BITSET:
2004                ret = futex_wait(uaddr, fshared, val, timeout, val3);
2005                break;
2006        case FUTEX_WAKE:
2007                val3 = FUTEX_BITSET_MATCH_ANY;
2008        case FUTEX_WAKE_BITSET:
2009                ret = futex_wake(uaddr, fshared, val, val3);
2010                break;
2011        case FUTEX_REQUEUE:
2012                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
2013                break;
2014        case FUTEX_CMP_REQUEUE:
2015                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
2016                break;
2017        case FUTEX_WAKE_OP:
2018                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
2019                break;
2020        case FUTEX_LOCK_PI:
2021                if (futex_cmpxchg_enabled)
2022                        ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
2023                break;
2024        case FUTEX_UNLOCK_PI:
2025                if (futex_cmpxchg_enabled)
2026                        ret = futex_unlock_pi(uaddr, fshared);
2027                break;
2028        case FUTEX_TRYLOCK_PI:
2029                if (futex_cmpxchg_enabled)
2030                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2031                break;
2032        default:
2033                ret = -ENOSYS;
2034        }
2035        return ret;
2036}
2037
2038
2039asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2040                          struct timespec __user *utime, u32 __user *uaddr2,
2041                          u32 val3)
2042{
2043        struct timespec ts;
2044        ktime_t t, *tp = NULL;
2045        u32 val2 = 0;
2046        int cmd = op & FUTEX_CMD_MASK;
2047
2048        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2049                      cmd == FUTEX_WAIT_BITSET)) {
2050                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2051                        return -EFAULT;
2052                if (!timespec_valid(&ts))
2053                        return -EINVAL;
2054
2055                t = timespec_to_ktime(ts);
2056                if (cmd == FUTEX_WAIT)
2057                        t = ktime_add_safe(ktime_get(), t);
2058                tp = &t;
2059        }
2060        /*
2061         * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
2062         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2063         */
2064        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2065            cmd == FUTEX_WAKE_OP)
2066                val2 = (u32) (unsigned long) utime;
2067
2068        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2069}
2070
2071static int __init futex_init(void)
2072{
2073        u32 curval;
2074        int i;
2075
2076        /*
2077         * This will fail and we want it. Some arch implementations do
2078         * runtime detection of the futex_atomic_cmpxchg_inatomic()
2079         * functionality. We want to know that before we call in any
2080         * of the complex code paths. Also we want to prevent
2081         * registration of robust lists in that case. NULL is
2082         * guaranteed to fault and we get -EFAULT on functional
2083         * implementation, the non functional ones will return
2084         * -ENOSYS.
2085         */
2086        curval = cmpxchg_futex_value_locked(NULL, 0, 0);
2087        if (curval == -EFAULT)
2088                futex_cmpxchg_enabled = 1;
2089
2090        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2091                plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
2092                spin_lock_init(&futex_queues[i].lock);
2093        }
2094
2095        return 0;
2096}
2097__initcall(futex_init);
2098
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.