linux/kernel/workqueue.c
<<
>>
Prefs
   1/*
   2 * kernel/workqueue.c - generic async execution with shared worker pool
   3 *
   4 * Copyright (C) 2002           Ingo Molnar
   5 *
   6 *   Derived from the taskqueue/keventd code by:
   7 *     David Woodhouse <dwmw2@infradead.org>
   8 *     Andrew Morton
   9 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
  10 *     Theodore Ts'o <tytso@mit.edu>
  11 *
  12 * Made to use alloc_percpu by Christoph Lameter.
  13 *
  14 * Copyright (C) 2010           SUSE Linux Products GmbH
  15 * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
  16 *
  17 * This is the generic async execution mechanism.  Work items as are
  18 * executed in process context.  The worker pool is shared and
  19 * automatically managed.  There is one worker pool for each CPU and
  20 * one extra for works which are better served by workers which are
  21 * not bound to any specific CPU.
  22 *
  23 * Please read Documentation/workqueue.txt for details.
  24 */
  25
  26#include <linux/export.h>
  27#include <linux/kernel.h>
  28#include <linux/sched.h>
  29#include <linux/init.h>
  30#include <linux/signal.h>
  31#include <linux/completion.h>
  32#include <linux/workqueue.h>
  33#include <linux/slab.h>
  34#include <linux/cpu.h>
  35#include <linux/notifier.h>
  36#include <linux/kthread.h>
  37#include <linux/hardirq.h>
  38#include <linux/mempolicy.h>
  39#include <linux/freezer.h>
  40#include <linux/kallsyms.h>
  41#include <linux/debug_locks.h>
  42#include <linux/lockdep.h>
  43#include <linux/idr.h>
  44
  45#include "workqueue_sched.h"
  46
  47enum {
  48        /* global_cwq flags */
  49        GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
  50        GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
  51        GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
  52        GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
  53        GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
  54
  55        /* worker flags */
  56        WORKER_STARTED          = 1 << 0,       /* started */
  57        WORKER_DIE              = 1 << 1,       /* die die die */
  58        WORKER_IDLE             = 1 << 2,       /* is idle */
  59        WORKER_PREP             = 1 << 3,       /* preparing to run works */
  60        WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
  61        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
  62        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
  63        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
  64
  65        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
  66                                  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
  67
  68        /* gcwq->trustee_state */
  69        TRUSTEE_START           = 0,            /* start */
  70        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
  71        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
  72        TRUSTEE_RELEASE         = 3,            /* release workers */
  73        TRUSTEE_DONE            = 4,            /* trustee is done */
  74
  75        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
  76        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
  77        BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
  78
  79        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
  80        IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
  81
  82        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
  83                                                /* call for help after 10ms
  84                                                   (min two ticks) */
  85        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
  86        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
  87        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
  88
  89        /*
  90         * Rescue workers are used only on emergencies and shared by
  91         * all cpus.  Give -20.
  92         */
  93        RESCUER_NICE_LEVEL      = -20,
  94};
  95
  96/*
  97 * Structure fields follow one of the following exclusion rules.
  98 *
  99 * I: Modifiable by initialization/destruction paths and read-only for
 100 *    everyone else.
 101 *
 102 * P: Preemption protected.  Disabling preemption is enough and should
 103 *    only be modified and accessed from the local cpu.
 104 *
 105 * L: gcwq->lock protected.  Access with gcwq->lock held.
 106 *
 107 * X: During normal operation, modification requires gcwq->lock and
 108 *    should be done only from local cpu.  Either disabling preemption
 109 *    on local cpu or grabbing gcwq->lock is enough for read access.
 110 *    If GCWQ_DISASSOCIATED is set, it's identical to L.
 111 *
 112 * F: wq->flush_mutex protected.
 113 *
 114 * W: workqueue_lock protected.
 115 */
 116
 117struct global_cwq;
 118
 119/*
 120 * The poor guys doing the actual heavy lifting.  All on-duty workers
 121 * are either serving the manager role, on idle list or on busy hash.
 122 */
 123struct worker {
 124        /* on idle list while idle, on busy hash table while busy */
 125        union {
 126                struct list_head        entry;  /* L: while idle */
 127                struct hlist_node       hentry; /* L: while busy */
 128        };
 129
 130        struct work_struct      *current_work;  /* L: work being processed */
 131        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
 132        struct list_head        scheduled;      /* L: scheduled works */
 133        struct task_struct      *task;          /* I: worker task */
 134        struct global_cwq       *gcwq;          /* I: the associated gcwq */
 135        /* 64 bytes boundary on 64bit, 32 on 32bit */
 136        unsigned long           last_active;    /* L: last active timestamp */
 137        unsigned int            flags;          /* X: flags */
 138        int                     id;             /* I: worker id */
 139        struct work_struct      rebind_work;    /* L: rebind worker to cpu */
 140};
 141
 142/*
 143 * Global per-cpu workqueue.  There's one and only one for each cpu
 144 * and all works are queued and processed here regardless of their
 145 * target workqueues.
 146 */
 147struct global_cwq {
 148        spinlock_t              lock;           /* the gcwq lock */
 149        struct list_head        worklist;       /* L: list of pending works */
 150        unsigned int            cpu;            /* I: the associated cpu */
 151        unsigned int            flags;          /* L: GCWQ_* flags */
 152
 153        int                     nr_workers;     /* L: total number of workers */
 154        int                     nr_idle;        /* L: currently idle ones */
 155
 156        /* workers are chained either in the idle_list or busy_hash */
 157        struct list_head        idle_list;      /* X: list of idle workers */
 158        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
 159                                                /* L: hash of busy workers */
 160
 161        struct timer_list       idle_timer;     /* L: worker idle timeout */
 162        struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
 163
 164        struct ida              worker_ida;     /* L: for worker IDs */
 165
 166        struct task_struct      *trustee;       /* L: for gcwq shutdown */
 167        unsigned int            trustee_state;  /* L: trustee state */
 168        wait_queue_head_t       trustee_wait;   /* trustee wait */
 169        struct worker           *first_idle;    /* L: first idle worker */
 170} ____cacheline_aligned_in_smp;
 171
 172/*
 173 * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
 174 * work_struct->data are used for flags and thus cwqs need to be
 175 * aligned at two's power of the number of flag bits.
 176 */
 177struct cpu_workqueue_struct {
 178        struct global_cwq       *gcwq;          /* I: the associated gcwq */
 179        struct workqueue_struct *wq;            /* I: the owning workqueue */
 180        int                     work_color;     /* L: current color */
 181        int                     flush_color;    /* L: flushing color */
 182        int                     nr_in_flight[WORK_NR_COLORS];
 183                                                /* L: nr of in_flight works */
 184        int                     nr_active;      /* L: nr of active works */
 185        int                     max_active;     /* L: max active works */
 186        struct list_head        delayed_works;  /* L: delayed works */
 187};
 188
 189/*
 190 * Structure used to wait for workqueue flush.
 191 */
 192struct wq_flusher {
 193        struct list_head        list;           /* F: list of flushers */
 194        int                     flush_color;    /* F: flush color waiting for */
 195        struct completion       done;           /* flush completion */
 196};
 197
 198/*
 199 * All cpumasks are assumed to be always set on UP and thus can't be
 200 * used to determine whether there's something to be done.
 201 */
 202#ifdef CONFIG_SMP
 203typedef cpumask_var_t mayday_mask_t;
 204#define mayday_test_and_set_cpu(cpu, mask)      \
 205        cpumask_test_and_set_cpu((cpu), (mask))
 206#define mayday_clear_cpu(cpu, mask)             cpumask_clear_cpu((cpu), (mask))
 207#define for_each_mayday_cpu(cpu, mask)          for_each_cpu((cpu), (mask))
 208#define alloc_mayday_mask(maskp, gfp)           zalloc_cpumask_var((maskp), (gfp))
 209#define free_mayday_mask(mask)                  free_cpumask_var((mask))
 210#else
 211typedef unsigned long mayday_mask_t;
 212#define mayday_test_and_set_cpu(cpu, mask)      test_and_set_bit(0, &(mask))
 213#define mayday_clear_cpu(cpu, mask)             clear_bit(0, &(mask))
 214#define for_each_mayday_cpu(cpu, mask)          if ((cpu) = 0, (mask))
 215#define alloc_mayday_mask(maskp, gfp)           true
 216#define free_mayday_mask(mask)                  do { } while (0)
 217#endif
 218
 219/*
 220 * The externally visible workqueue abstraction is an array of
 221 * per-CPU workqueues:
 222 */
 223struct workqueue_struct {
 224        unsigned int            flags;          /* W: WQ_* flags */
 225        union {
 226                struct cpu_workqueue_struct __percpu    *pcpu;
 227                struct cpu_workqueue_struct             *single;
 228                unsigned long                           v;
 229        } cpu_wq;                               /* I: cwq's */
 230        struct list_head        list;           /* W: list of all workqueues */
 231
 232        struct mutex            flush_mutex;    /* protects wq flushing */
 233        int                     work_color;     /* F: current work color */
 234        int                     flush_color;    /* F: current flush color */
 235        atomic_t                nr_cwqs_to_flush; /* flush in progress */
 236        struct wq_flusher       *first_flusher; /* F: first flusher */
 237        struct list_head        flusher_queue;  /* F: flush waiters */
 238        struct list_head        flusher_overflow; /* F: flush overflow list */
 239
 240        mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
 241        struct worker           *rescuer;       /* I: rescue worker */
 242
 243        int                     nr_drainers;    /* W: drain in progress */
 244        int                     saved_max_active; /* W: saved cwq max_active */
 245#ifdef CONFIG_LOCKDEP
 246        struct lockdep_map      lockdep_map;
 247#endif
 248        char                    name[];         /* I: workqueue name */
 249};
 250
 251struct workqueue_struct *system_wq __read_mostly;
 252struct workqueue_struct *system_long_wq __read_mostly;
 253struct workqueue_struct *system_nrt_wq __read_mostly;
 254struct workqueue_struct *system_unbound_wq __read_mostly;
 255struct workqueue_struct *system_freezable_wq __read_mostly;
 256struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 257EXPORT_SYMBOL_GPL(system_wq);
 258EXPORT_SYMBOL_GPL(system_long_wq);
 259EXPORT_SYMBOL_GPL(system_nrt_wq);
 260EXPORT_SYMBOL_GPL(system_unbound_wq);
 261EXPORT_SYMBOL_GPL(system_freezable_wq);
 262EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 263
 264#define CREATE_TRACE_POINTS
 265#include <trace/events/workqueue.h>
 266
 267#define for_each_busy_worker(worker, i, pos, gcwq)                      \
 268        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
 269                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
 270
 271static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
 272                                  unsigned int sw)
 273{
 274        if (cpu < nr_cpu_ids) {
 275                if (sw & 1) {
 276                        cpu = cpumask_next(cpu, mask);
 277                        if (cpu < nr_cpu_ids)
 278                                return cpu;
 279                }
 280                if (sw & 2)
 281                        return WORK_CPU_UNBOUND;
 282        }
 283        return WORK_CPU_NONE;
 284}
 285
 286static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
 287                                struct workqueue_struct *wq)
 288{
 289        return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
 290}
 291
 292/*
 293 * CPU iterators
 294 *
 295 * An extra gcwq is defined for an invalid cpu number
 296 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
 297 * specific CPU.  The following iterators are similar to
 298 * for_each_*_cpu() iterators but also considers the unbound gcwq.
 299 *
 300 * for_each_gcwq_cpu()          : possible CPUs + WORK_CPU_UNBOUND
 301 * for_each_online_gcwq_cpu()   : online CPUs + WORK_CPU_UNBOUND
 302 * for_each_cwq_cpu()           : possible CPUs for bound workqueues,
 303 *                                WORK_CPU_UNBOUND for unbound workqueues
 304 */
 305#define for_each_gcwq_cpu(cpu)                                          \
 306        for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);         \
 307             (cpu) < WORK_CPU_NONE;                                     \
 308             (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
 309
 310#define for_each_online_gcwq_cpu(cpu)                                   \
 311        for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);           \
 312             (cpu) < WORK_CPU_NONE;                                     \
 313             (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
 314
 315#define for_each_cwq_cpu(cpu, wq)                                       \
 316        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));        \
 317             (cpu) < WORK_CPU_NONE;                                     \
 318             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
 319
 320#ifdef CONFIG_DEBUG_OBJECTS_WORK
 321
 322static struct debug_obj_descr work_debug_descr;
 323
 324static void *work_debug_hint(void *addr)
 325{
 326        return ((struct work_struct *) addr)->func;
 327}
 328
 329/*
 330 * fixup_init is called when:
 331 * - an active object is initialized
 332 */
 333static int work_fixup_init(void *addr, enum debug_obj_state state)
 334{
 335        struct work_struct *work = addr;
 336
 337        switch (state) {
 338        case ODEBUG_STATE_ACTIVE:
 339                cancel_work_sync(work);
 340                debug_object_init(work, &work_debug_descr);
 341                return 1;
 342        default:
 343                return 0;
 344        }
 345}
 346
 347/*
 348 * fixup_activate is called when:
 349 * - an active object is activated
 350 * - an unknown object is activated (might be a statically initialized object)
 351 */
 352static int work_fixup_activate(void *addr, enum debug_obj_state state)
 353{
 354        struct work_struct *work = addr;
 355
 356        switch (state) {
 357
 358        case ODEBUG_STATE_NOTAVAILABLE:
 359                /*
 360                 * This is not really a fixup. The work struct was
 361                 * statically initialized. We just make sure that it
 362                 * is tracked in the object tracker.
 363                 */
 364                if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 365                        debug_object_init(work, &work_debug_descr);
 366                        debug_object_activate(work, &work_debug_descr);
 367                        return 0;
 368                }
 369                WARN_ON_ONCE(1);
 370                return 0;
 371
 372        case ODEBUG_STATE_ACTIVE:
 373                WARN_ON(1);
 374
 375        default:
 376                return 0;
 377        }
 378}
 379
 380/*
 381 * fixup_free is called when:
 382 * - an active object is freed
 383 */
 384static int work_fixup_free(void *addr, enum debug_obj_state state)
 385{
 386        struct work_struct *work = addr;
 387
 388        switch (state) {
 389        case ODEBUG_STATE_ACTIVE:
 390                cancel_work_sync(work);
 391                debug_object_free(work, &work_debug_descr);
 392                return 1;
 393        default:
 394                return 0;
 395        }
 396}
 397
 398static struct debug_obj_descr work_debug_descr = {
 399        .name           = "work_struct",
 400        .debug_hint     = work_debug_hint,
 401        .fixup_init     = work_fixup_init,
 402        .fixup_activate = work_fixup_activate,
 403        .fixup_free     = work_fixup_free,
 404};
 405
 406static inline void debug_work_activate(struct work_struct *work)
 407{
 408        debug_object_activate(work, &work_debug_descr);
 409}
 410
 411static inline void debug_work_deactivate(struct work_struct *work)
 412{
 413        debug_object_deactivate(work, &work_debug_descr);
 414}
 415
 416void __init_work(struct work_struct *work, int onstack)
 417{
 418        if (onstack)
 419                debug_object_init_on_stack(work, &work_debug_descr);
 420        else
 421                debug_object_init(work, &work_debug_descr);
 422}
 423EXPORT_SYMBOL_GPL(__init_work);
 424
 425void destroy_work_on_stack(struct work_struct *work)
 426{
 427        debug_object_free(work, &work_debug_descr);
 428}
 429EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 430
 431#else
 432static inline void debug_work_activate(struct work_struct *work) { }
 433static inline void debug_work_deactivate(struct work_struct *work) { }
 434#endif
 435
 436/* Serializes the accesses to the list of workqueues. */
 437static DEFINE_SPINLOCK(workqueue_lock);
 438static LIST_HEAD(workqueues);
 439static bool workqueue_freezing;         /* W: have wqs started freezing? */
 440
 441/*
 442 * The almighty global cpu workqueues.  nr_running is the only field
 443 * which is expected to be used frequently by other cpus via
 444 * try_to_wake_up().  Put it in a separate cacheline.
 445 */
 446static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 447static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 448
 449/*
 450 * Global cpu workqueue and nr_running counter for unbound gcwq.  The
 451 * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
 452 * workers have WORKER_UNBOUND set.
 453 */
 454static struct global_cwq unbound_global_cwq;
 455static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
 456
 457static int worker_thread(void *__worker);
 458
 459static struct global_cwq *get_gcwq(unsigned int cpu)
 460{
 461        if (cpu != WORK_CPU_UNBOUND)
 462                return &per_cpu(global_cwq, cpu);
 463        else
 464                return &unbound_global_cwq;
 465}
 466
 467static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 468{
 469        if (cpu != WORK_CPU_UNBOUND)
 470                return &per_cpu(gcwq_nr_running, cpu);
 471        else
 472                return &unbound_gcwq_nr_running;
 473}
 474
 475static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 476                                            struct workqueue_struct *wq)
 477{
 478        if (!(wq->flags & WQ_UNBOUND)) {
 479                if (likely(cpu < nr_cpu_ids))
 480                        return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
 481        } else if (likely(cpu == WORK_CPU_UNBOUND))
 482                return wq->cpu_wq.single;
 483        return NULL;
 484}
 485
 486static unsigned int work_color_to_flags(int color)
 487{
 488        return color << WORK_STRUCT_COLOR_SHIFT;
 489}
 490
 491static int get_work_color(struct work_struct *work)
 492{
 493        return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
 494                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
 495}
 496
 497static int work_next_color(int color)
 498{
 499        return (color + 1) % WORK_NR_COLORS;
 500}
 501
 502/*
 503 * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
 504 * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
 505 * cleared and the work data contains the cpu number it was last on.
 506 *
 507 * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
 508 * cwq, cpu or clear work->data.  These functions should only be
 509 * called while the work is owned - ie. while the PENDING bit is set.
 510 *
 511 * get_work_[g]cwq() can be used to obtain the gcwq or cwq
 512 * corresponding to a work.  gcwq is available once the work has been
 513 * queued anywhere after initialization.  cwq is available only from
 514 * queueing until execution starts.
 515 */
 516static inline void set_work_data(struct work_struct *work, unsigned long data,
 517                                 unsigned long flags)
 518{
 519        BUG_ON(!work_pending(work));
 520        atomic_long_set(&work->data, data | flags | work_static(work));
 521}
 522
 523static void set_work_cwq(struct work_struct *work,
 524                         struct cpu_workqueue_struct *cwq,
 525                         unsigned long extra_flags)
 526{
 527        set_work_data(work, (unsigned long)cwq,
 528                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 529}
 530
 531static void set_work_cpu(struct work_struct *work, unsigned int cpu)
 532{
 533        set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
 534}
 535
 536static void clear_work_data(struct work_struct *work)
 537{
 538        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 539}
 540
 541static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 542{
 543        unsigned long data = atomic_long_read(&work->data);
 544
 545        if (data & WORK_STRUCT_CWQ)
 546                return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
 547        else
 548                return NULL;
 549}
 550
 551static struct global_cwq *get_work_gcwq(struct work_struct *work)
 552{
 553        unsigned long data = atomic_long_read(&work->data);
 554        unsigned int cpu;
 555
 556        if (data & WORK_STRUCT_CWQ)
 557                return ((struct cpu_workqueue_struct *)
 558                        (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
 559
 560        cpu = data >> WORK_STRUCT_FLAG_BITS;
 561        if (cpu == WORK_CPU_NONE)
 562                return NULL;
 563
 564        BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
 565        return get_gcwq(cpu);
 566}
 567
 568/*
 569 * Policy functions.  These define the policies on how the global
 570 * worker pool is managed.  Unless noted otherwise, these functions
 571 * assume that they're being called with gcwq->lock held.
 572 */
 573
 574static bool __need_more_worker(struct global_cwq *gcwq)
 575{
 576        return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
 577                gcwq->flags & GCWQ_HIGHPRI_PENDING;
 578}
 579
 580/*
 581 * Need to wake up a worker?  Called from anything but currently
 582 * running workers.
 583 */
 584static bool need_more_worker(struct global_cwq *gcwq)
 585{
 586        return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
 587}
 588
 589/* Can I start working?  Called from busy but !running workers. */
 590static bool may_start_working(struct global_cwq *gcwq)
 591{
 592        return gcwq->nr_idle;
 593}
 594
 595/* Do I need to keep working?  Called from currently running workers. */
 596static bool keep_working(struct global_cwq *gcwq)
 597{
 598        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
 599
 600        return !list_empty(&gcwq->worklist) &&
 601                (atomic_read(nr_running) <= 1 ||
 602                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 603}
 604
 605/* Do we need a new worker?  Called from manager. */
 606static bool need_to_create_worker(struct global_cwq *gcwq)
 607{
 608        return need_more_worker(gcwq) && !may_start_working(gcwq);
 609}
 610
 611/* Do I need to be the manager? */
 612static bool need_to_manage_workers(struct global_cwq *gcwq)
 613{
 614        return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
 615}
 616
 617/* Do we have too many workers and should some go away? */
 618static bool too_many_workers(struct global_cwq *gcwq)
 619{
 620        bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
 621        int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
 622        int nr_busy = gcwq->nr_workers - nr_idle;
 623
 624        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 625}
 626
 627/*
 628 * Wake up functions.
 629 */
 630
 631/* Return the first worker.  Safe with preemption disabled */
 632static struct worker *first_worker(struct global_cwq *gcwq)
 633{
 634        if (unlikely(list_empty(&gcwq->idle_list)))
 635                return NULL;
 636
 637        return list_first_entry(&gcwq->idle_list, struct worker, entry);
 638}
 639
 640/**
 641 * wake_up_worker - wake up an idle worker
 642 * @gcwq: gcwq to wake worker for
 643 *
 644 * Wake up the first idle worker of @gcwq.
 645 *
 646 * CONTEXT:
 647 * spin_lock_irq(gcwq->lock).
 648 */
 649static void wake_up_worker(struct global_cwq *gcwq)
 650{
 651        struct worker *worker = first_worker(gcwq);
 652
 653        if (likely(worker))
 654                wake_up_process(worker->task);
 655}
 656
 657/**
 658 * wq_worker_waking_up - a worker is waking up
 659 * @task: task waking up
 660 * @cpu: CPU @task is waking up to
 661 *
 662 * This function is called during try_to_wake_up() when a worker is
 663 * being awoken.
 664 *
 665 * CONTEXT:
 666 * spin_lock_irq(rq->lock)
 667 */
 668void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 669{
 670        struct worker *worker = kthread_data(task);
 671
 672        if (!(worker->flags & WORKER_NOT_RUNNING))
 673                atomic_inc(get_gcwq_nr_running(cpu));
 674}
 675
 676/**
 677 * wq_worker_sleeping - a worker is going to sleep
 678 * @task: task going to sleep
 679 * @cpu: CPU in question, must be the current CPU number
 680 *
 681 * This function is called during schedule() when a busy worker is
 682 * going to sleep.  Worker on the same cpu can be woken up by
 683 * returning pointer to its task.
 684 *
 685 * CONTEXT:
 686 * spin_lock_irq(rq->lock)
 687 *
 688 * RETURNS:
 689 * Worker task on @cpu to wake up, %NULL if none.
 690 */
 691struct task_struct *wq_worker_sleeping(struct task_struct *task,
 692                                       unsigned int cpu)
 693{
 694        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
 695        struct global_cwq *gcwq = get_gcwq(cpu);
 696        atomic_t *nr_running = get_gcwq_nr_running(cpu);
 697
 698        if (worker->flags & WORKER_NOT_RUNNING)
 699                return NULL;
 700
 701        /* this can only happen on the local cpu */
 702        BUG_ON(cpu != raw_smp_processor_id());
 703
 704        /*
 705         * The counterpart of the following dec_and_test, implied mb,
 706         * worklist not empty test sequence is in insert_work().
 707         * Please read comment there.
 708         *
 709         * NOT_RUNNING is clear.  This means that trustee is not in
 710         * charge and we're running on the local cpu w/ rq lock held
 711         * and preemption disabled, which in turn means that none else
 712         * could be manipulating idle_list, so dereferencing idle_list
 713         * without gcwq lock is safe.
 714         */
 715        if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
 716                to_wakeup = first_worker(gcwq);
 717        return to_wakeup ? to_wakeup->task : NULL;
 718}
 719
 720/**
 721 * worker_set_flags - set worker flags and adjust nr_running accordingly
 722 * @worker: self
 723 * @flags: flags to set
 724 * @wakeup: wakeup an idle worker if necessary
 725 *
 726 * Set @flags in @worker->flags and adjust nr_running accordingly.  If
 727 * nr_running becomes zero and @wakeup is %true, an idle worker is
 728 * woken up.
 729 *
 730 * CONTEXT:
 731 * spin_lock_irq(gcwq->lock)
 732 */
 733static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 734                                    bool wakeup)
 735{
 736        struct global_cwq *gcwq = worker->gcwq;
 737
 738        WARN_ON_ONCE(worker->task != current);
 739
 740        /*
 741         * If transitioning into NOT_RUNNING, adjust nr_running and
 742         * wake up an idle worker as necessary if requested by
 743         * @wakeup.
 744         */
 745        if ((flags & WORKER_NOT_RUNNING) &&
 746            !(worker->flags & WORKER_NOT_RUNNING)) {
 747                atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
 748
 749                if (wakeup) {
 750                        if (atomic_dec_and_test(nr_running) &&
 751                            !list_empty(&gcwq->worklist))
 752                                wake_up_worker(gcwq);
 753                } else
 754                        atomic_dec(nr_running);
 755        }
 756
 757        worker->flags |= flags;
 758}
 759
 760/**
 761 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 762 * @worker: self
 763 * @flags: flags to clear
 764 *
 765 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 766 *
 767 * CONTEXT:
 768 * spin_lock_irq(gcwq->lock)
 769 */
 770static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 771{
 772        struct global_cwq *gcwq = worker->gcwq;
 773        unsigned int oflags = worker->flags;
 774
 775        WARN_ON_ONCE(worker->task != current);
 776
 777        worker->flags &= ~flags;
 778
 779        /*
 780         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
 781         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
 782         * of multiple flags, not a single flag.
 783         */
 784        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 785                if (!(worker->flags & WORKER_NOT_RUNNING))
 786                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
 787}
 788
 789/**
 790 * busy_worker_head - return the busy hash head for a work
 791 * @gcwq: gcwq of interest
 792 * @work: work to be hashed
 793 *
 794 * Return hash head of @gcwq for @work.
 795 *
 796 * CONTEXT:
 797 * spin_lock_irq(gcwq->lock).
 798 *
 799 * RETURNS:
 800 * Pointer to the hash head.
 801 */
 802static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
 803                                           struct work_struct *work)
 804{
 805        const int base_shift = ilog2(sizeof(struct work_struct));
 806        unsigned long v = (unsigned long)work;
 807
 808        /* simple shift and fold hash, do we need something better? */
 809        v >>= base_shift;
 810        v += v >> BUSY_WORKER_HASH_ORDER;
 811        v &= BUSY_WORKER_HASH_MASK;
 812
 813        return &gcwq->busy_hash[v];
 814}
 815
 816/**
 817 * __find_worker_executing_work - find worker which is executing a work
 818 * @gcwq: gcwq of interest
 819 * @bwh: hash head as returned by busy_worker_head()
 820 * @work: work to find worker for
 821 *
 822 * Find a worker which is executing @work on @gcwq.  @bwh should be
 823 * the hash head obtained by calling busy_worker_head() with the same
 824 * work.
 825 *
 826 * CONTEXT:
 827 * spin_lock_irq(gcwq->lock).
 828 *
 829 * RETURNS:
 830 * Pointer to worker which is executing @work if found, NULL
 831 * otherwise.
 832 */
 833static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
 834                                                   struct hlist_head *bwh,
 835                                                   struct work_struct *work)
 836{
 837        struct worker *worker;
 838        struct hlist_node *tmp;
 839
 840        hlist_for_each_entry(worker, tmp, bwh, hentry)
 841                if (worker->current_work == work)
 842                        return worker;
 843        return NULL;
 844}
 845
 846/**
 847 * find_worker_executing_work - find worker which is executing a work
 848 * @gcwq: gcwq of interest
 849 * @work: work to find worker for
 850 *
 851 * Find a worker which is executing @work on @gcwq.  This function is
 852 * identical to __find_worker_executing_work() except that this
 853 * function calculates @bwh itself.
 854 *
 855 * CONTEXT:
 856 * spin_lock_irq(gcwq->lock).
 857 *
 858 * RETURNS:
 859 * Pointer to worker which is executing @work if found, NULL
 860 * otherwise.
 861 */
 862static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 863                                                 struct work_struct *work)
 864{
 865        return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
 866                                            work);
 867}
 868
 869/**
 870 * gcwq_determine_ins_pos - find insertion position
 871 * @gcwq: gcwq of interest
 872 * @cwq: cwq a work is being queued for
 873 *
 874 * A work for @cwq is about to be queued on @gcwq, determine insertion
 875 * position for the work.  If @cwq is for HIGHPRI wq, the work is
 876 * queued at the head of the queue but in FIFO order with respect to
 877 * other HIGHPRI works; otherwise, at the end of the queue.  This
 878 * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
 879 * there are HIGHPRI works pending.
 880 *
 881 * CONTEXT:
 882 * spin_lock_irq(gcwq->lock).
 883 *
 884 * RETURNS:
 885 * Pointer to inserstion position.
 886 */
 887static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
 888                                               struct cpu_workqueue_struct *cwq)
 889{
 890        struct work_struct *twork;
 891
 892        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
 893                return &gcwq->worklist;
 894
 895        list_for_each_entry(twork, &gcwq->worklist, entry) {
 896                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
 897
 898                if (!(tcwq->wq->flags & WQ_HIGHPRI))
 899                        break;
 900        }
 901
 902        gcwq->flags |= GCWQ_HIGHPRI_PENDING;
 903        return &twork->entry;
 904}
 905
 906/**
 907 * insert_work - insert a work into gcwq
 908 * @cwq: cwq @work belongs to
 909 * @work: work to insert
 910 * @head: insertion point
 911 * @extra_flags: extra WORK_STRUCT_* flags to set
 912 *
 913 * Insert @work which belongs to @cwq into @gcwq after @head.
 914 * @extra_flags is or'd to work_struct flags.
 915 *
 916 * CONTEXT:
 917 * spin_lock_irq(gcwq->lock).
 918 */
 919static void insert_work(struct cpu_workqueue_struct *cwq,
 920                        struct work_struct *work, struct list_head *head,
 921                        unsigned int extra_flags)
 922{
 923        struct global_cwq *gcwq = cwq->gcwq;
 924
 925        /* we own @work, set data and link */
 926        set_work_cwq(work, cwq, extra_flags);
 927
 928        /*
 929         * Ensure that we get the right work->data if we see the
 930         * result of list_add() below, see try_to_grab_pending().
 931         */
 932        smp_wmb();
 933
 934        list_add_tail(&work->entry, head);
 935
 936        /*
 937         * Ensure either worker_sched_deactivated() sees the above
 938         * list_add_tail() or we see zero nr_running to avoid workers
 939         * lying around lazily while there are works to be processed.
 940         */
 941        smp_mb();
 942
 943        if (__need_more_worker(gcwq))
 944                wake_up_worker(gcwq);
 945}
 946
 947/*
 948 * Test whether @work is being queued from another work executing on the
 949 * same workqueue.  This is rather expensive and should only be used from
 950 * cold paths.
 951 */
 952static bool is_chained_work(struct workqueue_struct *wq)
 953{
 954        unsigned long flags;
 955        unsigned int cpu;
 956
 957        for_each_gcwq_cpu(cpu) {
 958                struct global_cwq *gcwq = get_gcwq(cpu);
 959                struct worker *worker;
 960                struct hlist_node *pos;
 961                int i;
 962
 963                spin_lock_irqsave(&gcwq->lock, flags);
 964                for_each_busy_worker(worker, i, pos, gcwq) {
 965                        if (worker->task != current)
 966                                continue;
 967                        spin_unlock_irqrestore(&gcwq->lock, flags);
 968                        /*
 969                         * I'm @worker, no locking necessary.  See if @work
 970                         * is headed to the same workqueue.
 971                         */
 972                        return worker->current_cwq->wq == wq;
 973                }
 974                spin_unlock_irqrestore(&gcwq->lock, flags);
 975        }
 976        return false;
 977}
 978
 979static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 980                         struct work_struct *work)
 981{
 982        struct global_cwq *gcwq;
 983        struct cpu_workqueue_struct *cwq;
 984        struct list_head *worklist;
 985        unsigned int work_flags;
 986        unsigned long flags;
 987
 988        debug_work_activate(work);
 989
 990        /* if dying, only works from the same workqueue are allowed */
 991        if (unlikely(wq->flags & WQ_DRAINING) &&
 992            WARN_ON_ONCE(!is_chained_work(wq)))
 993                return;
 994
 995        /* determine gcwq to use */
 996        if (!(wq->flags & WQ_UNBOUND)) {
 997                struct global_cwq *last_gcwq;
 998
 999                if (unlikely(cpu == WORK_CPU_UNBOUND))
1000                        cpu = raw_smp_processor_id();
1001
1002                /*
1003                 * It's multi cpu.  If @wq is non-reentrant and @work
1004                 * was previously on a different cpu, it might still
1005                 * be running there, in which case the work needs to
1006                 * be queued on that cpu to guarantee non-reentrance.
1007                 */
1008                gcwq = get_gcwq(cpu);
1009                if (wq->flags & WQ_NON_REENTRANT &&
1010                    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
1011                        struct worker *worker;
1012
1013                        spin_lock_irqsave(&last_gcwq->lock, flags);
1014
1015                        worker = find_worker_executing_work(last_gcwq, work);
1016
1017                        if (worker && worker->current_cwq->wq == wq)
1018                                gcwq = last_gcwq;
1019                        else {
1020                                /* meh... not running there, queue here */
1021                                spin_unlock_irqrestore(&last_gcwq->lock, flags);
1022                                spin_lock_irqsave(&gcwq->lock, flags);
1023                        }
1024                } else
1025                        spin_lock_irqsave(&gcwq->lock, flags);
1026        } else {
1027                gcwq = get_gcwq(WORK_CPU_UNBOUND);
1028                spin_lock_irqsave(&gcwq->lock, flags);
1029        }
1030
1031        /* gcwq determined, get cwq and queue */
1032        cwq = get_cwq(gcwq->cpu, wq);
1033        trace_workqueue_queue_work(cpu, cwq, work);
1034
1035        if (WARN_ON(!list_empty(&work->entry))) {
1036                spin_unlock_irqrestore(&gcwq->lock, flags);
1037                return;
1038        }
1039
1040        cwq->nr_in_flight[cwq->work_color]++;
1041        work_flags = work_color_to_flags(cwq->work_color);
1042
1043        if (likely(cwq->nr_active < cwq->max_active)) {
1044                trace_workqueue_activate_work(work);
1045                cwq->nr_active++;
1046                worklist = gcwq_determine_ins_pos(gcwq, cwq);
1047        } else {
1048                work_flags |= WORK_STRUCT_DELAYED;
1049                worklist = &cwq->delayed_works;
1050        }
1051
1052        insert_work(cwq, work, worklist, work_flags);
1053
1054        spin_unlock_irqrestore(&gcwq->lock, flags);
1055}
1056
1057/**
1058 * queue_work - queue work on a workqueue
1059 * @wq: workqueue to use
1060 * @work: work to queue
1061 *
1062 * Returns 0 if @work was already on a queue, non-zero otherwise.
1063 *
1064 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1065 * it can be processed by another CPU.
1066 */
1067int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1068{
1069        int ret;
1070
1071        ret = queue_work_on(get_cpu(), wq, work);
1072        put_cpu();
1073
1074        return ret;
1075}
1076EXPORT_SYMBOL_GPL(queue_work);
1077
1078/**
1079 * queue_work_on - queue work on specific cpu
1080 * @cpu: CPU number to execute work on
1081 * @wq: workqueue to use
1082 * @work: work to queue
1083 *
1084 * Returns 0 if @work was already on a queue, non-zero otherwise.
1085 *
1086 * We queue the work to a specific CPU, the caller must ensure it
1087 * can't go away.
1088 */
1089int
1090queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1091{
1092        int ret = 0;
1093
1094        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1095                __queue_work(cpu, wq, work);
1096                ret = 1;
1097        }
1098        return ret;
1099}
1100EXPORT_SYMBOL_GPL(queue_work_on);
1101
1102static void delayed_work_timer_fn(unsigned long __data)
1103{
1104        struct delayed_work *dwork = (struct delayed_work *)__data;
1105        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1106
1107        __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1108}
1109
1110/**
1111 * queue_delayed_work - queue work on a workqueue after delay
1112 * @wq: workqueue to use
1113 * @dwork: delayable work to queue
1114 * @delay: number of jiffies to wait before queueing
1115 *
1116 * Returns 0 if @work was already on a queue, non-zero otherwise.
1117 */
1118int queue_delayed_work(struct workqueue_struct *wq,
1119                        struct delayed_work *dwork, unsigned long delay)
1120{
1121        if (delay == 0)
1122                return queue_work(wq, &dwork->work);
1123
1124        return queue_delayed_work_on(-1, wq, dwork, delay);
1125}
1126EXPORT_SYMBOL_GPL(queue_delayed_work);
1127
1128/**
1129 * queue_delayed_work_on - queue work on specific CPU after delay
1130 * @cpu: CPU number to execute work on
1131 * @wq: workqueue to use
1132 * @dwork: work to queue
1133 * @delay: number of jiffies to wait before queueing
1134 *
1135 * Returns 0 if @work was already on a queue, non-zero otherwise.
1136 */
1137int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1138                        struct delayed_work *dwork, unsigned long delay)
1139{
1140        int ret = 0;
1141        struct timer_list *timer = &dwork->timer;
1142        struct work_struct *work = &dwork->work;
1143
1144        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1145                unsigned int lcpu;
1146
1147                BUG_ON(timer_pending(timer));
1148                BUG_ON(!list_empty(&work->entry));
1149
1150                timer_stats_timer_set_start_info(&dwork->timer);
1151
1152                /*
1153                 * This stores cwq for the moment, for the timer_fn.
1154                 * Note that the work's gcwq is preserved to allow
1155                 * reentrance detection for delayed works.
1156                 */
1157                if (!(wq->flags & WQ_UNBOUND)) {
1158                        struct global_cwq *gcwq = get_work_gcwq(work);
1159
1160                        if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1161                                lcpu = gcwq->cpu;
1162                        else
1163                                lcpu = raw_smp_processor_id();
1164                } else
1165                        lcpu = WORK_CPU_UNBOUND;
1166
1167                set_work_cwq(work, get_cwq(lcpu, wq), 0);
1168
1169                timer->expires = jiffies + delay;
1170                timer->data = (unsigned long)dwork;
1171                timer->function = delayed_work_timer_fn;
1172
1173                if (unlikely(cpu >= 0))
1174                        add_timer_on(timer, cpu);
1175                else
1176                        add_timer(timer);
1177                ret = 1;
1178        }
1179        return ret;
1180}
1181EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1182
1183/**
1184 * worker_enter_idle - enter idle state
1185 * @worker: worker which is entering idle state
1186 *
1187 * @worker is entering idle state.  Update stats and idle timer if
1188 * necessary.
1189 *
1190 * LOCKING:
1191 * spin_lock_irq(gcwq->lock).
1192 */
1193static void worker_enter_idle(struct worker *worker)
1194{
1195        struct global_cwq *gcwq = worker->gcwq;
1196
1197        BUG_ON(worker->flags & WORKER_IDLE);
1198        BUG_ON(!list_empty(&worker->entry) &&
1199               (worker->hentry.next || worker->hentry.pprev));
1200
1201        /* can't use worker_set_flags(), also called from start_worker() */
1202        worker->flags |= WORKER_IDLE;
1203        gcwq->nr_idle++;
1204        worker->last_active = jiffies;
1205
1206        /* idle_list is LIFO */
1207        list_add(&worker->entry, &gcwq->idle_list);
1208
1209        if (likely(!(worker->flags & WORKER_ROGUE))) {
1210                if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1211                        mod_timer(&gcwq->idle_timer,
1212                                  jiffies + IDLE_WORKER_TIMEOUT);
1213        } else
1214                wake_up_all(&gcwq->trustee_wait);
1215
1216        /*
1217         * Sanity check nr_running.  Because trustee releases gcwq->lock
1218         * between setting %WORKER_ROGUE and zapping nr_running, the
1219         * warning may trigger spuriously.  Check iff trustee is idle.
1220         */
1221        WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
1222                     gcwq->nr_workers == gcwq->nr_idle &&
1223                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1224}
1225
1226/**
1227 * worker_leave_idle - leave idle state
1228 * @worker: worker which is leaving idle state
1229 *
1230 * @worker is leaving idle state.  Update stats.
1231 *
1232 * LOCKING:
1233 * spin_lock_irq(gcwq->lock).
1234 */
1235static void worker_leave_idle(struct worker *worker)
1236{
1237        struct global_cwq *gcwq = worker->gcwq;
1238
1239        BUG_ON(!(worker->flags & WORKER_IDLE));
1240        worker_clr_flags(worker, WORKER_IDLE);
1241        gcwq->nr_idle--;
1242        list_del_init(&worker->entry);
1243}
1244
1245/**
1246 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1247 * @worker: self
1248 *
1249 * Works which are scheduled while the cpu is online must at least be
1250 * scheduled to a worker which is bound to the cpu so that if they are
1251 * flushed from cpu callbacks while cpu is going down, they are
1252 * guaranteed to execute on the cpu.
1253 *
1254 * This function is to be used by rogue workers and rescuers to bind
1255 * themselves to the target cpu and may race with cpu going down or
1256 * coming online.  kthread_bind() can't be used because it may put the
1257 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1258 * verbatim as it's best effort and blocking and gcwq may be
1259 * [dis]associated in the meantime.
1260 *
1261 * This function tries set_cpus_allowed() and locks gcwq and verifies
1262 * the binding against GCWQ_DISASSOCIATED which is set during
1263 * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1264 * idle state or fetches works without dropping lock, it can guarantee
1265 * the scheduling requirement described in the first paragraph.
1266 *
1267 * CONTEXT:
1268 * Might sleep.  Called without any lock but returns with gcwq->lock
1269 * held.
1270 *
1271 * RETURNS:
1272 * %true if the associated gcwq is online (@worker is successfully
1273 * bound), %false if offline.
1274 */
1275static bool worker_maybe_bind_and_lock(struct worker *worker)
1276__acquires(&gcwq->lock)
1277{
1278        struct global_cwq *gcwq = worker->gcwq;
1279        struct task_struct *task = worker->task;
1280
1281        while (true) {
1282                /*
1283                 * The following call may fail, succeed or succeed
1284                 * without actually migrating the task to the cpu if
1285                 * it races with cpu hotunplug operation.  Verify
1286                 * against GCWQ_DISASSOCIATED.
1287                 */
1288                if (!(gcwq->flags & GCWQ_DISASSOCIATED))
1289                        set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
1290
1291                spin_lock_irq(&gcwq->lock);
1292                if (gcwq->flags & GCWQ_DISASSOCIATED)
1293                        return false;
1294                if (task_cpu(task) == gcwq->cpu &&
1295                    cpumask_equal(&current->cpus_allowed,
1296                                  get_cpu_mask(gcwq->cpu)))
1297                        return true;
1298                spin_unlock_irq(&gcwq->lock);
1299
1300                /*
1301                 * We've raced with CPU hot[un]plug.  Give it a breather
1302                 * and retry migration.  cond_resched() is required here;
1303                 * otherwise, we might deadlock against cpu_stop trying to
1304                 * bring down the CPU on non-preemptive kernel.
1305                 */
1306                cpu_relax();
1307                cond_resched();
1308        }
1309}
1310
1311/*
1312 * Function for worker->rebind_work used to rebind rogue busy workers
1313 * to the associated cpu which is coming back online.  This is
1314 * scheduled by cpu up but can race with other cpu hotplug operations
1315 * and may be executed twice without intervening cpu down.
1316 */
1317static void worker_rebind_fn(struct work_struct *work)
1318{
1319        struct worker *worker = container_of(work, struct worker, rebind_work);
1320        struct global_cwq *gcwq = worker->gcwq;
1321
1322        if (worker_maybe_bind_and_lock(worker))
1323                worker_clr_flags(worker, WORKER_REBIND);
1324
1325        spin_unlock_irq(&gcwq->lock);
1326}
1327
1328static struct worker *alloc_worker(void)
1329{
1330        struct worker *worker;
1331
1332        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1333        if (worker) {
1334                INIT_LIST_HEAD(&worker->entry);
1335                INIT_LIST_HEAD(&worker->scheduled);
1336                INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1337                /* on creation a worker is in !idle && prep state */
1338                worker->flags = WORKER_PREP;
1339        }
1340        return worker;
1341}
1342
1343/**
1344 * create_worker - create a new workqueue worker
1345 * @gcwq: gcwq the new worker will belong to
1346 * @bind: whether to set affinity to @cpu or not
1347 *
1348 * Create a new worker which is bound to @gcwq.  The returned worker
1349 * can be started by calling start_worker() or destroyed using
1350 * destroy_worker().
1351 *
1352 * CONTEXT:
1353 * Might sleep.  Does GFP_KERNEL allocations.
1354 *
1355 * RETURNS:
1356 * Pointer to the newly created worker.
1357 */
1358static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1359{
1360        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1361        struct worker *worker = NULL;
1362        int id = -1;
1363
1364        spin_lock_irq(&gcwq->lock);
1365        while (ida_get_new(&gcwq->worker_ida, &id)) {
1366                spin_unlock_irq(&gcwq->lock);
1367                if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1368                        goto fail;
1369                spin_lock_irq(&gcwq->lock);
1370        }
1371        spin_unlock_irq(&gcwq->lock);
1372
1373        worker = alloc_worker();
1374        if (!worker)
1375                goto fail;
1376
1377        worker->gcwq = gcwq;
1378        worker->id = id;
1379
1380        if (!on_unbound_cpu)
1381                worker->task = kthread_create_on_node(worker_thread,
1382                                                      worker,
1383                                                      cpu_to_node(gcwq->cpu),
1384                                                      "kworker/%u:%d", gcwq->cpu, id);
1385        else
1386                worker->task = kthread_create(worker_thread, worker,
1387                                              "kworker/u:%d", id);
1388        if (IS_ERR(worker->task))
1389                goto fail;
1390
1391        /*
1392         * A rogue worker will become a regular one if CPU comes
1393         * online later on.  Make sure every worker has
1394         * PF_THREAD_BOUND set.
1395         */
1396        if (bind && !on_unbound_cpu)
1397                kthread_bind(worker->task, gcwq->cpu);
1398        else {
1399                worker->task->flags |= PF_THREAD_BOUND;
1400                if (on_unbound_cpu)
1401                        worker->flags |= WORKER_UNBOUND;
1402        }
1403
1404        return worker;
1405fail:
1406        if (id >= 0) {
1407                spin_lock_irq(&gcwq->lock);
1408                ida_remove(&gcwq->worker_ida, id);
1409                spin_unlock_irq(&gcwq->lock);
1410        }
1411        kfree(worker);
1412        return NULL;
1413}
1414
1415/**
1416 * start_worker - start a newly created worker
1417 * @worker: worker to start
1418 *
1419 * Make the gcwq aware of @worker and start it.
1420 *
1421 * CONTEXT:
1422 * spin_lock_irq(gcwq->lock).
1423 */
1424static void start_worker(struct worker *worker)
1425{
1426        worker->flags |= WORKER_STARTED;
1427        worker->gcwq->nr_workers++;
1428        worker_enter_idle(worker);
1429        wake_up_process(worker->task);
1430}
1431
1432/**
1433 * destroy_worker - destroy a workqueue worker
1434 * @worker: worker to be destroyed
1435 *
1436 * Destroy @worker and adjust @gcwq stats accordingly.
1437 *
1438 * CONTEXT:
1439 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1440 */
1441static void destroy_worker(struct worker *worker)
1442{
1443        struct global_cwq *gcwq = worker->gcwq;
1444        int id = worker->id;
1445
1446        /* sanity check frenzy */
1447        BUG_ON(worker->current_work);
1448        BUG_ON(!list_empty(&worker->scheduled));
1449
1450        if (worker->flags & WORKER_STARTED)
1451                gcwq->nr_workers--;
1452        if (worker->flags & WORKER_IDLE)
1453                gcwq->nr_idle--;
1454
1455        list_del_init(&worker->entry);
1456        worker->flags |= WORKER_DIE;
1457
1458        spin_unlock_irq(&gcwq->lock);
1459
1460        kthread_stop(worker->task);
1461        kfree(worker);
1462
1463        spin_lock_irq(&gcwq->lock);
1464        ida_remove(&gcwq->worker_ida, id);
1465}
1466
1467static void idle_worker_timeout(unsigned long __gcwq)
1468{
1469        struct global_cwq *gcwq = (void *)__gcwq;
1470
1471        spin_lock_irq(&gcwq->lock);
1472
1473        if (too_many_workers(gcwq)) {
1474                struct worker *worker;
1475                unsigned long expires;
1476
1477                /* idle_list is kept in LIFO order, check the last one */
1478                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1479                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1480
1481                if (time_before(jiffies, expires))
1482                        mod_timer(&gcwq->idle_timer, expires);
1483                else {
1484                        /* it's been idle for too long, wake up manager */
1485                        gcwq->flags |= GCWQ_MANAGE_WORKERS;
1486                        wake_up_worker(gcwq);
1487                }
1488        }
1489
1490        spin_unlock_irq(&gcwq->lock);
1491}
1492
1493static bool send_mayday(struct work_struct *work)
1494{
1495        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1496        struct workqueue_struct *wq = cwq->wq;
1497        unsigned int cpu;
1498
1499        if (!(wq->flags & WQ_RESCUER))
1500                return false;
1501
1502        /* mayday mayday mayday */
1503        cpu = cwq->gcwq->cpu;
1504        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1505        if (cpu == WORK_CPU_UNBOUND)
1506                cpu = 0;
1507        if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1508                wake_up_process(wq->rescuer->task);
1509        return true;
1510}
1511
1512static void gcwq_mayday_timeout(unsigned long __gcwq)
1513{
1514        struct global_cwq *gcwq = (void *)__gcwq;
1515        struct work_struct *work;
1516
1517        spin_lock_irq(&gcwq->lock);
1518
1519        if (need_to_create_worker(gcwq)) {
1520                /*
1521                 * We've been trying to create a new worker but
1522                 * haven't been successful.  We might be hitting an
1523                 * allocation deadlock.  Send distress signals to
1524                 * rescuers.
1525                 */
1526                list_for_each_entry(work, &gcwq->worklist, entry)
1527                        send_mayday(work);
1528        }
1529
1530        spin_unlock_irq(&gcwq->lock);
1531
1532        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1533}
1534
1535/**
1536 * maybe_create_worker - create a new worker if necessary
1537 * @gcwq: gcwq to create a new worker for
1538 *
1539 * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
1540 * have at least one idle worker on return from this function.  If
1541 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1542 * sent to all rescuers with works scheduled on @gcwq to resolve
1543 * possible allocation deadlock.
1544 *
1545 * On return, need_to_create_worker() is guaranteed to be false and
1546 * may_start_working() true.
1547 *
1548 * LOCKING:
1549 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1550 * multiple times.  Does GFP_KERNEL allocations.  Called only from
1551 * manager.
1552 *
1553 * RETURNS:
1554 * false if no action was taken and gcwq->lock stayed locked, true
1555 * otherwise.
1556 */
1557static bool maybe_create_worker(struct global_cwq *gcwq)
1558__releases(&gcwq->lock)
1559__acquires(&gcwq->lock)
1560{
1561        if (!need_to_create_worker(gcwq))
1562                return false;
1563restart:
1564        spin_unlock_irq(&gcwq->lock);
1565
1566        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1567        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1568
1569        while (true) {
1570                struct worker *worker;
1571
1572                worker = create_worker(gcwq, true);
1573                if (worker) {
1574                        del_timer_sync(&gcwq->mayday_timer);
1575                        spin_lock_irq(&gcwq->lock);
1576                        start_worker(worker);
1577                        BUG_ON(need_to_create_worker(gcwq));
1578                        return true;
1579                }
1580
1581                if (!need_to_create_worker(gcwq))
1582                        break;
1583
1584                __set_current_state(TASK_INTERRUPTIBLE);
1585                schedule_timeout(CREATE_COOLDOWN);
1586
1587                if (!need_to_create_worker(gcwq))
1588                        break;
1589        }
1590
1591        del_timer_sync(&gcwq->mayday_timer);
1592        spin_lock_irq(&gcwq->lock);
1593        if (need_to_create_worker(gcwq))
1594                goto restart;
1595        return true;
1596}
1597
1598/**
1599 * maybe_destroy_worker - destroy workers which have been idle for a while
1600 * @gcwq: gcwq to destroy workers for
1601 *
1602 * Destroy @gcwq workers which have been idle for longer than
1603 * IDLE_WORKER_TIMEOUT.
1604 *
1605 * LOCKING:
1606 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1607 * multiple times.  Called only from manager.
1608 *
1609 * RETURNS:
1610 * false if no action was taken and gcwq->lock stayed locked, true
1611 * otherwise.
1612 */
1613static bool maybe_destroy_workers(struct global_cwq *gcwq)
1614{
1615        bool ret = false;
1616
1617        while (too_many_workers(gcwq)) {
1618                struct worker *worker;
1619                unsigned long expires;
1620
1621                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1622                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1623
1624                if (time_before(jiffies, expires)) {
1625                        mod_timer(&gcwq->idle_timer, expires);
1626                        break;
1627                }
1628
1629                destroy_worker(worker);
1630                ret = true;
1631        }
1632
1633        return ret;
1634}
1635
1636/**
1637 * manage_workers - manage worker pool
1638 * @worker: self
1639 *
1640 * Assume the manager role and manage gcwq worker pool @worker belongs
1641 * to.  At any given time, there can be only zero or one manager per
1642 * gcwq.  The exclusion is handled automatically by this function.
1643 *
1644 * The caller can safely start processing works on false return.  On
1645 * true return, it's guaranteed that need_to_create_worker() is false
1646 * and may_start_working() is true.
1647 *
1648 * CONTEXT:
1649 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1650 * multiple times.  Does GFP_KERNEL allocations.
1651 *
1652 * RETURNS:
1653 * false if no action was taken and gcwq->lock stayed locked, true if
1654 * some action was taken.
1655 */
1656static bool manage_workers(struct worker *worker)
1657{
1658        struct global_cwq *gcwq = worker->gcwq;
1659        bool ret = false;
1660
1661        if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1662                return ret;
1663
1664        gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1665        gcwq->flags |= GCWQ_MANAGING_WORKERS;
1666
1667        /*
1668         * Destroy and then create so that may_start_working() is true
1669         * on return.
1670         */
1671        ret |= maybe_destroy_workers(gcwq);
1672        ret |= maybe_create_worker(gcwq);
1673
1674        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1675
1676        /*
1677         * The trustee might be waiting to take over the manager
1678         * position, tell it we're done.
1679         */
1680        if (unlikely(gcwq->trustee))
1681                wake_up_all(&gcwq->trustee_wait);
1682
1683        return ret;
1684}
1685
1686/**
1687 * move_linked_works - move linked works to a list
1688 * @work: start of series of works to be scheduled
1689 * @head: target list to append @work to
1690 * @nextp: out paramter for nested worklist walking
1691 *
1692 * Schedule linked works starting from @work to @head.  Work series to
1693 * be scheduled starts at @work and includes any consecutive work with
1694 * WORK_STRUCT_LINKED set in its predecessor.
1695 *
1696 * If @nextp is not NULL, it's updated to point to the next work of
1697 * the last scheduled work.  This allows move_linked_works() to be
1698 * nested inside outer list_for_each_entry_safe().
1699 *
1700 * CONTEXT:
1701 * spin_lock_irq(gcwq->lock).
1702 */
1703static void move_linked_works(struct work_struct *work, struct list_head *head,
1704                              struct work_struct **nextp)
1705{
1706        struct work_struct *n;
1707
1708        /*
1709         * Linked worklist will always end before the end of the list,
1710         * use NULL for list head.
1711         */
1712        list_for_each_entry_safe_from(work, n, NULL, entry) {
1713                list_move_tail(&work->entry, head);
1714                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1715                        break;
1716        }
1717
1718        /*
1719         * If we're already inside safe list traversal and have moved
1720         * multiple works to the scheduled queue, the next position
1721         * needs to be updated.
1722         */
1723        if (nextp)
1724                *nextp = n;
1725}
1726
1727static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1728{
1729        struct work_struct *work = list_first_entry(&cwq->delayed_works,
1730                                                    struct work_struct, entry);
1731        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1732
1733        trace_workqueue_activate_work(work);
1734        move_linked_works(work, pos, NULL);
1735        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1736        cwq->nr_active++;
1737}
1738
1739/**
1740 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1741 * @cwq: cwq of interest
1742 * @color: color of work which left the queue
1743 * @delayed: for a delayed work
1744 *
1745 * A work either has completed or is removed from pending queue,
1746 * decrement nr_in_flight of its cwq and handle workqueue flushing.
1747 *
1748 * CONTEXT:
1749 * spin_lock_irq(gcwq->lock).
1750 */
1751static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1752                                 bool delayed)
1753{
1754        /* ignore uncolored works */
1755        if (color == WORK_NO_COLOR)
1756                return;
1757
1758        cwq->nr_in_flight[color]--;
1759
1760        if (!delayed) {
1761                cwq->nr_active--;
1762                if (!list_empty(&cwq->delayed_works)) {
1763                        /* one down, submit a delayed one */
1764                        if (cwq->nr_active < cwq->max_active)
1765                                cwq_activate_first_delayed(cwq);
1766                }
1767        }
1768
1769        /* is flush in progress and are we at the flushing tip? */
1770        if (likely(cwq->flush_color != color))
1771                return;
1772
1773        /* are there still in-flight works? */
1774        if (cwq->nr_in_flight[color])
1775                return;
1776
1777        /* this cwq is done, clear flush_color */
1778        cwq->flush_color = -1;
1779
1780        /*
1781         * If this was the last cwq, wake up the first flusher.  It
1782         * will handle the rest.
1783         */
1784        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1785                complete(&cwq->wq->first_flusher->done);
1786}
1787
1788/**
1789 * process_one_work - process single work
1790 * @worker: self
1791 * @work: work to process
1792 *
1793 * Process @work.  This function contains all the logics necessary to
1794 * process a single work including synchronization against and
1795 * interaction with other workers on the same cpu, queueing and
1796 * flushing.  As long as context requirement is met, any worker can
1797 * call this function to process a work.
1798 *
1799 * CONTEXT:
1800 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1801 */
1802static void process_one_work(struct worker *worker, struct work_struct *work)
1803__releases(&gcwq->lock)
1804__acquires(&gcwq->lock)
1805{
1806        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1807        struct global_cwq *gcwq = cwq->gcwq;
1808        struct hlist_head *bwh = busy_worker_head(gcwq, work);
1809        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1810        work_func_t f = work->func;
1811        int work_color;
1812        struct worker *collision;
1813#ifdef CONFIG_LOCKDEP
1814        /*
1815         * It is permissible to free the struct work_struct from
1816         * inside the function that is called from it, this we need to
1817         * take into account for lockdep too.  To avoid bogus "held
1818         * lock freed" warnings as well as problems when looking into
1819         * work->lockdep_map, make a copy and use that here.
1820         */
1821        struct lockdep_map lockdep_map;
1822
1823        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1824#endif
1825        /*
1826         * A single work shouldn't be executed concurrently by
1827         * multiple workers on a single cpu.  Check whether anyone is
1828         * already processing the work.  If so, defer the work to the
1829         * currently executing one.
1830         */
1831        collision = __find_worker_executing_work(gcwq, bwh, work);
1832        if (unlikely(collision)) {
1833                move_linked_works(work, &collision->scheduled, NULL);
1834                return;
1835        }
1836
1837        /* claim and process */
1838        debug_work_deactivate(work);
1839        hlist_add_head(&worker->hentry, bwh);
1840        worker->current_work = work;
1841        worker->current_cwq = cwq;
1842        work_color = get_work_color(work);
1843
1844        /* record the current cpu number in the work data and dequeue */
1845        set_work_cpu(work, gcwq->cpu);
1846        list_del_init(&work->entry);
1847
1848        /*
1849         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1850         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1851         */
1852        if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1853                struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1854                                                struct work_struct, entry);
1855
1856                if (!list_empty(&gcwq->worklist) &&
1857                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1858                        wake_up_worker(gcwq);
1859                else
1860                        gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1861        }
1862
1863        /*
1864         * CPU intensive works don't participate in concurrency
1865         * management.  They're the scheduler's responsibility.
1866         */
1867        if (unlikely(cpu_intensive))
1868                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1869
1870        spin_unlock_irq(&gcwq->lock);
1871
1872        work_clear_pending(work);
1873        lock_map_acquire_read(&cwq->wq->lockdep_map);
1874        lock_map_acquire(&lockdep_map);
1875        trace_workqueue_execute_start(work);
1876        f(work);
1877        /*
1878         * While we must be careful to not use "work" after this, the trace
1879         * point will only record its address.
1880         */
1881        trace_workqueue_execute_end(work);
1882        lock_map_release(&lockdep_map);
1883        lock_map_release(&cwq->wq->lockdep_map);
1884
1885        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1886                printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1887                       "%s/0x%08x/%d\n",
1888                       current->comm, preempt_count(), task_pid_nr(current));
1889                printk(KERN_ERR "    last function: ");
1890                print_symbol("%s\n", (unsigned long)f);
1891                debug_show_held_locks(current);
1892                dump_stack();
1893        }
1894
1895        spin_lock_irq(&gcwq->lock);
1896
1897        /* clear cpu intensive status */
1898        if (unlikely(cpu_intensive))
1899                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1900
1901        /* we're done with it, release */
1902        hlist_del_init(&worker->hentry);
1903        worker->current_work = NULL;
1904        worker->current_cwq = NULL;
1905        cwq_dec_nr_in_flight(cwq, work_color, false);
1906}
1907
1908/**
1909 * process_scheduled_works - process scheduled works
1910 * @worker: self
1911 *
1912 * Process all scheduled works.  Please note that the scheduled list
1913 * may change while processing a work, so this function repeatedly
1914 * fetches a work from the top and executes it.
1915 *
1916 * CONTEXT:
1917 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1918 * multiple times.
1919 */
1920static void process_scheduled_works(struct worker *worker)
1921{
1922        while (!list_empty(&worker->scheduled)) {
1923                struct work_struct *work = list_first_entry(&worker->scheduled,
1924                                                struct work_struct, entry);
1925                process_one_work(worker, work);
1926        }
1927}
1928
1929/**
1930 * worker_thread - the worker thread function
1931 * @__worker: self
1932 *
1933 * The gcwq worker thread function.  There's a single dynamic pool of
1934 * these per each cpu.  These workers process all works regardless of
1935 * their specific target workqueue.  The only exception is works which
1936 * belong to workqueues with a rescuer which will be explained in
1937 * rescuer_thread().
1938 */
1939static int worker_thread(void *__worker)
1940{
1941        struct worker *worker = __worker;
1942        struct global_cwq *gcwq = worker->gcwq;
1943
1944        /* tell the scheduler that this is a workqueue worker */
1945        worker->task->flags |= PF_WQ_WORKER;
1946woke_up:
1947        spin_lock_irq(&gcwq->lock);
1948
1949        /* DIE can be set only while we're idle, checking here is enough */
1950        if (worker->flags & WORKER_DIE) {
1951                spin_unlock_irq(&gcwq->lock);
1952                worker->task->flags &= ~PF_WQ_WORKER;
1953                return 0;
1954        }
1955
1956        worker_leave_idle(worker);
1957recheck:
1958        /* no more worker necessary? */
1959        if (!need_more_worker(gcwq))
1960                goto sleep;
1961
1962        /* do we need to manage? */
1963        if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1964                goto recheck;
1965
1966        /*
1967         * ->scheduled list can only be filled while a worker is
1968         * preparing to process a work or actually processing it.
1969         * Make sure nobody diddled with it while I was sleeping.
1970         */
1971        BUG_ON(!list_empty(&worker->scheduled));
1972
1973        /*
1974         * When control reaches this point, we're guaranteed to have
1975         * at least one idle worker or that someone else has already
1976         * assumed the manager role.
1977         */
1978        worker_clr_flags(worker, WORKER_PREP);
1979
1980        do {
1981                struct work_struct *work =
1982                        list_first_entry(&gcwq->worklist,
1983                                         struct work_struct, entry);
1984
1985                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1986                        /* optimization path, not strictly necessary */
1987                        process_one_work(worker, work);
1988                        if (unlikely(!list_empty(&worker->scheduled)))
1989                                process_scheduled_works(worker);
1990                } else {
1991                        move_linked_works(work, &worker->scheduled, NULL);
1992                        process_scheduled_works(worker);
1993                }
1994        } while (keep_working(gcwq));
1995
1996        worker_set_flags(worker, WORKER_PREP, false);
1997sleep:
1998        if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1999                goto recheck;
2000
2001        /*
2002         * gcwq->lock is held and there's no work to process and no
2003         * need to manage, sleep.  Workers are woken up only while
2004         * holding gcwq->lock or from local cpu, so setting the
2005         * current state before releasing gcwq->lock is enough to
2006         * prevent losing any event.
2007         */
2008        worker_enter_idle(worker);
2009        __set_current_state(TASK_INTERRUPTIBLE);
2010        spin_unlock_irq(&gcwq->lock);
2011        schedule();
2012        goto woke_up;
2013}
2014
2015/**
2016 * rescuer_thread - the rescuer thread function
2017 * @__wq: the associated workqueue
2018 *
2019 * Workqueue rescuer thread function.  There's one rescuer for each
2020 * workqueue which has WQ_RESCUER set.
2021 *
2022 * Regular work processing on a gcwq may block trying to create a new
2023 * worker which uses GFP_KERNEL allocation which has slight chance of
2024 * developing into deadlock if some works currently on the same queue
2025 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
2026 * the problem rescuer solves.
2027 *
2028 * When such condition is possible, the gcwq summons rescuers of all
2029 * workqueues which have works queued on the gcwq and let them process
2030 * those works so that forward progress can be guaranteed.
2031 *
2032 * This should happen rarely.
2033 */
2034static int rescuer_thread(void *__wq)
2035{
2036        struct workqueue_struct *wq = __wq;
2037        struct worker *rescuer = wq->rescuer;
2038        struct list_head *scheduled = &rescuer->scheduled;
2039        bool is_unbound = wq->flags & WQ_UNBOUND;
2040        unsigned int cpu;
2041
2042        set_user_nice(current, RESCUER_NICE_LEVEL);
2043repeat:
2044        set_current_state(TASK_INTERRUPTIBLE);
2045
2046        if (kthread_should_stop())
2047                return 0;
2048
2049        /*
2050         * See whether any cpu is asking for help.  Unbounded
2051         * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
2052         */
2053        for_each_mayday_cpu(cpu, wq->mayday_mask) {
2054                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2055                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2056                struct global_cwq *gcwq = cwq->gcwq;
2057                struct work_struct *work, *n;
2058
2059                __set_current_state(TASK_RUNNING);
2060                mayday_clear_cpu(cpu, wq->mayday_mask);
2061
2062                /* migrate to the target cpu if possible */
2063                rescuer->gcwq = gcwq;
2064                worker_maybe_bind_and_lock(rescuer);
2065
2066                /*
2067                 * Slurp in all works issued via this workqueue and
2068                 * process'em.
2069                 */
2070                BUG_ON(!list_empty(&rescuer->scheduled));
2071                list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2072                        if (get_work_cwq(work) == cwq)
2073                                move_linked_works(work, scheduled, &n);
2074
2075                process_scheduled_works(rescuer);
2076
2077                /*
2078                 * Leave this gcwq.  If keep_working() is %true, notify a
2079                 * regular worker; otherwise, we end up with 0 concurrency
2080                 * and stalling the execution.
2081                 */
2082                if (keep_working(gcwq))
2083                        wake_up_worker(gcwq);
2084
2085                spin_unlock_irq(&gcwq->lock);
2086        }
2087
2088        schedule();
2089        goto repeat;
2090}
2091
2092struct wq_barrier {
2093        struct work_struct      work;
2094        struct completion       done;
2095};
2096
2097static void wq_barrier_func(struct work_struct *work)
2098{
2099        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2100        complete(&barr->done);
2101}
2102
2103/**
2104 * insert_wq_barrier - insert a barrier work
2105 * @cwq: cwq to insert barrier into
2106 * @barr: wq_barrier to insert
2107 * @target: target work to attach @barr to
2108 * @worker: worker currently executing @target, NULL if @target is not executing
2109 *
2110 * @barr is linked to @target such that @barr is completed only after
2111 * @target finishes execution.  Please note that the ordering
2112 * guarantee is observed only with respect to @target and on the local
2113 * cpu.
2114 *
2115 * Currently, a queued barrier can't be canceled.  This is because
2116 * try_to_grab_pending() can't determine whether the work to be
2117 * grabbed is at the head of the queue and thus can't clear LINKED
2118 * flag of the previous work while there must be a valid next work
2119 * after a work with LINKED flag set.
2120 *
2121 * Note that when @worker is non-NULL, @target may be modified
2122 * underneath us, so we can't reliably determine cwq from @target.
2123 *
2124 * CONTEXT:
2125 * spin_lock_irq(gcwq->lock).
2126 */
2127static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2128                              struct wq_barrier *barr,
2129                              struct work_struct *target, struct worker *worker)
2130{
2131        struct list_head *head;
2132        unsigned int linked = 0;
2133
2134        /*
2135         * debugobject calls are safe here even with gcwq->lock locked
2136         * as we know for sure that this will not trigger any of the
2137         * checks and call back into the fixup functions where we
2138         * might deadlock.
2139         */
2140        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2141        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2142        init_completion(&barr->done);
2143
2144        /*
2145         * If @target is currently being executed, schedule the
2146         * barrier to the worker; otherwise, put it after @target.
2147         */
2148        if (worker)
2149                head = worker->scheduled.next;
2150        else {
2151                unsigned long *bits = work_data_bits(target);
2152
2153                head = target->entry.next;
2154                /* there can already be other linked works, inherit and set */
2155                linked = *bits & WORK_STRUCT_LINKED;
2156                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2157        }
2158
2159        debug_work_activate(&barr->work);
2160        insert_work(cwq, &barr->work, head,
2161                    work_color_to_flags(WORK_NO_COLOR) | linked);
2162}
2163
2164/**
2165 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2166 * @wq: workqueue being flushed
2167 * @flush_color: new flush color, < 0 for no-op
2168 * @work_color: new work color, < 0 for no-op
2169 *
2170 * Prepare cwqs for workqueue flushing.
2171 *
2172 * If @flush_color is non-negative, flush_color on all cwqs should be
2173 * -1.  If no cwq has in-flight commands at the specified color, all
2174 * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
2175 * has in flight commands, its cwq->flush_color is set to
2176 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2177 * wakeup logic is armed and %true is returned.
2178 *
2179 * The caller should have initialized @wq->first_flusher prior to
2180 * calling this function with non-negative @flush_color.  If
2181 * @flush_color is negative, no flush color update is done and %false
2182 * is returned.
2183 *
2184 * If @work_color is non-negative, all cwqs should have the same
2185 * work_color which is previous to @work_color and all will be
2186 * advanced to @work_color.
2187 *
2188 * CONTEXT:
2189 * mutex_lock(wq->flush_mutex).
2190 *
2191 * RETURNS:
2192 * %true if @flush_color >= 0 and there's something to flush.  %false
2193 * otherwise.
2194 */
2195static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2196                                      int flush_color, int work_color)
2197{
2198        bool wait = false;
2199        unsigned int cpu;
2200
2201        if (flush_color >= 0) {
2202                BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
2203                atomic_set(&wq->nr_cwqs_to_flush, 1);
2204        }
2205
2206        for_each_cwq_cpu(cpu, wq) {
2207                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2208                struct global_cwq *gcwq = cwq->gcwq;
2209
2210                spin_lock_irq(&gcwq->lock);
2211
2212                if (flush_color >= 0) {
2213                        BUG_ON(cwq->flush_color != -1);
2214
2215                        if (cwq->nr_in_flight[flush_color]) {
2216                                cwq->flush_color = flush_color;
2217                                atomic_inc(&wq->nr_cwqs_to_flush);
2218                                wait = true;
2219                        }
2220                }
2221
2222                if (work_color >= 0) {
2223                        BUG_ON(work_color != work_next_color(cwq->work_color));
2224                        cwq->work_color = work_color;
2225                }
2226
2227                spin_unlock_irq(&gcwq->lock);
2228        }
2229
2230        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2231                complete(&wq->first_flusher->done);
2232
2233        return wait;
2234}
2235
2236/**
2237 * flush_workqueue - ensure that any scheduled work has run to completion.
2238 * @wq: workqueue to flush
2239 *
2240 * Forces execution of the workqueue and blocks until its completion.
2241 * This is typically used in driver shutdown handlers.
2242 *
2243 * We sleep until all works which were queued on entry have been handled,
2244 * but we are not livelocked by new incoming ones.
2245 */
2246void flush_workqueue(struct workqueue_struct *wq)
2247{
2248        struct wq_flusher this_flusher = {
2249                .list = LIST_HEAD_INIT(this_flusher.list),
2250                .flush_color = -1,
2251                .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2252        };
2253        int next_color;
2254
2255        lock_map_acquire(&wq->lockdep_map);
2256        lock_map_release(&wq->lockdep_map);
2257
2258        mutex_lock(&wq->flush_mutex);
2259
2260        /*
2261         * Start-to-wait phase
2262         */
2263        next_color = work_next_color(wq->work_color);
2264
2265        if (next_color != wq->flush_color) {
2266                /*
2267                 * Color space is not full.  The current work_color
2268                 * becomes our flush_color and work_color is advanced
2269                 * by one.
2270                 */
2271                BUG_ON(!list_empty(&wq->flusher_overflow));
2272                this_flusher.flush_color = wq->work_color;
2273                wq->work_color = next_color;
2274
2275                if (!wq->first_flusher) {
2276                        /* no flush in progress, become the first flusher */
2277                        BUG_ON(wq->flush_color != this_flusher.flush_color);
2278
2279                        wq->first_flusher = &this_flusher;
2280
2281                        if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2282                                                       wq->work_color)) {
2283                                /* nothing to flush, done */
2284                                wq->flush_color = next_color;
2285                                wq->first_flusher = NULL;
2286                                goto out_unlock;
2287                        }
2288                } else {
2289                        /* wait in queue */
2290                        BUG_ON(wq->flush_color == this_flusher.flush_color);
2291                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
2292                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2293                }
2294        } else {
2295                /*
2296                 * Oops, color space is full, wait on overflow queue.
2297                 * The next flush completion will assign us
2298                 * flush_color and transfer to flusher_queue.
2299                 */
2300                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2301        }
2302
2303        mutex_unlock(&wq->flush_mutex);
2304
2305        wait_for_completion(&this_flusher.done);
2306
2307        /*
2308         * Wake-up-and-cascade phase
2309         *
2310         * First flushers are responsible for cascading flushes and
2311         * handling overflow.  Non-first flushers can simply return.
2312         */
2313        if (wq->first_flusher != &this_flusher)
2314                return;
2315
2316        mutex_lock(&wq->flush_mutex);
2317
2318        /* we might have raced, check again with mutex held */
2319        if (wq->first_flusher != &this_flusher)
2320                goto out_unlock;
2321
2322        wq->first_flusher = NULL;
2323
2324        BUG_ON(!list_empty(&this_flusher.list));
2325        BUG_ON(wq->flush_color != this_flusher.flush_color);
2326
2327        while (true) {
2328                struct wq_flusher *next, *tmp;
2329
2330                /* complete all the flushers sharing the current flush color */
2331                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2332                        if (next->flush_color != wq->flush_color)
2333                                break;
2334                        list_del_init(&next->list);
2335                        complete(&next->done);
2336                }
2337
2338                BUG_ON(!list_empty(&wq->flusher_overflow) &&
2339                       wq->flush_color != work_next_color(wq->work_color));
2340
2341                /* this flush_color is finished, advance by one */
2342                wq->flush_color = work_next_color(wq->flush_color);
2343
2344                /* one color has been freed, handle overflow queue */
2345                if (!list_empty(&wq->flusher_overflow)) {
2346                        /*
2347                         * Assign the same color to all overflowed
2348                         * flushers, advance work_color and append to
2349                         * flusher_queue.  This is the start-to-wait
2350                         * phase for these overflowed flushers.
2351                         */
2352                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
2353                                tmp->flush_color = wq->work_color;
2354
2355                        wq->work_color = work_next_color(wq->work_color);
2356
2357                        list_splice_tail_init(&wq->flusher_overflow,
2358                                              &wq->flusher_queue);
2359                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2360                }
2361
2362                if (list_empty(&wq->flusher_queue)) {
2363                        BUG_ON(wq->flush_color != wq->work_color);
2364                        break;
2365                }
2366
2367                /*
2368                 * Need to flush more colors.  Make the next flusher
2369                 * the new first flusher and arm cwqs.
2370                 */
2371                BUG_ON(wq->flush_color == wq->work_color);
2372                BUG_ON(wq->flush_color != next->flush_color);
2373
2374                list_del_init(&next->list);
2375                wq->first_flusher = next;
2376
2377                if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2378                        break;
2379
2380                /*
2381                 * Meh... this color is already done, clear first
2382                 * flusher and repeat cascading.
2383                 */
2384                wq->first_flusher = NULL;
2385        }
2386
2387out_unlock:
2388        mutex_unlock(&wq->flush_mutex);
2389}
2390EXPORT_SYMBOL_GPL(flush_workqueue);
2391
2392/**
2393 * drain_workqueue - drain a workqueue
2394 * @wq: workqueue to drain
2395 *
2396 * Wait until the workqueue becomes empty.  While draining is in progress,
2397 * only chain queueing is allowed.  IOW, only currently pending or running
2398 * work items on @wq can queue further work items on it.  @wq is flushed
2399 * repeatedly until it becomes empty.  The number of flushing is detemined
2400 * by the depth of chaining and should be relatively short.  Whine if it
2401 * takes too long.
2402 */
2403void drain_workqueue(struct workqueue_struct *wq)
2404{
2405        unsigned int flush_cnt = 0;
2406        unsigned int cpu;
2407
2408        /*
2409         * __queue_work() needs to test whether there are drainers, is much
2410         * hotter than drain_workqueue() and already looks at @wq->flags.
2411         * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2412         */
2413        spin_lock(&workqueue_lock);
2414        if (!wq->nr_drainers++)
2415                wq->flags |= WQ_DRAINING;
2416        spin_unlock(&workqueue_lock);
2417reflush:
2418        flush_workqueue(wq);
2419
2420        for_each_cwq_cpu(cpu, wq) {
2421                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2422                bool drained;
2423
2424                spin_lock_irq(&cwq->gcwq->lock);
2425                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2426                spin_unlock_irq(&cwq->gcwq->lock);
2427
2428                if (drained)
2429                        continue;
2430
2431                if (++flush_cnt == 10 ||
2432                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2433                        pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2434                                   wq->name, flush_cnt);
2435                goto reflush;
2436        }
2437
2438        spin_lock(&workqueue_lock);
2439        if (!--wq->nr_drainers)
2440                wq->flags &= ~WQ_DRAINING;
2441        spin_unlock(&workqueue_lock);
2442}
2443EXPORT_SYMBOL_GPL(drain_workqueue);
2444
2445static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2446                             bool wait_executing)
2447{
2448        struct worker *worker = NULL;
2449        struct global_cwq *gcwq;
2450        struct cpu_workqueue_struct *cwq;
2451
2452        might_sleep();
2453        gcwq = get_work_gcwq(work);
2454        if (!gcwq)
2455                return false;
2456
2457        spin_lock_irq(&gcwq->lock);
2458        if (!list_empty(&work->entry)) {
2459                /*
2460                 * See the comment near try_to_grab_pending()->smp_rmb().
2461                 * If it was re-queued to a different gcwq under us, we
2462                 * are not going to wait.
2463                 */
2464                smp_rmb();
2465                cwq = get_work_cwq(work);
2466                if (unlikely(!cwq || gcwq != cwq->gcwq))
2467                        goto already_gone;
2468        } else if (wait_executing) {
2469                worker = find_worker_executing_work(gcwq, work);
2470                if (!worker)
2471                        goto already_gone;
2472                cwq = worker->current_cwq;
2473        } else
2474                goto already_gone;
2475
2476        insert_wq_barrier(cwq, barr, work, worker);
2477        spin_unlock_irq(&gcwq->lock);
2478
2479        /*
2480         * If @max_active is 1 or rescuer is in use, flushing another work
2481         * item on the same workqueue may lead to deadlock.  Make sure the
2482         * flusher is not running on the same workqueue by verifying write
2483         * access.
2484         */
2485        if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2486                lock_map_acquire(&cwq->wq->lockdep_map);
2487        else
2488                lock_map_acquire_read(&cwq->wq->lockdep_map);
2489        lock_map_release(&cwq->wq->lockdep_map);
2490
2491        return true;
2492already_gone:
2493        spin_unlock_irq(&gcwq->lock);
2494        return false;
2495}
2496
2497/**
2498 * flush_work - wait for a work to finish executing the last queueing instance
2499 * @work: the work to flush
2500 *
2501 * Wait until @work has finished execution.  This function considers
2502 * only the last queueing instance of @work.  If @work has been
2503 * enqueued across different CPUs on a non-reentrant workqueue or on
2504 * multiple workqueues, @work might still be executing on return on
2505 * some of the CPUs from earlier queueing.
2506 *
2507 * If @work was queued only on a non-reentrant, ordered or unbound
2508 * workqueue, @work is guaranteed to be idle on return if it hasn't
2509 * been requeued since flush started.
2510 *
2511 * RETURNS:
2512 * %true if flush_work() waited for the work to finish execution,
2513 * %false if it was already idle.
2514 */
2515bool flush_work(struct work_struct *work)
2516{
2517        struct wq_barrier barr;
2518
2519        lock_map_acquire(&work->lockdep_map);
2520        lock_map_release(&work->lockdep_map);
2521
2522        if (start_flush_work(work, &barr, true)) {
2523                wait_for_completion(&barr.done);
2524                destroy_work_on_stack(&barr.work);
2525                return true;
2526        } else
2527                return false;
2528}
2529EXPORT_SYMBOL_GPL(flush_work);
2530
2531static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2532{
2533        struct wq_barrier barr;
2534        struct worker *worker;
2535
2536        spin_lock_irq(&gcwq->lock);
2537
2538        worker = find_worker_executing_work(gcwq, work);
2539        if (unlikely(worker))
2540                insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2541
2542        spin_unlock_irq(&gcwq->lock);
2543
2544        if (unlikely(worker)) {
2545                wait_for_completion(&barr.done);
2546                destroy_work_on_stack(&barr.work);
2547                return true;
2548        } else
2549                return false;
2550}
2551
2552static bool wait_on_work(struct work_struct *work)
2553{
2554        bool ret = false;
2555        int cpu;
2556
2557        might_sleep();
2558
2559        lock_map_acquire(&work->lockdep_map);
2560        lock_map_release(&work->lockdep_map);
2561
2562        for_each_gcwq_cpu(cpu)
2563                ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2564        return ret;
2565}
2566
2567/**
2568 * flush_work_sync - wait until a work has finished execution
2569 * @work: the work to flush
2570 *
2571 * Wait until @work has finished execution.  On return, it's
2572 * guaranteed that all queueing instances of @work which happened
2573 * before this function is called are finished.  In other words, if
2574 * @work hasn't been requeued since this function was called, @work is
2575 * guaranteed to be idle on return.
2576 *
2577 * RETURNS:
2578 * %true if flush_work_sync() waited for the work to finish execution,
2579 * %false if it was already idle.
2580 */
2581bool flush_work_sync(struct work_struct *work)
2582{
2583        struct wq_barrier barr;
2584        bool pending, waited;
2585
2586        /* we'll wait for executions separately, queue barr only if pending */
2587        pending = start_flush_work(work, &barr, false);
2588
2589        /* wait for executions to finish */
2590        waited = wait_on_work(work);
2591
2592        /* wait for the pending one */
2593        if (pending) {
2594                wait_for_completion(&barr.done);
2595                destroy_work_on_stack(&barr.work);
2596        }
2597
2598        return pending || waited;
2599}
2600EXPORT_SYMBOL_GPL(flush_work_sync);
2601
2602/*
2603 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2604 * so this work can't be re-armed in any way.
2605 */
2606static int try_to_grab_pending(struct work_struct *work)
2607{
2608        struct global_cwq *gcwq;
2609        int ret = -1;
2610
2611        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2612                return 0;
2613
2614        /*
2615         * The queueing is in progress, or it is already queued. Try to
2616         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2617         */
2618        gcwq = get_work_gcwq(work);
2619        if (!gcwq)
2620                return ret;
2621
2622        spin_lock_irq(&gcwq->lock);
2623        if (!list_empty(&work->entry)) {
2624                /*
2625                 * This work is queued, but perhaps we locked the wrong gcwq.
2626                 * In that case we must see the new value after rmb(), see
2627                 * insert_work()->wmb().
2628                 */
2629                smp_rmb();
2630                if (gcwq == get_work_gcwq(work)) {
2631                        debug_work_deactivate(work);
2632                        list_del_init(&work->entry);
2633                        cwq_dec_nr_in_flight(get_work_cwq(work),
2634                                get_work_color(work),
2635                                *work_data_bits(work) & WORK_STRUCT_DELAYED);
2636                        ret = 1;
2637                }
2638        }
2639        spin_unlock_irq(&gcwq->lock);
2640
2641        return ret;
2642}
2643
2644static bool __cancel_work_timer(struct work_struct *work,
2645                                struct timer_list* timer)
2646{
2647        int ret;
2648
2649        do {
2650                ret = (timer && likely(del_timer(timer)));
2651                if (!ret)
2652                        ret = try_to_grab_pending(work);
2653                wait_on_work(work);
2654        } while (unlikely(ret < 0));
2655
2656        clear_work_data(work);
2657        return ret;
2658}
2659
2660/**
2661 * cancel_work_sync - cancel a work and wait for it to finish
2662 * @work: the work to cancel
2663 *
2664 * Cancel @work and wait for its execution to finish.  This function
2665 * can be used even if the work re-queues itself or migrates to
2666 * another workqueue.  On return from this function, @work is
2667 * guaranteed to be not pending or executing on any CPU.
2668 *
2669 * cancel_work_sync(&delayed_work->work) must not be used for
2670 * delayed_work's.  Use cancel_delayed_work_sync() instead.
2671 *
2672 * The caller must ensure that the workqueue on which @work was last
2673 * queued can't be destroyed before this function returns.
2674 *
2675 * RETURNS:
2676 * %true if @work was pending, %false otherwise.
2677 */
2678bool cancel_work_sync(struct work_struct *work)
2679{
2680        return __cancel_work_timer(work, NULL);
2681}
2682EXPORT_SYMBOL_GPL(cancel_work_sync);
2683
2684/**
2685 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2686 * @dwork: the delayed work to flush
2687 *
2688 * Delayed timer is cancelled and the pending work is queued for
2689 * immediate execution.  Like flush_work(), this function only
2690 * considers the last queueing instance of @dwork.
2691 *
2692 * RETURNS:
2693 * %true if flush_work() waited for the work to finish execution,
2694 * %false if it was already idle.
2695 */
2696bool flush_delayed_work(struct delayed_work *dwork)
2697{
2698        if (del_timer_sync(&dwork->timer))
2699                __queue_work(raw_smp_processor_id(),
2700                             get_work_cwq(&dwork->work)->wq, &dwork->work);
2701        return flush_work(&dwork->work);
2702}
2703EXPORT_SYMBOL(flush_delayed_work);
2704
2705/**
2706 * flush_delayed_work_sync - wait for a dwork to finish
2707 * @dwork: the delayed work to flush
2708 *
2709 * Delayed timer is cancelled and the pending work is queued for
2710 * execution immediately.  Other than timer handling, its behavior
2711 * is identical to flush_work_sync().
2712 *
2713 * RETURNS:
2714 * %true if flush_work_sync() waited for the work to finish execution,
2715 * %false if it was already idle.
2716 */
2717bool flush_delayed_work_sync(struct delayed_work *dwork)
2718{
2719        if (del_timer_sync(&dwork->timer))
2720                __queue_work(raw_smp_processor_id(),
2721                             get_work_cwq(&dwork->work)->wq, &dwork->work);
2722        return flush_work_sync(&dwork->work);
2723}
2724EXPORT_SYMBOL(flush_delayed_work_sync);
2725
2726/**
2727 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2728 * @dwork: the delayed work cancel
2729 *
2730 * This is cancel_work_sync() for delayed works.
2731 *
2732 * RETURNS:
2733 * %true if @dwork was pending, %false otherwise.
2734 */
2735bool cancel_delayed_work_sync(struct delayed_work *dwork)
2736{
2737        return __cancel_work_timer(&dwork->work, &dwork->timer);
2738}
2739EXPORT_SYMBOL(cancel_delayed_work_sync);
2740
2741/**
2742 * schedule_work - put work task in global workqueue
2743 * @work: job to be done
2744 *
2745 * Returns zero if @work was already on the kernel-global workqueue and
2746 * non-zero otherwise.
2747 *
2748 * This puts a job in the kernel-global workqueue if it was not already
2749 * queued and leaves it in the same position on the kernel-global
2750 * workqueue otherwise.
2751 */
2752int schedule_work(struct work_struct *work)
2753{
2754        return queue_work(system_wq, work);
2755}
2756EXPORT_SYMBOL(schedule_work);
2757
2758/*
2759 * schedule_work_on - put work task on a specific cpu
2760 * @cpu: cpu to put the work task on
2761 * @work: job to be done
2762 *
2763 * This puts a job on a specific cpu
2764 */
2765int schedule_work_on(int cpu, struct work_struct *work)
2766{
2767        return queue_work_on(cpu, system_wq, work);
2768}
2769EXPORT_SYMBOL(schedule_work_on);
2770
2771/**
2772 * schedule_delayed_work - put work task in global workqueue after delay
2773 * @dwork: job to be done
2774 * @delay: number of jiffies to wait or 0 for immediate execution
2775 *
2776 * After waiting for a given time this puts a job in the kernel-global
2777 * workqueue.
2778 */
2779int schedule_delayed_work(struct delayed_work *dwork,
2780                                        unsigned long delay)
2781{
2782        return queue_delayed_work(system_wq, dwork, delay);
2783}
2784EXPORT_SYMBOL(schedule_delayed_work);
2785
2786/**
2787 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2788 * @cpu: cpu to use
2789 * @dwork: job to be done
2790 * @delay: number of jiffies to wait
2791 *
2792 * After waiting for a given time this puts a job in the kernel-global
2793 * workqueue on the specified CPU.
2794 */
2795int schedule_delayed_work_on(int cpu,
2796                        struct delayed_work *dwork, unsigned long delay)
2797{
2798        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2799}
2800EXPORT_SYMBOL(schedule_delayed_work_on);
2801
2802/**
2803 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2804 * @func: the function to call
2805 *
2806 * schedule_on_each_cpu() executes @func on each online CPU using the
2807 * system workqueue and blocks until all CPUs have completed.
2808 * schedule_on_each_cpu() is very slow.
2809 *
2810 * RETURNS:
2811 * 0 on success, -errno on failure.
2812 */
2813int schedule_on_each_cpu(work_func_t func)
2814{
2815        int cpu;
2816        struct work_struct __percpu *works;
2817
2818        works = alloc_percpu(struct work_struct);
2819        if (!works)
2820                return -ENOMEM;
2821
2822        get_online_cpus();
2823
2824        for_each_online_cpu(cpu) {
2825                struct work_struct *work = per_cpu_ptr(works, cpu);
2826
2827                INIT_WORK(work, func);
2828                schedule_work_on(cpu, work);
2829        }
2830
2831        for_each_online_cpu(cpu)
2832                flush_work(per_cpu_ptr(works, cpu));
2833
2834        put_online_cpus();
2835        free_percpu(works);
2836        return 0;
2837}
2838
2839/**
2840 * flush_scheduled_work - ensure that any scheduled work has run to completion.
2841 *
2842 * Forces execution of the kernel-global workqueue and blocks until its
2843 * completion.
2844 *
2845 * Think twice before calling this function!  It's very easy to get into
2846 * trouble if you don't take great care.  Either of the following situations
2847 * will lead to deadlock:
2848 *
2849 *      One of the work items currently on the workqueue needs to acquire
2850 *      a lock held by your code or its caller.
2851 *
2852 *      Your code is running in the context of a work routine.
2853 *
2854 * They will be detected by lockdep when they occur, but the first might not
2855 * occur very often.  It depends on what work items are on the workqueue and
2856 * what locks they need, which you have no control over.
2857 *
2858 * In most situations flushing the entire workqueue is overkill; you merely
2859 * need to know that a particular work item isn't queued and isn't running.
2860 * In such cases you should use cancel_delayed_work_sync() or
2861 * cancel_work_sync() instead.
2862 */
2863void flush_scheduled_work(void)
2864{
2865        flush_workqueue(system_wq);
2866}
2867EXPORT_SYMBOL(flush_scheduled_work);
2868
2869/**
2870 * execute_in_process_context - reliably execute the routine with user context
2871 * @fn:         the function to execute
2872 * @ew:         guaranteed storage for the execute work structure (must
2873 *              be available when the work executes)
2874 *
2875 * Executes the function immediately if process context is available,
2876 * otherwise schedules the function for delayed execution.
2877 *
2878 * Returns:     0 - function was executed
2879 *              1 - function was scheduled for execution
2880 */
2881int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2882{
2883        if (!in_interrupt()) {
2884                fn(&ew->work);
2885                return 0;
2886        }
2887
2888        INIT_WORK(&ew->work, fn);
2889        schedule_work(&ew->work);
2890
2891        return 1;
2892}
2893EXPORT_SYMBOL_GPL(execute_in_process_context);
2894
2895int keventd_up(void)
2896{
2897        return system_wq != NULL;
2898}
2899
2900static int alloc_cwqs(struct workqueue_struct *wq)
2901{
2902        /*
2903         * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
2904         * Make sure that the alignment isn't lower than that of
2905         * unsigned long long.
2906         */
2907        const size_t size = sizeof(struct cpu_workqueue_struct);
2908        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2909                                   __alignof__(unsigned long long));
2910
2911        if (!(wq->flags & WQ_UNBOUND))
2912                wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2913        else {
2914                void *ptr;
2915
2916                /*
2917                 * Allocate enough room to align cwq and put an extra
2918                 * pointer at the end pointing back to the originally
2919                 * allocated pointer which will be used for free.
2920                 */
2921                ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2922                if (ptr) {
2923                        wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2924                        *(void **)(wq->cpu_wq.single + 1) = ptr;
2925                }
2926        }
2927
2928        /* just in case, make sure it's actually aligned */
2929        BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2930        return wq->cpu_wq.v ? 0 : -ENOMEM;
2931}
2932
2933static void free_cwqs(struct workqueue_struct *wq)
2934{
2935        if (!(wq->flags & WQ_UNBOUND))
2936                free_percpu(wq->cpu_wq.pcpu);
2937        else if (wq->cpu_wq.single) {
2938                /* the pointer to free is stored right after the cwq */
2939                kfree(*(void **)(wq->cpu_wq.single + 1));
2940        }
2941}
2942
2943static int wq_clamp_max_active(int max_active, unsigned int flags,
2944                               const char *name)
2945{
2946        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
2947
2948        if (max_active < 1 || max_active > lim)
2949                printk(KERN_WARNING "workqueue: max_active %d requested for %s "
2950                       "is out of range, clamping between %d and %d\n",
2951                       max_active, name, 1, lim);
2952
2953        return clamp_val(max_active, 1, lim);
2954}
2955
2956struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2957                                               unsigned int flags,
2958                                               int max_active,
2959                                               struct lock_class_key *key,
2960                                               const char *lock_name, ...)
2961{
2962        va_list args, args1;
2963        struct workqueue_struct *wq;
2964        unsigned int cpu;
2965        size_t namelen;
2966
2967        /* determine namelen, allocate wq and format name */
2968        va_start(args, lock_name);
2969        va_copy(args1, args);
2970        namelen = vsnprintf(NULL, 0, fmt, args) + 1;
2971
2972        wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
2973        if (!wq)
2974                goto err;
2975
2976        vsnprintf(wq->name, namelen, fmt, args1);
2977        va_end(args);
2978        va_end(args1);
2979
2980        /*
2981         * Workqueues which may be used during memory reclaim should
2982         * have a rescuer to guarantee forward progress.
2983         */
2984        if (flags & WQ_MEM_RECLAIM)
2985                flags |= WQ_RESCUER;
2986
2987        /*
2988         * Unbound workqueues aren't concurrency managed and should be
2989         * dispatched to workers immediately.
2990         */
2991        if (flags & WQ_UNBOUND)
2992                flags |= WQ_HIGHPRI;
2993
2994        max_active = max_active ?: WQ_DFL_ACTIVE;
2995        max_active = wq_clamp_max_active(max_active, flags, wq->name);
2996
2997        /* init wq */
2998        wq->flags = flags;
2999        wq->saved_max_active = max_active;
3000        mutex_init(&wq->flush_mutex);
3001        atomic_set(&wq->nr_cwqs_to_flush, 0);
3002        INIT_LIST_HEAD(&wq->flusher_queue);
3003        INIT_LIST_HEAD(&wq->flusher_overflow);
3004
3005        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3006        INIT_LIST_HEAD(&wq->list);
3007
3008        if (alloc_cwqs(wq) < 0)
3009                goto err;
3010
3011        for_each_cwq_cpu(cpu, wq) {
3012                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3013                struct global_cwq *gcwq = get_gcwq(cpu);
3014
3015                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3016                cwq->gcwq = gcwq;
3017                cwq->wq = wq;
3018                cwq->flush_color = -1;
3019                cwq->max_active = max_active;
3020                INIT_LIST_HEAD(&cwq->delayed_works);
3021        }
3022
3023        if (flags & WQ_RESCUER) {
3024                struct worker *rescuer;
3025
3026                if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
3027                        goto err;
3028
3029                wq->rescuer = rescuer = alloc_worker();
3030                if (!rescuer)
3031                        goto err;
3032
3033                rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3034                                               wq->name);
3035                if (IS_ERR(rescuer->task))
3036                        goto err;
3037
3038                rescuer->task->flags |= PF_THREAD_BOUND;
3039                wake_up_process(rescuer->task);
3040        }
3041
3042        /*
3043         * workqueue_lock protects global freeze state and workqueues
3044         * list.  Grab it, set max_active accordingly and add the new
3045         * workqueue to workqueues list.
3046         */
3047        spin_lock(&workqueue_lock);
3048
3049        if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3050                for_each_cwq_cpu(cpu, wq)
3051                        get_cwq(cpu, wq)->max_active = 0;
3052
3053        list_add(&wq->list, &workqueues);
3054
3055        spin_unlock(&workqueue_lock);
3056
3057        return wq;
3058err:
3059        if (wq) {
3060                free_cwqs(wq);
3061                free_mayday_mask(wq->mayday_mask);
3062                kfree(wq->rescuer);
3063                kfree(wq);
3064        }
3065        return NULL;
3066}
3067EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3068
3069/**
3070 * destroy_workqueue - safely terminate a workqueue
3071 * @wq: target workqueue
3072 *
3073 * Safely destroy a workqueue. All work currently pending will be done first.
3074 */
3075void destroy_workqueue(struct workqueue_struct *wq)
3076{
3077        unsigned int cpu;
3078
3079        /* drain it before proceeding with destruction */
3080        drain_workqueue(wq);
3081
3082        /*
3083         * wq list is used to freeze wq, remove from list after
3084         * flushing is complete in case freeze races us.
3085         */
3086        spin_lock(&workqueue_lock);
3087        list_del(&wq->list);
3088        spin_unlock(&workqueue_lock);
3089
3090        /* sanity check */
3091        for_each_cwq_cpu(cpu, wq) {
3092                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3093                int i;
3094
3095                for (i = 0; i < WORK_NR_COLORS; i++)
3096                        BUG_ON(cwq->nr_in_flight[i]);
3097                BUG_ON(cwq->nr_active);
3098                BUG_ON(!list_empty(&cwq->delayed_works));
3099        }
3100
3101        if (wq->flags & WQ_RESCUER) {
3102                kthread_stop(wq->rescuer->task);
3103                free_mayday_mask(wq->mayday_mask);
3104                kfree(wq->rescuer);
3105        }
3106
3107        free_cwqs(wq);
3108        kfree(wq);
3109}
3110EXPORT_SYMBOL_GPL(destroy_workqueue);
3111
3112/**
3113 * workqueue_set_max_active - adjust max_active of a workqueue
3114 * @wq: target workqueue
3115 * @max_active: new max_active value.
3116 *
3117 * Set max_active of @wq to @max_active.
3118 *
3119 * CONTEXT:
3120 * Don't call from IRQ context.
3121 */
3122void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3123{
3124        unsigned int cpu;
3125
3126        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3127
3128        spin_lock(&workqueue_lock);
3129
3130        wq->saved_max_active = max_active;
3131
3132        for_each_cwq_cpu(cpu, wq) {
3133                struct global_cwq *gcwq = get_gcwq(cpu);
3134
3135                spin_lock_irq(&gcwq->lock);
3136
3137                if (!(wq->flags & WQ_FREEZABLE) ||
3138                    !(gcwq->flags & GCWQ_FREEZING))
3139                        get_cwq(gcwq->cpu, wq)->max_active = max_active;
3140
3141                spin_unlock_irq(&gcwq->lock);
3142        }
3143
3144        spin_unlock(&workqueue_lock);
3145}
3146EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3147
3148/**
3149 * workqueue_congested - test whether a workqueue is congested
3150 * @cpu: CPU in question
3151 * @wq: target workqueue
3152 *
3153 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
3154 * no synchronization around this function and the test result is
3155 * unreliable and only useful as advisory hints or for debugging.
3156 *
3157 * RETURNS:
3158 * %true if congested, %false otherwise.
3159 */
3160bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3161{
3162        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3163
3164        return !list_empty(&cwq->delayed_works);
3165}
3166EXPORT_SYMBOL_GPL(workqueue_congested);
3167
3168/**
3169 * work_cpu - return the last known associated cpu for @work
3170 * @work: the work of interest
3171 *
3172 * RETURNS:
3173 * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
3174 */
3175unsigned int work_cpu(struct work_struct *work)
3176{
3177        struct global_cwq *gcwq = get_work_gcwq(work);
3178
3179        return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3180}
3181EXPORT_SYMBOL_GPL(work_cpu);
3182
3183/**
3184 * work_busy - test whether a work is currently pending or running
3185 * @work: the work to be tested
3186 *
3187 * Test whether @work is currently pending or running.  There is no
3188 * synchronization around this function and the test result is
3189 * unreliable and only useful as advisory hints or for debugging.
3190 * Especially for reentrant wqs, the pending state might hide the
3191 * running state.
3192 *
3193 * RETURNS:
3194 * OR'd bitmask of WORK_BUSY_* bits.
3195 */
3196unsigned int work_busy(struct work_struct *work)
3197{
3198        struct global_cwq *gcwq = get_work_gcwq(work);
3199        unsigned long flags;
3200        unsigned int ret = 0;
3201
3202        if (!gcwq)
3203                return false;
3204
3205        spin_lock_irqsave(&gcwq->lock, flags);
3206
3207        if (work_pending(work))
3208                ret |= WORK_BUSY_PENDING;
3209        if (find_worker_executing_work(gcwq, work))
3210                ret |= WORK_BUSY_RUNNING;
3211
3212        spin_unlock_irqrestore(&gcwq->lock, flags);
3213
3214        return ret;
3215}
3216EXPORT_SYMBOL_GPL(work_busy);
3217
3218/*
3219 * CPU hotplug.
3220 *
3221 * There are two challenges in supporting CPU hotplug.  Firstly, there
3222 * are a lot of assumptions on strong associations among work, cwq and
3223 * gcwq which make migrating pending and scheduled works very
3224 * difficult to implement without impacting hot paths.  Secondly,
3225 * gcwqs serve mix of short, long and very long running works making
3226 * blocked draining impractical.
3227 *
3228 * This is solved by allowing a gcwq to be detached from CPU, running
3229 * it with unbound (rogue) workers and allowing it to be reattached
3230 * later if the cpu comes back online.  A separate thread is created
3231 * to govern a gcwq in such state and is called the trustee of the
3232 * gcwq.
3233 *
3234 * Trustee states and their descriptions.
3235 *
3236 * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
3237 *              new trustee is started with this state.
3238 *
3239 * IN_CHARGE    Once started, trustee will enter this state after
3240 *              assuming the manager role and making all existing
3241 *              workers rogue.  DOWN_PREPARE waits for trustee to
3242 *              enter this state.  After reaching IN_CHARGE, trustee
3243 *              tries to execute the pending worklist until it's empty
3244 *              and the state is set to BUTCHER, or the state is set
3245 *              to RELEASE.
3246 *
3247 * BUTCHER      Command state which is set by the cpu callback after
3248 *              the cpu has went down.  Once this state is set trustee
3249 *              knows that there will be no new works on the worklist
3250 *              and once the worklist is empty it can proceed to
3251 *              killing idle workers.
3252 *
3253 * RELEASE      Command state which is set by the cpu callback if the
3254 *              cpu down has been canceled or it has come online
3255 *              again.  After recognizing this state, trustee stops
3256 *              trying to drain or butcher and clears ROGUE, rebinds
3257 *              all remaining workers back to the cpu and releases
3258 *              manager role.
3259 *
3260 * DONE         Trustee will enter this state after BUTCHER or RELEASE
3261 *              is complete.
3262 *
3263 *          trustee                 CPU                draining
3264 *         took over                down               complete
3265 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3266 *                        |                     |                  ^
3267 *                        | CPU is back online  v   return workers |
3268 *                         ----------------> RELEASE --------------
3269 */
3270
3271/**
3272 * trustee_wait_event_timeout - timed event wait for trustee
3273 * @cond: condition to wait for
3274 * @timeout: timeout in jiffies
3275 *
3276 * wait_event_timeout() for trustee to use.  Handles locking and
3277 * checks for RELEASE request.
3278 *
3279 * CONTEXT:
3280 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3281 * multiple times.  To be used by trustee.
3282 *
3283 * RETURNS:
3284 * Positive indicating left time if @cond is satisfied, 0 if timed
3285 * out, -1 if canceled.
3286 */
3287#define trustee_wait_event_timeout(cond, timeout) ({                    \
3288        long __ret = (timeout);                                         \
3289        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3290               __ret) {                                                 \
3291                spin_unlock_irq(&gcwq->lock);                           \
3292                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
3293                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
3294                        __ret);                                         \
3295                spin_lock_irq(&gcwq->lock);                             \
3296        }                                                               \
3297        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
3298})
3299
3300/**
3301 * trustee_wait_event - event wait for trustee
3302 * @cond: condition to wait for
3303 *
3304 * wait_event() for trustee to use.  Automatically handles locking and
3305 * checks for CANCEL request.
3306 *
3307 * CONTEXT:
3308 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3309 * multiple times.  To be used by trustee.
3310 *
3311 * RETURNS:
3312 * 0 if @cond is satisfied, -1 if canceled.
3313 */
3314#define trustee_wait_event(cond) ({                                     \
3315        long __ret1;                                                    \
3316        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3317        __ret1 < 0 ? -1 : 0;                                            \
3318})
3319
3320static int __cpuinit trustee_thread(void *__gcwq)
3321{
3322        struct global_cwq *gcwq = __gcwq;
3323        struct worker *worker;
3324        struct work_struct *work;
3325        struct hlist_node *pos;
3326        long rc;
3327        int i;
3328
3329        BUG_ON(gcwq->cpu != smp_processor_id());
3330
3331        spin_lock_irq(&gcwq->lock);
3332        /*
3333         * Claim the manager position and make all workers rogue.
3334         * Trustee must be bound to the target cpu and can't be
3335         * cancelled.
3336         */
3337        BUG_ON(gcwq->cpu != smp_processor_id());
3338        rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3339        BUG_ON(rc < 0);
3340
3341        gcwq->flags |= GCWQ_MANAGING_WORKERS;
3342
3343        list_for_each_entry(worker, &gcwq->idle_list, entry)
3344                worker->flags |= WORKER_ROGUE;
3345
3346        for_each_busy_worker(worker, i, pos, gcwq)
3347                worker->flags |= WORKER_ROGUE;
3348
3349        /*
3350         * Call schedule() so that we cross rq->lock and thus can
3351         * guarantee sched callbacks see the rogue flag.  This is
3352         * necessary as scheduler callbacks may be invoked from other
3353         * cpus.
3354         */
3355        spin_unlock_irq(&gcwq->lock);
3356        schedule();
3357        spin_lock_irq(&gcwq->lock);
3358
3359        /*
3360         * Sched callbacks are disabled now.  Zap nr_running.  After
3361         * this, nr_running stays zero and need_more_worker() and
3362         * keep_working() are always true as long as the worklist is
3363         * not empty.
3364         */
3365        atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3366
3367        spin_unlock_irq(&gcwq->lock);
3368        del_timer_sync(&gcwq->idle_timer);
3369        spin_lock_irq(&gcwq->lock);
3370
3371        /*
3372         * We're now in charge.  Notify and proceed to drain.  We need
3373         * to keep the gcwq running during the whole CPU down
3374         * procedure as other cpu hotunplug callbacks may need to
3375         * flush currently running tasks.
3376         */
3377        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3378        wake_up_all(&gcwq->trustee_wait);
3379
3380        /*
3381         * The original cpu is in the process of dying and may go away
3382         * anytime now.  When that happens, we and all workers would
3383         * be migrated to other cpus.  Try draining any left work.  We
3384         * want to get it over with ASAP - spam rescuers, wake up as
3385         * many idlers as necessary and create new ones till the
3386         * worklist is empty.  Note that if the gcwq is frozen, there
3387         * may be frozen works in freezable cwqs.  Don't declare
3388         * completion while frozen.
3389         */
3390        while (gcwq->nr_workers != gcwq->nr_idle ||
3391               gcwq->flags & GCWQ_FREEZING ||
3392               gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3393                int nr_works = 0;
3394
3395                list_for_each_entry(work, &gcwq->worklist, entry) {
3396                        send_mayday(work);
3397                        nr_works++;
3398                }
3399
3400                list_for_each_entry(worker, &gcwq->idle_list, entry) {
3401                        if (!nr_works--)
3402                                break;
3403                        wake_up_process(worker->task);
3404                }
3405
3406                if (need_to_create_worker(gcwq)) {
3407                        spin_unlock_irq(&gcwq->lock);
3408                        worker = create_worker(gcwq, false);
3409                        spin_lock_irq(&gcwq->lock);
3410                        if (worker) {
3411                                worker->flags |= WORKER_ROGUE;
3412                                start_worker(worker);
3413                        }
3414                }
3415
3416                /* give a breather */
3417                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3418                        break;
3419        }
3420
3421        /*
3422         * Either all works have been scheduled and cpu is down, or
3423         * cpu down has already been canceled.  Wait for and butcher
3424         * all workers till we're canceled.
3425         */
3426        do {
3427                rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3428                while (!list_empty(&gcwq->idle_list))
3429                        destroy_worker(list_first_entry(&gcwq->idle_list,
3430                                                        struct worker, entry));
3431        } while (gcwq->nr_workers && rc >= 0);
3432
3433        /*
3434         * At this point, either draining has completed and no worker
3435         * is left, or cpu down has been canceled or the cpu is being
3436         * brought back up.  There shouldn't be any idle one left.
3437         * Tell the remaining busy ones to rebind once it finishes the
3438         * currently scheduled works by scheduling the rebind_work.
3439         */
3440        WARN_ON(!list_empty(&gcwq->idle_list));
3441
3442        for_each_busy_worker(worker, i, pos, gcwq) {
3443                struct work_struct *rebind_work = &worker->rebind_work;
3444                unsigned long worker_flags = worker->flags;
3445
3446                /*
3447                 * Rebind_work may race with future cpu hotplug
3448                 * operations.  Use a separate flag to mark that
3449                 * rebinding is scheduled.  The morphing should
3450                 * be atomic.
3451                 */
3452                worker_flags |= WORKER_REBIND;
3453                worker_flags &= ~WORKER_ROGUE;
3454                ACCESS_ONCE(worker->flags) = worker_flags;
3455
3456                /* queue rebind_work, wq doesn't matter, use the default one */
3457                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3458                                     work_data_bits(rebind_work)))
3459                        continue;
3460
3461                debug_work_activate(rebind_work);
3462                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3463                            worker->scheduled.next,
3464                            work_color_to_flags(WORK_NO_COLOR));
3465        }
3466
3467        /* relinquish manager role */
3468        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3469
3470        /* notify completion */
3471        gcwq->trustee = NULL;
3472        gcwq->trustee_state = TRUSTEE_DONE;
3473        wake_up_all(&gcwq->trustee_wait);
3474        spin_unlock_irq(&gcwq->lock);
3475        return 0;
3476}
3477
3478/**
3479 * wait_trustee_state - wait for trustee to enter the specified state
3480 * @gcwq: gcwq the trustee of interest belongs to
3481 * @state: target state to wait for
3482 *
3483 * Wait for the trustee to reach @state.  DONE is already matched.
3484 *
3485 * CONTEXT:
3486 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3487 * multiple times.  To be used by cpu_callback.
3488 */
3489static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3490__releases(&gcwq->lock)
3491__acquires(&gcwq->lock)
3492{
3493        if (!(gcwq->trustee_state == state ||
3494              gcwq->trustee_state == TRUSTEE_DONE)) {
3495                spin_unlock_irq(&gcwq->lock);
3496                __wait_event(gcwq->trustee_wait,
3497                             gcwq->trustee_state == state ||
3498                             gcwq->trustee_state == TRUSTEE_DONE);
3499                spin_lock_irq(&gcwq->lock);
3500        }
3501}
3502
3503static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3504                                                unsigned long action,
3505                                                void *hcpu)
3506{
3507        unsigned int cpu = (unsigned long)hcpu;
3508        struct global_cwq *gcwq = get_gcwq(cpu);
3509        struct task_struct *new_trustee = NULL;
3510        struct worker *uninitialized_var(new_worker);
3511        unsigned long flags;
3512
3513        action &= ~CPU_TASKS_FROZEN;
3514
3515        switch (action) {
3516        case CPU_DOWN_PREPARE:
3517                new_trustee = kthread_create(trustee_thread, gcwq,
3518                                             "workqueue_trustee/%d\n", cpu);
3519                if (IS_ERR(new_trustee))
3520                        return notifier_from_errno(PTR_ERR(new_trustee));
3521                kthread_bind(new_trustee, cpu);
3522                /* fall through */
3523        case CPU_UP_PREPARE:
3524                BUG_ON(gcwq->first_idle);
3525                new_worker = create_worker(gcwq, false);
3526                if (!new_worker) {
3527                        if (new_trustee)
3528                                kthread_stop(new_trustee);
3529                        return NOTIFY_BAD;
3530                }
3531        }
3532
3533        /* some are called w/ irq disabled, don't disturb irq status */
3534        spin_lock_irqsave(&gcwq->lock, flags);
3535
3536        switch (action) {
3537        case CPU_DOWN_PREPARE:
3538                /* initialize trustee and tell it to acquire the gcwq */
3539                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3540                gcwq->trustee = new_trustee;
3541                gcwq->trustee_state = TRUSTEE_START;
3542                wake_up_process(gcwq->trustee);
3543                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3544                /* fall through */
3545        case CPU_UP_PREPARE:
3546                BUG_ON(gcwq->first_idle);
3547                gcwq->first_idle = new_worker;
3548                break;
3549
3550        case CPU_DYING:
3551                /*
3552                 * Before this, the trustee and all workers except for
3553                 * the ones which are still executing works from
3554                 * before the last CPU down must be on the cpu.  After
3555                 * this, they'll all be diasporas.
3556                 */
3557                gcwq->flags |= GCWQ_DISASSOCIATED;
3558                break;
3559
3560        case CPU_POST_DEAD:
3561                gcwq->trustee_state = TRUSTEE_BUTCHER;
3562                /* fall through */
3563        case CPU_UP_CANCELED:
3564                destroy_worker(gcwq->first_idle);
3565                gcwq->first_idle = NULL;
3566                break;
3567
3568        case CPU_DOWN_FAILED:
3569        case CPU_ONLINE:
3570                gcwq->flags &= ~GCWQ_DISASSOCIATED;
3571                if (gcwq->trustee_state != TRUSTEE_DONE) {
3572                        gcwq->trustee_state = TRUSTEE_RELEASE;
3573                        wake_up_process(gcwq->trustee);
3574                        wait_trustee_state(gcwq, TRUSTEE_DONE);
3575                }
3576
3577                /*
3578                 * Trustee is done and there might be no worker left.
3579                 * Put the first_idle in and request a real manager to
3580                 * take a look.
3581                 */
3582                spin_unlock_irq(&gcwq->lock);
3583                kthread_bind(gcwq->first_idle->task, cpu);
3584                spin_lock_irq(&gcwq->lock);
3585                gcwq->flags |= GCWQ_MANAGE_WORKERS;
3586                start_worker(gcwq->first_idle);
3587                gcwq->first_idle = NULL;
3588                break;
3589        }
3590
3591        spin_unlock_irqrestore(&gcwq->lock, flags);
3592
3593        return notifier_from_errno(0);
3594}
3595
3596/*
3597 * Workqueues should be brought up before normal priority CPU notifiers.
3598 * This will be registered high priority CPU notifier.
3599 */
3600static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3601                                               unsigned long action,
3602                                               void *hcpu)
3603{
3604        switch (action & ~CPU_TASKS_FROZEN) {
3605        case CPU_UP_PREPARE:
3606        case CPU_UP_CANCELED:
3607        case CPU_DOWN_FAILED:
3608        case CPU_ONLINE:
3609                return workqueue_cpu_callback(nfb, action, hcpu);
3610        }
3611        return NOTIFY_OK;
3612}
3613
3614/*
3615 * Workqueues should be brought down after normal priority CPU notifiers.
3616 * This will be registered as low priority CPU notifier.
3617 */
3618static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3619                                                 unsigned long action,
3620                                                 void *hcpu)
3621{
3622        switch (action & ~CPU_TASKS_FROZEN) {
3623        case CPU_DOWN_PREPARE:
3624        case CPU_DYING:
3625        case CPU_POST_DEAD:
3626                return workqueue_cpu_callback(nfb, action, hcpu);
3627        }
3628        return NOTIFY_OK;
3629}
3630
3631#ifdef CONFIG_SMP
3632
3633struct work_for_cpu {
3634        struct work_struct work;
3635        long (*fn)(void *);
3636        void *arg;
3637        long ret;
3638};
3639
3640static void work_for_cpu_fn(struct work_struct *work)
3641{
3642        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
3643
3644        wfc->ret = wfc->fn(wfc->arg);
3645}
3646
3647/**
3648 * work_on_cpu - run a function in user context on a particular cpu
3649 * @cpu: the cpu to run on
3650 * @fn: the function to run
3651 * @arg: the function arg
3652 *
3653 * This will return the value @fn returns.
3654 * It is up to the caller to ensure that the cpu doesn't go offline.
3655 * The caller must not hold any locks which would prevent @fn from completing.
3656 */
3657long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3658{
3659        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3660
3661        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
3662        schedule_work_on(cpu, &wfc.work);
3663        flush_work(&wfc.work);
3664        return wfc.ret;
3665}
3666EXPORT_SYMBOL_GPL(work_on_cpu);
3667#endif /* CONFIG_SMP */
3668
3669#ifdef CONFIG_FREEZER
3670
3671/**
3672 * freeze_workqueues_begin - begin freezing workqueues
3673 *
3674 * Start freezing workqueues.  After this function returns, all freezable
3675 * workqueues will queue new works to their frozen_works list instead of
3676 * gcwq->worklist.
3677 *
3678 * CONTEXT:
3679 * Grabs and releases workqueue_lock and gcwq->lock's.
3680 */
3681void freeze_workqueues_begin(void)
3682{
3683        unsigned int cpu;
3684
3685        spin_lock(&workqueue_lock);
3686
3687        BUG_ON(workqueue_freezing);
3688        workqueue_freezing = true;
3689
3690        for_each_gcwq_cpu(cpu) {
3691                struct global_cwq *gcwq = get_gcwq(cpu);
3692                struct workqueue_struct *wq;
3693
3694                spin_lock_irq(&gcwq->lock);
3695
3696                BUG_ON(gcwq->flags & GCWQ_FREEZING);
3697                gcwq->flags |= GCWQ_FREEZING;
3698
3699                list_for_each_entry(wq, &workqueues, list) {
3700                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3701
3702                        if (cwq && wq->flags & WQ_FREEZABLE)
3703                                cwq->max_active = 0;
3704                }
3705
3706                spin_unlock_irq(&gcwq->lock);
3707        }
3708
3709        spin_unlock(&workqueue_lock);
3710}
3711
3712/**
3713 * freeze_workqueues_busy - are freezable workqueues still busy?
3714 *
3715 * Check whether freezing is complete.  This function must be called
3716 * between freeze_workqueues_begin() and thaw_workqueues().
3717 *
3718 * CONTEXT:
3719 * Grabs and releases workqueue_lock.
3720 *
3721 * RETURNS:
3722 * %true if some freezable workqueues are still busy.  %false if freezing
3723 * is complete.
3724 */
3725bool freeze_workqueues_busy(void)
3726{
3727        unsigned int cpu;
3728        bool busy = false;
3729
3730        spin_lock(&workqueue_lock);
3731
3732        BUG_ON(!workqueue_freezing);
3733
3734        for_each_gcwq_cpu(cpu) {
3735                struct workqueue_struct *wq;
3736                /*
3737                 * nr_active is monotonically decreasing.  It's safe
3738                 * to peek without lock.
3739                 */
3740                list_for_each_entry(wq, &workqueues, list) {
3741                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3742
3743                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
3744                                continue;
3745
3746                        BUG_ON(cwq->nr_active < 0);
3747                        if (cwq->nr_active) {
3748                                busy = true;
3749                                goto out_unlock;
3750                        }
3751                }
3752        }
3753out_unlock:
3754        spin_unlock(&workqueue_lock);
3755        return busy;
3756}
3757
3758/**
3759 * thaw_workqueues - thaw workqueues
3760 *
3761 * Thaw workqueues.  Normal queueing is restored and all collected
3762 * frozen works are transferred to their respective gcwq worklists.
3763 *
3764 * CONTEXT:
3765 * Grabs and releases workqueue_lock and gcwq->lock's.
3766 */
3767void thaw_workqueues(void)
3768{
3769        unsigned int cpu;
3770
3771        spin_lock(&workqueue_lock);
3772
3773        if (!workqueue_freezing)
3774                goto out_unlock;
3775
3776        for_each_gcwq_cpu(cpu) {
3777                struct global_cwq *gcwq = get_gcwq(cpu);
3778                struct workqueue_struct *wq;
3779
3780                spin_lock_irq(&gcwq->lock);
3781
3782                BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3783                gcwq->flags &= ~GCWQ_FREEZING;
3784
3785                list_for_each_entry(wq, &workqueues, list) {
3786                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3787
3788                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
3789                                continue;
3790
3791                        /* restore max_active and repopulate worklist */
3792                        cwq->max_active = wq->saved_max_active;
3793
3794                        while (!list_empty(&cwq->delayed_works) &&
3795                               cwq->nr_active < cwq->max_active)
3796                                cwq_activate_first_delayed(cwq);
3797                }
3798
3799                wake_up_worker(gcwq);
3800
3801                spin_unlock_irq(&gcwq->lock);
3802        }
3803
3804        workqueue_freezing = false;
3805out_unlock:
3806        spin_unlock(&workqueue_lock);
3807}
3808#endif /* CONFIG_FREEZER */
3809
3810static int __init init_workqueues(void)
3811{
3812        unsigned int cpu;
3813        int i;
3814
3815        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3816        cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3817
3818        /* initialize gcwqs */
3819        for_each_gcwq_cpu(cpu) {
3820                struct global_cwq *gcwq = get_gcwq(cpu);
3821
3822                spin_lock_init(&gcwq->lock);
3823                INIT_LIST_HEAD(&gcwq->worklist);
3824                gcwq->cpu = cpu;
3825                gcwq->flags |= GCWQ_DISASSOCIATED;
3826
3827                INIT_LIST_HEAD(&gcwq->idle_list);
3828                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3829                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3830
3831                init_timer_deferrable(&gcwq->idle_timer);
3832                gcwq->idle_timer.function = idle_worker_timeout;
3833                gcwq->idle_timer.data = (unsigned long)gcwq;
3834
3835                setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3836                            (unsigned long)gcwq);
3837
3838                ida_init(&gcwq->worker_ida);
3839
3840                gcwq->trustee_state = TRUSTEE_DONE;
3841                init_waitqueue_head(&gcwq->trustee_wait);
3842        }
3843
3844        /* create the initial worker */
3845        for_each_online_gcwq_cpu(cpu) {
3846                struct global_cwq *gcwq = get_gcwq(cpu);
3847                struct worker *worker;
3848
3849                if (cpu != WORK_CPU_UNBOUND)
3850                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
3851                worker = create_worker(gcwq, true);
3852                BUG_ON(!worker);
3853                spin_lock_irq(&gcwq->lock);
3854                start_worker(worker);
3855                spin_unlock_irq(&gcwq->lock);
3856        }
3857
3858        system_wq = alloc_workqueue("events", 0, 0);
3859        system_long_wq = alloc_workqueue("events_long", 0, 0);
3860        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3861        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3862                                            WQ_UNBOUND_MAX_ACTIVE);
3863        system_freezable_wq = alloc_workqueue("events_freezable",
3864                                              WQ_FREEZABLE, 0);
3865        system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
3866                        WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
3867        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3868               !system_unbound_wq || !system_freezable_wq ||
3869                !system_nrt_freezable_wq);
3870        return 0;
3871}
3872early_initcall(init_workqueues);
3873
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.