linux/kernel/workqueue.c
<<
>>
Prefs
   1/*
   2 * kernel/workqueue.c - generic async execution with shared worker pool
   3 *
   4 * Copyright (C) 2002           Ingo Molnar
   5 *
   6 *   Derived from the taskqueue/keventd code by:
   7 *     David Woodhouse <dwmw2@infradead.org>
   8 *     Andrew Morton
   9 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
  10 *     Theodore Ts'o <tytso@mit.edu>
  11 *
  12 * Made to use alloc_percpu by Christoph Lameter.
  13 *
  14 * Copyright (C) 2010           SUSE Linux Products GmbH
  15 * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
  16 *
  17 * This is the generic async execution mechanism.  Work items as are
  18 * executed in process context.  The worker pool is shared and
  19 * automatically managed.  There is one worker pool for each CPU and
  20 * one extra for works which are better served by workers which are
  21 * not bound to any specific CPU.
  22 *
  23 * Please read Documentation/workqueue.txt for details.
  24 */
  25
  26#include <linux/export.h>
  27#include <linux/kernel.h>
  28#include <linux/sched.h>
  29#include <linux/init.h>
  30#include <linux/signal.h>
  31#include <linux/completion.h>
  32#include <linux/workqueue.h>
  33#include <linux/slab.h>
  34#include <linux/cpu.h>
  35#include <linux/notifier.h>
  36#include <linux/kthread.h>
  37#include <linux/hardirq.h>
  38#include <linux/mempolicy.h>
  39#include <linux/freezer.h>
  40#include <linux/kallsyms.h>
  41#include <linux/debug_locks.h>
  42#include <linux/lockdep.h>
  43#include <linux/idr.h>
  44#include <linux/hashtable.h>
  45
  46#include "workqueue_internal.h"
  47
  48enum {
  49        /*
  50         * worker_pool flags
  51         *
  52         * A bound pool is either associated or disassociated with its CPU.
  53         * While associated (!DISASSOCIATED), all workers are bound to the
  54         * CPU and none has %WORKER_UNBOUND set and concurrency management
  55         * is in effect.
  56         *
  57         * While DISASSOCIATED, the cpu may be offline and all workers have
  58         * %WORKER_UNBOUND set and concurrency management disabled, and may
  59         * be executing on any CPU.  The pool behaves as an unbound one.
  60         *
  61         * Note that DISASSOCIATED can be flipped only while holding
  62         * assoc_mutex to avoid changing binding state while
  63         * create_worker() is in progress.
  64         */
  65        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
  66        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
  67        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
  68        POOL_FREEZING           = 1 << 3,       /* freeze in progress */
  69
  70        /* worker flags */
  71        WORKER_STARTED          = 1 << 0,       /* started */
  72        WORKER_DIE              = 1 << 1,       /* die die die */
  73        WORKER_IDLE             = 1 << 2,       /* is idle */
  74        WORKER_PREP             = 1 << 3,       /* preparing to run works */
  75        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
  76        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
  77
  78        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_UNBOUND |
  79                                  WORKER_CPU_INTENSIVE,
  80
  81        NR_STD_WORKER_POOLS     = 2,            /* # standard pools per cpu */
  82
  83        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
  84
  85        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
  86        IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
  87
  88        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
  89                                                /* call for help after 10ms
  90                                                   (min two ticks) */
  91        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
  92        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
  93
  94        /*
  95         * Rescue workers are used only on emergencies and shared by
  96         * all cpus.  Give -20.
  97         */
  98        RESCUER_NICE_LEVEL      = -20,
  99        HIGHPRI_NICE_LEVEL      = -20,
 100};
 101
 102/*
 103 * Structure fields follow one of the following exclusion rules.
 104 *
 105 * I: Modifiable by initialization/destruction paths and read-only for
 106 *    everyone else.
 107 *
 108 * P: Preemption protected.  Disabling preemption is enough and should
 109 *    only be modified and accessed from the local cpu.
 110 *
 111 * L: pool->lock protected.  Access with pool->lock held.
 112 *
 113 * X: During normal operation, modification requires pool->lock and should
 114 *    be done only from local cpu.  Either disabling preemption on local
 115 *    cpu or grabbing pool->lock is enough for read access.  If
 116 *    POOL_DISASSOCIATED is set, it's identical to L.
 117 *
 118 * F: wq->flush_mutex protected.
 119 *
 120 * W: workqueue_lock protected.
 121 */
 122
 123/* struct worker is defined in workqueue_internal.h */
 124
 125struct worker_pool {
 126        spinlock_t              lock;           /* the pool lock */
 127        unsigned int            cpu;            /* I: the associated cpu */
 128        int                     id;             /* I: pool ID */
 129        unsigned int            flags;          /* X: flags */
 130
 131        struct list_head        worklist;       /* L: list of pending works */
 132        int                     nr_workers;     /* L: total number of workers */
 133
 134        /* nr_idle includes the ones off idle_list for rebinding */
 135        int                     nr_idle;        /* L: currently idle ones */
 136
 137        struct list_head        idle_list;      /* X: list of idle workers */
 138        struct timer_list       idle_timer;     /* L: worker idle timeout */
 139        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
 140
 141        /* workers are chained either in busy_hash or idle_list */
 142        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
 143                                                /* L: hash of busy workers */
 144
 145        struct mutex            assoc_mutex;    /* protect POOL_DISASSOCIATED */
 146        struct ida              worker_ida;     /* L: for worker IDs */
 147
 148        /*
 149         * The current concurrency level.  As it's likely to be accessed
 150         * from other CPUs during try_to_wake_up(), put it in a separate
 151         * cacheline.
 152         */
 153        atomic_t                nr_running ____cacheline_aligned_in_smp;
 154} ____cacheline_aligned_in_smp;
 155
 156/*
 157 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 158 * of work_struct->data are used for flags and the remaining high bits
 159 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 160 * number of flag bits.
 161 */
 162struct pool_workqueue {
 163        struct worker_pool      *pool;          /* I: the associated pool */
 164        struct workqueue_struct *wq;            /* I: the owning workqueue */
 165        int                     work_color;     /* L: current color */
 166        int                     flush_color;    /* L: flushing color */
 167        int                     nr_in_flight[WORK_NR_COLORS];
 168                                                /* L: nr of in_flight works */
 169        int                     nr_active;      /* L: nr of active works */
 170        int                     max_active;     /* L: max active works */
 171        struct list_head        delayed_works;  /* L: delayed works */
 172};
 173
 174/*
 175 * Structure used to wait for workqueue flush.
 176 */
 177struct wq_flusher {
 178        struct list_head        list;           /* F: list of flushers */
 179        int                     flush_color;    /* F: flush color waiting for */
 180        struct completion       done;           /* flush completion */
 181};
 182
 183/*
 184 * All cpumasks are assumed to be always set on UP and thus can't be
 185 * used to determine whether there's something to be done.
 186 */
 187#ifdef CONFIG_SMP
 188typedef cpumask_var_t mayday_mask_t;
 189#define mayday_test_and_set_cpu(cpu, mask)      \
 190        cpumask_test_and_set_cpu((cpu), (mask))
 191#define mayday_clear_cpu(cpu, mask)             cpumask_clear_cpu((cpu), (mask))
 192#define for_each_mayday_cpu(cpu, mask)          for_each_cpu((cpu), (mask))
 193#define alloc_mayday_mask(maskp, gfp)           zalloc_cpumask_var((maskp), (gfp))
 194#define free_mayday_mask(mask)                  free_cpumask_var((mask))
 195#else
 196typedef unsigned long mayday_mask_t;
 197#define mayday_test_and_set_cpu(cpu, mask)      test_and_set_bit(0, &(mask))
 198#define mayday_clear_cpu(cpu, mask)             clear_bit(0, &(mask))
 199#define for_each_mayday_cpu(cpu, mask)          if ((cpu) = 0, (mask))
 200#define alloc_mayday_mask(maskp, gfp)           true
 201#define free_mayday_mask(mask)                  do { } while (0)
 202#endif
 203
 204/*
 205 * The externally visible workqueue abstraction is an array of
 206 * per-CPU workqueues:
 207 */
 208struct workqueue_struct {
 209        unsigned int            flags;          /* W: WQ_* flags */
 210        union {
 211                struct pool_workqueue __percpu          *pcpu;
 212                struct pool_workqueue                   *single;
 213                unsigned long                           v;
 214        } pool_wq;                              /* I: pwq's */
 215        struct list_head        list;           /* W: list of all workqueues */
 216
 217        struct mutex            flush_mutex;    /* protects wq flushing */
 218        int                     work_color;     /* F: current work color */
 219        int                     flush_color;    /* F: current flush color */
 220        atomic_t                nr_pwqs_to_flush; /* flush in progress */
 221        struct wq_flusher       *first_flusher; /* F: first flusher */
 222        struct list_head        flusher_queue;  /* F: flush waiters */
 223        struct list_head        flusher_overflow; /* F: flush overflow list */
 224
 225        mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
 226        struct worker           *rescuer;       /* I: rescue worker */
 227
 228        int                     nr_drainers;    /* W: drain in progress */
 229        int                     saved_max_active; /* W: saved pwq max_active */
 230#ifdef CONFIG_LOCKDEP
 231        struct lockdep_map      lockdep_map;
 232#endif
 233        char                    name[];         /* I: workqueue name */
 234};
 235
 236struct workqueue_struct *system_wq __read_mostly;
 237EXPORT_SYMBOL_GPL(system_wq);
 238struct workqueue_struct *system_highpri_wq __read_mostly;
 239EXPORT_SYMBOL_GPL(system_highpri_wq);
 240struct workqueue_struct *system_long_wq __read_mostly;
 241EXPORT_SYMBOL_GPL(system_long_wq);
 242struct workqueue_struct *system_unbound_wq __read_mostly;
 243EXPORT_SYMBOL_GPL(system_unbound_wq);
 244struct workqueue_struct *system_freezable_wq __read_mostly;
 245EXPORT_SYMBOL_GPL(system_freezable_wq);
 246
 247#define CREATE_TRACE_POINTS
 248#include <trace/events/workqueue.h>
 249
 250#define for_each_std_worker_pool(pool, cpu)                             \
 251        for ((pool) = &std_worker_pools(cpu)[0];                        \
 252             (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
 253
 254#define for_each_busy_worker(worker, i, pool)                           \
 255        hash_for_each(pool->busy_hash, i, worker, hentry)
 256
 257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
 258                                unsigned int sw)
 259{
 260        if (cpu < nr_cpu_ids) {
 261                if (sw & 1) {
 262                        cpu = cpumask_next(cpu, mask);
 263                        if (cpu < nr_cpu_ids)
 264                                return cpu;
 265                }
 266                if (sw & 2)
 267                        return WORK_CPU_UNBOUND;
 268        }
 269        return WORK_CPU_END;
 270}
 271
 272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
 273                                 struct workqueue_struct *wq)
 274{
 275        return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
 276}
 277
 278/*
 279 * CPU iterators
 280 *
 281 * An extra cpu number is defined using an invalid cpu number
 282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
 283 * specific CPU.  The following iterators are similar to for_each_*_cpu()
 284 * iterators but also considers the unbound CPU.
 285 *
 286 * for_each_wq_cpu()            : possible CPUs + WORK_CPU_UNBOUND
 287 * for_each_online_wq_cpu()     : online CPUs + WORK_CPU_UNBOUND
 288 * for_each_pwq_cpu()           : possible CPUs for bound workqueues,
 289 *                                WORK_CPU_UNBOUND for unbound workqueues
 290 */
 291#define for_each_wq_cpu(cpu)                                            \
 292        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);           \
 293             (cpu) < WORK_CPU_END;                                      \
 294             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
 295
 296#define for_each_online_wq_cpu(cpu)                                     \
 297        for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);             \
 298             (cpu) < WORK_CPU_END;                                      \
 299             (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
 300
 301#define for_each_pwq_cpu(cpu, wq)                                       \
 302        for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));       \
 303             (cpu) < WORK_CPU_END;                                      \
 304             (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
 305
 306#ifdef CONFIG_DEBUG_OBJECTS_WORK
 307
 308static struct debug_obj_descr work_debug_descr;
 309
 310static void *work_debug_hint(void *addr)
 311{
 312        return ((struct work_struct *) addr)->func;
 313}
 314
 315/*
 316 * fixup_init is called when:
 317 * - an active object is initialized
 318 */
 319static int work_fixup_init(void *addr, enum debug_obj_state state)
 320{
 321        struct work_struct *work = addr;
 322
 323        switch (state) {
 324        case ODEBUG_STATE_ACTIVE:
 325                cancel_work_sync(work);
 326                debug_object_init(work, &work_debug_descr);
 327                return 1;
 328        default:
 329                return 0;
 330        }
 331}
 332
 333/*
 334 * fixup_activate is called when:
 335 * - an active object is activated
 336 * - an unknown object is activated (might be a statically initialized object)
 337 */
 338static int work_fixup_activate(void *addr, enum debug_obj_state state)
 339{
 340        struct work_struct *work = addr;
 341
 342        switch (state) {
 343
 344        case ODEBUG_STATE_NOTAVAILABLE:
 345                /*
 346                 * This is not really a fixup. The work struct was
 347                 * statically initialized. We just make sure that it
 348                 * is tracked in the object tracker.
 349                 */
 350                if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 351                        debug_object_init(work, &work_debug_descr);
 352                        debug_object_activate(work, &work_debug_descr);
 353                        return 0;
 354                }
 355                WARN_ON_ONCE(1);
 356                return 0;
 357
 358        case ODEBUG_STATE_ACTIVE:
 359                WARN_ON(1);
 360
 361        default:
 362                return 0;
 363        }
 364}
 365
 366/*
 367 * fixup_free is called when:
 368 * - an active object is freed
 369 */
 370static int work_fixup_free(void *addr, enum debug_obj_state state)
 371{
 372        struct work_struct *work = addr;
 373
 374        switch (state) {
 375        case ODEBUG_STATE_ACTIVE:
 376                cancel_work_sync(work);
 377                debug_object_free(work, &work_debug_descr);
 378                return 1;
 379        default:
 380                return 0;
 381        }
 382}
 383
 384static struct debug_obj_descr work_debug_descr = {
 385        .name           = "work_struct",
 386        .debug_hint     = work_debug_hint,
 387        .fixup_init     = work_fixup_init,
 388        .fixup_activate = work_fixup_activate,
 389        .fixup_free     = work_fixup_free,
 390};
 391
 392static inline void debug_work_activate(struct work_struct *work)
 393{
 394        debug_object_activate(work, &work_debug_descr);
 395}
 396
 397static inline void debug_work_deactivate(struct work_struct *work)
 398{
 399        debug_object_deactivate(work, &work_debug_descr);
 400}
 401
 402void __init_work(struct work_struct *work, int onstack)
 403{
 404        if (onstack)
 405                debug_object_init_on_stack(work, &work_debug_descr);
 406        else
 407                debug_object_init(work, &work_debug_descr);
 408}
 409EXPORT_SYMBOL_GPL(__init_work);
 410
 411void destroy_work_on_stack(struct work_struct *work)
 412{
 413        debug_object_free(work, &work_debug_descr);
 414}
 415EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 416
 417#else
 418static inline void debug_work_activate(struct work_struct *work) { }
 419static inline void debug_work_deactivate(struct work_struct *work) { }
 420#endif
 421
 422/* Serializes the accesses to the list of workqueues. */
 423static DEFINE_SPINLOCK(workqueue_lock);
 424static LIST_HEAD(workqueues);
 425static bool workqueue_freezing;         /* W: have wqs started freezing? */
 426
 427/*
 428 * The CPU and unbound standard worker pools.  The unbound ones have
 429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
 430 */
 431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 432                                     cpu_std_worker_pools);
 433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
 434
 435/* idr of all pools */
 436static DEFINE_MUTEX(worker_pool_idr_mutex);
 437static DEFINE_IDR(worker_pool_idr);
 438
 439static int worker_thread(void *__worker);
 440
 441static struct worker_pool *std_worker_pools(int cpu)
 442{
 443        if (cpu != WORK_CPU_UNBOUND)
 444                return per_cpu(cpu_std_worker_pools, cpu);
 445        else
 446                return unbound_std_worker_pools;
 447}
 448
 449static int std_worker_pool_pri(struct worker_pool *pool)
 450{
 451        return pool - std_worker_pools(pool->cpu);
 452}
 453
 454/* allocate ID and assign it to @pool */
 455static int worker_pool_assign_id(struct worker_pool *pool)
 456{
 457        int ret;
 458
 459        mutex_lock(&worker_pool_idr_mutex);
 460        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
 461        if (ret >= 0)
 462                pool->id = ret;
 463        mutex_unlock(&worker_pool_idr_mutex);
 464
 465        return ret < 0 ? ret : 0;
 466}
 467
 468/*
 469 * Lookup worker_pool by id.  The idr currently is built during boot and
 470 * never modified.  Don't worry about locking for now.
 471 */
 472static struct worker_pool *worker_pool_by_id(int pool_id)
 473{
 474        return idr_find(&worker_pool_idr, pool_id);
 475}
 476
 477static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
 478{
 479        struct worker_pool *pools = std_worker_pools(cpu);
 480
 481        return &pools[highpri];
 482}
 483
 484static struct pool_workqueue *get_pwq(unsigned int cpu,
 485                                      struct workqueue_struct *wq)
 486{
 487        if (!(wq->flags & WQ_UNBOUND)) {
 488                if (likely(cpu < nr_cpu_ids))
 489                        return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
 490        } else if (likely(cpu == WORK_CPU_UNBOUND))
 491                return wq->pool_wq.single;
 492        return NULL;
 493}
 494
 495static unsigned int work_color_to_flags(int color)
 496{
 497        return color << WORK_STRUCT_COLOR_SHIFT;
 498}
 499
 500static int get_work_color(struct work_struct *work)
 501{
 502        return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
 503                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
 504}
 505
 506static int work_next_color(int color)
 507{
 508        return (color + 1) % WORK_NR_COLORS;
 509}
 510
 511/*
 512 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 513 * contain the pointer to the queued pwq.  Once execution starts, the flag
 514 * is cleared and the high bits contain OFFQ flags and pool ID.
 515 *
 516 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
 517 * and clear_work_data() can be used to set the pwq, pool or clear
 518 * work->data.  These functions should only be called while the work is
 519 * owned - ie. while the PENDING bit is set.
 520 *
 521 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 522 * corresponding to a work.  Pool is available once the work has been
 523 * queued anywhere after initialization until it is sync canceled.  pwq is
 524 * available only while the work item is queued.
 525 *
 526 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 527 * canceled.  While being canceled, a work item may have its PENDING set
 528 * but stay off timer and worklist for arbitrarily long and nobody should
 529 * try to steal the PENDING bit.
 530 */
 531static inline void set_work_data(struct work_struct *work, unsigned long data,
 532                                 unsigned long flags)
 533{
 534        BUG_ON(!work_pending(work));
 535        atomic_long_set(&work->data, data | flags | work_static(work));
 536}
 537
 538static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
 539                         unsigned long extra_flags)
 540{
 541        set_work_data(work, (unsigned long)pwq,
 542                      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
 543}
 544
 545static void set_work_pool_and_keep_pending(struct work_struct *work,
 546                                           int pool_id)
 547{
 548        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
 549                      WORK_STRUCT_PENDING);
 550}
 551
 552static void set_work_pool_and_clear_pending(struct work_struct *work,
 553                                            int pool_id)
 554{
 555        /*
 556         * The following wmb is paired with the implied mb in
 557         * test_and_set_bit(PENDING) and ensures all updates to @work made
 558         * here are visible to and precede any updates by the next PENDING
 559         * owner.
 560         */
 561        smp_wmb();
 562        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
 563}
 564
 565static void clear_work_data(struct work_struct *work)
 566{
 567        smp_wmb();      /* see set_work_pool_and_clear_pending() */
 568        set_work_data(work, WORK_STRUCT_NO_POOL, 0);
 569}
 570
 571static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 572{
 573        unsigned long data = atomic_long_read(&work->data);
 574
 575        if (data & WORK_STRUCT_PWQ)
 576                return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
 577        else
 578                return NULL;
 579}
 580
 581/**
 582 * get_work_pool - return the worker_pool a given work was associated with
 583 * @work: the work item of interest
 584 *
 585 * Return the worker_pool @work was last associated with.  %NULL if none.
 586 */
 587static struct worker_pool *get_work_pool(struct work_struct *work)
 588{
 589        unsigned long data = atomic_long_read(&work->data);
 590        struct worker_pool *pool;
 591        int pool_id;
 592
 593        if (data & WORK_STRUCT_PWQ)
 594                return ((struct pool_workqueue *)
 595                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
 596
 597        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
 598        if (pool_id == WORK_OFFQ_POOL_NONE)
 599                return NULL;
 600
 601        pool = worker_pool_by_id(pool_id);
 602        WARN_ON_ONCE(!pool);
 603        return pool;
 604}
 605
 606/**
 607 * get_work_pool_id - return the worker pool ID a given work is associated with
 608 * @work: the work item of interest
 609 *
 610 * Return the worker_pool ID @work was last associated with.
 611 * %WORK_OFFQ_POOL_NONE if none.
 612 */
 613static int get_work_pool_id(struct work_struct *work)
 614{
 615        unsigned long data = atomic_long_read(&work->data);
 616
 617        if (data & WORK_STRUCT_PWQ)
 618                return ((struct pool_workqueue *)
 619                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
 620
 621        return data >> WORK_OFFQ_POOL_SHIFT;
 622}
 623
 624static void mark_work_canceling(struct work_struct *work)
 625{
 626        unsigned long pool_id = get_work_pool_id(work);
 627
 628        pool_id <<= WORK_OFFQ_POOL_SHIFT;
 629        set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
 630}
 631
 632static bool work_is_canceling(struct work_struct *work)
 633{
 634        unsigned long data = atomic_long_read(&work->data);
 635
 636        return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
 637}
 638
 639/*
 640 * Policy functions.  These define the policies on how the global worker
 641 * pools are managed.  Unless noted otherwise, these functions assume that
 642 * they're being called with pool->lock held.
 643 */
 644
 645static bool __need_more_worker(struct worker_pool *pool)
 646{
 647        return !atomic_read(&pool->nr_running);
 648}
 649
 650/*
 651 * Need to wake up a worker?  Called from anything but currently
 652 * running workers.
 653 *
 654 * Note that, because unbound workers never contribute to nr_running, this
 655 * function will always return %true for unbound pools as long as the
 656 * worklist isn't empty.
 657 */
 658static bool need_more_worker(struct worker_pool *pool)
 659{
 660        return !list_empty(&pool->worklist) && __need_more_worker(pool);
 661}
 662
 663/* Can I start working?  Called from busy but !running workers. */
 664static bool may_start_working(struct worker_pool *pool)
 665{
 666        return pool->nr_idle;
 667}
 668
 669/* Do I need to keep working?  Called from currently running workers. */
 670static bool keep_working(struct worker_pool *pool)
 671{
 672        return !list_empty(&pool->worklist) &&
 673                atomic_read(&pool->nr_running) <= 1;
 674}
 675
 676/* Do we need a new worker?  Called from manager. */
 677static bool need_to_create_worker(struct worker_pool *pool)
 678{
 679        return need_more_worker(pool) && !may_start_working(pool);
 680}
 681
 682/* Do I need to be the manager? */
 683static bool need_to_manage_workers(struct worker_pool *pool)
 684{
 685        return need_to_create_worker(pool) ||
 686                (pool->flags & POOL_MANAGE_WORKERS);
 687}
 688
 689/* Do we have too many workers and should some go away? */
 690static bool too_many_workers(struct worker_pool *pool)
 691{
 692        bool managing = pool->flags & POOL_MANAGING_WORKERS;
 693        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 694        int nr_busy = pool->nr_workers - nr_idle;
 695
 696        /*
 697         * nr_idle and idle_list may disagree if idle rebinding is in
 698         * progress.  Never return %true if idle_list is empty.
 699         */
 700        if (list_empty(&pool->idle_list))
 701                return false;
 702
 703        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 704}
 705
 706/*
 707 * Wake up functions.
 708 */
 709
 710/* Return the first worker.  Safe with preemption disabled */
 711static struct worker *first_worker(struct worker_pool *pool)
 712{
 713        if (unlikely(list_empty(&pool->idle_list)))
 714                return NULL;
 715
 716        return list_first_entry(&pool->idle_list, struct worker, entry);
 717}
 718
 719/**
 720 * wake_up_worker - wake up an idle worker
 721 * @pool: worker pool to wake worker from
 722 *
 723 * Wake up the first idle worker of @pool.
 724 *
 725 * CONTEXT:
 726 * spin_lock_irq(pool->lock).
 727 */
 728static void wake_up_worker(struct worker_pool *pool)
 729{
 730        struct worker *worker = first_worker(pool);
 731
 732        if (likely(worker))
 733                wake_up_process(worker->task);
 734}
 735
 736/**
 737 * wq_worker_waking_up - a worker is waking up
 738 * @task: task waking up
 739 * @cpu: CPU @task is waking up to
 740 *
 741 * This function is called during try_to_wake_up() when a worker is
 742 * being awoken.
 743 *
 744 * CONTEXT:
 745 * spin_lock_irq(rq->lock)
 746 */
 747void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 748{
 749        struct worker *worker = kthread_data(task);
 750
 751        if (!(worker->flags & WORKER_NOT_RUNNING)) {
 752                WARN_ON_ONCE(worker->pool->cpu != cpu);
 753                atomic_inc(&worker->pool->nr_running);
 754        }
 755}
 756
 757/**
 758 * wq_worker_sleeping - a worker is going to sleep
 759 * @task: task going to sleep
 760 * @cpu: CPU in question, must be the current CPU number
 761 *
 762 * This function is called during schedule() when a busy worker is
 763 * going to sleep.  Worker on the same cpu can be woken up by
 764 * returning pointer to its task.
 765 *
 766 * CONTEXT:
 767 * spin_lock_irq(rq->lock)
 768 *
 769 * RETURNS:
 770 * Worker task on @cpu to wake up, %NULL if none.
 771 */
 772struct task_struct *wq_worker_sleeping(struct task_struct *task,
 773                                       unsigned int cpu)
 774{
 775        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
 776        struct worker_pool *pool;
 777
 778        /*
 779         * Rescuers, which may not have all the fields set up like normal
 780         * workers, also reach here, let's not access anything before
 781         * checking NOT_RUNNING.
 782         */
 783        if (worker->flags & WORKER_NOT_RUNNING)
 784                return NULL;
 785
 786        pool = worker->pool;
 787
 788        /* this can only happen on the local cpu */
 789        BUG_ON(cpu != raw_smp_processor_id());
 790
 791        /*
 792         * The counterpart of the following dec_and_test, implied mb,
 793         * worklist not empty test sequence is in insert_work().
 794         * Please read comment there.
 795         *
 796         * NOT_RUNNING is clear.  This means that we're bound to and
 797         * running on the local cpu w/ rq lock held and preemption
 798         * disabled, which in turn means that none else could be
 799         * manipulating idle_list, so dereferencing idle_list without pool
 800         * lock is safe.
 801         */
 802        if (atomic_dec_and_test(&pool->nr_running) &&
 803            !list_empty(&pool->worklist))
 804                to_wakeup = first_worker(pool);
 805        return to_wakeup ? to_wakeup->task : NULL;
 806}
 807
 808/**
 809 * worker_set_flags - set worker flags and adjust nr_running accordingly
 810 * @worker: self
 811 * @flags: flags to set
 812 * @wakeup: wakeup an idle worker if necessary
 813 *
 814 * Set @flags in @worker->flags and adjust nr_running accordingly.  If
 815 * nr_running becomes zero and @wakeup is %true, an idle worker is
 816 * woken up.
 817 *
 818 * CONTEXT:
 819 * spin_lock_irq(pool->lock)
 820 */
 821static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 822                                    bool wakeup)
 823{
 824        struct worker_pool *pool = worker->pool;
 825
 826        WARN_ON_ONCE(worker->task != current);
 827
 828        /*
 829         * If transitioning into NOT_RUNNING, adjust nr_running and
 830         * wake up an idle worker as necessary if requested by
 831         * @wakeup.
 832         */
 833        if ((flags & WORKER_NOT_RUNNING) &&
 834            !(worker->flags & WORKER_NOT_RUNNING)) {
 835                if (wakeup) {
 836                        if (atomic_dec_and_test(&pool->nr_running) &&
 837                            !list_empty(&pool->worklist))
 838                                wake_up_worker(pool);
 839                } else
 840                        atomic_dec(&pool->nr_running);
 841        }
 842
 843        worker->flags |= flags;
 844}
 845
 846/**
 847 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 848 * @worker: self
 849 * @flags: flags to clear
 850 *
 851 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 852 *
 853 * CONTEXT:
 854 * spin_lock_irq(pool->lock)
 855 */
 856static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 857{
 858        struct worker_pool *pool = worker->pool;
 859        unsigned int oflags = worker->flags;
 860
 861        WARN_ON_ONCE(worker->task != current);
 862
 863        worker->flags &= ~flags;
 864
 865        /*
 866         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
 867         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
 868         * of multiple flags, not a single flag.
 869         */
 870        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 871                if (!(worker->flags & WORKER_NOT_RUNNING))
 872                        atomic_inc(&pool->nr_running);
 873}
 874
 875/**
 876 * find_worker_executing_work - find worker which is executing a work
 877 * @pool: pool of interest
 878 * @work: work to find worker for
 879 *
 880 * Find a worker which is executing @work on @pool by searching
 881 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 882 * to match, its current execution should match the address of @work and
 883 * its work function.  This is to avoid unwanted dependency between
 884 * unrelated work executions through a work item being recycled while still
 885 * being executed.
 886 *
 887 * This is a bit tricky.  A work item may be freed once its execution
 888 * starts and nothing prevents the freed area from being recycled for
 889 * another work item.  If the same work item address ends up being reused
 890 * before the original execution finishes, workqueue will identify the
 891 * recycled work item as currently executing and make it wait until the
 892 * current execution finishes, introducing an unwanted dependency.
 893 *
 894 * This function checks the work item address, work function and workqueue
 895 * to avoid false positives.  Note that this isn't complete as one may
 896 * construct a work function which can introduce dependency onto itself
 897 * through a recycled work item.  Well, if somebody wants to shoot oneself
 898 * in the foot that badly, there's only so much we can do, and if such
 899 * deadlock actually occurs, it should be easy to locate the culprit work
 900 * function.
 901 *
 902 * CONTEXT:
 903 * spin_lock_irq(pool->lock).
 904 *
 905 * RETURNS:
 906 * Pointer to worker which is executing @work if found, NULL
 907 * otherwise.
 908 */
 909static struct worker *find_worker_executing_work(struct worker_pool *pool,
 910                                                 struct work_struct *work)
 911{
 912        struct worker *worker;
 913
 914        hash_for_each_possible(pool->busy_hash, worker, hentry,
 915                               (unsigned long)work)
 916                if (worker->current_work == work &&
 917                    worker->current_func == work->func)
 918                        return worker;
 919
 920        return NULL;
 921}
 922
 923/**
 924 * move_linked_works - move linked works to a list
 925 * @work: start of series of works to be scheduled
 926 * @head: target list to append @work to
 927 * @nextp: out paramter for nested worklist walking
 928 *
 929 * Schedule linked works starting from @work to @head.  Work series to
 930 * be scheduled starts at @work and includes any consecutive work with
 931 * WORK_STRUCT_LINKED set in its predecessor.
 932 *
 933 * If @nextp is not NULL, it's updated to point to the next work of
 934 * the last scheduled work.  This allows move_linked_works() to be
 935 * nested inside outer list_for_each_entry_safe().
 936 *
 937 * CONTEXT:
 938 * spin_lock_irq(pool->lock).
 939 */
 940static void move_linked_works(struct work_struct *work, struct list_head *head,
 941                              struct work_struct **nextp)
 942{
 943        struct work_struct *n;
 944
 945        /*
 946         * Linked worklist will always end before the end of the list,
 947         * use NULL for list head.
 948         */
 949        list_for_each_entry_safe_from(work, n, NULL, entry) {
 950                list_move_tail(&work->entry, head);
 951                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
 952                        break;
 953        }
 954
 955        /*
 956         * If we're already inside safe list traversal and have moved
 957         * multiple works to the scheduled queue, the next position
 958         * needs to be updated.
 959         */
 960        if (nextp)
 961                *nextp = n;
 962}
 963
 964static void pwq_activate_delayed_work(struct work_struct *work)
 965{
 966        struct pool_workqueue *pwq = get_work_pwq(work);
 967
 968        trace_workqueue_activate_work(work);
 969        move_linked_works(work, &pwq->pool->worklist, NULL);
 970        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
 971        pwq->nr_active++;
 972}
 973
 974static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
 975{
 976        struct work_struct *work = list_first_entry(&pwq->delayed_works,
 977                                                    struct work_struct, entry);
 978
 979        pwq_activate_delayed_work(work);
 980}
 981
 982/**
 983 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 984 * @pwq: pwq of interest
 985 * @color: color of work which left the queue
 986 *
 987 * A work either has completed or is removed from pending queue,
 988 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 989 *
 990 * CONTEXT:
 991 * spin_lock_irq(pool->lock).
 992 */
 993static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
 994{
 995        /* ignore uncolored works */
 996        if (color == WORK_NO_COLOR)
 997                return;
 998
 999        pwq->nr_in_flight[color]--;
1000
1001        pwq->nr_active--;
1002        if (!list_empty(&pwq->delayed_works)) {
1003                /* one down, submit a delayed one */
1004                if (pwq->nr_active < pwq->max_active)
1005                        pwq_activate_first_delayed(pwq);
1006        }
1007
1008        /* is flush in progress and are we at the flushing tip? */
1009        if (likely(pwq->flush_color != color))
1010                return;
1011
1012        /* are there still in-flight works? */
1013        if (pwq->nr_in_flight[color])
1014                return;
1015
1016        /* this pwq is done, clear flush_color */
1017        pwq->flush_color = -1;
1018
1019        /*
1020         * If this was the last pwq, wake up the first flusher.  It
1021         * will handle the rest.
1022         */
1023        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1024                complete(&pwq->wq->first_flusher->done);
1025}
1026
1027/**
1028 * try_to_grab_pending - steal work item from worklist and disable irq
1029 * @work: work item to steal
1030 * @is_dwork: @work is a delayed_work
1031 * @flags: place to store irq state
1032 *
1033 * Try to grab PENDING bit of @work.  This function can handle @work in any
1034 * stable state - idle, on timer or on worklist.  Return values are
1035 *
1036 *  1           if @work was pending and we successfully stole PENDING
1037 *  0           if @work was idle and we claimed PENDING
1038 *  -EAGAIN     if PENDING couldn't be grabbed at the moment, safe to busy-retry
1039 *  -ENOENT     if someone else is canceling @work, this state may persist
1040 *              for arbitrarily long
1041 *
1042 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
1043 * interrupted while holding PENDING and @work off queue, irq must be
1044 * disabled on entry.  This, combined with delayed_work->timer being
1045 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
1046 *
1047 * On successful return, >= 0, irq is disabled and the caller is
1048 * responsible for releasing it using local_irq_restore(*@flags).
1049 *
1050 * This function is safe to call from any context including IRQ handler.
1051 */
1052static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1053                               unsigned long *flags)
1054{
1055        struct worker_pool *pool;
1056        struct pool_workqueue *pwq;
1057
1058        local_irq_save(*flags);
1059
1060        /* try to steal the timer if it exists */
1061        if (is_dwork) {
1062                struct delayed_work *dwork = to_delayed_work(work);
1063
1064                /*
1065                 * dwork->timer is irqsafe.  If del_timer() fails, it's
1066                 * guaranteed that the timer is not queued anywhere and not
1067                 * running on the local CPU.
1068                 */
1069                if (likely(del_timer(&dwork->timer)))
1070                        return 1;
1071        }
1072
1073        /* try to claim PENDING the normal way */
1074        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1075                return 0;
1076
1077        /*
1078         * The queueing is in progress, or it is already queued. Try to
1079         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1080         */
1081        pool = get_work_pool(work);
1082        if (!pool)
1083                goto fail;
1084
1085        spin_lock(&pool->lock);
1086        /*
1087         * work->data is guaranteed to point to pwq only while the work
1088         * item is queued on pwq->wq, and both updating work->data to point
1089         * to pwq on queueing and to pool on dequeueing are done under
1090         * pwq->pool->lock.  This in turn guarantees that, if work->data
1091         * points to pwq which is associated with a locked pool, the work
1092         * item is currently queued on that pool.
1093         */
1094        pwq = get_work_pwq(work);
1095        if (pwq && pwq->pool == pool) {
1096                debug_work_deactivate(work);
1097
1098                /*
1099                 * A delayed work item cannot be grabbed directly because
1100                 * it might have linked NO_COLOR work items which, if left
1101                 * on the delayed_list, will confuse pwq->nr_active
1102                 * management later on and cause stall.  Make sure the work
1103                 * item is activated before grabbing.
1104                 */
1105                if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
1106                        pwq_activate_delayed_work(work);
1107
1108                list_del_init(&work->entry);
1109                pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
1110
1111                /* work->data points to pwq iff queued, point to pool */
1112                set_work_pool_and_keep_pending(work, pool->id);
1113
1114                spin_unlock(&pool->lock);
1115                return 1;
1116        }
1117        spin_unlock(&pool->lock);
1118fail:
1119        local_irq_restore(*flags);
1120        if (work_is_canceling(work))
1121                return -ENOENT;
1122        cpu_relax();
1123        return -EAGAIN;
1124}
1125
1126/**
1127 * insert_work - insert a work into a pool
1128 * @pwq: pwq @work belongs to
1129 * @work: work to insert
1130 * @head: insertion point
1131 * @extra_flags: extra WORK_STRUCT_* flags to set
1132 *
1133 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
1134 * work_struct flags.
1135 *
1136 * CONTEXT:
1137 * spin_lock_irq(pool->lock).
1138 */
1139static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1140                        struct list_head *head, unsigned int extra_flags)
1141{
1142        struct worker_pool *pool = pwq->pool;
1143
1144        /* we own @work, set data and link */
1145        set_work_pwq(work, pwq, extra_flags);
1146        list_add_tail(&work->entry, head);
1147
1148        /*
1149         * Ensure either worker_sched_deactivated() sees the above
1150         * list_add_tail() or we see zero nr_running to avoid workers
1151         * lying around lazily while there are works to be processed.
1152         */
1153        smp_mb();
1154
1155        if (__need_more_worker(pool))
1156                wake_up_worker(pool);
1157}
1158
1159/*
1160 * Test whether @work is being queued from another work executing on the
1161 * same workqueue.
1162 */
1163static bool is_chained_work(struct workqueue_struct *wq)
1164{
1165        struct worker *worker;
1166
1167        worker = current_wq_worker();
1168        /*
1169         * Return %true iff I'm a worker execuing a work item on @wq.  If
1170         * I'm @worker, it's safe to dereference it without locking.
1171         */
1172        return worker && worker->current_pwq->wq == wq;
1173}
1174
1175static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1176                         struct work_struct *work)
1177{
1178        struct pool_workqueue *pwq;
1179        struct list_head *worklist;
1180        unsigned int work_flags;
1181        unsigned int req_cpu = cpu;
1182
1183        /*
1184         * While a work item is PENDING && off queue, a task trying to
1185         * steal the PENDING will busy-loop waiting for it to either get
1186         * queued or lose PENDING.  Grabbing PENDING and queueing should
1187         * happen with IRQ disabled.
1188         */
1189        WARN_ON_ONCE(!irqs_disabled());
1190
1191        debug_work_activate(work);
1192
1193        /* if dying, only works from the same workqueue are allowed */
1194        if (unlikely(wq->flags & WQ_DRAINING) &&
1195            WARN_ON_ONCE(!is_chained_work(wq)))
1196                return;
1197
1198        /* determine the pwq to use */
1199        if (!(wq->flags & WQ_UNBOUND)) {
1200                struct worker_pool *last_pool;
1201
1202                if (cpu == WORK_CPU_UNBOUND)
1203                        cpu = raw_smp_processor_id();
1204
1205                /*
1206                 * It's multi cpu.  If @work was previously on a different
1207                 * cpu, it might still be running there, in which case the
1208                 * work needs to be queued on that cpu to guarantee
1209                 * non-reentrancy.
1210                 */
1211                pwq = get_pwq(cpu, wq);
1212                last_pool = get_work_pool(work);
1213
1214                if (last_pool && last_pool != pwq->pool) {
1215                        struct worker *worker;
1216
1217                        spin_lock(&last_pool->lock);
1218
1219                        worker = find_worker_executing_work(last_pool, work);
1220
1221                        if (worker && worker->current_pwq->wq == wq) {
1222                                pwq = get_pwq(last_pool->cpu, wq);
1223                        } else {
1224                                /* meh... not running there, queue here */
1225                                spin_unlock(&last_pool->lock);
1226                                spin_lock(&pwq->pool->lock);
1227                        }
1228                } else {
1229                        spin_lock(&pwq->pool->lock);
1230                }
1231        } else {
1232                pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1233                spin_lock(&pwq->pool->lock);
1234        }
1235
1236        /* pwq determined, queue */
1237        trace_workqueue_queue_work(req_cpu, pwq, work);
1238
1239        if (WARN_ON(!list_empty(&work->entry))) {
1240                spin_unlock(&pwq->pool->lock);
1241                return;
1242        }
1243
1244        pwq->nr_in_flight[pwq->work_color]++;
1245        work_flags = work_color_to_flags(pwq->work_color);
1246
1247        if (likely(pwq->nr_active < pwq->max_active)) {
1248                trace_workqueue_activate_work(work);
1249                pwq->nr_active++;
1250                worklist = &pwq->pool->worklist;
1251        } else {
1252                work_flags |= WORK_STRUCT_DELAYED;
1253                worklist = &pwq->delayed_works;
1254        }
1255
1256        insert_work(pwq, work, worklist, work_flags);
1257
1258        spin_unlock(&pwq->pool->lock);
1259}
1260
1261/**
1262 * queue_work_on - queue work on specific cpu
1263 * @cpu: CPU number to execute work on
1264 * @wq: workqueue to use
1265 * @work: work to queue
1266 *
1267 * Returns %false if @work was already on a queue, %true otherwise.
1268 *
1269 * We queue the work to a specific CPU, the caller must ensure it
1270 * can't go away.
1271 */
1272bool queue_work_on(int cpu, struct workqueue_struct *wq,
1273                   struct work_struct *work)
1274{
1275        bool ret = false;
1276        unsigned long flags;
1277
1278        local_irq_save(flags);
1279
1280        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1281                __queue_work(cpu, wq, work);
1282                ret = true;
1283        }
1284
1285        local_irq_restore(flags);
1286        return ret;
1287}
1288EXPORT_SYMBOL_GPL(queue_work_on);
1289
1290/**
1291 * queue_work - queue work on a workqueue
1292 * @wq: workqueue to use
1293 * @work: work to queue
1294 *
1295 * Returns %false if @work was already on a queue, %true otherwise.
1296 *
1297 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1298 * it can be processed by another CPU.
1299 */
1300bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1301{
1302        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1303}
1304EXPORT_SYMBOL_GPL(queue_work);
1305
1306void delayed_work_timer_fn(unsigned long __data)
1307{
1308        struct delayed_work *dwork = (struct delayed_work *)__data;
1309
1310        /* should have been called from irqsafe timer with irq already off */
1311        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1312}
1313EXPORT_SYMBOL(delayed_work_timer_fn);
1314
1315static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1316                                struct delayed_work *dwork, unsigned long delay)
1317{
1318        struct timer_list *timer = &dwork->timer;
1319        struct work_struct *work = &dwork->work;
1320
1321        WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1322                     timer->data != (unsigned long)dwork);
1323        WARN_ON_ONCE(timer_pending(timer));
1324        WARN_ON_ONCE(!list_empty(&work->entry));
1325
1326        /*
1327         * If @delay is 0, queue @dwork->work immediately.  This is for
1328         * both optimization and correctness.  The earliest @timer can
1329         * expire is on the closest next tick and delayed_work users depend
1330         * on that there's no such delay when @delay is 0.
1331         */
1332        if (!delay) {
1333                __queue_work(cpu, wq, &dwork->work);
1334                return;
1335        }
1336
1337        timer_stats_timer_set_start_info(&dwork->timer);
1338
1339        dwork->wq = wq;
1340        dwork->cpu = cpu;
1341        timer->expires = jiffies + delay;
1342
1343        if (unlikely(cpu != WORK_CPU_UNBOUND))
1344                add_timer_on(timer, cpu);
1345        else
1346                add_timer(timer);
1347}
1348
1349/**
1350 * queue_delayed_work_on - queue work on specific CPU after delay
1351 * @cpu: CPU number to execute work on
1352 * @wq: workqueue to use
1353 * @dwork: work to queue
1354 * @delay: number of jiffies to wait before queueing
1355 *
1356 * Returns %false if @work was already on a queue, %true otherwise.  If
1357 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1358 * execution.
1359 */
1360bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1361                           struct delayed_work *dwork, unsigned long delay)
1362{
1363        struct work_struct *work = &dwork->work;
1364        bool ret = false;
1365        unsigned long flags;
1366
1367        /* read the comment in __queue_work() */
1368        local_irq_save(flags);
1369
1370        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1371                __queue_delayed_work(cpu, wq, dwork, delay);
1372                ret = true;
1373        }
1374
1375        local_irq_restore(flags);
1376        return ret;
1377}
1378EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1379
1380/**
1381 * queue_delayed_work - queue work on a workqueue after delay
1382 * @wq: workqueue to use
1383 * @dwork: delayable work to queue
1384 * @delay: number of jiffies to wait before queueing
1385 *
1386 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1387 */
1388bool queue_delayed_work(struct workqueue_struct *wq,
1389                        struct delayed_work *dwork, unsigned long delay)
1390{
1391        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1392}
1393EXPORT_SYMBOL_GPL(queue_delayed_work);
1394
1395/**
1396 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1397 * @cpu: CPU number to execute work on
1398 * @wq: workqueue to use
1399 * @dwork: work to queue
1400 * @delay: number of jiffies to wait before queueing
1401 *
1402 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
1403 * modify @dwork's timer so that it expires after @delay.  If @delay is
1404 * zero, @work is guaranteed to be scheduled immediately regardless of its
1405 * current state.
1406 *
1407 * Returns %false if @dwork was idle and queued, %true if @dwork was
1408 * pending and its timer was modified.
1409 *
1410 * This function is safe to call from any context including IRQ handler.
1411 * See try_to_grab_pending() for details.
1412 */
1413bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1414                         struct delayed_work *dwork, unsigned long delay)
1415{
1416        unsigned long flags;
1417        int ret;
1418
1419        do {
1420                ret = try_to_grab_pending(&dwork->work, true, &flags);
1421        } while (unlikely(ret == -EAGAIN));
1422
1423        if (likely(ret >= 0)) {
1424                __queue_delayed_work(cpu, wq, dwork, delay);
1425                local_irq_restore(flags);
1426        }
1427
1428        /* -ENOENT from try_to_grab_pending() becomes %true */
1429        return ret;
1430}
1431EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1432
1433/**
1434 * mod_delayed_work - modify delay of or queue a delayed work
1435 * @wq: workqueue to use
1436 * @dwork: work to queue
1437 * @delay: number of jiffies to wait before queueing
1438 *
1439 * mod_delayed_work_on() on local CPU.
1440 */
1441bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1442                      unsigned long delay)
1443{
1444        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1445}
1446EXPORT_SYMBOL_GPL(mod_delayed_work);
1447
1448/**
1449 * worker_enter_idle - enter idle state
1450 * @worker: worker which is entering idle state
1451 *
1452 * @worker is entering idle state.  Update stats and idle timer if
1453 * necessary.
1454 *
1455 * LOCKING:
1456 * spin_lock_irq(pool->lock).
1457 */
1458static void worker_enter_idle(struct worker *worker)
1459{
1460        struct worker_pool *pool = worker->pool;
1461
1462        BUG_ON(worker->flags & WORKER_IDLE);
1463        BUG_ON(!list_empty(&worker->entry) &&
1464               (worker->hentry.next || worker->hentry.pprev));
1465
1466        /* can't use worker_set_flags(), also called from start_worker() */
1467        worker->flags |= WORKER_IDLE;
1468        pool->nr_idle++;
1469        worker->last_active = jiffies;
1470
1471        /* idle_list is LIFO */
1472        list_add(&worker->entry, &pool->idle_list);
1473
1474        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1475                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1476
1477        /*
1478         * Sanity check nr_running.  Because wq_unbind_fn() releases
1479         * pool->lock between setting %WORKER_UNBOUND and zapping
1480         * nr_running, the warning may trigger spuriously.  Check iff
1481         * unbind is not in progress.
1482         */
1483        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
1484                     pool->nr_workers == pool->nr_idle &&
1485                     atomic_read(&pool->nr_running));
1486}
1487
1488/**
1489 * worker_leave_idle - leave idle state
1490 * @worker: worker which is leaving idle state
1491 *
1492 * @worker is leaving idle state.  Update stats.
1493 *
1494 * LOCKING:
1495 * spin_lock_irq(pool->lock).
1496 */
1497static void worker_leave_idle(struct worker *worker)
1498{
1499        struct worker_pool *pool = worker->pool;
1500
1501        BUG_ON(!(worker->flags & WORKER_IDLE));
1502        worker_clr_flags(worker, WORKER_IDLE);
1503        pool->nr_idle--;
1504        list_del_init(&worker->entry);
1505}
1506
1507/**
1508 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
1509 * @worker: self
1510 *
1511 * Works which are scheduled while the cpu is online must at least be
1512 * scheduled to a worker which is bound to the cpu so that if they are
1513 * flushed from cpu callbacks while cpu is going down, they are
1514 * guaranteed to execute on the cpu.
1515 *
1516 * This function is to be used by rogue workers and rescuers to bind
1517 * themselves to the target cpu and may race with cpu going down or
1518 * coming online.  kthread_bind() can't be used because it may put the
1519 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1520 * verbatim as it's best effort and blocking and pool may be
1521 * [dis]associated in the meantime.
1522 *
1523 * This function tries set_cpus_allowed() and locks pool and verifies the
1524 * binding against %POOL_DISASSOCIATED which is set during
1525 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1526 * enters idle state or fetches works without dropping lock, it can
1527 * guarantee the scheduling requirement described in the first paragraph.
1528 *
1529 * CONTEXT:
1530 * Might sleep.  Called without any lock but returns with pool->lock
1531 * held.
1532 *
1533 * RETURNS:
1534 * %true if the associated pool is online (@worker is successfully
1535 * bound), %false if offline.
1536 */
1537static bool worker_maybe_bind_and_lock(struct worker *worker)
1538__acquires(&pool->lock)
1539{
1540        struct worker_pool *pool = worker->pool;
1541        struct task_struct *task = worker->task;
1542
1543        while (true) {
1544                /*
1545                 * The following call may fail, succeed or succeed
1546                 * without actually migrating the task to the cpu if
1547                 * it races with cpu hotunplug operation.  Verify
1548                 * against POOL_DISASSOCIATED.
1549                 */
1550                if (!(pool->flags & POOL_DISASSOCIATED))
1551                        set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
1552
1553                spin_lock_irq(&pool->lock);
1554                if (pool->flags & POOL_DISASSOCIATED)
1555                        return false;
1556                if (task_cpu(task) == pool->cpu &&
1557                    cpumask_equal(&current->cpus_allowed,
1558                                  get_cpu_mask(pool->cpu)))
1559                        return true;
1560                spin_unlock_irq(&pool->lock);
1561
1562                /*
1563                 * We've raced with CPU hot[un]plug.  Give it a breather
1564                 * and retry migration.  cond_resched() is required here;
1565                 * otherwise, we might deadlock against cpu_stop trying to
1566                 * bring down the CPU on non-preemptive kernel.
1567                 */
1568                cpu_relax();
1569                cond_resched();
1570        }
1571}
1572
1573/*
1574 * Rebind an idle @worker to its CPU.  worker_thread() will test
1575 * list_empty(@worker->entry) before leaving idle and call this function.
1576 */
1577static void idle_worker_rebind(struct worker *worker)
1578{
1579        /* CPU may go down again inbetween, clear UNBOUND only on success */
1580        if (worker_maybe_bind_and_lock(worker))
1581                worker_clr_flags(worker, WORKER_UNBOUND);
1582
1583        /* rebind complete, become available again */
1584        list_add(&worker->entry, &worker->pool->idle_list);
1585        spin_unlock_irq(&worker->pool->lock);
1586}
1587
1588/*
1589 * Function for @worker->rebind.work used to rebind unbound busy workers to
1590 * the associated cpu which is coming back online.  This is scheduled by
1591 * cpu up but can race with other cpu hotplug operations and may be
1592 * executed twice without intervening cpu down.
1593 */
1594static void busy_worker_rebind_fn(struct work_struct *work)
1595{
1596        struct worker *worker = container_of(work, struct worker, rebind_work);
1597
1598        if (worker_maybe_bind_and_lock(worker))
1599                worker_clr_flags(worker, WORKER_UNBOUND);
1600
1601        spin_unlock_irq(&worker->pool->lock);
1602}
1603
1604/**
1605 * rebind_workers - rebind all workers of a pool to the associated CPU
1606 * @pool: pool of interest
1607 *
1608 * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
1609 * is different for idle and busy ones.
1610 *
1611 * Idle ones will be removed from the idle_list and woken up.  They will
1612 * add themselves back after completing rebind.  This ensures that the
1613 * idle_list doesn't contain any unbound workers when re-bound busy workers
1614 * try to perform local wake-ups for concurrency management.
1615 *
1616 * Busy workers can rebind after they finish their current work items.
1617 * Queueing the rebind work item at the head of the scheduled list is
1618 * enough.  Note that nr_running will be properly bumped as busy workers
1619 * rebind.
1620 *
1621 * On return, all non-manager workers are scheduled for rebind - see
1622 * manage_workers() for the manager special case.  Any idle worker
1623 * including the manager will not appear on @idle_list until rebind is
1624 * complete, making local wake-ups safe.
1625 */
1626static void rebind_workers(struct worker_pool *pool)
1627{
1628        struct worker *worker, *n;
1629        int i;
1630
1631        lockdep_assert_held(&pool->assoc_mutex);
1632        lockdep_assert_held(&pool->lock);
1633
1634        /* dequeue and kick idle ones */
1635        list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1636                /*
1637                 * idle workers should be off @pool->idle_list until rebind
1638                 * is complete to avoid receiving premature local wake-ups.
1639                 */
1640                list_del_init(&worker->entry);
1641
1642                /*
1643                 * worker_thread() will see the above dequeuing and call
1644                 * idle_worker_rebind().
1645                 */
1646                wake_up_process(worker->task);
1647        }
1648
1649        /* rebind busy workers */
1650        for_each_busy_worker(worker, i, pool) {
1651                struct work_struct *rebind_work = &worker->rebind_work;
1652                struct workqueue_struct *wq;
1653
1654                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1655                                     work_data_bits(rebind_work)))
1656                        continue;
1657
1658                debug_work_activate(rebind_work);
1659
1660                /*
1661                 * wq doesn't really matter but let's keep @worker->pool
1662                 * and @pwq->pool consistent for sanity.
1663                 */
1664                if (std_worker_pool_pri(worker->pool))
1665                        wq = system_highpri_wq;
1666                else
1667                        wq = system_wq;
1668
1669                insert_work(get_pwq(pool->cpu, wq), rebind_work,
1670                            worker->scheduled.next,
1671                            work_color_to_flags(WORK_NO_COLOR));
1672        }
1673}
1674
1675static struct worker *alloc_worker(void)
1676{
1677        struct worker *worker;
1678
1679        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1680        if (worker) {
1681                INIT_LIST_HEAD(&worker->entry);
1682                INIT_LIST_HEAD(&worker->scheduled);
1683                INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1684                /* on creation a worker is in !idle && prep state */
1685                worker->flags = WORKER_PREP;
1686        }
1687        return worker;
1688}
1689
1690/**
1691 * create_worker - create a new workqueue worker
1692 * @pool: pool the new worker will belong to
1693 *
1694 * Create a new worker which is bound to @pool.  The returned worker
1695 * can be started by calling start_worker() or destroyed using
1696 * destroy_worker().
1697 *
1698 * CONTEXT:
1699 * Might sleep.  Does GFP_KERNEL allocations.
1700 *
1701 * RETURNS:
1702 * Pointer to the newly created worker.
1703 */
1704static struct worker *create_worker(struct worker_pool *pool)
1705{
1706        const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1707        struct worker *worker = NULL;
1708        int id = -1;
1709
1710        spin_lock_irq(&pool->lock);
1711        while (ida_get_new(&pool->worker_ida, &id)) {
1712                spin_unlock_irq(&pool->lock);
1713                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
1714                        goto fail;
1715                spin_lock_irq(&pool->lock);
1716        }
1717        spin_unlock_irq(&pool->lock);
1718
1719        worker = alloc_worker();
1720        if (!worker)
1721                goto fail;
1722
1723        worker->pool = pool;
1724        worker->id = id;
1725
1726        if (pool->cpu != WORK_CPU_UNBOUND)
1727                worker->task = kthread_create_on_node(worker_thread,
1728                                        worker, cpu_to_node(pool->cpu),
1729                                        "kworker/%u:%d%s", pool->cpu, id, pri);
1730        else
1731                worker->task = kthread_create(worker_thread, worker,
1732                                              "kworker/u:%d%s", id, pri);
1733        if (IS_ERR(worker->task))
1734                goto fail;
1735
1736        if (std_worker_pool_pri(pool))
1737                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1738
1739        /*
1740         * Determine CPU binding of the new worker depending on
1741         * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
1742         * flag remains stable across this function.  See the comments
1743         * above the flag definition for details.
1744         *
1745         * As an unbound worker may later become a regular one if CPU comes
1746         * online, make sure every worker has %PF_THREAD_BOUND set.
1747         */
1748        if (!(pool->flags & POOL_DISASSOCIATED)) {
1749                kthread_bind(worker->task, pool->cpu);
1750        } else {
1751                worker->task->flags |= PF_THREAD_BOUND;
1752                worker->flags |= WORKER_UNBOUND;
1753        }
1754
1755        return worker;
1756fail:
1757        if (id >= 0) {
1758                spin_lock_irq(&pool->lock);
1759                ida_remove(&pool->worker_ida, id);
1760                spin_unlock_irq(&pool->lock);
1761        }
1762        kfree(worker);
1763        return NULL;
1764}
1765
1766/**
1767 * start_worker - start a newly created worker
1768 * @worker: worker to start
1769 *
1770 * Make the pool aware of @worker and start it.
1771 *
1772 * CONTEXT:
1773 * spin_lock_irq(pool->lock).
1774 */
1775static void start_worker(struct worker *worker)
1776{
1777        worker->flags |= WORKER_STARTED;
1778        worker->pool->nr_workers++;
1779        worker_enter_idle(worker);
1780        wake_up_process(worker->task);
1781}
1782
1783/**
1784 * destroy_worker - destroy a workqueue worker
1785 * @worker: worker to be destroyed
1786 *
1787 * Destroy @worker and adjust @pool stats accordingly.
1788 *
1789 * CONTEXT:
1790 * spin_lock_irq(pool->lock) which is released and regrabbed.
1791 */
1792static void destroy_worker(struct worker *worker)
1793{
1794        struct worker_pool *pool = worker->pool;
1795        int id = worker->id;
1796
1797        /* sanity check frenzy */
1798        BUG_ON(worker->current_work);
1799        BUG_ON(!list_empty(&worker->scheduled));
1800
1801        if (worker->flags & WORKER_STARTED)
1802                pool->nr_workers--;
1803        if (worker->flags & WORKER_IDLE)
1804                pool->nr_idle--;
1805
1806        list_del_init(&worker->entry);
1807        worker->flags |= WORKER_DIE;
1808
1809        spin_unlock_irq(&pool->lock);
1810
1811        kthread_stop(worker->task);
1812        kfree(worker);
1813
1814        spin_lock_irq(&pool->lock);
1815        ida_remove(&pool->worker_ida, id);
1816}
1817
1818static void idle_worker_timeout(unsigned long __pool)
1819{
1820        struct worker_pool *pool = (void *)__pool;
1821
1822        spin_lock_irq(&pool->lock);
1823
1824        if (too_many_workers(pool)) {
1825                struct worker *worker;
1826                unsigned long expires;
1827
1828                /* idle_list is kept in LIFO order, check the last one */
1829                worker = list_entry(pool->idle_list.prev, struct worker, entry);
1830                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1831
1832                if (time_before(jiffies, expires))
1833                        mod_timer(&pool->idle_timer, expires);
1834                else {
1835                        /* it's been idle for too long, wake up manager */
1836                        pool->flags |= POOL_MANAGE_WORKERS;
1837                        wake_up_worker(pool);
1838                }
1839        }
1840
1841        spin_unlock_irq(&pool->lock);
1842}
1843
1844static bool send_mayday(struct work_struct *work)
1845{
1846        struct pool_workqueue *pwq = get_work_pwq(work);
1847        struct workqueue_struct *wq = pwq->wq;
1848        unsigned int cpu;
1849
1850        if (!(wq->flags & WQ_RESCUER))
1851                return false;
1852
1853        /* mayday mayday mayday */
1854        cpu = pwq->pool->cpu;
1855        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1856        if (cpu == WORK_CPU_UNBOUND)
1857                cpu = 0;
1858        if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1859                wake_up_process(wq->rescuer->task);
1860        return true;
1861}
1862
1863static void pool_mayday_timeout(unsigned long __pool)
1864{
1865        struct worker_pool *pool = (void *)__pool;
1866        struct work_struct *work;
1867
1868        spin_lock_irq(&pool->lock);
1869
1870        if (need_to_create_worker(pool)) {
1871                /*
1872                 * We've been trying to create a new worker but
1873                 * haven't been successful.  We might be hitting an
1874                 * allocation deadlock.  Send distress signals to
1875                 * rescuers.
1876                 */
1877                list_for_each_entry(work, &pool->worklist, entry)
1878                        send_mayday(work);
1879        }
1880
1881        spin_unlock_irq(&pool->lock);
1882
1883        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1884}
1885
1886/**
1887 * maybe_create_worker - create a new worker if necessary
1888 * @pool: pool to create a new worker for
1889 *
1890 * Create a new worker for @pool if necessary.  @pool is guaranteed to
1891 * have at least one idle worker on return from this function.  If
1892 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1893 * sent to all rescuers with works scheduled on @pool to resolve
1894 * possible allocation deadlock.
1895 *
1896 * On return, need_to_create_worker() is guaranteed to be false and
1897 * may_start_working() true.
1898 *
1899 * LOCKING:
1900 * spin_lock_irq(pool->lock) which may be released and regrabbed
1901 * multiple times.  Does GFP_KERNEL allocations.  Called only from
1902 * manager.
1903 *
1904 * RETURNS:
1905 * false if no action was taken and pool->lock stayed locked, true
1906 * otherwise.
1907 */
1908static bool maybe_create_worker(struct worker_pool *pool)
1909__releases(&pool->lock)
1910__acquires(&pool->lock)
1911{
1912        if (!need_to_create_worker(pool))
1913                return false;
1914restart:
1915        spin_unlock_irq(&pool->lock);
1916
1917        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1918        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1919
1920        while (true) {
1921                struct worker *worker;
1922
1923                worker = create_worker(pool);
1924                if (worker) {
1925                        del_timer_sync(&pool->mayday_timer);
1926                        spin_lock_irq(&pool->lock);
1927                        start_worker(worker);
1928                        BUG_ON(need_to_create_worker(pool));
1929                        return true;
1930                }
1931
1932                if (!need_to_create_worker(pool))
1933                        break;
1934
1935                __set_current_state(TASK_INTERRUPTIBLE);
1936                schedule_timeout(CREATE_COOLDOWN);
1937
1938                if (!need_to_create_worker(pool))
1939                        break;
1940        }
1941
1942        del_timer_sync(&pool->mayday_timer);
1943        spin_lock_irq(&pool->lock);
1944        if (need_to_create_worker(pool))
1945                goto restart;
1946        return true;
1947}
1948
1949/**
1950 * maybe_destroy_worker - destroy workers which have been idle for a while
1951 * @pool: pool to destroy workers for
1952 *
1953 * Destroy @pool workers which have been idle for longer than
1954 * IDLE_WORKER_TIMEOUT.
1955 *
1956 * LOCKING:
1957 * spin_lock_irq(pool->lock) which may be released and regrabbed
1958 * multiple times.  Called only from manager.
1959 *
1960 * RETURNS:
1961 * false if no action was taken and pool->lock stayed locked, true
1962 * otherwise.
1963 */
1964static bool maybe_destroy_workers(struct worker_pool *pool)
1965{
1966        bool ret = false;
1967
1968        while (too_many_workers(pool)) {
1969                struct worker *worker;
1970                unsigned long expires;
1971
1972                worker = list_entry(pool->idle_list.prev, struct worker, entry);
1973                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1974
1975                if (time_before(jiffies, expires)) {
1976                        mod_timer(&pool->idle_timer, expires);
1977                        break;
1978                }
1979
1980                destroy_worker(worker);
1981                ret = true;
1982        }
1983
1984        return ret;
1985}
1986
1987/**
1988 * manage_workers - manage worker pool
1989 * @worker: self
1990 *
1991 * Assume the manager role and manage the worker pool @worker belongs
1992 * to.  At any given time, there can be only zero or one manager per
1993 * pool.  The exclusion is handled automatically by this function.
1994 *
1995 * The caller can safely start processing works on false return.  On
1996 * true return, it's guaranteed that need_to_create_worker() is false
1997 * and may_start_working() is true.
1998 *
1999 * CONTEXT:
2000 * spin_lock_irq(pool->lock) which may be released and regrabbed
2001 * multiple times.  Does GFP_KERNEL allocations.
2002 *
2003 * RETURNS:
2004 * spin_lock_irq(pool->lock) which may be released and regrabbed
2005 * multiple times.  Does GFP_KERNEL allocations.
2006 */
2007static bool manage_workers(struct worker *worker)
2008{
2009        struct worker_pool *pool = worker->pool;
2010        bool ret = false;
2011
2012        if (pool->flags & POOL_MANAGING_WORKERS)
2013                return ret;
2014
2015        pool->flags |= POOL_MANAGING_WORKERS;
2016
2017        /*
2018         * To simplify both worker management and CPU hotplug, hold off
2019         * management while hotplug is in progress.  CPU hotplug path can't
2020         * grab %POOL_MANAGING_WORKERS to achieve this because that can
2021         * lead to idle worker depletion (all become busy thinking someone
2022         * else is managing) which in turn can result in deadlock under
2023         * extreme circumstances.  Use @pool->assoc_mutex to synchronize
2024         * manager against CPU hotplug.
2025         *
2026         * assoc_mutex would always be free unless CPU hotplug is in
2027         * progress.  trylock first without dropping @pool->lock.
2028         */
2029        if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
2030                spin_unlock_irq(&pool->lock);
2031                mutex_lock(&pool->assoc_mutex);
2032                /*
2033                 * CPU hotplug could have happened while we were waiting
2034                 * for assoc_mutex.  Hotplug itself can't handle us
2035                 * because manager isn't either on idle or busy list, and
2036                 * @pool's state and ours could have deviated.
2037                 *
2038                 * As hotplug is now excluded via assoc_mutex, we can
2039                 * simply try to bind.  It will succeed or fail depending
2040                 * on @pool's current state.  Try it and adjust
2041                 * %WORKER_UNBOUND accordingly.
2042                 */
2043                if (worker_maybe_bind_and_lock(worker))
2044                        worker->flags &= ~WORKER_UNBOUND;
2045                else
2046                        worker->flags |= WORKER_UNBOUND;
2047
2048                ret = true;
2049        }
2050
2051        pool->flags &= ~POOL_MANAGE_WORKERS;
2052
2053        /*
2054         * Destroy and then create so that may_start_working() is true
2055         * on return.
2056         */
2057        ret |= maybe_destroy_workers(pool);
2058        ret |= maybe_create_worker(pool);
2059
2060        pool->flags &= ~POOL_MANAGING_WORKERS;
2061        mutex_unlock(&pool->assoc_mutex);
2062        return ret;
2063}
2064
2065/**
2066 * process_one_work - process single work
2067 * @worker: self
2068 * @work: work to process
2069 *
2070 * Process @work.  This function contains all the logics necessary to
2071 * process a single work including synchronization against and
2072 * interaction with other workers on the same cpu, queueing and
2073 * flushing.  As long as context requirement is met, any worker can
2074 * call this function to process a work.
2075 *
2076 * CONTEXT:
2077 * spin_lock_irq(pool->lock) which is released and regrabbed.
2078 */
2079static void process_one_work(struct worker *worker, struct work_struct *work)
2080__releases(&pool->lock)
2081__acquires(&pool->lock)
2082{
2083        struct pool_workqueue *pwq = get_work_pwq(work);
2084        struct worker_pool *pool = worker->pool;
2085        bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
2086        int work_color;
2087        struct worker *collision;
2088#ifdef CONFIG_LOCKDEP
2089        /*
2090         * It is permissible to free the struct work_struct from
2091         * inside the function that is called from it, this we need to
2092         * take into account for lockdep too.  To avoid bogus "held
2093         * lock freed" warnings as well as problems when looking into
2094         * work->lockdep_map, make a copy and use that here.
2095         */
2096        struct lockdep_map lockdep_map;
2097
2098        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
2099#endif
2100        /*
2101         * Ensure we're on the correct CPU.  DISASSOCIATED test is
2102         * necessary to avoid spurious warnings from rescuers servicing the
2103         * unbound or a disassociated pool.
2104         */
2105        WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
2106                     !(pool->flags & POOL_DISASSOCIATED) &&
2107                     raw_smp_processor_id() != pool->cpu);
2108
2109        /*
2110         * A single work shouldn't be executed concurrently by
2111         * multiple workers on a single cpu.  Check whether anyone is
2112         * already processing the work.  If so, defer the work to the
2113         * currently executing one.
2114         */
2115        collision = find_worker_executing_work(pool, work);
2116        if (unlikely(collision)) {
2117                move_linked_works(work, &collision->scheduled, NULL);
2118                return;
2119        }
2120
2121        /* claim and dequeue */
2122        debug_work_deactivate(work);
2123        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
2124        worker->current_work = work;
2125        worker->current_func = work->func;
2126        worker->current_pwq = pwq;
2127        work_color = get_work_color(work);
2128
2129        list_del_init(&work->entry);
2130
2131        /*
2132         * CPU intensive works don't participate in concurrency
2133         * management.  They're the scheduler's responsibility.
2134         */
2135        if (unlikely(cpu_intensive))
2136                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2137
2138        /*
2139         * Unbound pool isn't concurrency managed and work items should be
2140         * executed ASAP.  Wake up another worker if necessary.
2141         */
2142        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2143                wake_up_worker(pool);
2144
2145        /*
2146         * Record the last pool and clear PENDING which should be the last
2147         * update to @work.  Also, do this inside @pool->lock so that
2148         * PENDING and queued state changes happen together while IRQ is
2149         * disabled.
2150         */
2151        set_work_pool_and_clear_pending(work, pool->id);
2152
2153        spin_unlock_irq(&pool->lock);
2154
2155        lock_map_acquire_read(&pwq->wq->lockdep_map);
2156        lock_map_acquire(&lockdep_map);
2157        trace_workqueue_execute_start(work);
2158        worker->current_func(work);
2159        /*
2160         * While we must be careful to not use "work" after this, the trace
2161         * point will only record its address.
2162         */
2163        trace_workqueue_execute_end(work);
2164        lock_map_release(&lockdep_map);
2165        lock_map_release(&pwq->wq->lockdep_map);
2166
2167        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2168                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2169                       "     last function: %pf\n",
2170                       current->comm, preempt_count(), task_pid_nr(current),
2171                       worker->current_func);
2172                debug_show_held_locks(current);
2173                dump_stack();
2174        }
2175
2176        spin_lock_irq(&pool->lock);
2177
2178        /* clear cpu intensive status */
2179        if (unlikely(cpu_intensive))
2180                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2181
2182        /* we're done with it, release */
2183        hash_del(&worker->hentry);
2184        worker->current_work = NULL;
2185        worker->current_func = NULL;
2186        worker->current_pwq = NULL;
2187        pwq_dec_nr_in_flight(pwq, work_color);
2188}
2189
2190/**
2191 * process_scheduled_works - process scheduled works
2192 * @worker: self
2193 *
2194 * Process all scheduled works.  Please note that the scheduled list
2195 * may change while processing a work, so this function repeatedly
2196 * fetches a work from the top and executes it.
2197 *
2198 * CONTEXT:
2199 * spin_lock_irq(pool->lock) which may be released and regrabbed
2200 * multiple times.
2201 */
2202static void process_scheduled_works(struct worker *worker)
2203{
2204        while (!list_empty(&worker->scheduled)) {
2205                struct work_struct *work = list_first_entry(&worker->scheduled,
2206                                                struct work_struct, entry);
2207                process_one_work(worker, work);
2208        }
2209}
2210
2211/**
2212 * worker_thread - the worker thread function
2213 * @__worker: self
2214 *
2215 * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools
2216 * of these per each cpu.  These workers process all works regardless of
2217 * their specific target workqueue.  The only exception is works which
2218 * belong to workqueues with a rescuer which will be explained in
2219 * rescuer_thread().
2220 */
2221static int worker_thread(void *__worker)
2222{
2223        struct worker *worker = __worker;
2224        struct worker_pool *pool = worker->pool;
2225
2226        /* tell the scheduler that this is a workqueue worker */
2227        worker->task->flags |= PF_WQ_WORKER;
2228woke_up:
2229        spin_lock_irq(&pool->lock);
2230
2231        /* we are off idle list if destruction or rebind is requested */
2232        if (unlikely(list_empty(&worker->entry))) {
2233                spin_unlock_irq(&pool->lock);
2234
2235                /* if DIE is set, destruction is requested */
2236                if (worker->flags & WORKER_DIE) {
2237                        worker->task->flags &= ~PF_WQ_WORKER;
2238                        return 0;
2239                }
2240
2241                /* otherwise, rebind */
2242                idle_worker_rebind(worker);
2243                goto woke_up;
2244        }
2245
2246        worker_leave_idle(worker);
2247recheck:
2248        /* no more worker necessary? */
2249        if (!need_more_worker(pool))
2250                goto sleep;
2251
2252        /* do we need to manage? */
2253        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
2254                goto recheck;
2255
2256        /*
2257         * ->scheduled list can only be filled while a worker is
2258         * preparing to process a work or actually processing it.
2259         * Make sure nobody diddled with it while I was sleeping.
2260         */
2261        BUG_ON(!list_empty(&worker->scheduled));
2262
2263        /*
2264         * When control reaches this point, we're guaranteed to have
2265         * at least one idle worker or that someone else has already
2266         * assumed the manager role.
2267         */
2268        worker_clr_flags(worker, WORKER_PREP);
2269
2270        do {
2271                struct work_struct *work =
2272                        list_first_entry(&pool->worklist,
2273                                         struct work_struct, entry);
2274
2275                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
2276                        /* optimization path, not strictly necessary */
2277                        process_one_work(worker, work);
2278                        if (unlikely(!list_empty(&worker->scheduled)))
2279                                process_scheduled_works(worker);
2280                } else {
2281                        move_linked_works(work, &worker->scheduled, NULL);
2282                        process_scheduled_works(worker);
2283                }
2284        } while (keep_working(pool));
2285
2286        worker_set_flags(worker, WORKER_PREP, false);
2287sleep:
2288        if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
2289                goto recheck;
2290
2291        /*
2292         * pool->lock is held and there's no work to process and no need to
2293         * manage, sleep.  Workers are woken up only while holding
2294         * pool->lock or from local cpu, so setting the current state
2295         * before releasing pool->lock is enough to prevent losing any
2296         * event.
2297         */
2298        worker_enter_idle(worker);
2299        __set_current_state(TASK_INTERRUPTIBLE);
2300        spin_unlock_irq(&pool->lock);
2301        schedule();
2302        goto woke_up;
2303}
2304
2305/**
2306 * rescuer_thread - the rescuer thread function
2307 * @__rescuer: self
2308 *
2309 * Workqueue rescuer thread function.  There's one rescuer for each
2310 * workqueue which has WQ_RESCUER set.
2311 *
2312 * Regular work processing on a pool may block trying to create a new
2313 * worker which uses GFP_KERNEL allocation which has slight chance of
2314 * developing into deadlock if some works currently on the same queue
2315 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
2316 * the problem rescuer solves.
2317 *
2318 * When such condition is possible, the pool summons rescuers of all
2319 * workqueues which have works queued on the pool and let them process
2320 * those works so that forward progress can be guaranteed.
2321 *
2322 * This should happen rarely.
2323 */
2324static int rescuer_thread(void *__rescuer)
2325{
2326        struct worker *rescuer = __rescuer;
2327        struct workqueue_struct *wq = rescuer->rescue_wq;
2328        struct list_head *scheduled = &rescuer->scheduled;
2329        bool is_unbound = wq->flags & WQ_UNBOUND;
2330        unsigned int cpu;
2331
2332        set_user_nice(current, RESCUER_NICE_LEVEL);
2333
2334        /*
2335         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
2336         * doesn't participate in concurrency management.
2337         */
2338        rescuer->task->flags |= PF_WQ_WORKER;
2339repeat:
2340        set_current_state(TASK_INTERRUPTIBLE);
2341
2342        if (kthread_should_stop()) {
2343                __set_current_state(TASK_RUNNING);
2344                rescuer->task->flags &= ~PF_WQ_WORKER;
2345                return 0;
2346        }
2347
2348        /*
2349         * See whether any cpu is asking for help.  Unbounded
2350         * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
2351         */
2352        for_each_mayday_cpu(cpu, wq->mayday_mask) {
2353                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2354                struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2355                struct worker_pool *pool = pwq->pool;
2356                struct work_struct *work, *n;
2357
2358                __set_current_state(TASK_RUNNING);
2359                mayday_clear_cpu(cpu, wq->mayday_mask);
2360
2361                /* migrate to the target cpu if possible */
2362                rescuer->pool = pool;
2363                worker_maybe_bind_and_lock(rescuer);
2364
2365                /*
2366                 * Slurp in all works issued via this workqueue and
2367                 * process'em.
2368                 */
2369                BUG_ON(!list_empty(&rescuer->scheduled));
2370                list_for_each_entry_safe(work, n, &pool->worklist, entry)
2371                        if (get_work_pwq(work) == pwq)
2372                                move_linked_works(work, scheduled, &n);
2373
2374                process_scheduled_works(rescuer);
2375
2376                /*
2377                 * Leave this pool.  If keep_working() is %true, notify a
2378                 * regular worker; otherwise, we end up with 0 concurrency
2379                 * and stalling the execution.
2380                 */
2381                if (keep_working(pool))
2382                        wake_up_worker(pool);
2383
2384                spin_unlock_irq(&pool->lock);
2385        }
2386
2387        /* rescuers should never participate in concurrency management */
2388        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2389        schedule();
2390        goto repeat;
2391}
2392
2393struct wq_barrier {
2394        struct work_struct      work;
2395        struct completion       done;
2396};
2397
2398static void wq_barrier_func(struct work_struct *work)
2399{
2400        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2401        complete(&barr->done);
2402}
2403
2404/**
2405 * insert_wq_barrier - insert a barrier work
2406 * @pwq: pwq to insert barrier into
2407 * @barr: wq_barrier to insert
2408 * @target: target work to attach @barr to
2409 * @worker: worker currently executing @target, NULL if @target is not executing
2410 *
2411 * @barr is linked to @target such that @barr is completed only after
2412 * @target finishes execution.  Please note that the ordering
2413 * guarantee is observed only with respect to @target and on the local
2414 * cpu.
2415 *
2416 * Currently, a queued barrier can't be canceled.  This is because
2417 * try_to_grab_pending() can't determine whether the work to be
2418 * grabbed is at the head of the queue and thus can't clear LINKED
2419 * flag of the previous work while there must be a valid next work
2420 * after a work with LINKED flag set.
2421 *
2422 * Note that when @worker is non-NULL, @target may be modified
2423 * underneath us, so we can't reliably determine pwq from @target.
2424 *
2425 * CONTEXT:
2426 * spin_lock_irq(pool->lock).
2427 */
2428static void insert_wq_barrier(struct pool_workqueue *pwq,
2429                              struct wq_barrier *barr,
2430                              struct work_struct *target, struct worker *worker)
2431{
2432        struct list_head *head;
2433        unsigned int linked = 0;
2434
2435        /*
2436         * debugobject calls are safe here even with pool->lock locked
2437         * as we know for sure that this will not trigger any of the
2438         * checks and call back into the fixup functions where we
2439         * might deadlock.
2440         */
2441        INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2442        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2443        init_completion(&barr->done);
2444
2445        /*
2446         * If @target is currently being executed, schedule the
2447         * barrier to the worker; otherwise, put it after @target.
2448         */
2449        if (worker)
2450                head = worker->scheduled.next;
2451        else {
2452                unsigned long *bits = work_data_bits(target);
2453
2454                head = target->entry.next;
2455                /* there can already be other linked works, inherit and set */
2456                linked = *bits & WORK_STRUCT_LINKED;
2457                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2458        }
2459
2460        debug_work_activate(&barr->work);
2461        insert_work(pwq, &barr->work, head,
2462                    work_color_to_flags(WORK_NO_COLOR) | linked);
2463}
2464
2465/**
2466 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
2467 * @wq: workqueue being flushed
2468 * @flush_color: new flush color, < 0 for no-op
2469 * @work_color: new work color, < 0 for no-op
2470 *
2471 * Prepare pwqs for workqueue flushing.
2472 *
2473 * If @flush_color is non-negative, flush_color on all pwqs should be
2474 * -1.  If no pwq has in-flight commands at the specified color, all
2475 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
2476 * has in flight commands, its pwq->flush_color is set to
2477 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
2478 * wakeup logic is armed and %true is returned.
2479 *
2480 * The caller should have initialized @wq->first_flusher prior to
2481 * calling this function with non-negative @flush_color.  If
2482 * @flush_color is negative, no flush color update is done and %false
2483 * is returned.
2484 *
2485 * If @work_color is non-negative, all pwqs should have the same
2486 * work_color which is previous to @work_color and all will be
2487 * advanced to @work_color.
2488 *
2489 * CONTEXT:
2490 * mutex_lock(wq->flush_mutex).
2491 *
2492 * RETURNS:
2493 * %true if @flush_color >= 0 and there's something to flush.  %false
2494 * otherwise.
2495 */
2496static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2497                                      int flush_color, int work_color)
2498{
2499        bool wait = false;
2500        unsigned int cpu;
2501
2502        if (flush_color >= 0) {
2503                BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
2504                atomic_set(&wq->nr_pwqs_to_flush, 1);
2505        }
2506
2507        for_each_pwq_cpu(cpu, wq) {
2508                struct pool_workqueue *pwq = get_pwq(cpu, wq);
2509                struct worker_pool *pool = pwq->pool;
2510
2511                spin_lock_irq(&pool->lock);
2512
2513                if (flush_color >= 0) {
2514                        BUG_ON(pwq->flush_color != -1);
2515
2516                        if (pwq->nr_in_flight[flush_color]) {
2517                                pwq->flush_color = flush_color;
2518                                atomic_inc(&wq->nr_pwqs_to_flush);
2519                                wait = true;
2520                        }
2521                }
2522
2523                if (work_color >= 0) {
2524                        BUG_ON(work_color != work_next_color(pwq->work_color));
2525                        pwq->work_color = work_color;
2526                }
2527
2528                spin_unlock_irq(&pool->lock);
2529        }
2530
2531        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
2532                complete(&wq->first_flusher->done);
2533
2534        return wait;
2535}
2536
2537/**
2538 * flush_workqueue - ensure that any scheduled work has run to completion.
2539 * @wq: workqueue to flush
2540 *
2541 * Forces execution of the workqueue and blocks until its completion.
2542 * This is typically used in driver shutdown handlers.
2543 *
2544 * We sleep until all works which were queued on entry have been handled,
2545 * but we are not livelocked by new incoming ones.
2546 */
2547void flush_workqueue(struct workqueue_struct *wq)
2548{
2549        struct wq_flusher this_flusher = {
2550                .list = LIST_HEAD_INIT(this_flusher.list),
2551                .flush_color = -1,
2552                .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2553        };
2554        int next_color;
2555
2556        lock_map_acquire(&wq->lockdep_map);
2557        lock_map_release(&wq->lockdep_map);
2558
2559        mutex_lock(&wq->flush_mutex);
2560
2561        /*
2562         * Start-to-wait phase
2563         */
2564        next_color = work_next_color(wq->work_color);
2565
2566        if (next_color != wq->flush_color) {
2567                /*
2568                 * Color space is not full.  The current work_color
2569                 * becomes our flush_color and work_color is advanced
2570                 * by one.
2571                 */
2572                BUG_ON(!list_empty(&wq->flusher_overflow));
2573                this_flusher.flush_color = wq->work_color;
2574                wq->work_color = next_color;
2575
2576                if (!wq->first_flusher) {
2577                        /* no flush in progress, become the first flusher */
2578                        BUG_ON(wq->flush_color != this_flusher.flush_color);
2579
2580                        wq->first_flusher = &this_flusher;
2581
2582                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
2583                                                       wq->work_color)) {
2584                                /* nothing to flush, done */
2585                                wq->flush_color = next_color;
2586                                wq->first_flusher = NULL;
2587                                goto out_unlock;
2588                        }
2589                } else {
2590                        /* wait in queue */
2591                        BUG_ON(wq->flush_color == this_flusher.flush_color);
2592                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
2593                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2594                }
2595        } else {
2596                /*
2597                 * Oops, color space is full, wait on overflow queue.
2598                 * The next flush completion will assign us
2599                 * flush_color and transfer to flusher_queue.
2600                 */
2601                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2602        }
2603
2604        mutex_unlock(&wq->flush_mutex);
2605
2606        wait_for_completion(&this_flusher.done);
2607
2608        /*
2609         * Wake-up-and-cascade phase
2610         *
2611         * First flushers are responsible for cascading flushes and
2612         * handling overflow.  Non-first flushers can simply return.
2613         */
2614        if (wq->first_flusher != &this_flusher)
2615                return;
2616
2617        mutex_lock(&wq->flush_mutex);
2618
2619        /* we might have raced, check again with mutex held */
2620        if (wq->first_flusher != &this_flusher)
2621                goto out_unlock;
2622
2623        wq->first_flusher = NULL;
2624
2625        BUG_ON(!list_empty(&this_flusher.list));
2626        BUG_ON(wq->flush_color != this_flusher.flush_color);
2627
2628        while (true) {
2629                struct wq_flusher *next, *tmp;
2630
2631                /* complete all the flushers sharing the current flush color */
2632                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2633                        if (next->flush_color != wq->flush_color)
2634                                break;
2635                        list_del_init(&next->list);
2636                        complete(&next->done);
2637                }
2638
2639                BUG_ON(!list_empty(&wq->flusher_overflow) &&
2640                       wq->flush_color != work_next_color(wq->work_color));
2641
2642                /* this flush_color is finished, advance by one */
2643                wq->flush_color = work_next_color(wq->flush_color);
2644
2645                /* one color has been freed, handle overflow queue */
2646                if (!list_empty(&wq->flusher_overflow)) {
2647                        /*
2648                         * Assign the same color to all overflowed
2649                         * flushers, advance work_color and append to
2650                         * flusher_queue.  This is the start-to-wait
2651                         * phase for these overflowed flushers.
2652                         */
2653                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
2654                                tmp->flush_color = wq->work_color;
2655
2656                        wq->work_color = work_next_color(wq->work_color);
2657
2658                        list_splice_tail_init(&wq->flusher_overflow,
2659                                              &wq->flusher_queue);
2660                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2661                }
2662
2663                if (list_empty(&wq->flusher_queue)) {
2664                        BUG_ON(wq->flush_color != wq->work_color);
2665                        break;
2666                }
2667
2668                /*
2669                 * Need to flush more colors.  Make the next flusher
2670                 * the new first flusher and arm pwqs.
2671                 */
2672                BUG_ON(wq->flush_color == wq->work_color);
2673                BUG_ON(wq->flush_color != next->flush_color);
2674
2675                list_del_init(&next->list);
2676                wq->first_flusher = next;
2677
2678                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
2679                        break;
2680
2681                /*
2682                 * Meh... this color is already done, clear first
2683                 * flusher and repeat cascading.
2684                 */
2685                wq->first_flusher = NULL;
2686        }
2687
2688out_unlock:
2689        mutex_unlock(&wq->flush_mutex);
2690}
2691EXPORT_SYMBOL_GPL(flush_workqueue);
2692
2693/**
2694 * drain_workqueue - drain a workqueue
2695 * @wq: workqueue to drain
2696 *
2697 * Wait until the workqueue becomes empty.  While draining is in progress,
2698 * only chain queueing is allowed.  IOW, only currently pending or running
2699 * work items on @wq can queue further work items on it.  @wq is flushed
2700 * repeatedly until it becomes empty.  The number of flushing is detemined
2701 * by the depth of chaining and should be relatively short.  Whine if it
2702 * takes too long.
2703 */
2704void drain_workqueue(struct workqueue_struct *wq)
2705{
2706        unsigned int flush_cnt = 0;
2707        unsigned int cpu;
2708
2709        /*
2710         * __queue_work() needs to test whether there are drainers, is much
2711         * hotter than drain_workqueue() and already looks at @wq->flags.
2712         * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2713         */
2714        spin_lock(&workqueue_lock);
2715        if (!wq->nr_drainers++)
2716                wq->flags |= WQ_DRAINING;
2717        spin_unlock(&workqueue_lock);
2718reflush:
2719        flush_workqueue(wq);
2720
2721        for_each_pwq_cpu(cpu, wq) {
2722                struct pool_workqueue *pwq = get_pwq(cpu, wq);
2723                bool drained;
2724
2725                spin_lock_irq(&pwq->pool->lock);
2726                drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
2727                spin_unlock_irq(&pwq->pool->lock);
2728
2729                if (drained)
2730                        continue;
2731
2732                if (++flush_cnt == 10 ||
2733                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2734                        pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
2735                                wq->name, flush_cnt);
2736                goto reflush;
2737        }
2738
2739        spin_lock(&workqueue_lock);
2740        if (!--wq->nr_drainers)
2741                wq->flags &= ~WQ_DRAINING;
2742        spin_unlock(&workqueue_lock);
2743}
2744EXPORT_SYMBOL_GPL(drain_workqueue);
2745
2746static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2747{
2748        struct worker *worker = NULL;
2749        struct worker_pool *pool;
2750        struct pool_workqueue *pwq;
2751
2752        might_sleep();
2753        pool = get_work_pool(work);
2754        if (!pool)
2755                return false;
2756
2757        spin_lock_irq(&pool->lock);
2758        /* see the comment in try_to_grab_pending() with the same code */
2759        pwq = get_work_pwq(work);
2760        if (pwq) {
2761                if (unlikely(pwq->pool != pool))
2762                        goto already_gone;
2763        } else {
2764                worker = find_worker_executing_work(pool, work);
2765                if (!worker)
2766                        goto already_gone;
2767                pwq = worker->current_pwq;
2768        }
2769
2770        insert_wq_barrier(pwq, barr, work, worker);
2771        spin_unlock_irq(&pool->lock);
2772
2773        /*
2774         * If @max_active is 1 or rescuer is in use, flushing another work
2775         * item on the same workqueue may lead to deadlock.  Make sure the
2776         * flusher is not running on the same workqueue by verifying write
2777         * access.
2778         */
2779        if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
2780                lock_map_acquire(&pwq->wq->lockdep_map);
2781        else
2782                lock_map_acquire_read(&pwq->wq->lockdep_map);
2783        lock_map_release(&pwq->wq->lockdep_map);
2784
2785        return true;
2786already_gone:
2787        spin_unlock_irq(&pool->lock);
2788        return false;
2789}
2790
2791/**
2792 * flush_work - wait for a work to finish executing the last queueing instance
2793 * @work: the work to flush
2794 *
2795 * Wait until @work has finished execution.  @work is guaranteed to be idle
2796 * on return if it hasn't been requeued since flush started.
2797 *
2798 * RETURNS:
2799 * %true if flush_work() waited for the work to finish execution,
2800 * %false if it was already idle.
2801 */
2802bool flush_work(struct work_struct *work)
2803{
2804        struct wq_barrier barr;
2805
2806        lock_map_acquire(&work->lockdep_map);
2807        lock_map_release(&work->lockdep_map);
2808
2809        if (start_flush_work(work, &barr)) {
2810                wait_for_completion(&barr.done);
2811                destroy_work_on_stack(&barr.work);
2812                return true;
2813        } else {
2814                return false;
2815        }
2816}
2817EXPORT_SYMBOL_GPL(flush_work);
2818
2819static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
2820{
2821        unsigned long flags;
2822        int ret;
2823
2824        do {
2825                ret = try_to_grab_pending(work, is_dwork, &flags);
2826                /*
2827                 * If someone else is canceling, wait for the same event it
2828                 * would be waiting for before retrying.
2829                 */
2830                if (unlikely(ret == -ENOENT))
2831                        flush_work(work);
2832        } while (unlikely(ret < 0));
2833
2834        /* tell other tasks trying to grab @work to back off */
2835        mark_work_canceling(work);
2836        local_irq_restore(flags);
2837
2838        flush_work(work);
2839        clear_work_data(work);
2840        return ret;
2841}
2842
2843/**
2844 * cancel_work_sync - cancel a work and wait for it to finish
2845 * @work: the work to cancel
2846 *
2847 * Cancel @work and wait for its execution to finish.  This function
2848 * can be used even if the work re-queues itself or migrates to
2849 * another workqueue.  On return from this function, @work is
2850 * guaranteed to be not pending or executing on any CPU.
2851 *
2852 * cancel_work_sync(&delayed_work->work) must not be used for
2853 * delayed_work's.  Use cancel_delayed_work_sync() instead.
2854 *
2855 * The caller must ensure that the workqueue on which @work was last
2856 * queued can't be destroyed before this function returns.
2857 *
2858 * RETURNS:
2859 * %true if @work was pending, %false otherwise.
2860 */
2861bool cancel_work_sync(struct work_struct *work)
2862{
2863        return __cancel_work_timer(work, false);
2864}
2865EXPORT_SYMBOL_GPL(cancel_work_sync);
2866
2867/**
2868 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2869 * @dwork: the delayed work to flush
2870 *
2871 * Delayed timer is cancelled and the pending work is queued for
2872 * immediate execution.  Like flush_work(), this function only
2873 * considers the last queueing instance of @dwork.
2874 *
2875 * RETURNS:
2876 * %true if flush_work() waited for the work to finish execution,
2877 * %false if it was already idle.
2878 */
2879bool flush_delayed_work(struct delayed_work *dwork)
2880{
2881        local_irq_disable();
2882        if (del_timer_sync(&dwork->timer))
2883                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
2884        local_irq_enable();
2885        return flush_work(&dwork->work);
2886}
2887EXPORT_SYMBOL(flush_delayed_work);
2888
2889/**
2890 * cancel_delayed_work - cancel a delayed work
2891 * @dwork: delayed_work to cancel
2892 *
2893 * Kill off a pending delayed_work.  Returns %true if @dwork was pending
2894 * and canceled; %false if wasn't pending.  Note that the work callback
2895 * function may still be running on return, unless it returns %true and the
2896 * work doesn't re-arm itself.  Explicitly flush or use
2897 * cancel_delayed_work_sync() to wait on it.
2898 *
2899 * This function is safe to call from any context including IRQ handler.
2900 */
2901bool cancel_delayed_work(struct delayed_work *dwork)
2902{
2903        unsigned long flags;
2904        int ret;
2905
2906        do {
2907                ret = try_to_grab_pending(&dwork->work, true, &flags);
2908        } while (unlikely(ret == -EAGAIN));
2909
2910        if (unlikely(ret < 0))
2911                return false;
2912
2913        set_work_pool_and_clear_pending(&dwork->work,
2914                                        get_work_pool_id(&dwork->work));
2915        local_irq_restore(flags);
2916        return ret;
2917}
2918EXPORT_SYMBOL(cancel_delayed_work);
2919
2920/**
2921 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2922 * @dwork: the delayed work cancel
2923 *
2924 * This is cancel_work_sync() for delayed works.
2925 *
2926 * RETURNS:
2927 * %true if @dwork was pending, %false otherwise.
2928 */
2929bool cancel_delayed_work_sync(struct delayed_work *dwork)
2930{
2931        return __cancel_work_timer(&dwork->work, true);
2932}
2933EXPORT_SYMBOL(cancel_delayed_work_sync);
2934
2935/**
2936 * schedule_work_on - put work task on a specific cpu
2937 * @cpu: cpu to put the work task on
2938 * @work: job to be done
2939 *
2940 * This puts a job on a specific cpu
2941 */
2942bool schedule_work_on(int cpu, struct work_struct *work)
2943{
2944        return queue_work_on(cpu, system_wq, work);
2945}
2946EXPORT_SYMBOL(schedule_work_on);
2947
2948/**
2949 * schedule_work - put work task in global workqueue
2950 * @work: job to be done
2951 *
2952 * Returns %false if @work was already on the kernel-global workqueue and
2953 * %true otherwise.
2954 *
2955 * This puts a job in the kernel-global workqueue if it was not already
2956 * queued and leaves it in the same position on the kernel-global
2957 * workqueue otherwise.
2958 */
2959bool schedule_work(struct work_struct *work)
2960{
2961        return queue_work(system_wq, work);
2962}
2963EXPORT_SYMBOL(schedule_work);
2964
2965/**
2966 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2967 * @cpu: cpu to use
2968 * @dwork: job to be done
2969 * @delay: number of jiffies to wait
2970 *
2971 * After waiting for a given time this puts a job in the kernel-global
2972 * workqueue on the specified CPU.
2973 */
2974bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2975                              unsigned long delay)
2976{
2977        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2978}
2979EXPORT_SYMBOL(schedule_delayed_work_on);
2980
2981/**
2982 * schedule_delayed_work - put work task in global workqueue after delay
2983 * @dwork: job to be done
2984 * @delay: number of jiffies to wait or 0 for immediate execution
2985 *
2986 * After waiting for a given time this puts a job in the kernel-global
2987 * workqueue.
2988 */
2989bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
2990{
2991        return queue_delayed_work(system_wq, dwork, delay);
2992}
2993EXPORT_SYMBOL(schedule_delayed_work);
2994
2995/**
2996 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2997 * @func: the function to call
2998 *
2999 * schedule_on_each_cpu() executes @func on each online CPU using the
3000 * system workqueue and blocks until all CPUs have completed.
3001 * schedule_on_each_cpu() is very slow.
3002 *
3003 * RETURNS:
3004 * 0 on success, -errno on failure.
3005 */
3006int schedule_on_each_cpu(work_func_t func)
3007{
3008        int cpu;
3009        struct work_struct __percpu *works;
3010
3011        works = alloc_percpu(struct work_struct);
3012        if (!works)
3013                return -ENOMEM;
3014
3015        get_online_cpus();
3016
3017        for_each_online_cpu(cpu) {
3018                struct work_struct *work = per_cpu_ptr(works, cpu);
3019
3020                INIT_WORK(work, func);
3021                schedule_work_on(cpu, work);
3022        }
3023
3024        for_each_online_cpu(cpu)
3025                flush_work(per_cpu_ptr(works, cpu));
3026
3027        put_online_cpus();
3028        free_percpu(works);
3029        return 0;
3030}
3031
3032/**
3033 * flush_scheduled_work - ensure that any scheduled work has run to completion.
3034 *
3035 * Forces execution of the kernel-global workqueue and blocks until its
3036 * completion.
3037 *
3038 * Think twice before calling this function!  It's very easy to get into
3039 * trouble if you don't take great care.  Either of the following situations
3040 * will lead to deadlock:
3041 *
3042 *      One of the work items currently on the workqueue needs to acquire
3043 *      a lock held by your code or its caller.
3044 *
3045 *      Your code is running in the context of a work routine.
3046 *
3047 * They will be detected by lockdep when they occur, but the first might not
3048 * occur very often.  It depends on what work items are on the workqueue and
3049 * what locks they need, which you have no control over.
3050 *
3051 * In most situations flushing the entire workqueue is overkill; you merely
3052 * need to know that a particular work item isn't queued and isn't running.
3053 * In such cases you should use cancel_delayed_work_sync() or
3054 * cancel_work_sync() instead.
3055 */
3056void flush_scheduled_work(void)
3057{
3058        flush_workqueue(system_wq);
3059}
3060EXPORT_SYMBOL(flush_scheduled_work);
3061
3062/**
3063 * execute_in_process_context - reliably execute the routine with user context
3064 * @fn:         the function to execute
3065 * @ew:         guaranteed storage for the execute work structure (must
3066 *              be available when the work executes)
3067 *
3068 * Executes the function immediately if process context is available,
3069 * otherwise schedules the function for delayed execution.
3070 *
3071 * Returns:     0 - function was executed
3072 *              1 - function was scheduled for execution
3073 */
3074int execute_in_process_context(work_func_t fn, struct execute_work *ew)
3075{
3076        if (!in_interrupt()) {
3077                fn(&ew->work);
3078                return 0;
3079        }
3080
3081        INIT_WORK(&ew->work, fn);
3082        schedule_work(&ew->work);
3083
3084        return 1;
3085}
3086EXPORT_SYMBOL_GPL(execute_in_process_context);
3087
3088int keventd_up(void)
3089{
3090        return system_wq != NULL;
3091}
3092
3093static int alloc_pwqs(struct workqueue_struct *wq)
3094{
3095        /*
3096         * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
3097         * Make sure that the alignment isn't lower than that of
3098         * unsigned long long.
3099         */
3100        const size_t size = sizeof(struct pool_workqueue);
3101        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
3102                                   __alignof__(unsigned long long));
3103
3104        if (!(wq->flags & WQ_UNBOUND))
3105                wq->pool_wq.pcpu = __alloc_percpu(size, align);
3106        else {
3107                void *ptr;
3108
3109                /*
3110                 * Allocate enough room to align pwq and put an extra
3111                 * pointer at the end pointing back to the originally
3112                 * allocated pointer which will be used for free.
3113                 */
3114                ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
3115                if (ptr) {
3116                        wq->pool_wq.single = PTR_ALIGN(ptr, align);
3117                        *(void **)(wq->pool_wq.single + 1) = ptr;
3118                }
3119        }
3120
3121        /* just in case, make sure it's actually aligned */
3122        BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
3123        return wq->pool_wq.v ? 0 : -ENOMEM;
3124}
3125
3126static void free_pwqs(struct workqueue_struct *wq)
3127{
3128        if (!(wq->flags & WQ_UNBOUND))
3129                free_percpu(wq->pool_wq.pcpu);
3130        else if (wq->pool_wq.single) {
3131                /* the pointer to free is stored right after the pwq */
3132                kfree(*(void **)(wq->pool_wq.single + 1));
3133        }
3134}
3135
3136static int wq_clamp_max_active(int max_active, unsigned int flags,
3137                               const char *name)
3138{
3139        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3140
3141        if (max_active < 1 || max_active > lim)
3142                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
3143                        max_active, name, 1, lim);
3144
3145        return clamp_val(max_active, 1, lim);
3146}
3147
3148struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3149                                               unsigned int flags,
3150                                               int max_active,
3151                                               struct lock_class_key *key,
3152                                               const char *lock_name, ...)
3153{
3154        va_list args, args1;
3155        struct workqueue_struct *wq;
3156        unsigned int cpu;
3157        size_t namelen;
3158
3159        /* determine namelen, allocate wq and format name */
3160        va_start(args, lock_name);
3161        va_copy(args1, args);
3162        namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3163
3164        wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
3165        if (!wq)
3166                goto err;
3167
3168        vsnprintf(wq->name, namelen, fmt, args1);
3169        va_end(args);
3170        va_end(args1);
3171
3172        /*
3173         * Workqueues which may be used during memory reclaim should
3174         * have a rescuer to guarantee forward progress.
3175         */
3176        if (flags & WQ_MEM_RECLAIM)
3177                flags |= WQ_RESCUER;
3178
3179        max_active = max_active ?: WQ_DFL_ACTIVE;
3180        max_active = wq_clamp_max_active(max_active, flags, wq->name);
3181
3182        /* init wq */
3183        wq->flags = flags;
3184        wq->saved_max_active = max_active;
3185        mutex_init(&wq->flush_mutex);
3186        atomic_set(&wq->nr_pwqs_to_flush, 0);
3187        INIT_LIST_HEAD(&wq->flusher_queue);
3188        INIT_LIST_HEAD(&wq->flusher_overflow);
3189
3190        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3191        INIT_LIST_HEAD(&wq->list);
3192
3193        if (alloc_pwqs(wq) < 0)
3194                goto err;
3195
3196        for_each_pwq_cpu(cpu, wq) {
3197                struct pool_workqueue *pwq = get_pwq(cpu, wq);
3198
3199                BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3200                pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3201                pwq->wq = wq;
3202                pwq->flush_color = -1;
3203                pwq->max_active = max_active;
3204                INIT_LIST_HEAD(&pwq->delayed_works);
3205        }
3206
3207        if (flags & WQ_RESCUER) {
3208                struct worker *rescuer;
3209
3210                if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
3211                        goto err;
3212
3213                wq->rescuer = rescuer = alloc_worker();
3214                if (!rescuer)
3215                        goto err;
3216
3217                rescuer->rescue_wq = wq;
3218                rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3219                                               wq->name);
3220                if (IS_ERR(rescuer->task))
3221                        goto err;
3222
3223                rescuer->task->flags |= PF_THREAD_BOUND;
3224                wake_up_process(rescuer->task);
3225        }
3226
3227        /*
3228         * workqueue_lock protects global freeze state and workqueues
3229         * list.  Grab it, set max_active accordingly and add the new
3230         * workqueue to workqueues list.
3231         */
3232        spin_lock(&workqueue_lock);
3233
3234        if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3235                for_each_pwq_cpu(cpu, wq)
3236                        get_pwq(cpu, wq)->max_active = 0;
3237
3238        list_add(&wq->list, &workqueues);
3239
3240        spin_unlock(&workqueue_lock);
3241
3242        return wq;
3243err:
3244        if (wq) {
3245                free_pwqs(wq);
3246                free_mayday_mask(wq->mayday_mask);
3247                kfree(wq->rescuer);
3248                kfree(wq);
3249        }
3250        return NULL;
3251}
3252EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3253
3254/**
3255 * destroy_workqueue - safely terminate a workqueue
3256 * @wq: target workqueue
3257 *
3258 * Safely destroy a workqueue. All work currently pending will be done first.
3259 */
3260void destroy_workqueue(struct workqueue_struct *wq)
3261{
3262        unsigned int cpu;
3263
3264        /* drain it before proceeding with destruction */
3265        drain_workqueue(wq);
3266
3267        /*
3268         * wq list is used to freeze wq, remove from list after
3269         * flushing is complete in case freeze races us.
3270         */
3271        spin_lock(&workqueue_lock);
3272        list_del(&wq->list);
3273        spin_unlock(&workqueue_lock);
3274
3275        /* sanity check */
3276        for_each_pwq_cpu(cpu, wq) {
3277                struct pool_workqueue *pwq = get_pwq(cpu, wq);
3278                int i;
3279
3280                for (i = 0; i < WORK_NR_COLORS; i++)
3281                        BUG_ON(pwq->nr_in_flight[i]);
3282                BUG_ON(pwq->nr_active);
3283                BUG_ON(!list_empty(&pwq->delayed_works));
3284        }
3285
3286        if (wq->flags & WQ_RESCUER) {
3287                kthread_stop(wq->rescuer->task);
3288                free_mayday_mask(wq->mayday_mask);
3289                kfree(wq->rescuer);
3290        }
3291
3292        free_pwqs(wq);
3293        kfree(wq);
3294}
3295EXPORT_SYMBOL_GPL(destroy_workqueue);
3296
3297/**
3298 * pwq_set_max_active - adjust max_active of a pwq
3299 * @pwq: target pool_workqueue
3300 * @max_active: new max_active value.
3301 *
3302 * Set @pwq->max_active to @max_active and activate delayed works if
3303 * increased.
3304 *
3305 * CONTEXT:
3306 * spin_lock_irq(pool->lock).
3307 */
3308static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3309{
3310        pwq->max_active = max_active;
3311
3312        while (!list_empty(&pwq->delayed_works) &&
3313               pwq->nr_active < pwq->max_active)
3314                pwq_activate_first_delayed(pwq);
3315}
3316
3317/**
3318 * workqueue_set_max_active - adjust max_active of a workqueue
3319 * @wq: target workqueue
3320 * @max_active: new max_active value.
3321 *
3322 * Set max_active of @wq to @max_active.
3323 *
3324 * CONTEXT:
3325 * Don't call from IRQ context.
3326 */
3327void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3328{
3329        unsigned int cpu;
3330
3331        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3332
3333        spin_lock(&workqueue_lock);
3334
3335        wq->saved_max_active = max_active;
3336
3337        for_each_pwq_cpu(cpu, wq) {
3338                struct pool_workqueue *pwq = get_pwq(cpu, wq);
3339                struct worker_pool *pool = pwq->pool;
3340
3341                spin_lock_irq(&pool->lock);
3342
3343                if (!(wq->flags & WQ_FREEZABLE) ||
3344                    !(pool->flags & POOL_FREEZING))
3345                        pwq_set_max_active(pwq, max_active);
3346
3347                spin_unlock_irq(&pool->lock);
3348        }
3349
3350        spin_unlock(&workqueue_lock);
3351}
3352EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3353
3354/**
3355 * workqueue_congested - test whether a workqueue is congested
3356 * @cpu: CPU in question
3357 * @wq: target workqueue
3358 *
3359 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
3360 * no synchronization around this function and the test result is
3361 * unreliable and only useful as advisory hints or for debugging.
3362 *
3363 * RETURNS:
3364 * %true if congested, %false otherwise.
3365 */
3366bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3367{
3368        struct pool_workqueue *pwq = get_pwq(cpu, wq);
3369
3370        return !list_empty(&pwq->delayed_works);
3371}
3372EXPORT_SYMBOL_GPL(workqueue_congested);
3373
3374/**
3375 * work_busy - test whether a work is currently pending or running
3376 * @work: the work to be tested
3377 *
3378 * Test whether @work is currently pending or running.  There is no
3379 * synchronization around this function and the test result is
3380 * unreliable and only useful as advisory hints or for debugging.
3381 *
3382 * RETURNS:
3383 * OR'd bitmask of WORK_BUSY_* bits.
3384 */
3385unsigned int work_busy(struct work_struct *work)
3386{
3387        struct worker_pool *pool = get_work_pool(work);
3388        unsigned long flags;
3389        unsigned int ret = 0;
3390
3391        if (work_pending(work))
3392                ret |= WORK_BUSY_PENDING;
3393
3394        if (pool) {
3395                spin_lock_irqsave(&pool->lock, flags);
3396                if (find_worker_executing_work(pool, work))
3397                        ret |= WORK_BUSY_RUNNING;
3398                spin_unlock_irqrestore(&pool->lock, flags);
3399        }
3400
3401        return ret;
3402}
3403EXPORT_SYMBOL_GPL(work_busy);
3404
3405/*
3406 * CPU hotplug.
3407 *
3408 * There are two challenges in supporting CPU hotplug.  Firstly, there
3409 * are a lot of assumptions on strong associations among work, pwq and
3410 * pool which make migrating pending and scheduled works very
3411 * difficult to implement without impacting hot paths.  Secondly,
3412 * worker pools serve mix of short, long and very long running works making
3413 * blocked draining impractical.
3414 *
3415 * This is solved by allowing the pools to be disassociated from the CPU
3416 * running as an unbound one and allowing it to be reattached later if the
3417 * cpu comes back online.
3418 */
3419
3420static void wq_unbind_fn(struct work_struct *work)
3421{
3422        int cpu = smp_processor_id();
3423        struct worker_pool *pool;
3424        struct worker *worker;
3425        int i;
3426
3427        for_each_std_worker_pool(pool, cpu) {
3428                BUG_ON(cpu != smp_processor_id());
3429
3430                mutex_lock(&pool->assoc_mutex);
3431                spin_lock_irq(&pool->lock);
3432
3433                /*
3434                 * We've claimed all manager positions.  Make all workers
3435                 * unbound and set DISASSOCIATED.  Before this, all workers
3436                 * except for the ones which are still executing works from
3437                 * before the last CPU down must be on the cpu.  After
3438                 * this, they may become diasporas.
3439                 */
3440                list_for_each_entry(worker, &pool->idle_list, entry)
3441                        worker->flags |= WORKER_UNBOUND;
3442
3443                for_each_busy_worker(worker, i, pool)
3444                        worker->flags |= WORKER_UNBOUND;
3445
3446                pool->flags |= POOL_DISASSOCIATED;
3447
3448                spin_unlock_irq(&pool->lock);
3449                mutex_unlock(&pool->assoc_mutex);
3450
3451                /*
3452                 * Call schedule() so that we cross rq->lock and thus can
3453                 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
3454                 * This is necessary as scheduler callbacks may be invoked
3455                 * from other cpus.
3456                 */
3457                schedule();
3458
3459                /*
3460                 * Sched callbacks are disabled now.  Zap nr_running.
3461                 * After this, nr_running stays zero and need_more_worker()
3462                 * and keep_working() are always true as long as the
3463                 * worklist is not empty.  This pool now behaves as an
3464                 * unbound (in terms of concurrency management) pool which
3465                 * are served by workers tied to the pool.
3466                 */
3467                atomic_set(&pool->nr_running, 0);
3468
3469                /*
3470                 * With concurrency management just turned off, a busy
3471                 * worker blocking could lead to lengthy stalls.  Kick off
3472                 * unbound chain execution of currently pending work items.
3473                 */
3474                spin_lock_irq(&pool->lock);
3475                wake_up_worker(pool);
3476                spin_unlock_irq(&pool->lock);
3477        }
3478}
3479
3480/*
3481 * Workqueues should be brought up before normal priority CPU notifiers.
3482 * This will be registered high priority CPU notifier.
3483 */
3484static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3485                                               unsigned long action,
3486                                               void *hcpu)
3487{
3488        unsigned int cpu = (unsigned long)hcpu;
3489        struct worker_pool *pool;
3490
3491        switch (action & ~CPU_TASKS_FROZEN) {
3492        case CPU_UP_PREPARE:
3493                for_each_std_worker_pool(pool, cpu) {
3494                        struct worker *worker;
3495
3496                        if (pool->nr_workers)
3497                                continue;
3498
3499                        worker = create_worker(pool);
3500                        if (!worker)
3501                                return NOTIFY_BAD;
3502
3503                        spin_lock_irq(&pool->lock);
3504                        start_worker(worker);
3505                        spin_unlock_irq(&pool->lock);
3506                }
3507                break;
3508
3509        case CPU_DOWN_FAILED:
3510        case CPU_ONLINE:
3511                for_each_std_worker_pool(pool, cpu) {
3512                        mutex_lock(&pool->assoc_mutex);
3513                        spin_lock_irq(&pool->lock);
3514
3515                        pool->flags &= ~POOL_DISASSOCIATED;
3516                        rebind_workers(pool);
3517
3518                        spin_unlock_irq(&pool->lock);
3519                        mutex_unlock(&pool->assoc_mutex);
3520                }
3521                break;
3522        }
3523        return NOTIFY_OK;
3524}
3525
3526/*
3527 * Workqueues should be brought down after normal priority CPU notifiers.
3528 * This will be registered as low priority CPU notifier.
3529 */
3530static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3531                                                 unsigned long action,
3532                                                 void *hcpu)
3533{
3534        unsigned int cpu = (unsigned long)hcpu;
3535        struct work_struct unbind_work;
3536
3537        switch (action & ~CPU_TASKS_FROZEN) {
3538        case CPU_DOWN_PREPARE:
3539                /* unbinding should happen on the local CPU */
3540                INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3541                queue_work_on(cpu, system_highpri_wq, &unbind_work);
3542                flush_work(&unbind_work);
3543                break;
3544        }
3545        return NOTIFY_OK;
3546}
3547
3548#ifdef CONFIG_SMP
3549
3550struct work_for_cpu {
3551        struct work_struct work;
3552        long (*fn)(void *);
3553        void *arg;
3554        long ret;
3555};
3556
3557static void work_for_cpu_fn(struct work_struct *work)
3558{
3559        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
3560
3561        wfc->ret = wfc->fn(wfc->arg);
3562}
3563
3564/**
3565 * work_on_cpu - run a function in user context on a particular cpu
3566 * @cpu: the cpu to run on
3567 * @fn: the function to run
3568 * @arg: the function arg
3569 *
3570 * This will return the value @fn returns.
3571 * It is up to the caller to ensure that the cpu doesn't go offline.
3572 * The caller must not hold any locks which would prevent @fn from completing.
3573 */
3574long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3575{
3576        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3577
3578        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
3579        schedule_work_on(cpu, &wfc.work);
3580        flush_work(&wfc.work);
3581        return wfc.ret;
3582}
3583EXPORT_SYMBOL_GPL(work_on_cpu);
3584#endif /* CONFIG_SMP */
3585
3586#ifdef CONFIG_FREEZER
3587
3588/**
3589 * freeze_workqueues_begin - begin freezing workqueues
3590 *
3591 * Start freezing workqueues.  After this function returns, all freezable
3592 * workqueues will queue new works to their frozen_works list instead of
3593 * pool->worklist.
3594 *
3595 * CONTEXT:
3596 * Grabs and releases workqueue_lock and pool->lock's.
3597 */
3598void freeze_workqueues_begin(void)
3599{
3600        unsigned int cpu;
3601
3602        spin_lock(&workqueue_lock);
3603
3604        BUG_ON(workqueue_freezing);
3605        workqueue_freezing = true;
3606
3607        for_each_wq_cpu(cpu) {
3608                struct worker_pool *pool;
3609                struct workqueue_struct *wq;
3610
3611                for_each_std_worker_pool(pool, cpu) {
3612                        spin_lock_irq(&pool->lock);
3613
3614                        WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3615                        pool->flags |= POOL_FREEZING;
3616
3617                        list_for_each_entry(wq, &workqueues, list) {
3618                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
3619
3620                                if (pwq && pwq->pool == pool &&
3621                                    (wq->flags & WQ_FREEZABLE))
3622                                        pwq->max_active = 0;
3623                        }
3624
3625                        spin_unlock_irq(&pool->lock);
3626                }
3627        }
3628
3629        spin_unlock(&workqueue_lock);
3630}
3631
3632/**
3633 * freeze_workqueues_busy - are freezable workqueues still busy?
3634 *
3635 * Check whether freezing is complete.  This function must be called
3636 * between freeze_workqueues_begin() and thaw_workqueues().
3637 *
3638 * CONTEXT:
3639 * Grabs and releases workqueue_lock.
3640 *
3641 * RETURNS:
3642 * %true if some freezable workqueues are still busy.  %false if freezing
3643 * is complete.
3644 */
3645bool freeze_workqueues_busy(void)
3646{
3647        unsigned int cpu;
3648        bool busy = false;
3649
3650        spin_lock(&workqueue_lock);
3651
3652        BUG_ON(!workqueue_freezing);
3653
3654        for_each_wq_cpu(cpu) {
3655                struct workqueue_struct *wq;
3656                /*
3657                 * nr_active is monotonically decreasing.  It's safe
3658                 * to peek without lock.
3659                 */
3660                list_for_each_entry(wq, &workqueues, list) {
3661                        struct pool_workqueue *pwq = get_pwq(cpu, wq);
3662
3663                        if (!pwq || !(wq->flags & WQ_FREEZABLE))
3664                                continue;
3665
3666                        BUG_ON(pwq->nr_active < 0);
3667                        if (pwq->nr_active) {
3668                                busy = true;
3669                                goto out_unlock;
3670                        }
3671                }
3672        }
3673out_unlock:
3674        spin_unlock(&workqueue_lock);
3675        return busy;
3676}
3677
3678/**
3679 * thaw_workqueues - thaw workqueues
3680 *
3681 * Thaw workqueues.  Normal queueing is restored and all collected
3682 * frozen works are transferred to their respective pool worklists.
3683 *
3684 * CONTEXT:
3685 * Grabs and releases workqueue_lock and pool->lock's.
3686 */
3687void thaw_workqueues(void)
3688{
3689        unsigned int cpu;
3690
3691        spin_lock(&workqueue_lock);
3692
3693        if (!workqueue_freezing)
3694                goto out_unlock;
3695
3696        for_each_wq_cpu(cpu) {
3697                struct worker_pool *pool;
3698                struct workqueue_struct *wq;
3699
3700                for_each_std_worker_pool(pool, cpu) {
3701                        spin_lock_irq(&pool->lock);
3702
3703                        WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
3704                        pool->flags &= ~POOL_FREEZING;
3705
3706                        list_for_each_entry(wq, &workqueues, list) {
3707                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
3708
3709                                if (!pwq || pwq->pool != pool ||
3710                                    !(wq->flags & WQ_FREEZABLE))
3711                                        continue;
3712
3713                                /* restore max_active and repopulate worklist */
3714                                pwq_set_max_active(pwq, wq->saved_max_active);
3715                        }
3716
3717                        wake_up_worker(pool);
3718
3719                        spin_unlock_irq(&pool->lock);
3720                }
3721        }
3722
3723        workqueue_freezing = false;
3724out_unlock:
3725        spin_unlock(&workqueue_lock);
3726}
3727#endif /* CONFIG_FREEZER */
3728
3729static int __init init_workqueues(void)
3730{
3731        unsigned int cpu;
3732
3733        /* make sure we have enough bits for OFFQ pool ID */
3734        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3735                     WORK_CPU_END * NR_STD_WORKER_POOLS);
3736
3737        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3738        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3739
3740        /* initialize CPU pools */
3741        for_each_wq_cpu(cpu) {
3742                struct worker_pool *pool;
3743
3744                for_each_std_worker_pool(pool, cpu) {
3745                        spin_lock_init(&pool->lock);
3746                        pool->cpu = cpu;
3747                        pool->flags |= POOL_DISASSOCIATED;
3748                        INIT_LIST_HEAD(&pool->worklist);
3749                        INIT_LIST_HEAD(&pool->idle_list);
3750                        hash_init(pool->busy_hash);
3751
3752                        init_timer_deferrable(&pool->idle_timer);
3753                        pool->idle_timer.function = idle_worker_timeout;
3754                        pool->idle_timer.data = (unsigned long)pool;
3755
3756                        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3757                                    (unsigned long)pool);
3758
3759                        mutex_init(&pool->assoc_mutex);
3760                        ida_init(&pool->worker_ida);
3761
3762                        /* alloc pool ID */
3763                        BUG_ON(worker_pool_assign_id(pool));
3764                }
3765        }
3766
3767        /* create the initial worker */
3768        for_each_online_wq_cpu(cpu) {
3769                struct worker_pool *pool;
3770
3771                for_each_std_worker_pool(pool, cpu) {
3772                        struct worker *worker;
3773
3774                        if (cpu != WORK_CPU_UNBOUND)
3775                                pool->flags &= ~POOL_DISASSOCIATED;
3776
3777                        worker = create_worker(pool);
3778                        BUG_ON(!worker);
3779                        spin_lock_irq(&pool->lock);
3780                        start_worker(worker);
3781                        spin_unlock_irq(&pool->lock);
3782                }
3783        }
3784
3785        system_wq = alloc_workqueue("events", 0, 0);
3786        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
3787        system_long_wq = alloc_workqueue("events_long", 0, 0);
3788        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3789                                            WQ_UNBOUND_MAX_ACTIVE);
3790        system_freezable_wq = alloc_workqueue("events_freezable",
3791                                              WQ_FREEZABLE, 0);
3792        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
3793               !system_unbound_wq || !system_freezable_wq);
3794        return 0;
3795}
3796early_initcall(init_workqueues);
3797
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.