/*-
* Copyright (c) 2014, 2018 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell and Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Thread pools.
*
* A thread pool is a collection of worker threads idle or running
* jobs, together with a dispatcher thread that does not run jobs but
* can be given jobs to assign to a worker thread. Scheduling a job in
* a thread pool does not allocate or even sleep at all, except perhaps
* on an adaptive lock, unlike kthread_create. Jobs reuse threads, so
* they do not incur the expense of creating and destroying kthreads
* unless there is not much work to be done.
*
* A per-CPU thread pool (threadpool_percpu) is a collection of thread
* pools, one per CPU bound to that CPU. For each priority level in
* use, there is one shared unbound thread pool (i.e., pool of threads
* not bound to any CPU) and one shared per-CPU thread pool.
*
* To use the unbound thread pool at priority pri, call
* threadpool_get(&pool, pri). When you're done, call
* threadpool_put(pool, pri).
*
* To use the per-CPU thread pools at priority pri, call
* threadpool_percpu_get(&pool_percpu, pri), and then use the thread
* pool returned by threadpool_percpu_ref(pool_percpu) for the current
* CPU, or by threadpool_percpu_ref_remote(pool_percpu, ci) for another
* CPU. When you're done, call threadpool_percpu_put(pool_percpu,
* pri).
*
* +--MACHINE-----------------------------------------------------+
* | +--CPU 0---------+ +--CPU 1---------+ +--CPU n---------+ |
* | | <dispatcher 0> | | <dispatcher 1> | ... | <dispatcher n> | |
* | | <idle 0a> | | <running 1a> | ... | <idle na> | |
* | | <running 0b> | | <running 1b> | ... | <idle nb> | |
* | | . | | . | ... | . | |
* | | . | | . | ... | . | |
* | | . | | . | ... | . | |
* | +----------------+ +----------------+ +----------------+ |
* | +--unbound-----------+ |
* | | <dispatcher n+1> | |
* | | <idle (n+1)a> | |
* | | <running (n+1)b> | |
* | +--------------------+ |
* +--------------------------------------------------------------+
*
* XXX Why one dispatcher per CPU? I did that originally to avoid
* touching remote CPUs' memory when scheduling a job, but that still
* requires interprocessor synchronization. Perhaps we could get by
* with a single dispatcher thread, at the expense of another pointer
* in struct threadpool_job to identify the CPU on which it must run in
* order for the dispatcher to schedule it correctly.
*/
/* Mark the pool dying and wait for threads to commit suicide. */
mutex_spin_enter(&pool->tp_lock);
KASSERT(TAILQ_EMPTY(&pool->tp_jobs));
pool->tp_flags |= THREADPOOL_DYING;
cv_broadcast(&pool->tp_dispatcher.tpt_cv);
TAILQ_FOREACH(thread, &pool->tp_idle_threads, tpt_entry)
cv_broadcast(&thread->tpt_cv);
while (0 < pool->tp_refcnt) {
SDT_PROBE2(sdt, kernel, threadpool, destroy__wait,
pool, pool->tp_refcnt);
cv_wait(&pool->tp_dispatcher.tpt_cv, &pool->tp_lock);
}
mutex_spin_exit(&pool->tp_lock);
/*
* As long as xcalls are blocked -- e.g., by kpreempt_disable
* -- the percpu object will not be swapped and destroyed. We
* can't write to it, because the data may have already been
* moved to a new buffer, but we can safely read from it.
*/
kpreempt_disable();
poolp = percpu_getptr_remote(pool_percpu->tpp_percpu, ci);
pool = *poolp;
kpreempt_enable();
return pool;
}
static int
threadpool_percpu_create(struct threadpool_percpu **pool_percpup, pri_t pri)
{
struct threadpool_percpu *pool_percpu;
bool ok = true;
/*
* Verify that all of the CPUs were initialized.
*
* XXX What to do if we add CPU hotplug?
*/
percpu_foreach(pool_percpu->tpp_percpu, &threadpool_percpu_ok, &ok);
if (!ok)
goto fail;
/*
* We can safely read this field; it's only modified right before
* we call the job work function, and we are only preserving it
* to use here; no one cares if it contains junk afterward.
*/
lwp_lock(curlwp);
curlwp->l_name = job->job_thread->tpt_lwp_savedname;
lwp_unlock(curlwp);
/*
* Inline the work of threadpool_job_rele(); the job is already
* locked, the most likely scenario (XXXJRT only scenario?) is
* that we're dropping the last reference (the one taken in
* threadpool_schedule_job()), and we always do the cv_broadcast()
* anyway.
*/
KASSERT(0 < atomic_load_relaxed(&job->job_refcnt));
unsigned int refcnt __diagused = atomic_dec_uint_nv(&job->job_refcnt);
KASSERT(refcnt != UINT_MAX);
cv_broadcast(&job->job_cv);
job->job_thread = NULL;
}
SDT_PROBE2(sdt, kernel, threadpool, schedule__job, pool, job);
/*
* If the job's already running, let it keep running. The job
* is guaranteed by the interlock not to end early -- if it had
* ended early, threadpool_job_done would have set job_thread
* to NULL under the interlock.
*/
if (__predict_true(job->job_thread != NULL)) {
SDT_PROBE2(sdt, kernel, threadpool, schedule__job__running,
pool, job);
return;
}
threadpool_job_hold(job);
/* Otherwise, try to assign a thread to the job. */
mutex_spin_enter(&pool->tp_lock);
if (__predict_false(TAILQ_EMPTY(&pool->tp_idle_threads))) {
/* Nobody's idle. Give it to the dispatcher. */
SDT_PROBE2(sdt, kernel, threadpool, schedule__job__dispatcher,
pool, job);
job->job_thread = &pool->tp_dispatcher;
TAILQ_INSERT_TAIL(&pool->tp_jobs, job, job_entry);
} else {
/* Assign it to the first idle thread. */
job->job_thread = TAILQ_FIRST(&pool->tp_idle_threads);
SDT_PROBE3(sdt, kernel, threadpool, schedule__job__thread,
pool, job, job->job_thread->tpt_lwp);
TAILQ_REMOVE(&pool->tp_idle_threads, job->job_thread,
tpt_entry);
job->job_thread->tpt_job = job;
}
/* Notify whomever we gave it to, dispatcher or idle thread. */
KASSERT(job->job_thread != NULL);
cv_broadcast(&job->job_thread->tpt_cv);
mutex_spin_exit(&pool->tp_lock);
}
/*
* XXXJRT This fails (albeit safely) when all of the following
* are true:
*
* => "pool" is something other than what the job was
* scheduled on. This can legitimately occur if,
* for example, a job is percpu-scheduled on CPU0
* and then CPU1 attempts to cancel it without taking
* a remote pool reference. (this might happen by
* "luck of the draw").
*
* => "job" is not yet running, but is assigned to the
* dispatcher.
*
* When this happens, this code makes the determination that
* the job is already running. The failure mode is that the
* caller is told the job is running, and thus has to wait.
* The dispatcher will eventually get to it and the job will
* proceed as if it had been already running.
*/
if (job->job_thread == NULL) {
/* Nothing to do. Guaranteed not running. */
return true;
} else if (job->job_thread == &pool->tp_dispatcher) {
/* Take it off the list to guarantee it won't run. */
job->job_thread = NULL;
mutex_spin_enter(&pool->tp_lock);
TAILQ_REMOVE(&pool->tp_jobs, job, job_entry);
mutex_spin_exit(&pool->tp_lock);
threadpool_job_rele(job);
return true;
} else {
/* Too late -- already running. */
return false;
}
}
/*
* We may sleep here, but we can't ASSERT_SLEEPABLE() because
* the job lock (used to interlock the cv_wait()) may in fact
* legitimately be a spin lock, so the assertion would fire
* as a false-positive.
*/
KASSERT(mutex_owned(job->job_lock));
if (threadpool_cancel_job_async(pool, job))
return;
/* Already running. Wait for it to complete. */
while (job->job_thread != NULL)
cv_wait(&job->job_cv, job->job_lock);
}
/* Thread pool dispatcher thread */
static void __dead
threadpool_dispatcher_thread(void *arg)
{
struct threadpool_thread *const dispatcher = arg;
struct threadpool *const pool = dispatcher->tpt_pool;
struct lwp *lwp = NULL;
int ktflags;
char suffix[16];
int error;
for (;;) {
/* Wait until there's a job. */
while (TAILQ_EMPTY(&pool->tp_jobs)) {
if (ISSET(pool->tp_flags, THREADPOOL_DYING)) {
SDT_PROBE1(sdt, kernel, threadpool,
dispatcher__dying, pool);
break;
}
cv_wait(&dispatcher->tpt_cv, &pool->tp_lock);
}
if (__predict_false(TAILQ_EMPTY(&pool->tp_jobs)))
break;
/* If there are no threads, we'll have to try to start one. */
if (TAILQ_EMPTY(&pool->tp_idle_threads)) {
SDT_PROBE1(sdt, kernel, threadpool, dispatcher__spawn,
pool);
threadpool_hold(pool);
mutex_spin_exit(&pool->tp_lock);
mutex_spin_enter(&pool->tp_lock);
if (error) {
pool_cache_put(threadpool_thread_pc, thread);
threadpool_rele(pool);
/* XXX What to do to wait for memory? */
(void)kpause("thrdplcr", false, hz,
&pool->tp_lock);
continue;
}
/*
* New kthread now owns the reference to the pool
* taken above.
*/
KASSERT(lwp != NULL);
TAILQ_INSERT_TAIL(&pool->tp_idle_threads, thread,
tpt_entry);
thread->tpt_lwp = lwp;
lwp = NULL;
cv_broadcast(&thread->tpt_cv);
continue;
}
/* There are idle threads, so try giving one a job. */
struct threadpool_job *const job = TAILQ_FIRST(&pool->tp_jobs);
/*
* Take an extra reference on the job temporarily so that
* it won't disappear on us while we have both locks dropped.
*/
threadpool_job_hold(job);
mutex_spin_exit(&pool->tp_lock);
mutex_enter(job->job_lock);
/* If the job was cancelled, we'll no longer be its thread. */
if (__predict_true(job->job_thread == dispatcher)) {
mutex_spin_enter(&pool->tp_lock);
TAILQ_REMOVE(&pool->tp_jobs, job, job_entry);
if (__predict_false(
TAILQ_EMPTY(&pool->tp_idle_threads))) {
/*
* Someone else snagged the thread
* first. We'll have to try again.
*/
SDT_PROBE2(sdt, kernel, threadpool,
dispatcher__race, pool, job);
TAILQ_INSERT_HEAD(&pool->tp_jobs, job,
job_entry);
} else {
/*
* Assign the job to the thread and
* wake the thread so it starts work.
*/
struct threadpool_thread *const thread =
TAILQ_FIRST(&pool->tp_idle_threads);
/* Wait until we're initialized and on the queue. */
mutex_spin_enter(&pool->tp_lock);
while (thread->tpt_lwp == NULL)
cv_wait(&thread->tpt_cv, &pool->tp_lock);
/* Set our lwp name to reflect what job we're doing. */
lwp_lock(curlwp);
char *const lwp_name __diagused = curlwp->l_name;
thread->tpt_lwp_savedname = curlwp->l_name;
curlwp->l_name = job->job_name;
lwp_unlock(curlwp);
mutex_spin_exit(&pool->tp_lock);
SDT_PROBE2(sdt, kernel, threadpool, thread__job, pool, job);
/* Run the job. */
(*job->job_fn)(job);
/* lwp name restored in threadpool_job_done(). */
KASSERTMSG((curlwp->l_name == lwp_name),
"someone forgot to call threadpool_job_done()!");
/*
* We can compare pointers, but we can no longer deference
* job after this because threadpool_job_done() drops the
* last reference on the job while the job is locked.
*/