/* $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $ */
/*
* Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* needed in slowpath */
struct rumpuser_mtx *rcpu_mtx;
struct rumpuser_cv *rcpu_cv;
int rcpu_wanted;
/* offset 20 (P=4) or 36 (P=8) here */
/*
* Some stats. Not really that necessary, but we should
* have room. Note that these overflow quite fast, so need
* to be collected often.
*/
unsigned int rcpu_fastpath;
unsigned int rcpu_slowpath;
unsigned int rcpu_migrated;
/* offset 32 (P=4) or 50 (P=8) */
int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
} rcpu_storage[MAXCPUS];
/*
* Keep some stats.
*
* Keeping track of there is not really critical for speed, unless
* stats happen to be on a different cache line (CACHE_LINE_SIZE is
* really just a coarse estimate), so default for the performant case
* (i.e. no stats).
*/
#ifdef RUMPSCHED_STATS
#define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
#define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
#define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
#else
#define SCHED_FASTPATH(rcpu)
#define SCHED_SLOWPATH(rcpu)
#define SCHED_MIGRATED(rcpu)
#endif
rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
rumpuser_cv_init(&lwp0cv);
for (i = 0; i < numcpu; i++) {
if (i == 0) {
ci = &rump_bootcpu;
} else {
ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
ci->ci_index = i;
}
/*
* rump_schedule: ensure that the calling host thread has a valid lwp context.
* ie. ensure that curlwp != NULL. Also, ensure that there
* a 1:1 mapping between the lwp and rump kernel cpu.
*/
void
rump_schedule()
{
struct lwp *l;
/*
* If there is no dedicated lwp, allocate a temp one and
* set it to be free'd upon unschedule(). Use lwp0 context
* for reserving the necessary resources. Don't optimize
* for this case -- anyone who cares about performance will
* start a real thread.
*/
if (__predict_true((l = curlwp) != NULL)) {
struct proc *p = l->l_proc;
rump_schedule_cpu(l);
if (l->l_cred != p->p_cred) {
kauth_cred_t oc = l->l_cred;
mutex_enter(p->p_lock);
l->l_cred = kauth_cred_hold(p->p_cred);
mutex_exit(p->p_lock);
kauth_cred_free(oc);
}
} else {
lwp0busy();
/* schedule cpu and use lwp0 */
rump_schedule_cpu(&lwp0);
rump_lwproc_curlwp_set(&lwp0);
/* allocate thread, switch to it, and release lwp0 */
l = rump__lwproc_alloclwp(initproc);
rump_lwproc_switch(l);
lwp0rele();
/*
* mark new thread dead-on-unschedule. this
* means that we'll be running with l_refcnt == 0.
* relax, it's fine.
*/
rump_lwproc_releaselwp();
}
}
void
rump_schedule_cpu(struct lwp *l)
{
rump_schedule_cpu_interlock(l, NULL);
}
/*
* Schedule a CPU. This optimizes for the case where we schedule
* the same thread often, and we have nCPU >= nFrequently-Running-Thread
* (where CPU is virtual rump cpu, not host CPU).
*/
void
rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
{
struct rumpcpu *rcpu;
struct cpu_info *ci;
void *old;
bool domigrate;
bool bound = l->l_pflag & LP_BOUND;
l->l_stat = LSRUN;
/*
* First, try fastpath: if we were the previous user of the
* CPU, everything is in order cachewise and we can just
* proceed to use it.
*
* If we are a different thread (i.e. CAS fails), we must go
* through a memory barrier to ensure we get a truthful
* view of the world.
*/
KASSERT(l->l_target_cpu != NULL);
rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
if (interlock == rcpu->rcpu_mtx)
rumpuser_mutex_exit(rcpu->rcpu_mtx);
SCHED_FASTPATH(rcpu);
/* jones, you're the man */
goto fastlane;
}
/*
* Else, it's the slowpath for us. First, determine if we
* can migrate.
*/
if (ncpu == 1)
domigrate = false;
else
domigrate = true;
/* Take lock. This acts as a load barrier too. */
if (interlock != rcpu->rcpu_mtx)
rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
for (;;) {
SCHED_SLOWPATH(rcpu);
old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
/* CPU is free? */
if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
break;
}
}
/*
* Do we want to migrate once?
* This may need a slightly better algorithm, or we
* might cache pingpong eternally for non-frequent
* threads.
*/
if (domigrate && !bound) {
domigrate = false;
SCHED_MIGRATED(rcpu);
rumpuser_mutex_exit(rcpu->rcpu_mtx);
rcpu = getnextcpu();
rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
continue;
}
/* Want CPU, wait until it's released an retry */
rcpu->rcpu_wanted++;
rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
rcpu->rcpu_wanted--;
}
rumpuser_mutex_exit(rcpu->rcpu_mtx);
/*
* No interrupts, so ci_curlwp === cpu_onproc.
* Okay, we could make an attempt to not set cpu_onproc
* in the case that an interrupt is scheduled immediately
* after a user proc, but leave that for later.
*/
ci->ci_curlwp = ci->ci_onproc = l;
}
/*
* Check special conditions:
* 1) do we need to free the lwp which just unscheduled?
* (locking order: lwp0, cpu)
* 2) do we want to clear curlwp for the current host thread
*/
if (__predict_false(l->l_flag & LW_WEXIT)) {
lwp0busy();
/* Now that we have lwp0, we can schedule a CPU again */
rump_schedule_cpu(l);
/* switch to lwp0. this frees the old thread */
KASSERT(l->l_flag & LW_WEXIT);
rump_lwproc_switch(&lwp0);
/*
* Make sure all stores are seen before the CPU release. This
* is relevant only in the non-fastpath scheduling case, but
* we don't know here if that's going to happen, so need to
* expect the worst.
*
* If the scheduler interlock was requested by the caller, we
* need to obtain it before we release the CPU. Otherwise, we risk a
* race condition where another thread is scheduled onto the
* rump kernel CPU before our current thread can
* grab the interlock.
*/
if (interlock == rcpu->rcpu_mtx)
rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
else
membar_release(); /* XXX what does this pair with? */
/* Release the CPU. */
old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
/* No waiters? No problems. We're outta here. */
if (old == RCPULWP_BUSY) {
return;
}
KASSERT(old == RCPULWP_WANTED);
/*
* Ok, things weren't so snappy.
*
* Snailpath: take lock and signal anyone waiting for this CPU.
*/
if (interlock != rcpu->rcpu_mtx)
rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
if (rcpu->rcpu_wanted)
rumpuser_cv_broadcast(rcpu->rcpu_cv);
if (interlock != rcpu->rcpu_mtx)
rumpuser_mutex_exit(rcpu->rcpu_mtx);
}
/* Give up and retake CPU (perhaps a different one) */
void
yield()
{
struct lwp *l = curlwp;
int nlocks;
/*
* There is no kernel thread preemption in rump currently. But call
* the implementing macros anyway in case they grow some side-effects
* down the road.
*/
void
kpreempt_disable(void)
{