/*      $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $        */

/*
* Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/queue.h>
#include <sys/select.h>
#include <sys/systm.h>

#include <rump-sys/kern.h>

#include <rump/rumpuser.h>

static struct rumpcpu {
       /* needed in fastpath */
       struct cpu_info *rcpu_ci;
       void *rcpu_prevlwp;

       /* needed in slowpath */
       struct rumpuser_mtx *rcpu_mtx;
       struct rumpuser_cv *rcpu_cv;
       int rcpu_wanted;

       /* offset 20 (P=4) or 36 (P=8) here */

       /*
        * Some stats.  Not really that necessary, but we should
        * have room.  Note that these overflow quite fast, so need
        * to be collected often.
        */
       unsigned int rcpu_fastpath;
       unsigned int rcpu_slowpath;
       unsigned int rcpu_migrated;

       /* offset 32 (P=4) or 50 (P=8) */

       int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
} rcpu_storage[MAXCPUS];

static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info *ci)
{

       return &rcpu_storage[cpu_index(ci)];
}

struct cpu_info rump_bootcpu;

#define RCPULWP_BUSY    ((void *)-1)
#define RCPULWP_WANTED  ((void *)-2)

static struct rumpuser_mtx *lwp0mtx;
static struct rumpuser_cv *lwp0cv;
static unsigned nextcpu;

kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */

static bool lwp0isbusy = false;

/*
* Keep some stats.
*
* Keeping track of there is not really critical for speed, unless
* stats happen to be on a different cache line (CACHE_LINE_SIZE is
* really just a coarse estimate), so default for the performant case
* (i.e. no stats).
*/
#ifdef RUMPSCHED_STATS
#define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
#define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
#define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
#else
#define SCHED_FASTPATH(rcpu)
#define SCHED_SLOWPATH(rcpu)
#define SCHED_MIGRATED(rcpu)
#endif

struct cpu_info *
cpu_lookup(u_int index)
{

       return rcpu_storage[index].rcpu_ci;
}

static inline struct rumpcpu *
getnextcpu(void)
{
       unsigned newcpu;

       newcpu = atomic_inc_uint_nv(&nextcpu);
       if (__predict_false(ncpu > UINT_MAX/2))
               atomic_and_uint(&nextcpu, 0);
       newcpu = newcpu % ncpu;

       return &rcpu_storage[newcpu];
}

/* this could/should be mi_attach_cpu? */
void
rump_cpus_bootstrap(int *nump)
{
       int num = *nump;

       if (num > MAXCPUS) {
               aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
                   "available (adjusted)\n", num, MAXCPUS);
               num = MAXCPUS;
       }

       cpu_setmodel("rumpcore (virtual)");

       mi_cpu_init();

       /* attach first cpu for bootstrap */
       rump_cpu_attach(&rump_bootcpu);
       ncpu = 1;
       *nump = num;
}

void
rump_scheduler_init(int numcpu)
{
       struct rumpcpu *rcpu;
       struct cpu_info *ci;
       int i;

       rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
       rumpuser_cv_init(&lwp0cv);
       for (i = 0; i < numcpu; i++) {
               if (i == 0) {
                       ci = &rump_bootcpu;
               } else {
                       ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
                       ci->ci_index = i;
               }

               rcpu = &rcpu_storage[i];
               rcpu->rcpu_ci = ci;
               rcpu->rcpu_wanted = 0;
               rumpuser_cv_init(&rcpu->rcpu_cv);
               rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);

               ci->ci_schedstate.spc_mutex =
                   mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
               ci->ci_schedstate.spc_flags = SPCF_RUNNING;
       }

       mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
}

void
rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
{
       struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);

       rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
       rumpuser_cv_signal(cv);
       rumpuser_mutex_exit(rcpu->rcpu_mtx);
}

/*
* condvar ops using scheduler lock as the rumpuser interlock.
*/
void
rump_schedlock_cv_wait(struct rumpuser_cv *cv)
{
       struct lwp *l = curlwp;
       struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);

       /* mutex will be taken and released in cpu schedule/unschedule */
       rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
}

int
rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
{
       struct lwp *l = curlwp;
       struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);

       /* mutex will be taken and released in cpu schedule/unschedule */
       return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
           ts->tv_sec, ts->tv_nsec);
}

static void
lwp0busy(void)
{

       /* busy lwp0 */
       KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
       rumpuser_mutex_enter_nowrap(lwp0mtx);
       while (lwp0isbusy)
               rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
       lwp0isbusy = true;
       rumpuser_mutex_exit(lwp0mtx);
}

static void
lwp0rele(void)
{

       rumpuser_mutex_enter_nowrap(lwp0mtx);
       KASSERT(lwp0isbusy == true);
       lwp0isbusy = false;
       rumpuser_cv_signal(lwp0cv);
       rumpuser_mutex_exit(lwp0mtx);
}

/*
* rump_schedule: ensure that the calling host thread has a valid lwp context.
* ie. ensure that curlwp != NULL.  Also, ensure that there
* a 1:1 mapping between the lwp and rump kernel cpu.
*/
void
rump_schedule()
{
       struct lwp *l;

       /*
        * If there is no dedicated lwp, allocate a temp one and
        * set it to be free'd upon unschedule().  Use lwp0 context
        * for reserving the necessary resources.  Don't optimize
        * for this case -- anyone who cares about performance will
        * start a real thread.
        */
       if (__predict_true((l = curlwp) != NULL)) {
               struct proc *p = l->l_proc;
               rump_schedule_cpu(l);
               if (l->l_cred != p->p_cred) {
                       kauth_cred_t oc = l->l_cred;
                       mutex_enter(p->p_lock);
                       l->l_cred = kauth_cred_hold(p->p_cred);
                       mutex_exit(p->p_lock);
                       kauth_cred_free(oc);
               }
       } else {
               lwp0busy();

               /* schedule cpu and use lwp0 */
               rump_schedule_cpu(&lwp0);
               rump_lwproc_curlwp_set(&lwp0);

               /* allocate thread, switch to it, and release lwp0 */
               l = rump__lwproc_alloclwp(initproc);
               rump_lwproc_switch(l);
               lwp0rele();

               /*
                * mark new thread dead-on-unschedule.  this
                * means that we'll be running with l_refcnt == 0.
                * relax, it's fine.
                */
               rump_lwproc_releaselwp();
       }
}

void
rump_schedule_cpu(struct lwp *l)
{

       rump_schedule_cpu_interlock(l, NULL);
}

/*
* Schedule a CPU.  This optimizes for the case where we schedule
* the same thread often, and we have nCPU >= nFrequently-Running-Thread
* (where CPU is virtual rump cpu, not host CPU).
*/
void
rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
{
       struct rumpcpu *rcpu;
       struct cpu_info *ci;
       void *old;
       bool domigrate;
       bool bound = l->l_pflag & LP_BOUND;

       l->l_stat = LSRUN;

       /*
        * First, try fastpath: if we were the previous user of the
        * CPU, everything is in order cachewise and we can just
        * proceed to use it.
        *
        * If we are a different thread (i.e. CAS fails), we must go
        * through a memory barrier to ensure we get a truthful
        * view of the world.
        */

       KASSERT(l->l_target_cpu != NULL);
       rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
       if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
               if (interlock == rcpu->rcpu_mtx)
                       rumpuser_mutex_exit(rcpu->rcpu_mtx);
               SCHED_FASTPATH(rcpu);
               /* jones, you're the man */
               goto fastlane;
       }

       /*
        * Else, it's the slowpath for us.  First, determine if we
        * can migrate.
        */
       if (ncpu == 1)
               domigrate = false;
       else
               domigrate = true;

       /* Take lock.  This acts as a load barrier too. */
       if (interlock != rcpu->rcpu_mtx)
               rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);

       for (;;) {
               SCHED_SLOWPATH(rcpu);
               old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);

               /* CPU is free? */
               if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
                       if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
                           RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
                               break;
                       }
               }

               /*
                * Do we want to migrate once?
                * This may need a slightly better algorithm, or we
                * might cache pingpong eternally for non-frequent
                * threads.
                */
               if (domigrate && !bound) {
                       domigrate = false;
                       SCHED_MIGRATED(rcpu);
                       rumpuser_mutex_exit(rcpu->rcpu_mtx);
                       rcpu = getnextcpu();
                       rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
                       continue;
               }

               /* Want CPU, wait until it's released an retry */
               rcpu->rcpu_wanted++;
               rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
               rcpu->rcpu_wanted--;
       }
       rumpuser_mutex_exit(rcpu->rcpu_mtx);

fastlane:
       ci = rcpu->rcpu_ci;
       l->l_cpu = l->l_target_cpu = ci;
       l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
       l->l_ru.ru_nvcsw++;
       l->l_stat = LSONPROC;

       /*
        * No interrupts, so ci_curlwp === cpu_onproc.
        * Okay, we could make an attempt to not set cpu_onproc
        * in the case that an interrupt is scheduled immediately
        * after a user proc, but leave that for later.
        */
       ci->ci_curlwp = ci->ci_onproc = l;
}

void
rump_unschedule()
{
       struct lwp *l = curlwp;
#ifdef DIAGNOSTIC
       int nlock;

       KERNEL_UNLOCK_ALL(l, &nlock);
       KASSERT(nlock == 0);
#endif

       KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
       rump_unschedule_cpu(l);
       l->l_mutex = &unruntime_lock;
       l->l_stat = LSSTOP;

       /*
        * Check special conditions:
        *  1) do we need to free the lwp which just unscheduled?
        *     (locking order: lwp0, cpu)
        *  2) do we want to clear curlwp for the current host thread
        */
       if (__predict_false(l->l_flag & LW_WEXIT)) {
               lwp0busy();

               /* Now that we have lwp0, we can schedule a CPU again */
               rump_schedule_cpu(l);

               /* switch to lwp0.  this frees the old thread */
               KASSERT(l->l_flag & LW_WEXIT);
               rump_lwproc_switch(&lwp0);

               /* release lwp0 */
               rump_unschedule_cpu(&lwp0);
               lwp0.l_mutex = &unruntime_lock;
               lwp0.l_pflag &= ~LP_RUNNING;
               lwp0rele();
               rump_lwproc_curlwp_clear(&lwp0);

       } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
               rump_lwproc_curlwp_clear(l);
               l->l_flag &= ~LW_RUMP_CLEAR;
       }
}

void
rump_unschedule_cpu(struct lwp *l)
{

       rump_unschedule_cpu_interlock(l, NULL);
}

void
rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
{

       if ((l->l_pflag & LP_INTR) == 0)
               rump_softint_run(l->l_cpu);
       rump_unschedule_cpu1(l, interlock);
}

void
rump_unschedule_cpu1(struct lwp *l, void *interlock)
{
       struct rumpcpu *rcpu;
       struct cpu_info *ci;
       void *old;

       ci = l->l_cpu;
       ci->ci_curlwp = ci->ci_onproc = NULL;
       rcpu = cpuinfo_to_rumpcpu(ci);

       KASSERT(rcpu->rcpu_ci == ci);

       /*
        * Make sure all stores are seen before the CPU release.  This
        * is relevant only in the non-fastpath scheduling case, but
        * we don't know here if that's going to happen, so need to
        * expect the worst.
        *
        * If the scheduler interlock was requested by the caller, we
        * need to obtain it before we release the CPU.  Otherwise, we risk a
        * race condition where another thread is scheduled onto the
        * rump kernel CPU before our current thread can
        * grab the interlock.
        */
       if (interlock == rcpu->rcpu_mtx)
               rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
       else
               membar_release(); /* XXX what does this pair with? */

       /* Release the CPU. */
       old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);

       /* No waiters?  No problems.  We're outta here. */
       if (old == RCPULWP_BUSY) {
               return;
       }

       KASSERT(old == RCPULWP_WANTED);

       /*
        * Ok, things weren't so snappy.
        *
        * Snailpath: take lock and signal anyone waiting for this CPU.
        */

       if (interlock != rcpu->rcpu_mtx)
               rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
       if (rcpu->rcpu_wanted)
               rumpuser_cv_broadcast(rcpu->rcpu_cv);
       if (interlock != rcpu->rcpu_mtx)
               rumpuser_mutex_exit(rcpu->rcpu_mtx);
}

/* Give up and retake CPU (perhaps a different one) */
void
yield()
{
       struct lwp *l = curlwp;
       int nlocks;

       KERNEL_UNLOCK_ALL(l, &nlocks);
       rump_unschedule_cpu(l);
       rump_schedule_cpu(l);
       KERNEL_LOCK(nlocks, l);
}

void
preempt()
{

       yield();
}

bool
kpreempt(uintptr_t where)
{

       return false;
}

/*
* There is no kernel thread preemption in rump currently.  But call
* the implementing macros anyway in case they grow some side-effects
* down the road.
*/
void
kpreempt_disable(void)
{

       KPREEMPT_DISABLE(curlwp);
}

void
kpreempt_enable(void)
{

       KPREEMPT_ENABLE(curlwp);
}

bool
kpreempt_disabled(void)
{
#if 0
       const lwp_t *l = curlwp;

       return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
           (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
#endif
       /* XXX: emulate cpu_kpreempt_disabled() */
       return true;
}

void
suspendsched(void)
{

       /*
        * Could wait until everyone is out and block further entries,
        * but skip that for now.
        */
}

void
sched_nice(struct proc *p, int level)
{

       /* nothing to do for now */
}

void
setrunnable(struct lwp *l)
{

       sched_enqueue(l);
}

void
sched_enqueue(struct lwp *l)
{

       rump_thread_allow(l);
}

void
sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
{

}

void
sched_resched_lwp(struct lwp *l, bool unlock)
{

}

void
sched_dequeue(struct lwp *l)
{

       panic("sched_dequeue not implemented");
}

void
preempt_point(void)
{

}

bool
preempt_needed(void)
{

       return false;
}