/* $NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $ */

/* $NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $ */

/*
* Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

/*
* System calls relating to the scheduler.
*
* Lock order:
*
* cpu_lock ->
* proc_lock ->
* proc_t::p_lock ->
* lwp_t::lwp_lock
*
* TODO:
* - Handle pthread_setschedprio() as defined by POSIX;
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $");

#include <sys/param.h>

#include <sys/cpu.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/sched.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/unistd.h>

static struct sysctllog *sched_sysctl_log;
static kauth_listener_t sched_listener;

/*
* Convert user priority or the in-kernel priority or convert the current
* priority to the appropriate range according to the policy change.
*/
static pri_t
convert_pri(lwp_t *l, int policy, pri_t pri)
{

/* Convert user priority to the in-kernel */
if (pri != PRI_NONE) {
/* Only for real-time threads */
KASSERT(pri >= SCHED_PRI_MIN);
KASSERT(pri <= SCHED_PRI_MAX);
KASSERT(policy != SCHED_OTHER);
return PRI_USER_RT + pri;
}

/* Neither policy, nor priority change */
if (l->l_class == policy)
return l->l_priority;

/* Time-sharing -> real-time */
if (l->l_class == SCHED_OTHER) {
KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
return PRI_USER_RT;
}

/* Real-time -> time-sharing */
if (policy == SCHED_OTHER) {
KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
/*
* this is a bit arbitrary because the priority is dynamic
* for SCHED_OTHER threads and will likely be changed by
* the scheduler soon anyway.
*/
return l->l_priority - PRI_USER_RT;
}

/* Real-time -> real-time */
return l->l_priority;
}

int
do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
const struct sched_param *params)
{
struct proc *p;
struct lwp *t;
pri_t pri;
u_int lcnt;
int error;

error = 0;

pri = params->sched_priority;

/* If no parameters specified, just return (this should not happen) */
if (pri == PRI_NONE && policy == SCHED_NONE)
return 0;

/* Validate scheduling class */
if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
return EINVAL;

/* Validate priority */
if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
return EINVAL;

if (pid != 0) {
/* Find the process */
mutex_enter(&proc_lock);
p = proc_find(pid);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
mutex_enter(p->p_lock);
mutex_exit(&proc_lock);
/* Disallow modification of system processes */
if ((p->p_flag & PK_SYSTEM) != 0) {
mutex_exit(p->p_lock);
return EPERM;
}
} else {
/* Use the calling process */
p = curlwp->l_proc;
mutex_enter(p->p_lock);
}

/* Find the LWP(s) */
lcnt = 0;
LIST_FOREACH(t, &p->p_lwps, l_sibling) {
pri_t kpri;
int lpolicy;

if (lid && lid != t->l_lid)
continue;

lcnt++;
lwp_lock(t);
lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;

/* Disallow setting of priority for SCHED_OTHER threads */
if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
lwp_unlock(t);
error = EINVAL;
break;
}

/* Convert priority, if needed */
kpri = convert_pri(t, lpolicy, pri);

/* Check the permission */
error = kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
KAUTH_ARG(kpri));
if (error) {
lwp_unlock(t);
break;
}

/* Set the scheduling class, change the priority */
t->l_class = lpolicy;
lwp_changepri(t, kpri);
lwp_unlock(t);
}
mutex_exit(p->p_lock);
return (lcnt == 0) ? ESRCH : error;
}

/*
* Set scheduling parameters.
*/
int
sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
register_t *retval)
{
/* {
syscallarg(pid_t) pid;
syscallarg(lwpid_t) lid;
syscallarg(int) policy;
syscallarg(const struct sched_param *) params;
} */
struct sched_param params;
int error;

/* Get the parameters from the user-space */
error = copyin(SCARG(uap, params), &params, sizeof(params));
if (error)
goto out;

error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
SCARG(uap, policy), &params);
out:
return error;
}

/*
* do_sched_getparam:
*
* if lid=0, returns the parameter of the first LWP in the process.
*/
int
do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
struct sched_param *params)
{
struct sched_param lparams;
struct lwp *t;
int error, lpolicy;

if (pid < 0 || lid < 0)
return EINVAL;

t = lwp_find2(pid, lid); /* acquire p_lock */
if (t == NULL)
return ESRCH;

/* Check the permission */
error = kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
if (error != 0) {
mutex_exit(t->l_proc->p_lock);
return error;
}

lwp_lock(t);
lparams.sched_priority = t->l_priority;
lpolicy = t->l_class;
lwp_unlock(t);
mutex_exit(t->l_proc->p_lock);

/*
* convert to the user-visible priority value.
* it's an inversion of convert_pri().
*
* the SCHED_OTHER case is a bit arbitrary given that
* - we don't allow setting the priority.
* - the priority is dynamic.
*/
switch (lpolicy) {
case SCHED_OTHER:
lparams.sched_priority -= PRI_USER;
break;
case SCHED_RR:
case SCHED_FIFO:
lparams.sched_priority -= PRI_USER_RT;
break;
}

if (policy != NULL)
*policy = lpolicy;

if (params != NULL)
*params = lparams;

return error;
}

/*
* Get scheduling parameters.
*/
int
sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
register_t *retval)
{
/* {
syscallarg(pid_t) pid;
syscallarg(lwpid_t) lid;
syscallarg(int *) policy;
syscallarg(struct sched_param *) params;
} */
struct sched_param params;
int error, policy;

error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
&params);
if (error)
goto out;

error = copyout(&params, SCARG(uap, params), sizeof(params));
if (error == 0 && SCARG(uap, policy) != NULL)
error = copyout(&policy, SCARG(uap, policy), sizeof(int));
out:
return error;
}

/*
* Allocate the CPU set, and get it from userspace.
*/
static int
genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
{
kcpuset_t *kset;
int error;

kcpuset_create(&kset, true);
error = kcpuset_copyin(sset, kset, size);
if (error) {
kcpuset_unuse(kset, NULL);
} else {
*dset = kset;
}
return error;
}

/*
* Set affinity.
*/
int
sys__sched_setaffinity(struct lwp *l,
const struct sys__sched_setaffinity_args *uap, register_t *retval)
{
/* {
syscallarg(pid_t) pid;
syscallarg(lwpid_t) lid;
syscallarg(size_t) size;
syscallarg(const cpuset_t *) cpuset;
} */
kcpuset_t *kcset, *kcpulst = NULL;
struct cpu_info *ici, *ci;
struct proc *p;
struct lwp *t;
CPU_INFO_ITERATOR cii;
bool alloff;
lwpid_t lid;
u_int lcnt;
int error;

error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
if (error)
return error;

/*
* Traverse _each_ CPU to:
* - Check that CPUs in the mask have no assigned processor set.
* - Check that at least one CPU from the mask is online.
* - Find the first target CPU to migrate.
*
* To avoid the race with CPU online/offline calls and processor sets,
* cpu_lock will be locked for the entire operation.
*/
ci = NULL;
alloff = false;
mutex_enter(&cpu_lock);
for (CPU_INFO_FOREACH(cii, ici)) {
struct schedstate_percpu *ispc;

if (!kcpuset_isset(kcset, cpu_index(ici))) {
continue;
}

ispc = &ici->ci_schedstate;
/* Check that CPU is not in the processor-set */
if (ispc->spc_psid != PS_NONE) {
error = EPERM;
goto out;
}
/* Skip offline CPUs */
if (ispc->spc_flags & SPCF_OFFLINE) {
alloff = true;
continue;
}
/* Target CPU to migrate */
if (ci == NULL) {
ci = ici;
}
}
if (ci == NULL) {
if (alloff) {
/* All CPUs in the set are offline */
error = EPERM;
goto out;
}
/* Empty set */
kcpuset_unuse(kcset, &kcpulst);
kcset = NULL;
}

if (SCARG(uap, pid) != 0) {
/* Find the process */
mutex_enter(&proc_lock);
p = proc_find(SCARG(uap, pid));
if (p == NULL) {
mutex_exit(&proc_lock);
error = ESRCH;
goto out;
}
mutex_enter(p->p_lock);
mutex_exit(&proc_lock);
/* Disallow modification of system processes. */
if ((p->p_flag & PK_SYSTEM) != 0) {
mutex_exit(p->p_lock);
error = EPERM;
goto out;
}
} else {
/* Use the calling process */
p = l->l_proc;
mutex_enter(p->p_lock);
}

/*
* Check the permission.
*/
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
if (error != 0) {
mutex_exit(p->p_lock);
goto out;
}

/* Iterate through LWP(s). */
lcnt = 0;
lid = SCARG(uap, lid);
LIST_FOREACH(t, &p->p_lwps, l_sibling) {
if (lid && lid != t->l_lid) {
continue;
}
lwp_lock(t);
/* No affinity for zombie LWPs. */
if (t->l_stat == LSZOMB) {
lwp_unlock(t);
continue;
}
/* First, release existing affinity, if any. */
if (t->l_affinity) {
kcpuset_unuse(t->l_affinity, &kcpulst);
}
if (kcset) {
/*
* Hold a reference on affinity mask, assign mask to
* LWP and migrate it to another CPU (unlocks LWP).
*/
kcpuset_use(kcset);
t->l_affinity = kcset;
lwp_migrate(t, ci);
} else {
/* Old affinity mask is released, just clear. */
t->l_affinity = NULL;
lwp_unlock(t);
}
lcnt++;
}
mutex_exit(p->p_lock);
if (lcnt == 0) {
error = ESRCH;
}
out:
mutex_exit(&cpu_lock);

/*
* Drop the initial reference (LWPs, if any, have the ownership now),
* and destroy whatever is in the G/C list, if filled.
*/
if (kcset) {
kcpuset_unuse(kcset, &kcpulst);
}
if (kcpulst) {
kcpuset_destroy(kcpulst);
}
return error;
}

/*
* Get affinity.
*/
int
sys__sched_getaffinity(struct lwp *l,
const struct sys__sched_getaffinity_args *uap, register_t *retval)
{
/* {
syscallarg(pid_t) pid;
syscallarg(lwpid_t) lid;
syscallarg(size_t) size;
syscallarg(cpuset_t *) cpuset;
} */
struct lwp *t;
kcpuset_t *kcset;
int error;

if (SCARG(uap, pid) < 0 || SCARG(uap, lid) < 0)
return EINVAL;

error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
if (error)
return error;

/* Locks the LWP */
t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
if (t == NULL) {
error = ESRCH;
goto out;
}
/* Check the permission */
if (kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
mutex_exit(t->l_proc->p_lock);
error = EPERM;
goto out;
}
lwp_lock(t);
if (t->l_affinity) {
kcpuset_copy(kcset, t->l_affinity);
} else {
kcpuset_zero(kcset);
}
lwp_unlock(t);
mutex_exit(t->l_proc->p_lock);

error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
out:
kcpuset_unuse(kcset, NULL);
return error;
}

/*
* Priority protection for PTHREAD_PRIO_PROTECT. This is a weak
* analogue of priority inheritance: temp raise the priority
* of the caller when accessing a protected resource.
*/
int
sys__sched_protect(struct lwp *l,
const struct sys__sched_protect_args *uap, register_t *retval)
{
/* {
syscallarg(int) priority;
syscallarg(int *) opriority;
} */
int error;
pri_t pri;

KASSERT(l->l_inheritedprio == -1);
KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio);

pri = SCARG(uap, priority);
error = 0;
lwp_lock(l);
if (pri == -1) {
/* back out priority changes */
switch(l->l_protectdepth) {
case 0:
error = EINVAL;
break;
case 1:
l->l_protectdepth = 0;
l->l_protectprio = -1;
l->l_auxprio = -1;
break;
default:
l->l_protectdepth--;
break;
}
} else if (pri < 0) {
/* Just retrieve the current value, for debugging */
if (l->l_protectprio == -1)
error = ENOENT;
else
*retval = l->l_protectprio - PRI_USER_RT;
} else if (__predict_false(pri < SCHED_PRI_MIN ||
pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) {
/* must fail if existing priority is higher */
error = EPERM;
} else {
/* play along but make no changes if not a realtime LWP. */
l->l_protectdepth++;
pri += PRI_USER_RT;
if (__predict_true(l->l_class != SCHED_OTHER &&
pri > l->l_protectprio)) {
l->l_protectprio = pri;
l->l_auxprio = pri;
}
}
lwp_unlock(l);

return error;
}

/*
* Yield.
*/
int
sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
{

yield();
return 0;
}

/*
* Sysctl nodes and initialization.
*/
static void
sysctl_sched_setup(struct sysctllog **clog)
{
const struct sysctlnode *node = NULL;

sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
CTLTYPE_INT, "posix_sched",
SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
"Process Scheduling option to which the "
"system attempts to conform"),
NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sched",
SYSCTL_DESCR("Scheduler options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);

if (node == NULL)
return;

sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
CTLTYPE_INT, "pri_min",
SYSCTL_DESCR("Minimal POSIX real-time priority"),
NULL, SCHED_PRI_MIN, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
CTLTYPE_INT, "pri_max",
SYSCTL_DESCR("Maximal POSIX real-time priority"),
NULL, SCHED_PRI_MAX, NULL, 0,
CTL_CREATE, CTL_EOL);
}

static int
sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;

result = KAUTH_RESULT_DEFER;
p = arg0;

switch (action) {
case KAUTH_PROCESS_SCHEDULER_GETPARAM:
if (kauth_cred_uidmatch(cred, p->p_cred))
result = KAUTH_RESULT_ALLOW;
break;

case KAUTH_PROCESS_SCHEDULER_SETPARAM:
if (kauth_cred_uidmatch(cred, p->p_cred)) {
struct lwp *l;
int policy;
pri_t priority;

l = arg1;
policy = (int)(unsigned long)arg2;
priority = (pri_t)(unsigned long)arg3;

if ((policy == l->l_class ||
(policy != SCHED_FIFO && policy != SCHED_RR)) &&
priority <= l->l_priority)
result = KAUTH_RESULT_ALLOW;
}

break;

case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
result = KAUTH_RESULT_ALLOW;
break;

case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
/* Privileged; we let the secmodel handle this. */
break;

default:
break;
}

return result;
}

void
sched_init(void)
{

sysctl_sched_setup(&sched_sysctl_log);

sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
sched_listener_cb, NULL);
}