/*      $NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $       */

/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $");

#ifdef _KERNEL_OPT
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "opt_ktrace.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#endif

/* XXX To get syscall prototypes. */
#define SYSVSHM
#define SYSVSEM
#define SYSVMSG

#include <sys/param.h>
#include <sys/module.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/systm.h>
#include <sys/xcall.h>
#include <sys/ktrace.h>
#include <sys/ptrace.h>

int
sys_nomodule(struct lwp *l, const void *v, register_t *retval)
{
#ifdef MODULAR

       const struct sysent *sy;
       const struct emul *em;
       const struct sc_autoload *auto_list;
       u_int code;

       /*
        * Restart the syscall if we interrupted a module unload that
        * failed.  Acquiring kernconfig_lock delays us until any unload
        * has been completed or rolled back.
        */
       kernconfig_lock();
       sy = l->l_sysent;
       if (sy->sy_call != sys_nomodule) {
               kernconfig_unlock();
               return ERESTART;
       }
       /*
        * Try to autoload a module to satisfy the request.  If it
        * works, retry the request.
        */
       em = l->l_proc->p_emul;
       code = sy - em->e_sysent;

       if ((auto_list = em->e_sc_autoload) != NULL)
               for (; auto_list->al_code > 0; auto_list++) {
                       if (auto_list->al_code != code) {
                               continue;
                       }
                       if (module_autoload(auto_list->al_module,
                           MODULE_CLASS_ANY) != 0 ||
                           sy->sy_call == sys_nomodule) {
                               break;
                       }
                       kernconfig_unlock();
                       return ERESTART;
               }
       kernconfig_unlock();
#endif  /* MODULAR */

       return sys_nosys(l, v, retval);
}

int
syscall_establish(const struct emul *em, const struct syscall_package *sp)
{
       struct sysent *sy;
       int i;

       KASSERT(kernconfig_is_held());

       if (em == NULL) {
               em = &emul_netbsd;
       }
       sy = em->e_sysent;

       /*
        * Ensure that all preconditions are valid, since this is
        * an all or nothing deal.  Once a system call is entered,
        * it can become busy and we could be unable to remove it
        * on error.
        */
       for (i = 0; sp[i].sp_call != NULL; i++) {
               if (sp[i].sp_code >= SYS_NSYSENT)
                       return EINVAL;
               if (sy[sp[i].sp_code].sy_call != sys_nomodule &&
                   sy[sp[i].sp_code].sy_call != sys_nosys) {
#ifdef DIAGNOSTIC
                       printf("syscall %d is busy\n", sp[i].sp_code);
#endif
                       return EBUSY;
               }
       }
       /* Everything looks good, patch them in. */
       for (i = 0; sp[i].sp_call != NULL; i++) {
               sy[sp[i].sp_code].sy_call = sp[i].sp_call;
       }

       return 0;
}

int
syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
{
       struct sysent *sy;
       const uint32_t *sb;
       lwp_t *l;
       int i;

       KASSERT(kernconfig_is_held());

       if (em == NULL) {
               em = &emul_netbsd;
       }
       sy = em->e_sysent;
       sb = em->e_nomodbits;

       /*
        * First, patch the system calls to sys_nomodule or sys_nosys
        * to gate further activity.
        */
       for (i = 0; sp[i].sp_call != NULL; i++) {
               KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
               sy[sp[i].sp_code].sy_call =
                   sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ?
                     sys_nomodule : sys_nosys;
       }

       /*
        * Run a cross call to cycle through all CPUs.  This does two
        * things: lock activity provides a barrier and makes our update
        * of sy_call visible to all CPUs, and upon return we can be sure
        * that we see pertinent values of l_sysent posted by remote CPUs.
        */
       xc_barrier(0);

       /*
        * Now it's safe to check l_sysent.  Run through all LWPs and see
        * if anyone is still using the system call.
        */
       for (i = 0; sp[i].sp_call != NULL; i++) {
               mutex_enter(&proc_lock);
               LIST_FOREACH(l, &alllwp, l_list) {
                       if (l->l_sysent == &sy[sp[i].sp_code]) {
                               break;
                       }
               }
               mutex_exit(&proc_lock);
               if (l == NULL) {
                       continue;
               }
               /*
                * We lose: one or more calls are still in use.  Put back
                * the old entrypoints and act like nothing happened.
                * When we drop kernconfig_lock, any system calls held in
                * sys_nomodule() will be restarted.
                */
               for (i = 0; sp[i].sp_call != NULL; i++) {
                       sy[sp[i].sp_code].sy_call = sp[i].sp_call;
               }
               return EBUSY;
       }

       return 0;
}

/*
* Return true if system call tracing is enabled for the specified process.
*/
bool
trace_is_enabled(struct proc *p)
{
#ifdef SYSCALL_DEBUG
       return (true);
#endif
#ifdef KTRACE
       if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
               return (true);
#endif
#ifdef PTRACE
       if (ISSET(p->p_slflag, PSL_SYSCALL))
               return (true);
#endif

       return (false);
}

/*
* Start trace of particular system call. If process is being traced,
* this routine is called by MD syscall dispatch code just before
* a system call is actually executed.
*/
int
trace_enter(register_t code, const struct sysent *sy, const void *args)
{
       int error = 0;
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
       struct proc *p = curlwp->l_proc;
#endif

#ifdef KDTRACE_HOOKS
       if (sy->sy_entry) {
               struct emul *e = p->p_emul;
               if (e->e_dtrace_syscall)
                       (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args,
                           NULL, 0);
       }
#endif

#ifdef SYSCALL_DEBUG
       scdebug_call(code, args);
#endif /* SYSCALL_DEBUG */

       ktrsyscall(code, args, sy->sy_narg);

#ifdef PTRACE
       if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
           (PSL_SYSCALL|PSL_TRACED)) {
               proc_stoptrace(TRAP_SCE, code, args, NULL, 0);
               if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
                       /* tracer will emulate syscall for us */
                       error = EJUSTRETURN;
               }
       }
#endif
       return error;
}

/*
* End trace of particular system call. If process is being traced,
* this routine is called by MD syscall dispatch code just after
* a system call finishes.
* MD caller guarantees the passed 'code' is within the supported
* system call number range for emulation the process runs under.
*/
void
trace_exit(register_t code, const struct sysent *sy, const void *args,
   register_t rval[], int error)
{
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
       struct proc *p = curlwp->l_proc;
#endif

#ifdef KDTRACE_HOOKS
       if (sy->sy_return) {
               struct emul *e = p->p_emul;
               if (e->e_dtrace_syscall)
                       (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy,
                           args, rval, error);
       }
#endif

#ifdef SYSCALL_DEBUG
       scdebug_ret(code, error, rval);
#endif /* SYSCALL_DEBUG */

       ktrsysret(code, error, rval);

#ifdef PTRACE
       if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
           (PSL_SYSCALL|PSL_TRACED)) {
               proc_stoptrace(TRAP_SCX, code, args, rval, error);
       }
       CLR(p->p_slflag, PSL_SYSCALLEMU);
#endif
}