* Copyright (c) 1999, 2006, 2007 The NetBSD Foundation, Inc.

/* $NetBSD: sysv_msg.c,v 1.76 2019/10/04 23:20:22 kamil Exp $ */

/*-
* Copyright (c) 1999, 2006, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

/*
* Implementation of SVID messages
*
* Author: Daniel Boulet
*
* Copyright 1993 Daniel Boulet and RTMX Inc.
*
* This system call was implemented by Daniel Boulet under contract from RTMX.
*
* Redistribution and use in source forms, with and without modification,
* are permitted provided that this entire comment appears intact.
*
* Redistribution in binary form may occur without any restrictions.
* Obviously, it would be nice if you gave credit where credit is due
* but requiring it would be too onerous.
*
* This software is provided ``AS IS'' without any warranties of any kind.
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_msg.c,v 1.76 2019/10/04 23:20:22 kamil Exp $");

#ifdef _KERNEL_OPT
#include "opt_sysv.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/mount.h> /* XXX for <sys/syscallargs.h> */
#include <sys/syscallargs.h>
#include <sys/kauth.h>

#define MSG_DEBUG
#undef MSG_DEBUG_OK

#ifdef MSG_DEBUG_OK
#define MSG_PRINTF(a) printf a
#else
#define MSG_PRINTF(a)
#endif

static int nfree_msgmaps; /* # of free map entries */
static short free_msgmaps; /* head of linked list of free map entries */
static struct __msg *free_msghdrs; /* list of free msg headers */
static char *msgpool; /* MSGMAX byte long msg buffer pool */
static struct msgmap *msgmaps; /* MSGSEG msgmap structures */
static struct __msg *msghdrs; /* MSGTQL msg headers */

kmsq_t *msqs; /* MSGMNI msqid_ds struct's */
kmutex_t msgmutex; /* subsystem lock */

static u_int msg_waiters = 0; /* total number of msgrcv waiters */
static bool msg_realloc_state;
static kcondvar_t msg_realloc_cv;

static void msg_freehdr(struct __msg *);

extern int kern_has_sysvmsg;

SYSCTL_SETUP_PROTO(sysctl_ipc_msg_setup);

int
msginit(void)
{
int i, sz;
vaddr_t v;

/*
* msginfo.msgssz should be a power of two for efficiency reasons.
* It is also pretty silly if msginfo.msgssz is less than 8
* or greater than about 256 so ...
*/

i = 8;
while (i < 1024 && i != msginfo.msgssz)
i <<= 1;
if (i != msginfo.msgssz) {
printf("msginfo.msgssz = %d, not a small power of 2",
msginfo.msgssz);
return EINVAL;
}

if (msginfo.msgseg > 32767) {
printf("msginfo.msgseg = %d > 32767", msginfo.msgseg);
return EINVAL;
}

/* Allocate the wired memory for our structures */
sz = ALIGN(msginfo.msgmax) +
ALIGN(msginfo.msgseg * sizeof(struct msgmap)) +
ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
ALIGN(msginfo.msgmni * sizeof(kmsq_t));
sz = round_page(sz);
v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
if (v == 0) {
printf("sysv_msg: cannot allocate memory");
return ENOMEM;
}
msgpool = (void *)v;
msgmaps = (void *)((uintptr_t)msgpool + ALIGN(msginfo.msgmax));
msghdrs = (void *)((uintptr_t)msgmaps +
ALIGN(msginfo.msgseg * sizeof(struct msgmap)));
msqs = (void *)((uintptr_t)msghdrs +
ALIGN(msginfo.msgtql * sizeof(struct __msg)));

for (i = 0; i < (msginfo.msgseg - 1); i++)
msgmaps[i].next = i + 1;
msgmaps[msginfo.msgseg - 1].next = -1;

free_msgmaps = 0;
nfree_msgmaps = msginfo.msgseg;

for (i = 0; i < (msginfo.msgtql - 1); i++) {
msghdrs[i].msg_type = 0;
msghdrs[i].msg_next = &msghdrs[i + 1];
}
i = msginfo.msgtql - 1;
msghdrs[i].msg_type = 0;
msghdrs[i].msg_next = NULL;
free_msghdrs = &msghdrs[0];

for (i = 0; i < msginfo.msgmni; i++) {
cv_init(&msqs[i].msq_cv, "msgwait");
/* Implies entry is available */
msqs[i].msq_u.msg_qbytes = 0;
/* Reset to a known value */
msqs[i].msq_u.msg_perm._seq = 0;
}

mutex_init(&msgmutex, MUTEX_DEFAULT, IPL_NONE);
cv_init(&msg_realloc_cv, "msgrealc");
msg_realloc_state = false;

kern_has_sysvmsg = 1;

return 0;
}

int
msgfini(void)
{
int i, sz;
vaddr_t v = (vaddr_t)msgpool;

mutex_enter(&msgmutex);
for (i = 0; i < msginfo.msgmni; i++) {
if (msqs[i].msq_u.msg_qbytes != 0) {
mutex_exit(&msgmutex);
return 1; /* queue not available, prevent unload! */
}
}
/*
* Destroy all condvars and free the memory we're using
*/
for (i = 0; i < msginfo.msgmni; i++) {
cv_destroy(&msqs[i].msq_cv);
}
sz = ALIGN(msginfo.msgmax) +
ALIGN(msginfo.msgseg * sizeof(struct msgmap)) +
ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
ALIGN(msginfo.msgmni * sizeof(kmsq_t));
sz = round_page(sz);
uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);

cv_destroy(&msg_realloc_cv);
mutex_exit(&msgmutex);
mutex_destroy(&msgmutex);

kern_has_sysvmsg = 0;

return 0;
}

static int
msgrealloc(int newmsgmni, int newmsgseg)
{
struct msgmap *new_msgmaps;
struct __msg *new_msghdrs, *new_free_msghdrs;
char *old_msgpool, *new_msgpool;
kmsq_t *new_msqs;
vaddr_t v;
int i, sz, msqid, newmsgmax, new_nfree_msgmaps;
short new_free_msgmaps;

if (newmsgmni < 1 || newmsgseg < 1)
return EINVAL;

/* Allocate the wired memory for our structures */
newmsgmax = msginfo.msgssz * newmsgseg;
sz = ALIGN(newmsgmax) +
ALIGN(newmsgseg * sizeof(struct msgmap)) +
ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
ALIGN(newmsgmni * sizeof(kmsq_t));
sz = round_page(sz);
v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
if (v == 0)
return ENOMEM;

mutex_enter(&msgmutex);
if (msg_realloc_state) {
mutex_exit(&msgmutex);
uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
return EBUSY;
}
msg_realloc_state = true;
if (msg_waiters) {
/*
* Mark reallocation state, wake-up all waiters,
* and wait while they will all exit.
*/
for (i = 0; i < msginfo.msgmni; i++)
cv_broadcast(&msqs[i].msq_cv);
while (msg_waiters)
cv_wait(&msg_realloc_cv, &msgmutex);
}
old_msgpool = msgpool;

/* We cannot reallocate less memory than we use */
i = 0;
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
struct msqid_ds *mptr;
kmsq_t *msq;

msq = &msqs[msqid];
mptr = &msq->msq_u;
if (mptr->msg_qbytes || (mptr->msg_perm.mode & MSG_LOCKED))
i = msqid;
}
if (i >= newmsgmni || (msginfo.msgseg - nfree_msgmaps) > newmsgseg) {
mutex_exit(&msgmutex);
uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
return EBUSY;
}

new_msgpool = (void *)v;
new_msgmaps = (void *)((uintptr_t)new_msgpool + ALIGN(newmsgmax));
new_msghdrs = (void *)((uintptr_t)new_msgmaps +
ALIGN(newmsgseg * sizeof(struct msgmap)));
new_msqs = (void *)((uintptr_t)new_msghdrs +
ALIGN(msginfo.msgtql * sizeof(struct __msg)));

/* Initialize the structures */
for (i = 0; i < (newmsgseg - 1); i++)
new_msgmaps[i].next = i + 1;
new_msgmaps[newmsgseg - 1].next = -1;
new_free_msgmaps = 0;
new_nfree_msgmaps = newmsgseg;

for (i = 0; i < (msginfo.msgtql - 1); i++) {
new_msghdrs[i].msg_type = 0;
new_msghdrs[i].msg_next = &new_msghdrs[i + 1];
}
i = msginfo.msgtql - 1;
new_msghdrs[i].msg_type = 0;
new_msghdrs[i].msg_next = NULL;
new_free_msghdrs = &new_msghdrs[0];

for (i = 0; i < newmsgmni; i++) {
new_msqs[i].msq_u.msg_qbytes = 0;
new_msqs[i].msq_u.msg_perm._seq = 0;
cv_init(&new_msqs[i].msq_cv, "msgwait");
}

/*
* Copy all message queue identifiers, message headers and buffer
* pools to the new memory location.
*/
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
struct __msg *nmsghdr, *msghdr, *pmsghdr;
struct msqid_ds *nmptr, *mptr;
kmsq_t *nmsq, *msq;

msq = &msqs[msqid];
mptr = &msq->msq_u;

if (mptr->msg_qbytes == 0 &&
(mptr->msg_perm.mode & MSG_LOCKED) == 0)
continue;

nmsq = &new_msqs[msqid];
nmptr = &nmsq->msq_u;
memcpy(nmptr, mptr, sizeof(struct msqid_ds));

/*
* Go through the message headers, and copy each one
* by taking the new ones, and thus defragmenting.
*/
nmsghdr = pmsghdr = NULL;
msghdr = mptr->_msg_first;
while (msghdr) {
short nnext = 0, next;
u_short msgsz, segcnt;

/* Take an entry from the new list of free msghdrs */
nmsghdr = new_free_msghdrs;
KASSERT(nmsghdr != NULL);
new_free_msghdrs = nmsghdr->msg_next;

nmsghdr->msg_next = NULL;
if (pmsghdr) {
pmsghdr->msg_next = nmsghdr;
} else {
nmptr->_msg_first = nmsghdr;
pmsghdr = nmsghdr;
}
nmsghdr->msg_ts = msghdr->msg_ts;
nmsghdr->msg_spot = -1;

/* Compute the amount of segments and reserve them */
msgsz = msghdr->msg_ts;
segcnt = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
if (segcnt == 0)
continue;
while (segcnt--) {
nnext = new_free_msgmaps;
new_free_msgmaps = new_msgmaps[nnext].next;
new_nfree_msgmaps--;
new_msgmaps[nnext].next = nmsghdr->msg_spot;
nmsghdr->msg_spot = nnext;
}

/* Copy all segments */
KASSERT(nnext == nmsghdr->msg_spot);
next = msghdr->msg_spot;
while (msgsz > 0) {
size_t tlen;

if (msgsz >= msginfo.msgssz) {
tlen = msginfo.msgssz;
msgsz -= msginfo.msgssz;
} else {
tlen = msgsz;
msgsz = 0;
}

/* Copy the message buffer */
memcpy(&new_msgpool[nnext * msginfo.msgssz],
&msgpool[next * msginfo.msgssz], tlen);

/* Next entry of the map */
nnext = msgmaps[nnext].next;
next = msgmaps[next].next;
}

/* Next message header */
msghdr = msghdr->msg_next;
}
nmptr->_msg_last = nmsghdr;
}
KASSERT((msginfo.msgseg - nfree_msgmaps) ==
(newmsgseg - new_nfree_msgmaps));

sz = ALIGN(msginfo.msgmax) +
ALIGN(msginfo.msgseg * sizeof(struct msgmap)) +
ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
ALIGN(msginfo.msgmni * sizeof(kmsq_t));
sz = round_page(sz);

for (i = 0; i < msginfo.msgmni; i++)
cv_destroy(&msqs[i].msq_cv);

/* Set the pointers and update the new values */
msgpool = new_msgpool;
msgmaps = new_msgmaps;
msghdrs = new_msghdrs;
msqs = new_msqs;

free_msghdrs = new_free_msghdrs;
free_msgmaps = new_free_msgmaps;
nfree_msgmaps = new_nfree_msgmaps;
msginfo.msgmni = newmsgmni;
msginfo.msgseg = newmsgseg;
msginfo.msgmax = newmsgmax;

/* Reallocation completed - notify all waiters, if any */
msg_realloc_state = false;
cv_broadcast(&msg_realloc_cv);
mutex_exit(&msgmutex);

uvm_km_free(kernel_map, (vaddr_t)old_msgpool, sz, UVM_KMF_WIRED);
return 0;
}

static void
msg_freehdr(struct __msg *msghdr)
{

KASSERT(mutex_owned(&msgmutex));

while (msghdr->msg_ts > 0) {
short next;
KASSERT(msghdr->msg_spot >= 0);
KASSERT(msghdr->msg_spot < msginfo.msgseg);

next = msgmaps[msghdr->msg_spot].next;
msgmaps[msghdr->msg_spot].next = free_msgmaps;
free_msgmaps = msghdr->msg_spot;
nfree_msgmaps++;
msghdr->msg_spot = next;
if (msghdr->msg_ts >= msginfo.msgssz)
msghdr->msg_ts -= msginfo.msgssz;
else
msghdr->msg_ts = 0;
}
KASSERT(msghdr->msg_spot == -1);
msghdr->msg_next = free_msghdrs;
free_msghdrs = msghdr;
}

int
sys___msgctl50(struct lwp *l, const struct sys___msgctl50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) msqid;
syscallarg(int) cmd;
syscallarg(struct msqid_ds *) buf;
} */
struct msqid_ds msqbuf;
int cmd, error;

cmd = SCARG(uap, cmd);

if (cmd == IPC_SET) {
error = copyin(SCARG(uap, buf), &msqbuf, sizeof(msqbuf));
if (error)
return (error);
}

error = msgctl1(l, SCARG(uap, msqid), cmd,
(cmd == IPC_SET || cmd == IPC_STAT) ? &msqbuf : NULL);

if (error == 0 && cmd == IPC_STAT)
error = copyout(&msqbuf, SCARG(uap, buf), sizeof(msqbuf));

return (error);
}

int
msgctl1(struct lwp *l, int msqid, int cmd, struct msqid_ds *msqbuf)
{
kauth_cred_t cred = l->l_cred;
struct msqid_ds *msqptr;
kmsq_t *msq;
int error = 0, ix;

MSG_PRINTF(("call to msgctl1(%d, %d)\n", msqid, cmd));

ix = IPCID_TO_IX(msqid);

mutex_enter(&msgmutex);

if (ix < 0 || ix >= msginfo.msgmni) {
MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", ix,
msginfo.msgmni));
error = EINVAL;
goto unlock;
}

msq = &msqs[ix];
msqptr = &msq->msq_u;

if (msqptr->msg_qbytes == 0) {
MSG_PRINTF(("no such msqid\n"));
error = EINVAL;
goto unlock;
}
if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqid)) {
MSG_PRINTF(("wrong sequence number\n"));
error = EINVAL;
goto unlock;
}

switch (cmd) {
case IPC_RMID:
{
struct __msg *msghdr;
if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_M)) != 0)
break;
/* Free the message headers */
msghdr = msqptr->_msg_first;
while (msghdr != NULL) {
struct __msg *msghdr_tmp;

/* Free the segments of each message */
msqptr->_msg_cbytes -= msghdr->msg_ts;
msqptr->msg_qnum--;
msghdr_tmp = msghdr;
msghdr = msghdr->msg_next;
msg_freehdr(msghdr_tmp);
}
KASSERT(msqptr->_msg_cbytes == 0);
KASSERT(msqptr->msg_qnum == 0);

/* Mark it as free */
msqptr->msg_qbytes = 0;
cv_broadcast(&msq->msq_cv);
}
break;

case IPC_SET:
if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
break;
if (msqbuf->msg_qbytes > msqptr->msg_qbytes &&
kauth_authorize_system(cred, KAUTH_SYSTEM_SYSVIPC,
KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE,
KAUTH_ARG(msqbuf->msg_qbytes),
KAUTH_ARG(msqptr->msg_qbytes), NULL) != 0) {
error = EPERM;
break;
}
if (msqbuf->msg_qbytes > msginfo.msgmnb) {
MSG_PRINTF(("can't increase msg_qbytes beyond %d "
"(truncating)\n", msginfo.msgmnb));
/* silently restrict qbytes to system limit */
msqbuf->msg_qbytes = msginfo.msgmnb;
}
if (msqbuf->msg_qbytes == 0) {
MSG_PRINTF(("can't reduce msg_qbytes to 0\n"));
error = EINVAL; /* XXX non-standard errno! */
break;
}
msqptr->msg_perm.uid = msqbuf->msg_perm.uid;
msqptr->msg_perm.gid = msqbuf->msg_perm.gid;
msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
(msqbuf->msg_perm.mode & 0777);
msqptr->msg_qbytes = msqbuf->msg_qbytes;
msqptr->msg_ctime = time_second;
break;

case IPC_STAT:
if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
MSG_PRINTF(("requester doesn't have read access\n"));
break;
}
memset(msqbuf, 0, sizeof *msqbuf);
msqbuf->msg_perm = msqptr->msg_perm;
msqbuf->msg_perm.mode &= 0777;
msqbuf->msg_qnum = msqptr->msg_qnum;
msqbuf->msg_qbytes = msqptr->msg_qbytes;
msqbuf->msg_lspid = msqptr->msg_lspid;
msqbuf->msg_lrpid = msqptr->msg_lrpid;
msqbuf->msg_stime = msqptr->msg_stime;
msqbuf->msg_rtime = msqptr->msg_rtime;
msqbuf->msg_ctime = msqptr->msg_ctime;
break;

default:
MSG_PRINTF(("invalid command %d\n", cmd));
error = EINVAL;
break;
}

unlock:
mutex_exit(&msgmutex);
return (error);
}

int
sys_msgget(struct lwp *l, const struct sys_msgget_args *uap, register_t *retval)
{
/* {
syscallarg(key_t) key;
syscallarg(int) msgflg;
} */
int msqid, error = 0;
int key = SCARG(uap, key);
int msgflg = SCARG(uap, msgflg);
kauth_cred_t cred = l->l_cred;
struct msqid_ds *msqptr = NULL;
kmsq_t *msq;

mutex_enter(&msgmutex);

MSG_PRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));

if (key != IPC_PRIVATE) {
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
msq = &msqs[msqid];
msqptr = &msq->msq_u;
if (msqptr->msg_qbytes != 0 &&
msqptr->msg_perm._key == key)
break;
}
if (msqid < msginfo.msgmni) {
MSG_PRINTF(("found public key\n"));
if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
MSG_PRINTF(("not exclusive\n"));
error = EEXIST;
goto unlock;
}
if ((error = ipcperm(cred, &msqptr->msg_perm,
msgflg & 0700 ))) {
MSG_PRINTF(("requester doesn't have 0%o access\n",
msgflg & 0700));
goto unlock;
}
goto found;
}
}

MSG_PRINTF(("need to allocate the msqid_ds\n"));
if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
/*
* Look for an unallocated and unlocked msqid_ds.
* msqid_ds's can be locked by msgsnd or msgrcv while
* they are copying the message in/out. We can't
* re-use the entry until they release it.
*/
msq = &msqs[msqid];
msqptr = &msq->msq_u;
if (msqptr->msg_qbytes == 0 &&
(msqptr->msg_perm.mode & MSG_LOCKED) == 0)
break;
}
if (msqid == msginfo.msgmni) {
MSG_PRINTF(("no more msqid_ds's available\n"));
error = ENOSPC;
goto unlock;
}
MSG_PRINTF(("msqid %d is available\n", msqid));
msqptr->msg_perm._key = key;
msqptr->msg_perm.cuid = kauth_cred_geteuid(cred);
msqptr->msg_perm.uid = kauth_cred_geteuid(cred);
msqptr->msg_perm.cgid = kauth_cred_getegid(cred);
msqptr->msg_perm.gid = kauth_cred_getegid(cred);
msqptr->msg_perm.mode = (msgflg & 0777);
/* Make sure that the returned msqid is unique */
msqptr->msg_perm._seq++;
msqptr->_msg_first = NULL;
msqptr->_msg_last = NULL;
msqptr->_msg_cbytes = 0;
msqptr->msg_qnum = 0;
msqptr->msg_qbytes = msginfo.msgmnb;
msqptr->msg_lspid = 0;
msqptr->msg_lrpid = 0;
msqptr->msg_stime = 0;
msqptr->msg_rtime = 0;
msqptr->msg_ctime = time_second;
} else {
MSG_PRINTF(("didn't find it and wasn't asked to create it\n"));
error = ENOENT;
goto unlock;
}

found:
/* Construct the unique msqid */
*retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);

unlock:
mutex_exit(&msgmutex);
return (error);
}

int
sys_msgsnd(struct lwp *l, const struct sys_msgsnd_args *uap, register_t *retval)
{
/* {
syscallarg(int) msqid;
syscallarg(const void *) msgp;
syscallarg(size_t) msgsz;
syscallarg(int) msgflg;
} */

return msgsnd1(l, SCARG(uap, msqid), SCARG(uap, msgp),
SCARG(uap, msgsz), SCARG(uap, msgflg), sizeof(long), copyin);
}

int
msgsnd1(struct lwp *l, int msqidr, const char *user_msgp, size_t msgsz,
int msgflg, size_t typesz, copyin_t fetch_type)
{
int segs_needed, error = 0, msqid;
kauth_cred_t cred = l->l_cred;
struct msqid_ds *msqptr;
struct __msg *msghdr;
kmsq_t *msq;
short next;

MSG_PRINTF(("call to msgsnd(%d, %p, %lld, %d)\n", msqidr,
user_msgp, (long long)msgsz, msgflg));

if ((ssize_t)msgsz < 0)
return EINVAL;

restart:
msqid = IPCID_TO_IX(msqidr);

mutex_enter(&msgmutex);
/* In case of reallocation, we will wait for completion */
while (__predict_false(msg_realloc_state))
cv_wait(&msg_realloc_cv, &msgmutex);

if (msqid < 0 || msqid >= msginfo.msgmni) {
MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
msginfo.msgmni));
error = EINVAL;
goto unlock;
}

msq = &msqs[msqid];
msqptr = &msq->msq_u;

if (msqptr->msg_qbytes == 0) {
MSG_PRINTF(("no such message queue id\n"));
error = EINVAL;
goto unlock;
}
if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) {
MSG_PRINTF(("wrong sequence number\n"));
error = EINVAL;
goto unlock;
}

if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_W))) {
MSG_PRINTF(("requester doesn't have write access\n"));
goto unlock;
}

segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
MSG_PRINTF(("msgsz=%lld, msgssz=%d, segs_needed=%d\n",
(long long)msgsz, msginfo.msgssz, segs_needed));
for (;;) {
int need_more_resources = 0;

/*
* check msgsz [cannot be negative since it is unsigned]
* (inside this loop in case msg_qbytes changes while we sleep)
*/

if (msgsz > msqptr->msg_qbytes) {
MSG_PRINTF(("msgsz > msqptr->msg_qbytes\n"));
error = EINVAL;
goto unlock;
}

if (msqptr->msg_perm.mode & MSG_LOCKED) {
MSG_PRINTF(("msqid is locked\n"));
need_more_resources = 1;
}
if (msgsz + msqptr->_msg_cbytes > msqptr->msg_qbytes) {
MSG_PRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
need_more_resources = 1;
}
if (segs_needed > nfree_msgmaps) {
MSG_PRINTF(("segs_needed > nfree_msgmaps\n"));
need_more_resources = 1;
}
if (free_msghdrs == NULL) {
MSG_PRINTF(("no more msghdrs\n"));
need_more_resources = 1;
}

if (need_more_resources) {
int we_own_it;

if ((msgflg & IPC_NOWAIT) != 0) {
MSG_PRINTF(("need more resources but caller "
"doesn't want to wait\n"));
error = EAGAIN;
goto unlock;
}

if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
MSG_PRINTF(("we don't own the msqid_ds\n"));
we_own_it = 0;
} else {
/* Force later arrivals to wait for our
request */
MSG_PRINTF(("we own the msqid_ds\n"));
msqptr->msg_perm.mode |= MSG_LOCKED;
we_own_it = 1;
}

msg_waiters++;
MSG_PRINTF(("goodnight\n"));
error = cv_wait_sig(&msq->msq_cv, &msgmutex);
MSG_PRINTF(("good morning, error=%d\n", error));
msg_waiters--;

if (we_own_it)
msqptr->msg_perm.mode &= ~MSG_LOCKED;

/*
* In case of such state, notify reallocator and
* restart the call.
*/
if (msg_realloc_state) {
cv_broadcast(&msg_realloc_cv);
mutex_exit(&msgmutex);
goto restart;
}

if (error != 0) {
MSG_PRINTF(("msgsnd: interrupted system "
"call\n"));
error = EINTR;
goto unlock;
}

/*
* Make sure that the msq queue still exists
*/

if (msqptr->msg_qbytes == 0) {
MSG_PRINTF(("msqid deleted\n"));
error = EIDRM;
goto unlock;
}
} else {
MSG_PRINTF(("got all the resources that we need\n"));
break;
}
}

/*
* We have the resources that we need.
* Make sure!
*/

KASSERT((msqptr->msg_perm.mode & MSG_LOCKED) == 0);
KASSERT(segs_needed <= nfree_msgmaps);
KASSERT(msgsz + msqptr->_msg_cbytes <= msqptr->msg_qbytes);
KASSERT(free_msghdrs != NULL);

/*
* Re-lock the msqid_ds in case we page-fault when copying in the
* message
*/

KASSERT((msqptr->msg_perm.mode & MSG_LOCKED) == 0);
msqptr->msg_perm.mode |= MSG_LOCKED;

/*
* Allocate a message header
*/

msghdr = free_msghdrs;
free_msghdrs = msghdr->msg_next;
msghdr->msg_spot = -1;
msghdr->msg_ts = msgsz;

/*
* Allocate space for the message
*/

while (segs_needed > 0) {
KASSERT(nfree_msgmaps > 0);
KASSERT(free_msgmaps != -1);
KASSERT(free_msgmaps < msginfo.msgseg);

next = free_msgmaps;
MSG_PRINTF(("allocating segment %d to message\n", next));
free_msgmaps = msgmaps[next].next;
nfree_msgmaps--;
msgmaps[next].next = msghdr->msg_spot;
msghdr->msg_spot = next;
segs_needed--;
}

/*
* Copy in the message type
*/
mutex_exit(&msgmutex);
error = (*fetch_type)(user_msgp, &msghdr->msg_type, typesz);
mutex_enter(&msgmutex);
if (error != 0) {
MSG_PRINTF(("error %d copying the message type\n", error));
msg_freehdr(msghdr);
msqptr->msg_perm.mode &= ~MSG_LOCKED;
cv_broadcast(&msq->msq_cv);
goto unlock;
}
user_msgp += typesz;

/*
* Validate the message type
*/

if (msghdr->msg_type < 1) {
msg_freehdr(msghdr);
msqptr->msg_perm.mode &= ~MSG_LOCKED;
cv_broadcast(&msq->msq_cv);
MSG_PRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
error = EINVAL;
goto unlock;
}

/*
* Copy in the message body
*/

next = msghdr->msg_spot;
while (msgsz > 0) {
size_t tlen;
KASSERT(next > -1);
KASSERT(next < msginfo.msgseg);

if (msgsz > msginfo.msgssz)
tlen = msginfo.msgssz;
else
tlen = msgsz;
mutex_exit(&msgmutex);
error = copyin(user_msgp, &msgpool[next * msginfo.msgssz], tlen);
mutex_enter(&msgmutex);
if (error != 0) {
MSG_PRINTF(("error %d copying in message segment\n",
error));
msg_freehdr(msghdr);
msqptr->msg_perm.mode &= ~MSG_LOCKED;
cv_broadcast(&msq->msq_cv);
goto unlock;
}
msgsz -= tlen;
user_msgp += tlen;
next = msgmaps[next].next;
}
KASSERT(next == -1);

/*
* We've got the message. Unlock the msqid_ds.
*/

msqptr->msg_perm.mode &= ~MSG_LOCKED;

/*
* Make sure that the msqid_ds is still allocated.
*/

if (msqptr->msg_qbytes == 0) {
msg_freehdr(msghdr);
cv_broadcast(&msq->msq_cv);
error = EIDRM;
goto unlock;
}

/*
* Put the message into the queue
*/

if (msqptr->_msg_first == NULL) {
msqptr->_msg_first = msghdr;
msqptr->_msg_last = msghdr;
} else {
msqptr->_msg_last->msg_next = msghdr;
msqptr->_msg_last = msghdr;
}
msqptr->_msg_last->msg_next = NULL;

msqptr->_msg_cbytes += msghdr->msg_ts;
msqptr->msg_qnum++;
msqptr->msg_lspid = l->l_proc->p_pid;
msqptr->msg_stime = time_second;

cv_broadcast(&msq->msq_cv);

unlock:
mutex_exit(&msgmutex);
return error;
}

int
sys_msgrcv(struct lwp *l, const struct sys_msgrcv_args *uap, register_t *retval)
{
/* {
syscallarg(int) msqid;
syscallarg(void *) msgp;
syscallarg(size_t) msgsz;
syscallarg(long) msgtyp;
syscallarg(int) msgflg;
} */

return msgrcv1(l, SCARG(uap, msqid), SCARG(uap, msgp),
SCARG(uap, msgsz), SCARG(uap, msgtyp), SCARG(uap, msgflg),
sizeof(long), copyout, retval);
}

int
msgrcv1(struct lwp *l, int msqidr, char *user_msgp, size_t msgsz, long msgtyp,
int msgflg, size_t typesz, copyout_t put_type, register_t *retval)
{
size_t len;
kauth_cred_t cred = l->l_cred;
struct msqid_ds *msqptr;
struct __msg *msghdr;
int error = 0, msqid;
kmsq_t *msq;
short next;

MSG_PRINTF(("call to msgrcv(%d, %p, %lld, %ld, %d)\n", msqidr,
user_msgp, (long long)msgsz, msgtyp, msgflg));

if ((ssize_t)msgsz < 0)
return EINVAL;

restart:
msqid = IPCID_TO_IX(msqidr);

mutex_enter(&msgmutex);
/* In case of reallocation, we will wait for completion */
while (__predict_false(msg_realloc_state))
cv_wait(&msg_realloc_cv, &msgmutex);

if (msqid < 0 || msqid >= msginfo.msgmni) {
MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
msginfo.msgmni));
error = EINVAL;
goto unlock;
}

msq = &msqs[msqid];
msqptr = &msq->msq_u;

if (msqptr->msg_qbytes == 0) {
MSG_PRINTF(("no such message queue id\n"));
error = EINVAL;
goto unlock;
}
if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) {
MSG_PRINTF(("wrong sequence number\n"));
error = EINVAL;
goto unlock;
}

if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
MSG_PRINTF(("requester doesn't have read access\n"));
goto unlock;
}

msghdr = NULL;
while (msghdr == NULL) {
if (msgtyp == 0) {
msghdr = msqptr->_msg_first;
if (msghdr != NULL) {
if (msgsz < msghdr->msg_ts &&
(msgflg & MSG_NOERROR) == 0) {
MSG_PRINTF(("first msg on the queue "
"is too big (want %lld, got %d)\n",
(long long)msgsz, msghdr->msg_ts));
error = E2BIG;
goto unlock;
}
if (msqptr->_msg_first == msqptr->_msg_last) {
msqptr->_msg_first = NULL;
msqptr->_msg_last = NULL;
} else {
msqptr->_msg_first = msghdr->msg_next;
KASSERT(msqptr->_msg_first != NULL);
}
}
} else {
struct __msg *previous;
struct __msg **prev;

for (previous = NULL, prev = &msqptr->_msg_first;
(msghdr = *prev) != NULL;
previous = msghdr, prev = &msghdr->msg_next) {
/*
* Is this message's type an exact match or is
* this message's type less than or equal to
* the absolute value of a negative msgtyp?
* Note that the second half of this test can
* NEVER be true if msgtyp is positive since
* msg_type is always positive!
*/

if (msgtyp != msghdr->msg_type &&
msgtyp != LONG_MIN &&
msghdr->msg_type > -msgtyp)
continue;

MSG_PRINTF(("found message type %ld, requested %ld\n",
msghdr->msg_type, msgtyp));
if (msgsz < msghdr->msg_ts &&
(msgflg & MSG_NOERROR) == 0) {
MSG_PRINTF(("requested message on the queue "
"is too big (want %lld, got %d)\n",
(long long)msgsz, msghdr->msg_ts));
error = E2BIG;
goto unlock;
}
*prev = msghdr->msg_next;
if (msghdr != msqptr->_msg_last)
break;
if (previous == NULL) {
KASSERT(prev == &msqptr->_msg_first);
msqptr->_msg_first = NULL;
msqptr->_msg_last = NULL;
} else {
KASSERT(prev != &msqptr->_msg_first);
msqptr->_msg_last = previous;
}
break;
}
}

/*
* We've either extracted the msghdr for the appropriate
* message or there isn't one.
* If there is one then bail out of this loop.
*/
if (msghdr != NULL)
break;

/*
* Hmph! No message found. Does the user want to wait?
*/

if ((msgflg & IPC_NOWAIT) != 0) {
MSG_PRINTF(("no appropriate message found (msgtyp=%ld)\n",
msgtyp));
error = ENOMSG;
goto unlock;
}

/*
* Wait for something to happen
*/

msg_waiters++;
MSG_PRINTF(("msgrcv: goodnight\n"));
error = cv_wait_sig(&msq->msq_cv, &msgmutex);
MSG_PRINTF(("msgrcv: good morning (error=%d)\n", error));
msg_waiters--;

/*
* In case of such state, notify reallocator and
* restart the call.
*/
if (msg_realloc_state) {
cv_broadcast(&msg_realloc_cv);
mutex_exit(&msgmutex);
goto restart;
}

if (error != 0) {
MSG_PRINTF(("msgsnd: interrupted system call\n"));
error = EINTR;
goto unlock;
}

/*
* Make sure that the msq queue still exists
*/

if (msqptr->msg_qbytes == 0 ||
msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) {
MSG_PRINTF(("msqid deleted\n"));
error = EIDRM;
goto unlock;
}
}

/*
* Return the message to the user.
*
* First, do the bookkeeping (before we risk being interrupted).
*/

msqptr->_msg_cbytes -= msghdr->msg_ts;
msqptr->msg_qnum--;
msqptr->msg_lrpid = l->l_proc->p_pid;
msqptr->msg_rtime = time_second;

/*
* Make msgsz the actual amount that we'll be returning.
* Note that this effectively truncates the message if it is too long
* (since msgsz is never increased).
*/

MSG_PRINTF(("found a message, msgsz=%lld, msg_ts=%d\n",
(long long)msgsz, msghdr->msg_ts));
if (msgsz > msghdr->msg_ts)
msgsz = msghdr->msg_ts;

/*
* Return the type to the user.
*/
mutex_exit(&msgmutex);
error = (*put_type)(&msghdr->msg_type, user_msgp, typesz);
mutex_enter(&msgmutex);
if (error != 0) {
MSG_PRINTF(("error (%d) copying out message type\n", error));
msg_freehdr(msghdr);
cv_broadcast(&msq->msq_cv);
goto unlock;
}
user_msgp += typesz;

/*
* Return the segments to the user
*/

next = msghdr->msg_spot;
for (len = 0; len < msgsz; len += msginfo.msgssz) {
size_t tlen;
KASSERT(next > -1);
KASSERT(next < msginfo.msgseg);

if (msgsz - len > msginfo.msgssz)
tlen = msginfo.msgssz;
else
tlen = msgsz - len;
mutex_exit(&msgmutex);
error = copyout(&msgpool[next * msginfo.msgssz],
user_msgp, tlen);
mutex_enter(&msgmutex);
if (error != 0) {
MSG_PRINTF(("error (%d) copying out message segment\n",
error));
msg_freehdr(msghdr);
cv_broadcast(&msq->msq_cv);
goto unlock;
}
user_msgp += tlen;
next = msgmaps[next].next;
}

/*
* Done, return the actual number of bytes copied out.
*/

msg_freehdr(msghdr);
cv_broadcast(&msq->msq_cv);
*retval = msgsz;

unlock:
mutex_exit(&msgmutex);
return error;
}

/*
* Sysctl initialization and nodes.
*/

static int
sysctl_ipc_msgmni(SYSCTLFN_ARGS)
{
int newsize, error;
struct sysctlnode node;
node = *rnode;
node.sysctl_data = &newsize;

newsize = msginfo.msgmni;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;

sysctl_unlock();
error = msgrealloc(newsize, msginfo.msgseg);
sysctl_relock();
return error;
}

static int
sysctl_ipc_msgseg(SYSCTLFN_ARGS)
{
int newsize, error;
struct sysctlnode node;
node = *rnode;
node.sysctl_data = &newsize;

newsize = msginfo.msgseg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;

sysctl_unlock();
error = msgrealloc(msginfo.msgmni, newsize);
sysctl_relock();
return error;
}

SYSCTL_SETUP(sysctl_ipc_msg_setup, "sysctl kern.ipc subtree setup")
{
const struct sysctlnode *node = NULL;

sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ipc",
SYSCTL_DESCR("SysV IPC options"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_SYSVIPC, CTL_EOL);

if (node == NULL)
return;

sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "msgmni",
SYSCTL_DESCR("Max number of message queue identifiers"),
sysctl_ipc_msgmni, 0, &msginfo.msgmni, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "msgseg",
SYSCTL_DESCR("Max number of number of message segments"),
sysctl_ipc_msgseg, 0, &msginfo.msgseg, 0,
CTL_CREATE, CTL_EOL);
}