/*-
* Copyright (c) 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Theodore Preduta.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: linux_inotify.c,v 1.8 2025/06/27 21:36:23 andvar Exp $");
/*
* inotify(2). This interface allows the user to get file system
* events and (unlike kqueue(2)) their order is strictly preserved.
* While nice, the API has sufficient gotchas that mean we don't want
* to add native entry points for it. They are:
*
* - Because data is returned via read(2), this API is prone to
* unaligned memory accesses. There is a note in the Linux man page
* that says the name field of struct linux_inotify_event *can* be
* used for alignment purposes. In practice, even Linux doesn't
* always do this, so for simplicity, we don't ever do this.
*/
struct inotifyfd {
int ifd_kqfd; /* kqueue fd used by this inotify */
/* instance */
struct selinfo ifd_sel; /* for EVFILT_READ by epoll */
kmutex_t ifd_lock; /* lock for ifd_sel, ifd_wds and */
/* ifd_nwds */
struct inotify_dir_entries **ifd_wds;
/* keeps track of watch descriptors */
/* for directories: snapshot of the */
/* directory state */
/* for files: an inotify_dir_entries */
/* with ide_count == 0 */
size_t ifd_nwds; /* max watch descriptor that can be */
/* stored in ifd_wds + 1 */
TAILQ_HEAD(, inotify_entry) ifd_qhead; /* queue of pending events */
size_t ifd_qcount; /* number of pending events */
kcondvar_t ifd_qcv; /* condvar for blocking reads */
kmutex_t ifd_qlock; /* lock for ifd_q* and interlock */
/* for ifd_qcv */
};
/*
* Register the custom kfilter for inotify.
*/
int
linux_inotify_init(void)
{
return kfilter_register(inotify_filtname, &inotify_filtops,
&inotify_filtid);
}
/*
* Unregister the custom kfilter for inotify.
*/
int
linux_inotify_fini(void)
{
return kfilter_unregister(inotify_filtname);
}
/*
* Copyin callback used by kevent. This copies already converted
* filters from kernel memory to the kevent internal kernel memory.
* Hence the memcpy instead of copyin.
*/
static int
inotify_kev_fetch_changes(void *ctx, const struct kevent *changelist,
struct kevent *changes, size_t index, int n)
{
memcpy(changes, changelist + index, n * sizeof(*changes));
return 0;
}
/*
* Initialize a new inotify fd.
*/
static int
do_inotify_init(struct lwp *l, register_t *retval, int flags)
{
file_t *fp;
int error, fd;
struct proc *p = l->l_proc;
struct inotifyfd *ifd;
struct sys_kqueue1_args kqa;
#ifndef __aarch64__
/*
* inotify_init(2). Initialize a new inotify fd with flags=0.
*/
int
linux_sys_inotify_init(struct lwp *l, const void *v, register_t *retval)
{
return do_inotify_init(l, retval, 0);
}
#endif
/*
* inotify_init(2). Initialize a new inotify fd with the given flags.
*/
int
linux_sys_inotify_init1(struct lwp *l,
const struct linux_sys_inotify_init1_args *uap, register_t *retval)
{
/* {
syscallarg(int) flags;
} */
/*
* Convert inotify mask to the fflags of an equivalent kevent.
*/
static uint32_t
inotify_mask_to_kevent_fflags(uint32_t mask, enum vtype type)
{
const struct inotify_kevent_mask_pair *type_inotify_to_kevent;
uint32_t fflags;
size_t i, type_inotify_to_kevent_len;
switch (type) {
case VREG:
case VDIR:
case VLNK:
break;
default:
return 0;
}
/* flags that all watches could have */
fflags = NOTE_DELETE|NOTE_REVOKE;
for (i = 0; i < common_inotify_to_kevent_len; i++)
if (mask & common_inotify_to_kevent[i].inotify)
fflags |= common_inotify_to_kevent[i].kevent;
/* flags that depend on type */
switch (type) {
case VREG:
type_inotify_to_kevent = vreg_inotify_to_kevent;
type_inotify_to_kevent_len = vreg_inotify_to_kevent_len;
break;
case VDIR:
type_inotify_to_kevent = vdir_inotify_to_kevent;
type_inotify_to_kevent_len = vdir_inotify_to_kevent_len;
break;
default:
type_inotify_to_kevent_len = 0;
break;
}
for (i = 0; i < type_inotify_to_kevent_len; i++)
if (mask & type_inotify_to_kevent[i].inotify)
fflags |= type_inotify_to_kevent[i].kevent;
return fflags;
}
/*
* inotify_add_watch(2). Open a fd for pathname (if desired by mask)
* track it and add an equivalent kqueue event for it in
* ifd->ifd_kqfd.
*/
int
linux_sys_inotify_add_watch(struct lwp *l,
const struct linux_sys_inotify_add_watch_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const char *) pathname;
syscallarg(uint32_t) mask;
} */
int wd, i, error = 0;
file_t *fp, *wp, *cur_fp;
struct inotifyfd *ifd;
struct inotify_dir_entries **new_wds;
struct knote *kn, *tmpkn;
struct sys_open_args oa;
struct kevent kev;
struct vnode *wvp;
namei_simple_flags_t sflags;
struct kevent_ops k_ops = {
.keo_private = NULL,
.keo_fetch_timeout = NULL,
.keo_fetch_changes = inotify_kev_fetch_changes,
.keo_put_events = NULL,
};
const int fd = SCARG(uap, fd);
const uint32_t mask = SCARG(uap, mask);
if (mask & ~LINUX_IN_ADD_KNOWN)
return EINVAL;
fp = fd_getfile(fd);
if (fp == NULL)
return EBADF;
if (fp->f_ops != &inotify_fileops) {
/* not an inotify fd */
error = EBADF;
goto leave0;
}
/* Check to see if we already have a descriptor to wd's file. */
wd = -1;
for (i = 0; i < ifd->ifd_nwds; i++) {
if (ifd->ifd_wds[i] != NULL) {
cur_fp = fd_getfile(i);
if (cur_fp == NULL) {
DPRINTF(("%s: wd=%d was closed externally\n",
__func__, i));
error = EBADF;
goto leave1;
}
if (cur_fp->f_type != DTYPE_VNODE) {
DPRINTF(("%s: wd=%d was replaced "
"with a non-vnode\n", __func__, i));
error = EBADF;
}
if (error == 0 && cur_fp->f_vnode == wvp)
wd = i;
fd_putfile(i);
if (error != 0)
goto leave1;
if (wd != -1)
break;
}
}
if (wd == -1) {
/*
* If we do not have a descriptor to wd's file, we
* need to open the watch descriptor.
*/
SCARG(&oa, path) = SCARG(uap, pathname);
SCARG(&oa, mode) = 0;
SCARG(&oa, flags) = O_RDONLY;
if (mask & LINUX_IN_DONT_FOLLOW)
SCARG(&oa, flags) |= O_NOFOLLOW;
if (mask & LINUX_IN_ONLYDIR)
SCARG(&oa, flags) |= O_DIRECTORY;
ifd->ifd_wds[wd] = get_inotify_dir_entries(wd, true);
}
} else {
/*
* If we do have a descriptor to wd's file, try to edit
* the relevant knote.
*/
if (mask & LINUX_IN_MASK_CREATE) {
error = EEXIST;
goto leave1;
}
wp = fd_getfile(wd);
if (wp == NULL) {
DPRINTF(("%s: wd=%d was closed externally "
"(race, probably)\n", __func__, wd));
error = EBADF;
goto leave1;
}
if (wp->f_type != DTYPE_VNODE) {
DPRINTF(("%s: wd=%d was replace with a non-vnode "
"(race, probably)\n", __func__, wd));
error = EBADF;
goto leave2;
}
/*
* XXX We are forced to find the appropriate knote
* manually because we cannot create a custom f_touch
* function for inotify_filtops. See filter_touch()
* in kern_event.c for details.
*/
SLIST_FOREACH_SAFE(kn, &wp->f_vnode->v_klist->vk_klist,
kn_selnext, tmpkn) {
if (kn->kn_fop == &inotify_filtops
&& ifd == kn->kn_kevent.udata) {
mutex_enter(&kn->kn_kq->kq_lock);
if (mask & LINUX_IN_MASK_ADD)
kn->kn_sfflags |= kev.fflags;
else
kn->kn_sfflags = kev.fflags;
wp->f_vnode->v_klist->vk_interest |=
kn->kn_sfflags;
mutex_exit(&kn->kn_kq->kq_lock);
}
}
/*
* Needs to be set so that we get the same event handling as
* EVFILT_VNODE. Otherwise we don't get any events.
*
* A consequence of this is that modifications/removals of
* this knote need to specify EVFILT_VNODE rather than
* inotify_filtid.
*/
kn->kn_filter = EVFILT_VNODE;
/*
* Like vn_readdir(), but with vnode locking only if needs_lock is
* true (to avoid double locking in some situations).
*/
static int
inotify_readdir(file_t *fp, struct dirent *dep, int *done, bool needs_lock)
{
struct vnode *vp;
struct iovec iov;
struct uio uio;
int error, eofflag;
/* XXX: should pass whether to lock or not */
if (needs_lock)
vn_lock(vp, LK_SHARED | LK_RETRY);
else
/*
* XXX We need to temporarily drop v_interlock because
* it may be temporarily acquired by biowait().
*/
mutex_exit(vp->v_interlock);
KASSERT(!mutex_owned(vp->v_interlock));
error = VOP_READDIR(vp, &uio, fp->f_cred, &eofflag, NULL, NULL);
if (needs_lock)
VOP_UNLOCK(vp);
else
mutex_enter(vp->v_interlock);
/*
* Create (and allocate) an appropriate inotify_dir_entries struct for wd to be
* used on ifd_wds of inotifyfd. If the entries on a directory fail to be read,
* NULL is returned. needs_lock indicates if the vnode's lock is not already
* owned.
*/
static struct inotify_dir_entries *
get_inotify_dir_entries(int wd, bool needs_lock)
{
struct dirent de;
struct dirent *currdep;
struct inotify_dir_entries *idep = NULL;
file_t *wp;
int done, error;
size_t i, decount;
wp = fd_getfile(wd);
if (wp == NULL)
return NULL;
if (wp->f_type != DTYPE_VNODE)
goto leave;
/* for non-directories, we have 0 entries. */
if (wp->f_vnode->v_type != VDIR) {
idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(0), KM_SLEEP);
goto leave;
}
mutex_enter(&wp->f_lock);
wp->f_offset = 0;
mutex_exit(&wp->f_lock);
decount = 0;
for (;;) {
error = inotify_readdir(wp, &de, &done, needs_lock);
if (error != 0)
goto leave;
if (done == 0)
break;
/*
* XXX Because we are not watching the entire
* file system, the only time we know for sure
* that the event is a LINUX_IN_MOVED_FROM/
* LINUX_IN_MOVED_TO is when the move happens
* within a single directory... ie. the number
* of directory entries has not changed.
*
* Otherwise all we can say for sure is that
* something was created/deleted. So we issue a
* LINUX_IN_CREATE/LINUX_IN_DELETE.
*/
ino_t changed = new_idep->ide_entries[new_idep->ide_count - 1].fileno;
/* Find the deleted entry. */
for (i = 0; i < old_idep->ide_count; i++)
if (old_idep->ide_entries[i].fileno == changed)
break;
KASSERT(i != old_idep->ide_count);
/*
* Convert a kevent flags and fflags for EVFILT_VNODE to some number
* of inotify events.
*/
static int
kevent_to_inotify(struct inotifyfd *ifd, int wd, enum vtype wtype,
uint32_t flags, uint32_t fflags, struct inotify_entry *buf,
size_t *nbuf)
{
struct stat st;
file_t *wp;
size_t i;
int error = 0;
for (i = 0; i < common_kevent_to_inotify_len; i++)
if (fflags & common_kevent_to_inotify[i].kevent)
do_kevent_to_inotify(wd,
common_kevent_to_inotify[i].inotify, 0, buf, nbuf,
NULL);
if (wtype == VREG) {
for (i = 0; i < vreg_kevent_to_inotify_len; i++)
if (fflags & vreg_kevent_to_inotify[i].kevent)
do_kevent_to_inotify(wd,
vreg_kevent_to_inotify[i].inotify, 0,
buf, nbuf, NULL);
} else if (wtype == VDIR) {
for (i = 0; i < *nbuf; i++)
if (buf[i].ie_event.mask &
(LINUX_IN_ACCESS|LINUX_IN_ATTRIB
|LINUX_IN_CLOSE|LINUX_IN_OPEN))
buf[i].ie_event.mask |= LINUX_IN_ISDIR;
/* Need to disambiguate the possible NOTE_WRITEs. */
if (fflags & NOTE_WRITE)
handle_write(ifd, wd, buf, nbuf);
}
/*
* Need to check if wd is actually has a link count of 0 to issue a
* LINUX_IN_DELETE_SELF.
*/
if (fflags & NOTE_DELETE) {
wp = fd_getfile(wd);
KASSERT(wp != NULL);
KASSERT(wp->f_type == DTYPE_VNODE);
vn_stat(wp->f_vnode, &st);
fd_putfile(wd);
/* LINUX_IN_IGNORED must be the last event issued for wd. */
if ((flags & EV_ONESHOT) || (fflags & (NOTE_REVOKE|NOTE_DELETE))) {
do_kevent_to_inotify(wd, LINUX_IN_IGNORED, 0, buf, nbuf, NULL);
/*
* XXX in theory we could call inotify_close_wd(ifd, wd) but if
* we get here we must already be holding v_interlock for
* wd... so we can't.
*
* For simplicity we do nothing, and so wd will only be closed
* when the inotify fd is closed.
*/
}
return error;
}
/*
* Handle an event. Unlike EVFILT_VNODE, we translate the event to a
* linux_inotify_event and put it in our own custom queue.
*/
static int
inotify_filt_event(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
struct inotifyfd *ifd;
struct inotify_entry *cur_ie;
size_t nbuf, i;
uint32_t status;
struct inotify_entry buf[LINUX_INOTIFY_MAX_FROM_KEVENT];
/*
* If KN_WILLDETACH is set then
* 1. kn->kn_kevent.udata has already been trashed with a
* struct lwp *, so we don't have access to a real ifd
* anymore, and
* 2. we're about to detach anyways, so we don't really care
* about the events.
* (Also because of this we need to get ifd under the same
* lock as kn->kn_status.)
*/
mutex_enter(&kn->kn_kq->kq_lock);
status = kn->kn_status;
ifd = kn->kn_kevent.udata;
mutex_exit(&kn->kn_kq->kq_lock);
if (status & KN_WILLDETACH)
return 0;
/*
* If we don't care about the NOTEs in hint, we don't generate
* any events.
*/
hint &= kn->kn_sfflags;
if (hint == 0)
return 0;
/*
* early out: there's no point even traslating the event if we
* have nowhere to put it (and an LINUX_IN_Q_OVERFLOW has
* already been added).
*/
if (ifd->ifd_qcount >= LINUX_INOTIFY_MAX_QUEUED)
goto leave;
nbuf = 0;
(void)kevent_to_inotify(ifd, kn->kn_id, vp->v_type, kn->kn_flags,
hint, buf, &nbuf);
for (i = 0; i < nbuf && ifd->ifd_qcount < LINUX_INOTIFY_MAX_QUEUED-1;
i++) {
cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
memcpy(cur_ie, &buf[i], sizeof(*cur_ie));
TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
ifd->ifd_qcount++;
}
/* handle early overflow, by adding an overflow event to the end */
if (i != nbuf) {
nbuf = 0;
cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
do_kevent_to_inotify(-1, LINUX_IN_Q_OVERFLOW, 0,
cur_ie, &nbuf, NULL);
/*
* Read inotify events from the queue.
*/
static int
inotify_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct inotify_entry *cur_iep;
size_t cur_size, nread;
int error = 0;
struct inotifyfd *ifd = fp->f_data;
mutex_enter(&ifd->ifd_qlock);
if (ifd->ifd_qcount == 0) {
if (fp->f_flag & O_NONBLOCK) {
error = EAGAIN;
goto leave;
}
while (ifd->ifd_qcount == 0) {
/* wait until there is an event to read */
error = cv_wait_sig(&ifd->ifd_qcv, &ifd->ifd_qlock);
if (error != 0) {
error = EINTR;
goto leave;
}
}
}
leave:
/* Wake up the next reader, if the queue is not empty. */
if (ifd->ifd_qcount > 0)
cv_signal(&ifd->ifd_qcv);
mutex_exit(&ifd->ifd_qlock);
return error;
}
/*
* Close all the file descriptors associated with fp.
*/
static int
inotify_close(file_t *fp)
{
int error;
size_t i;
file_t *kqfp;
struct inotifyfd *ifd = fp->f_data;
for (i = 0; i < ifd->ifd_nwds; i++) {
if (ifd->ifd_wds[i] != NULL) {
error = inotify_close_wd(ifd, i);
if (error != 0)
return error;
}
}
/* the reference we need to hold is ifd->ifd_kqfp */
kqfp = fd_getfile(ifd->ifd_kqfd);
if (kqfp == NULL) {
DPRINTF(("%s: kqfp=%d is already closed\n", __func__,
ifd->ifd_kqfd));
} else {
error = fd_close(ifd->ifd_kqfd);
if (error != 0)
return error;
}
/*
* Check if there are pending read events.
*/
static int
inotify_poll(file_t *fp, int events)
{
int revents;
struct inotifyfd *ifd = fp->f_data;
revents = 0;
if (events & (POLLIN|POLLRDNORM)) {
mutex_enter(&ifd->ifd_qlock);
if (ifd->ifd_qcount > 0)
revents |= events & (POLLIN|POLLRDNORM);
mutex_exit(&ifd->ifd_qlock);
}
return revents;
}
/*
* Attach EVFILT_READ to the inotify instance in fp.
*
* This is so you can watch inotify with epoll. No other kqueue
* filter needs to be supported.
*/
static int
inotify_kqfilter(file_t *fp, struct knote *kn)
{
struct inotifyfd *ifd = fp->f_data;
/*
* Handle EVFILT_READ events. Note that nothing is put in kn_data.
*/
static int
inotify_read_filt_event(struct knote *kn, long hint)
{
struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
if (hint != 0) {
KASSERT(mutex_owned(&ifd->ifd_lock));
KASSERT(mutex_owned(&ifd->ifd_qlock));
KASSERT(hint == NOTE_LOWAT);