/*-
* Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
* by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
/*
* Flush out and invalidate all buffers associated with a vnode.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
bool catch_p, int slptimeo)
{
struct buf *bp, *nbp;
int error;
int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
(flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
/* XXXUBC this doesn't look at flags or slp* */
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, 0, 0, flushflags);
if (error) {
return error;
}
if (flags & V_SAVE) {
error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
if (error)
return error;
KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
}
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return error;
}
/*
* XXX Since there are no node locks for NFS, I believe
* there is a slight chance that a delayed write will
* occur while sleeping just above, so check for it.
*/
if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
#ifdef DEBUG
printf("buffer still DELWRI\n");
#endif
bp->b_cflags |= BC_BUSY | BC_VFLUSH;
mutex_exit(&bufcache_lock);
VOP_BWRITE(bp->b_vp, bp);
mutex_enter(&bufcache_lock);
goto restart;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
/*
* Destroy any in core blocks past the truncation length.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
{
struct buf *bp, *nbp;
int error;
voff_t off;
mutex_enter(&bufcache_lock);
restart:
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return error;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return error;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
mutex_exit(&bufcache_lock);
return 0;
}
/*
* Flush all dirty buffers from a vnode.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vflushbuf(struct vnode *vp, int flags)
{
struct buf *bp, *nbp;
int error, pflags;
bool dirty, sync;
loop:
mutex_enter(&bufcache_lock);
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if ((bp->b_cflags & BC_BUSY))
continue;
if ((bp->b_oflags & BO_DELWRI) == 0)
panic("vflushbuf: not dirty, bp %p", bp);
bp->b_cflags |= BC_BUSY | BC_VFLUSH;
mutex_exit(&bufcache_lock);
/*
* Wait for I/O associated with indirect blocks to complete,
* since there is no way to quickly wait for them below.
*/
if (bp->b_vp == vp || !sync)
(void) bawrite(bp);
else {
error = bwrite(bp);
if (error)
return error;
}
goto loop;
}
mutex_exit(&bufcache_lock);
if (dirty) {
vprint("vflushbuf: dirty", vp);
goto loop;
}
return 0;
}
/*
* Create a vnode for a block device.
* Used for root filesystem and swap areas.
* Also used for memory file system special devices.
*/
int
bdevvp(dev_t dev, vnode_t **vpp)
{
struct vattr va;
/*
* Reassign a buffer from one vnode list to another.
* The list reassignment must be within the same vnode.
* Used to assign file specific control information
* (indirect blocks) to the list to which they belong.
*/
void
reassignbuf(struct buf *bp, struct vnode *vp)
{
struct buflists *listheadp;
int delayx;
/*
* Revoke all the vnodes corresponding to the specified minor number
* range (endpoints inclusive) of the specified major.
*/
void
vdevgone(int maj, int minl, int minh, enum vtype type)
{
vnode_t *vp;
dev_t dev;
int mn;
for (mn = minl; mn <= minh; mn++) {
dev = makedev(maj, mn);
/*
* Notify anyone trying to get at this device that it
* has been detached, and then revoke it.
*/
switch (type) {
case VBLK:
bdev_detached(dev);
break;
case VCHR:
cdev_detached(dev);
break;
default:
panic("invalid specnode type: %d", type);
}
/*
* Passing 0 as flags, instead of VDEAD_NOWAIT, means
* spec_node_lookup_by_dev will wait for vnodes it
* finds concurrently being revoked before returning.
*/
while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) {
VOP_REVOKE(vp, REVOKEALL);
vrele(vp);
}
}
}
/*
* The filesystem synchronizer mechanism - syncer.
*
* It is useful to delay writes of file data and filesystem metadata for
* a certain amount of time so that quickly created and deleted files need
* not waste disk bandwidth being created and removed. To implement this,
* vnodes are appended to a "workitem" queue.
*
* Most pending metadata should not wait for more than ten seconds. Thus,
* mounted on block devices are delayed only about a half the time that file
* data is delayed. Similarly, directory updates are more critical, so are
* only delayed about a third the time that file data is delayed.
*
* There are SYNCER_MAXDELAY queues that are processed in a round-robin
* manner at a rate of one each second (driven off the filesystem syner
* thread). The syncer_delayno variable indicates the next queue that is
* to be processed. Items that need to be processed soon are placed in
* this queue:
*
* syncer_workitem_pending[syncer_delayno]
*
* A delay of e.g. fifteen seconds is done by placing the request fifteen
* entries later in the queue:
*
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
*
* Flag VI_ONWORKLST indicates that vnode is added into the queue.
*/
/*
* Defines and variables for the syncer process.
*/
int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
time_t syncdelay = 30; /* max time to delay syncing data */
time_t filedelay = 30; /* time to delay syncing files */
time_t dirdelay = 15; /* time to delay syncing directories */
time_t metadelay = 10; /* time to delay syncing metadata */
time_t lockdelay = 1; /* time to delay if locking fails */
static kmutex_t syncer_data_lock; /* short term lock on data structs */
static int syncer_delayno = 0;
static long syncer_last;
static synclist_t * syncer_workitem_pending;
/*
* Return delay factor appropriate for the given file system. For
* WAPBL we use the sync vnode to burst out metadata updates: sync
* those file systems more frequently.
*/
static inline int
sync_delay(struct mount *mp)
{
/*
* Add an item to the syncer work queue.
*/
static void
vn_syncer_add1(struct vnode *vp, int delayx)
{
synclist_t *slp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(&syncer_data_lock));
if (vp->v_iflag & VI_ONWORKLST) {
/*
* Remove in order to adjust the position of the vnode.
* Note: called from sched_sync(), which will not hold
* interlock, therefore we cannot modify v_iflag here.
*/
slp = &syncer_workitem_pending[vip->vi_synclist_slot];
TAILQ_REMOVE(slp, vip, vi_synclist);
} else {
KASSERT(mutex_owned(vp->v_interlock));
vp->v_iflag |= VI_ONWORKLST;
}
/*
* We attempt to scatter the mount points on the list
* so that they will go off at evenly distributed times
* even if all the filesystems are mounted at once.
*/
/*
* XXX The vnode may have been recycled, in which
* case it may have a new identity.
*/
vi = TAILQ_FIRST(slp);
if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) {
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*
* Try again sooner rather than later if
* we were unable to lock the vnode. Lock
* failure should not prevent us from doing
* the sync "soon".
*
* If we locked it yet arrive here, it's
* likely that lazy sync is in progress and
* so the vnode still has dirty metadata.
* syncdelay is mainly to get this vnode out
* of the way so we do not consider it again
* "soon" in this loop, so the delay time is
* not critical as long as it is not "soon".
* While write-back strategy is the file
* system's domain, we expect write-back to
* occur no later than syncdelay seconds
* into the future.
*/
delayx = synced ? syncdelay : lockdelay;
oslot = vi->vi_synclist_slot;
vn_syncer_add1(vp, delayx);
nslot = vi->vi_synclist_slot;
SDT_PROBE4(vfs, syncer, worklist,
vnode__update,
vp, delayx, oslot, nslot);
}
}
/*
* If it has taken us less than a second to process the
* current work, then wait. Otherwise start right over
* again. We can still lose time if any single round
* takes more than two seconds, but it does not really
* matter as we are just trying to generally pace the
* filesystem activity.
*/
if (endtime == starttime) {
kpause("syncer", false, hz, &syncer_data_lock);
}
mutex_exit(&syncer_data_lock);
}
}
/*
* Print out a description of a vnode.
*/
void
vprint(const char *label, struct vnode *vp)
{
if (label != NULL)
printf("%s: ", label);
vprint_common(vp, "\t", printf);
if (vp->v_data != NULL) {
printf("\t");
VOP_PRINT(vp);
}
}
/*
* Given a file system name, look up the vfsops for that
* file system, or return NULL if file system isn't present
* in the kernel.
*/
struct vfsops *
vfs_getopsbyname(const char *name)
{
struct vfsops *v;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (strcmp(v->vfs_name, name) == 0)
break;
}
if (v != NULL)
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
/*
* Knob to control the precision of file timestamps:
*
* 0 = seconds only; nanoseconds zeroed.
* 1 = seconds and nanoseconds, accurate within 1/HZ.
* 2 = seconds and nanoseconds, truncated to microseconds.
* >=3 = seconds and nanoseconds, maximum precision.
*/
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
int vfs_timestamp_precision __read_mostly = TSP_NSEC;
switch (vfs_timestamp_precision) {
case TSP_SEC:
tsp->tv_sec = time_second;
tsp->tv_nsec = 0;
break;
case TSP_HZ:
getnanotime(tsp);
break;
case TSP_USEC:
microtime(&tv);
TIMEVAL_TO_TIMESPEC(&tv, tsp);
break;
case TSP_NSEC:
default:
nanotime(tsp);
break;
}
}
/*
* The purpose of this routine is to remove granularity from accmode_t,
* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
* VADMIN and VAPPEND.
*
* If it returns 0, the caller is supposed to continue with the usual
* access checks using 'accmode' as modified by this routine. If it
* returns nonzero value, the caller is supposed to return that value
* as errno.
*
* Note that after this routine runs, accmode may be zero.
*/
int
vfs_unixify_accmode(accmode_t *accmode)
{
/*
* There is no way to specify explicit "deny" rule using
* file mode or POSIX.1e ACLs.
*/
if (*accmode & VEXPLICIT_DENY) {
*accmode = 0;
return 0;
}
/*
* None of these can be translated into usual access bits.
* Also, the common case for NFSv4 ACLs is to not contain
* either of these bits. Caller should check for VWRITE
* on the containing directory instead.
*/
if (*accmode & (VDELETE_CHILD | VDELETE))
return SET_ERROR(EPERM);
/*
* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
*/
*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
return 0;
}
time_t rootfstime; /* recorded root fs time, if known */
void
setrootfstime(time_t t)
{
int
VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
{
int mpsafe = mp->mnt_iflag & IMNT_MPSAFE;
int error;
/*
* Note: The first time through, the vfs_mount function may set
* IMNT_MPSAFE, so we have to cache it on entry in order to
* avoid leaking a kernel lock.
*
* XXX Maybe the MPSAFE bit should be set in struct vfsops and
* not in struct mount.
*/
if (mpsafe) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
if (mpsafe) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_START(struct mount *mp, int a)
{
int error;