/*
* Copyright (c) 1992, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_vnops.c 8.33 (Berkeley) 7/31/95
*/
/*
* Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_vnops.c 8.33 (Berkeley) 7/31/95
*/
int union_parsepath(void *);
int union_lookup(void *);
int union_create(void *);
int union_whiteout(void *);
int union_mknod(void *);
int union_open(void *);
int union_close(void *);
int union_access(void *);
int union_getattr(void *);
int union_setattr(void *);
int union_read(void *);
int union_write(void *);
int union_ioctl(void *);
int union_poll(void *);
int union_revoke(void *);
int union_mmap(void *);
int union_fsync(void *);
int union_seek(void *);
int union_remove(void *);
int union_link(void *);
int union_rename(void *);
int union_mkdir(void *);
int union_rmdir(void *);
int union_symlink(void *);
int union_readdir(void *);
int union_readlink(void *);
int union_abortop(void *);
int union_inactive(void *);
int union_reclaim(void *);
int union_lock(void *);
int union_unlock(void *);
int union_bmap(void *);
int union_print(void *);
int union_islocked(void *);
int union_pathconf(void *);
int union_advlock(void *);
int union_strategy(void *);
int union_bwrite(void *);
int union_getpages(void *);
int union_putpages(void *);
int union_kqfilter(void *);
if (upper == 0 && lower == 0) {
panic("%s: missing both layers", __func__);
}
/*
* If they're different, use the larger one. This is not a
* comprehensive solution, but it's sufficient for the
* non-default cases of parsepath that currently exist.
*/
*ap->a_retval = MAX(upper, lower);
return 0;
}
static int
union_lookup1(struct vnode *udvp, struct vnode **dvpp, struct vnode **vpp,
struct componentname *cnp)
{
int error;
struct vnode *tdvp;
struct vnode *dvp;
struct mount *mp;
dvp = *dvpp;
/*
* If stepping up the directory tree, check for going
* back across the mount point, in which case do what
* lookup would do by stepping back down the mount
* hierarchy.
*/
if (cnp->cn_flags & ISDOTDOT) {
while ((dvp != udvp) && (dvp->v_vflag & VV_ROOT)) {
/*
* Don't do the NOCROSSMOUNT check
* at this level. By definition,
* union fs deals with namespaces, not
* filesystems.
*/
tdvp = dvp;
*dvpp = dvp = dvp->v_mount->mnt_vnodecovered;
VOP_UNLOCK(tdvp);
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
}
}
error = VOP_LOOKUP(dvp, &tdvp, cnp);
if (error)
return (error);
if (dvp != tdvp) {
if (cnp->cn_flags & ISDOTDOT)
VOP_UNLOCK(dvp);
error = vn_lock(tdvp, LK_EXCLUSIVE);
if (cnp->cn_flags & ISDOTDOT)
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
if (error) {
vrele(tdvp);
return error;
}
dvp = tdvp;
}
/*
* Lastly check if the current node is a mount point in
* which case walk up the mount hierarchy making sure not to
* bump into the root of the mount tree (ie. dvp != udvp).
*/
while (dvp != udvp && (dvp->v_type == VDIR) &&
(mp = dvp->v_mountedhere)) {
if (vfs_busy(mp))
continue;
vput(dvp);
error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdvp);
vfs_unbusy(mp);
if (error) {
return (error);
}
dvp = tdvp;
}
/*
* do the lookup in the upper level.
* if that level comsumes additional pathnames,
* then assume that something special is going
* on and just return that vnode.
*/
if (upperdvp != NULLVP) {
uerror = union_lookup1(um->um_uppervp, &upperdvp,
&uppervp, cnp);
if (uerror == ENOENT || uerror == EJUSTRETURN) {
if (cnp->cn_flags & ISWHITEOUT) {
iswhiteout = 1;
} else if (lowerdvp != NULLVP) {
lerror = VOP_GETATTR(upperdvp, &va,
cnp->cn_cred);
if (lerror == 0 && (va.va_flags & OPAQUE))
iswhiteout = 1;
}
}
} else {
uerror = ENOENT;
}
/*
* in a similar way to the upper layer, do the lookup
* in the lower layer. this time, if there is some
* component magic going on, then vput whatever we got
* back from the upper layer and return the lower vnode
* instead.
*/
if (lowerdvp != NULLVP && !iswhiteout) {
int nameiop;
vn_lock(lowerdvp, LK_EXCLUSIVE | LK_RETRY);
/*
* Only do a LOOKUP on the bottom node, since
* we won't be making changes to it anyway.
*/
nameiop = cnp->cn_nameiop;
cnp->cn_nameiop = LOOKUP;
if (um->um_op == UNMNT_BELOW) {
saved_cred = cnp->cn_cred;
cnp->cn_cred = um->um_cred;
}
/*
* we shouldn't have to worry about locking interactions
* between the lower layer and our union layer (w.r.t.
* `..' processing) because we don't futz with lowervp
* locks in the union-node instantiation code path.
*/
lerror = union_lookup1(um->um_lowervp, &lowerdvp,
&lowervp, cnp);
if (um->um_op == UNMNT_BELOW)
cnp->cn_cred = saved_cred;
cnp->cn_nameiop = nameiop;
/*
* EJUSTRETURN is used by underlying filesystems to indicate that
* a directory modification op was started successfully.
* This will only happen in the upper layer, since
* the lower layer only does LOOKUPs.
* If this union is mounted read-only, bounce it now.
*/
/*
* at this point, we have uerror and lerror indicating
* possible errors with the lookups in the upper and lower
* layers. additionally, uppervp and lowervp are (locked)
* references to existing vnodes in the upper and lower layers.
*
* there are now three cases to consider.
* 1. if both layers returned an error, then return whatever
* error the upper layer generated.
*
* 2. if the top layer failed and the bottom layer succeeded
* then two subcases occur.
* a. the bottom vnode is not a directory, in which
* case just return a new union vnode referencing
* an empty top layer and the existing bottom layer.
* b. the bottom vnode is a directory, in which case
* create a new directory in the top-level and
* continue as in case 3.
*
* 3. if the top layer succeeded then return a new union
* vnode referencing whatever the new top layer and
* whatever the bottom layer returned.
*/
*ap->a_vpp = NULLVP;
/* case 1. */
if ((uerror != 0) && (lerror != 0)) {
return (uerror);
}
/* case 2. */
if (uerror != 0 /* && (lerror == 0) */ ) {
if (lowervp->v_type == VDIR) { /* case 2b. */
/*
* We may be racing another process to make the
* upper-level shadow directory. Be careful with
* locks/etc!
* If we have to create a shadow directory and want
* to commit the node we have to restart the lookup
* to get the componentname right.
*/
if (upperdvp) {
VOP_UNLOCK(upperdvp);
uerror = union_mkshadow(um, upperdvp, cnp,
&uppervp);
vn_lock(upperdvp, LK_EXCLUSIVE | LK_RETRY);
if (uerror == 0 && cnp->cn_nameiop != LOOKUP) {
vrele(uppervp);
if (lowervp != NULLVP)
vput(lowervp);
goto start;
}
}
if (uerror) {
if (lowervp != NULLVP) {
vput(lowervp);
lowervp = NULLVP;
}
return (uerror);
}
}
} else { /* uerror == 0 */
if (uppervp != upperdvp)
VOP_UNLOCK(uppervp);
}
/*
* If there is an existing upper vp then simply open that.
*/
tvp = un->un_uppervp;
if (tvp == NULLVP) {
/*
* If the lower vnode is being opened for writing, then
* copy the file contents to the upper vnode and open that,
* otherwise can simply open the lower vnode.
*/
tvp = un->un_lowervp;
if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) {
error = union_copyup(un, (mode&O_TRUNC) == 0, cred, l);
if (error == 0)
error = VOP_OPEN(un->un_uppervp, mode, cred);
if (error == 0) {
mutex_enter(un->un_uppervp->v_interlock);
un->un_uppervp->v_writecount++;
mutex_exit(un->un_uppervp->v_interlock);
}
return (error);
}
/*
* Just open the lower vnode, but check for nodev mount flag
*/
if ((tvp->v_type == VBLK || tvp->v_type == VCHR) &&
(ap->a_vp->v_mount->mnt_flag & MNT_NODEV))
return ENXIO;
un->un_openl++;
vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(tvp, mode, cred);
VOP_UNLOCK(tvp);
return (error);
}
/*
* Just open the upper vnode, checking for nodev mount flag first
*/
if ((tvp->v_type == VBLK || tvp->v_type == VCHR) &&
(ap->a_vp->v_mount->mnt_flag & MNT_NODEV))
return ENXIO;
/*
* Check access permission on the union vnode.
* The access check being enforced is to check
* against both the underlying vnode, and any
* copied vnode. This ensures that no additional
* file permissions are given away simply because
* the user caused an implicit file copy.
*/
int
union_access(void *v)
{
struct vop_access_args /* {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
accmode_t a_accmode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct union_node *un = VTOUNION(vp);
int error = EACCES;
struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount);
/*
* Disallow write attempts on read-only file systems;
* unless the file is a socket, fifo, or a block or
* character device resident on the file system.
*/
if (ap->a_accmode & VWRITE) {
switch (vp->v_type) {
case VDIR:
case VLNK:
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
break;
case VBAD:
case VBLK:
case VCHR:
case VSOCK:
case VFIFO:
case VNON:
default:
break;
}
}
/*
* Copy up to prevent checking (and failing) against
* underlying file system mounted read only.
* Check for read access first to prevent implicit
* copy of inaccessible underlying vnode.
*/
if (un->un_uppervp == NULLVP &&
(un->un_lowervp->v_type == VREG) &&
(ap->a_accmode & VWRITE)) {
vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_ACCESS(un->un_lowervp, VREAD, ap->a_cred);
VOP_UNLOCK(un->un_lowervp);
if (error == 0)
error = union_copyup(un, 1, ap->a_cred, curlwp);
if (error)
return error;
}
/*
* We handle getattr only to change the fsid and
* track object sizes
*/
int
union_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
} */ *ap = v;
int error;
struct union_node *un = VTOUNION(ap->a_vp);
struct vnode *vp = un->un_uppervp;
struct vattr *vap;
struct vattr va;
/*
* Some programs walk the filesystem hierarchy by counting
* links to directories to avoid stat'ing all the time.
* This means the link count on directories needs to be "correct".
* The only way to do that is to call getattr on both layers
* and fix up the link count. The link count will not necessarily
* be accurate but will be large enough to defeat the tree walkers.
*
* To make life more interesting, some filesystems don't keep
* track of link counts in the expected way, and return a
* link count of `1' for those directories; if either of the
* component directories returns a link count of `1', we return a 1.
*/
if (!size_only && (vp->v_mount->mnt_flag & MNT_RDONLY))
return (EROFS);
if (vap->va_size != VNOVAL) {
switch (vp->v_type) {
case VDIR:
return (EISDIR);
case VCHR:
case VBLK:
case VSOCK:
case VFIFO:
break;
case VREG:
case VLNK:
default:
/*
* Disallow write attempts if the filesystem is
* mounted read-only.
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (EROFS);
}
}
/*
* Handle case of truncating lower object to zero size,
* by creating a zero length upper object. This is to
* handle the case of open with O_TRUNC and O_CREAT.
*/
if ((un->un_uppervp == NULLVP) &&
/* assert(un->un_lowervp != NULLVP) */
(un->un_lowervp->v_type == VREG)) {
error = union_copyup(un, (vap->va_size != 0),
ap->a_cred, curlwp);
if (error)
return (error);
}
/*
* Try to set attributes in upper layer, ignore size change to zero
* for devices to handle O_TRUNC and return read-only filesystem error
* otherwise.
*/
if (un->un_uppervp != NULLVP) {
error = VOP_SETATTR(un->un_uppervp, vap, ap->a_cred);
if ((error == 0) && (vap->va_size != VNOVAL)) {
mutex_enter(&un->un_lock);
union_newsize(ap->a_vp, vap->va_size, VNOVAL);
}
} else {
KASSERT(un->un_lowervp != NULLVP);
if (NODE_IS_SPECIAL(un->un_lowervp)) {
if (size_only &&
(vap->va_size == 0 || vap->va_size == VNOVAL))
error = 0;
else
error = EROFS;
} else {
error = EROFS;
}
}
return (error);
}
int
union_read(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
int error;
struct vnode *vp = OTHERVP(ap->a_vp);
int dolock = (vp == LOWERVP(ap->a_vp));
if (dolock)
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_cred);
if (dolock)
VOP_UNLOCK(vp);
/*
* XXX
* perhaps the size of the underlying object has changed under
* our feet. take advantage of the offset information present
* in the uio structure.
*/
if (error == 0) {
struct union_node *un = VTOUNION(ap->a_vp);
off_t cur = ap->a_uio->uio_offset;
off_t usz = VNOVAL, lsz = VNOVAL;
mutex_enter(&un->un_lock);
if (vp == un->un_uppervp) {
if (cur > un->un_uppersz)
usz = cur;
} else {
if (cur > un->un_lowersz)
lsz = cur;
}
/*
* If vinvalbuf is calling us, it's a "shallow fsync" -- don't
* bother syncing the underlying vnodes, since (a) they'll be
* fsync'ed when reclaimed and (b) we could deadlock if
* they're locked; otherwise, pass it through to the
* underlying layer.
*/
if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) {
error = spec_fsync(v);
if (error)
return error;
}
if (ap->a_flags & FSYNC_RECLAIM)
return 0;
targetvp = OTHERVP(ap->a_vp);
if (targetvp != NULLVP) {
int dolock = (targetvp == LOWERVP(ap->a_vp));
if (dolock)
vn_lock(targetvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_FSYNC(targetvp, ap->a_cred, ap->a_flags,
ap->a_offlo, ap->a_offhi);
if (dolock)
VOP_UNLOCK(targetvp);
}
/*
* Needs to be copied before we can link it.
*/
vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
if (droplock)
VOP_UNLOCK(dun->un_uppervp);
error = union_copyup(un, 1, cnp->cn_cred, curlwp);
if (droplock) {
vn_lock(dun->un_uppervp,
LK_EXCLUSIVE | LK_RETRY);
/*
* During copyup, we dropped the lock on the
* dir and invalidated any saved namei lookup
* state for the directory we'll be entering
* the link in. We need to re-run the lookup
* in that directory to reset any state needed
* for VOP_LINK.
* Call relookup on the union-layer to reset
* the state.
*/
vp = NULLVP;
if (dun->un_uppervp == NULLVP)
panic("union: null upperdvp?");
error = relookup(ap->a_dvp, &vp, ap->a_cnp, 0);
if (error) {
VOP_UNLOCK(ap->a_vp);
return EROFS; /* ? */
}
if (vp != NULLVP) {
/*
* The name we want to create has
* mysteriously appeared (a race?)
*/
error = EEXIST;
VOP_UNLOCK(ap->a_vp);
vput(vp);
return (error);
}
}
VOP_UNLOCK(ap->a_vp);
}
vp = un->un_uppervp;
}
dvp = dun->un_uppervp;
if (dvp == NULLVP)
error = EROFS;
/*
* Account for VOP_RENAME to vrele all nodes.
* Note: VOP_RENAME will unlock tdvp.
*/
if (fdvp->v_op == union_vnodeop_p) { /* always true */
struct union_node *un = VTOUNION(fdvp);
if (un->un_uppervp == NULLVP) {
/*
* this should never happen in normal
* operation but might if there was
* a problem creating the top-level shadow
* directory.
*/
error = EXDEV;
goto bad;
}
fdvp = un->un_uppervp;
vref(fdvp);
}
if (fvp->v_op == union_vnodeop_p) { /* always true */
struct union_node *un = VTOUNION(fvp);
if (un->un_uppervp == NULLVP) {
/* XXX: should do a copyup */
error = EXDEV;
goto bad;
}
if (un->un_lowervp != NULLVP)
ap->a_fcnp->cn_flags |= DOWHITEOUT;
fvp = un->un_uppervp;
vref(fvp);
}
if (tdvp->v_op == union_vnodeop_p) {
struct union_node *un = VTOUNION(tdvp);
if (un->un_uppervp == NULLVP) {
/*
* this should never happen in normal
* operation but might if there was
* a problem creating the top-level shadow
* directory.
*/
error = EXDEV;
goto bad;
}
/*
* union_readdir works in concert with getdirentries and
* readdir(3) to provide a list of entries in the unioned
* directories. getdirentries is responsible for walking
* down the union stack. readdir(3) is responsible for
* eliminating duplicate names from the returned data stream.
*/
int
union_readdir(void *v)
{
struct vop_readdir_args /* {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
struct uio *a_uio;
kauth_cred_t a_cred;
int *a_eofflag;
u_long *a_cookies;
int a_ncookies;
} */ *ap = v;
struct union_node *un = VTOUNION(ap->a_vp);
struct vnode *vp;
int dolock, error;
/*
* Do nothing (and _don't_ bypass).
* Wait to vrele lowervp until reclaim,
* so that until then our union_node is in the
* cache and reusable.
*
* NEEDSWORK: Someday, consider inactive'ing
* the lowervp and then trying to reactivate it
* with capabilities (v_id)
* like they do in the name lookup cache code.
* That's too much work for now.
*/
if (un->un_dircache != 0) {
for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
vrele(*vpp);
free(un->un_dircache, M_TEMP);
un->un_dircache = 0;
}
/*
* We watch either the upper layer file (if it already exists),
* or the lower layer one. If there is lower layer file only
* at this moment, we will keep watching that lower layer file
* even if upper layer file would be created later on.
*/
if (UPPERVP(ap->a_vp))
error = VOP_KQFILTER(UPPERVP(ap->a_vp), ap->a_kn);
else if (LOWERVP(ap->a_vp))
error = VOP_KQFILTER(LOWERVP(ap->a_vp), ap->a_kn);
else {
/* panic? */
error = EOPNOTSUPP;
}