/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
if (trans_mount == NULL) {
trans_mount = vp->v_mount;
fstrans_start(trans_mount);
/*
* check if this vnode is still valid.
*/
mutex_enter(vp->v_interlock);
error = vdead_check(vp, 0);
mutex_exit(vp->v_interlock);
if (error)
goto out_err_free;
/*
* XXX: This assumes that we come here only via
* the mmio path
*/
if (blockalloc && need_wapbl) {
error = WAPBL_BEGIN(trans_mount);
if (error)
goto out_err_free;
holds_wapbl = true;
}
}
/*
* hold g_glock to prevent a race with truncate.
*
* check if our idea of v_size is still valid.
*/
KASSERT(!glocked || genfs_node_wrlocked(vp));
if (!glocked) {
if (blockalloc) {
genfs_node_wrlock(vp);
} else {
genfs_node_rdlock(vp);
}
}
rw_enter(uobj->vmobjlock, RW_WRITER);
if (vp->v_size < origvsize) {
if (!glocked) {
genfs_node_unlock(vp);
}
if (pgs != pgs_onstack)
kmem_free(pgs, pgs_size);
goto startover;
}
/*
* mark the page DIRTY.
* otherwise another thread can do putpages and pull
* our vnode from syncer's queue before our caller does
* ubc_release. note that putpages won't see CLEAN
* pages even if they are BUSY.
*/
/*
* if the pages are already resident, just return them.
*/
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[ridx + i];
if ((pg->flags & PG_FAKE) ||
(blockalloc && (pg->flags & PG_RDONLY) != 0)) {
break;
}
}
if (i == npages) {
if (!glocked) {
genfs_node_unlock(vp);
}
UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
npages += ridx;
goto out;
}
/*
* the page wasn't resident and we're not overwriting,
* so we're going to have to do some i/o.
* find any additional pages needed to cover the expanded range.
*/
npages = (endoffset - startoffset) >> PAGE_SHIFT;
if (startoffset != origoffset || npages != orignmempages) {
int npgs;
/*
* we need to avoid deadlocks caused by locking
* additional pages at lower offsets than pages we
* already have locked. unlock them all and start over.
*/
/*
* we're almost done! release the pages...
* for errors, we free the pages.
* otherwise we activate them and mark them as valid and clean.
* also, unbusy pages that were not actually requested.
*/
/*
* if EOF is in the middle of the range, zero the part past EOF.
* skip over pages which are not PG_FAKE since in that case they have
* valid data that we need to preserve.
*/
tailstart = bytes;
while (tailbytes > 0) {
const int len = PAGE_SIZE - (tailstart & PAGE_MASK);
/*
* bmap the file to find out the blkno to read from and
* how much we can read in one i/o. if bmap returns an error,
* skip the rest of the top-level i/o.
*/
/* Remove the mapping (make KVA available as soon as possible) */
uvm_pagermapout(kva, npages);
/*
* if this we encountered a hole then we have to do a little more work.
* for read faults, we marked the page PG_RDONLY so that future
* write accesses to the page will fault again.
* for write faults, we must make sure that the backing store for
* the page is completely allocated while the pages are locked.
*/
if (!error && sawhole && blockalloc) {
error = GOP_ALLOC(vp, startoffset,
npages << PAGE_SHIFT, 0, cred);
UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd",
startoffset, npages << PAGE_SHIFT, error,0);
if (!error) {
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
/*
* generic VM putpages routine.
* Write the given range of pages to backing store.
*
* => "offhi == 0" means flush all pages at or after "offlo".
* => object should be locked by caller. we return with the
* object unlocked.
* => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
* thus, a caller might want to unlock higher level resources
* (e.g. vm_map) before calling flush.
* => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
* => if PGO_ALLPAGES is set, then all pages in the object will be processed.
*
* note on "cleaning" object and PG_BUSY pages:
* this routine is holding the lock on the object. the only time
* that it can run into a PG_BUSY page that it does not own is if
* some other process has started I/O on the page (e.g. either
* a pagein, or a pageout). if the PG_BUSY page is being paged
* in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no
* one has had a chance to modify it yet. if the PG_BUSY page is
* being paged out then it means that someone else has already started
* cleaning the page for us (how nice!). in this case, if we
* have syncio specified, then after we make our pass through the
* object we need to wait for the other PG_BUSY pages to clear
* off (i.e. we need to do an iosync). also note that once a
* page is PG_BUSY it must stay in its object until it is un-busyed.
*/
int
genfs_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ * const ap = v;
int
genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff,
int origflags, struct vm_page **busypg)
{
struct uvm_object * const uobj = &vp->v_uobj;
krwlock_t * const slock = uobj->vmobjlock;
off_t nextoff;
int i, error, npages, nback;
int freeflag;
/*
* This array is larger than it should so that it's size is constant.
* The right size is MAXPAGES.
*/
struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE];
#define MAXPAGES (MAXPHYS / PAGE_SIZE)
struct vm_page *pg, *tpg;
struct uvm_page_array a;
bool wasclean, needs_clean;
bool async = (origflags & PGO_SYNCIO) == 0;
bool pagedaemon = curlwp == uvm.pagedaemon_lwp;
struct mount *trans_mp;
int flags;
bool modified; /* if we write out any pages */
bool holds_wapbl;
bool cleanall; /* try to pull off from the syncer's list */
bool onworklst;
bool nodirty;
const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0;
/*
* if !dirtyonly, iterate over all resident pages in the range.
*
* if dirtyonly, only possibly dirty pages are interesting.
* however, if we are asked to sync for integrity, we should
* wait on pages being written back by other threads as well.
*/
if (preempt_needed()) {
nextoff = pg->offset; /* visit this page again */
rw_exit(slock);
preempt();
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
continue;
}
/*
* if the current page is busy, wait for it to become unbusy.
*/
if ((pg->flags & PG_BUSY) != 0) {
UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg,
0, 0, 0);
if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0
&& (flags & PGO_BUSYFAIL) != 0) {
UVMHIST_LOG(ubchist, "busyfail %#jx",
(uintptr_t)pg, 0, 0, 0);
error = EDEADLK;
if (busypg != NULL)
*busypg = pg;
break;
}
if (pagedaemon) {
/*
* someone has taken the page while we
* dropped the lock for fstrans_start.
*/
break;
}
/*
* don't bother to wait on other's activities
* unless we are asked to sync for integrity.
*/
if (!async && (flags & PGO_RECLAIM) == 0) {
wasclean = false;
nextoff = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
continue;
}
nextoff = pg->offset; /* visit this page again */
uvm_pagewait(pg, slock, "genput");
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
continue;
}
/*
* if we're cleaning, build a cluster.
* the cluster will consist of pages which are currently dirty.
* if not cleaning, just operate on the one page.
*/
/*
* then look forward to fill in the remaining space in
* the array of pages.
*
* pass our cached array of pages so that hopefully
* uvn_findpages can find some good pages in it.
* the array a was filled above with the one of
* following sets of flags:
* 0
* UVM_PAGE_ARRAY_FILL_DIRTY
* UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK
*
* XXX this is fragile but it'll work: the array
* was earlier filled sparsely, but UFP_DIRTYONLY
* implies dense. see corresponding comment in
* uvn_findpages().
*/
/*
* start the i/o.
*/
rw_exit(slock);
error = GOP_WRITE(vp, pgs, npages, flags);
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
if (error) {
break;
}
}
}
uvm_page_array_fini(&a);
/*
* update ctime/mtime if the modification we started writing out might
* be from mmap'ed write.
*
* this is necessary when an application keeps a file mmaped and
* repeatedly modifies it via the window. note that, because we
* don't always write-protect pages when cleaning, such modifications
* might not involve any page faults.
*/
/* Wait for output to complete. */
rw_exit(slock);
if (!wasclean && !async && vp->v_numoutput != 0) {
while (vp->v_numoutput != 0)
cv_wait(&vp->v_cv, vp->v_interlock);
}
onworklst = (vp->v_iflag & VI_ONWORKLST) != 0;
mutex_exit(vp->v_interlock);
if ((flags & PGO_RECLAIM) != 0 && onworklst) {
/*
* in the case of PGO_RECLAIM, ensure to make the vnode clean.
* retrying is not a big deal because, in many cases,
* uobj->uo_npages is already 0 here.
*/
rw_enter(slock, RW_WRITER);
goto retry;
}
if (trans_mp) {
if (holds_wapbl)
WAPBL_END(trans_mp);
fstrans_done(trans_mp);
}
return (error);
}
/*
* Default putrange method for file systems that do not care
* how many pages are given to one GOP_WRITE() call.
*/
void
genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
{
*lop = 0;
*hip = 0;
}
int
genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
{
off_t off;
vaddr_t kva;
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
/*
* genfs_gop_write_rwmap:
*
* a variant of genfs_gop_write. it's used by UDF for its directory buffers.
* this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies
* the contents before writing it out to the underlying storage.
*/
int
genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
off_t off;
vaddr_t kva;
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
/*
* Backend routine for doing I/O to vnode pages. Pages are already locked
* and mapped into kernel memory. Here we just look up the underlying
* device block addresses and call the strategy routine.
*/
bp = NULL;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
int run;
daddr_t lbn, blkno;
struct vnode *devvp;
/*
* bmap the file to find out the blkno to read from and
* how much we can read in one i/o. if bmap returns an error,
* skip the rest of the top-level i/o.
*/
/*
* Process a uio using direct I/O. If we reach a part of the request
* which cannot be processed in this fashion for some reason, just return.
* The caller must handle some additional part of the request using
* buffered I/O before trying direct I/O again.
*/
#ifdef DIAGNOSTIC
if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
/*
* We only support direct I/O to user space for now.
*/
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
return;
}
/*
* If the vnode is mapped, we would need to get the getpages lock
* to stabilize the bmap, but then we would get into trouble while
* locking the pages if the pages belong to this same vnode (or a
* multi-vnode cascade to the same effect). Just fall back to
* buffered I/O if the vnode is mapped to avoid this mess.
*/
if (vp->v_vflag & VV_MAPPED) {
return;
}
if (need_wapbl) {
error = WAPBL_BEGIN(vp->v_mount);
if (error)
return;
}
/*
* Do as much of the uio as possible with direct I/O.
*/
vs = uio->uio_vmspace;
while (uio->uio_resid) {
iov = uio->uio_iov;
if (iov->iov_len == 0) {
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
va = (vaddr_t)iov->iov_base;
len = MIN(iov->iov_len, genfs_maxdio);
len &= ~mask;
/*
* If the next chunk is smaller than DEV_BSIZE or extends past
* the current EOF, then fall back to buffered I/O.
*/
if (len == 0 || uio->uio_offset + len > vp->v_size) {
break;
}
/*
* Check alignment. The file offset must be at least
* sector-aligned. The exact constraint on memory alignment
* is very hardware-dependent, but requiring sector-aligned
* addresses there too is safe.
*/
if (uio->uio_offset & mask || va & mask) {
break;
}
error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
uio->uio_rw);
if (error) {
break;
}
iov->iov_base = (char *)iov->iov_base + len;
iov->iov_len -= len;
uio->uio_offset += len;
uio->uio_resid -= len;
}
if (need_wapbl)
WAPBL_END(vp->v_mount);
}
/*
* Iodone routine for direct I/O. We don't do much here since the request is
* always synchronous, so the caller will do most of the work after biowait().
*/
/*
* For writes, verify that this range of the file already has fully
* allocated backing store. If there are any holes, just punt and
* make the caller take the buffered write path.
*/
if (rw == UIO_WRITE) {
daddr_t lbn, elbn, blkno;
int bsize, bshift, run;