From 86420bc1e30673fcf0486d8386a7282dc6469a81 Mon Sep 17 00:00:00 2001
From: alc <alc@ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f>
Date: Sun, 8 Jan 2012 20:09:26 +0000
Subject: [PATCH 158/175] Correct an error of omission in the implementation
 of the truncation operation on POSIX shared memory
 objects and tmpfs.  Previously, neither of these
 modules correctly handled the case in which the new
 size of the object or file was not a multiple of
 the page size.  Specifically, they did not handle
 partial page truncation of data stored on swap.  As
 a result, stale data might later be returned to an
 application.

Interestingly, a data inconsistency was less likely to occur under tmpfs
than POSIX shared memory objects.  The reason being that a different mistake
by the tmpfs truncation operation helped avoid a data inconsistency.  If the
data was still resident in memory in a PG_CACHED page, then the tmpfs
truncation operation would reactivate that page, zero the truncated portion,
and leave the page pinned in memory.  More precisely, the benevolent error
was that the truncation operation didn't add the reactivated page to any of
the paging queues, effectively pinning the page.  This page would remain
pinned until the file was destroyed or the page was read or written.  With
this change, the page is now added to the inactive queue.

Discussed with:	jhb
Reviewed by:	kib (an earlier version)
MFC after:	3 weeks

git-svn-id: http://svn.freebsd.org/base/head@229821 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f
(cherry picked from commit 546cb80aa1ce89675ac9c84cc57a68ac957f8624)

Signed-off-by: Xin Li <d@delphij.net>
---
 sys/fs/tmpfs/tmpfs_subr.c |   78 +++++++++++++++++++++++++---------
 sys/kern/uipc_shm.c       |  101 ++++++++++++++++++++++++---------------------
 2 files changed, 113 insertions(+), 66 deletions(-)

diff --git a/sys/fs/tmpfs/tmpfs_subr.c b/sys/fs/tmpfs/tmpfs_subr.c
index e9324cf..e733f19 100644
--- a/sys/fs/tmpfs/tmpfs_subr.c
+++ b/sys/fs/tmpfs/tmpfs_subr.c
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
@@ -886,10 +887,10 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 	vm_object_t uobj;
-	vm_page_t m;
-	vm_pindex_t newpages, oldpages;
+	vm_page_t m, ma[1];
+	vm_pindex_t idx, newpages, oldpages;
 	off_t oldsize;
-	size_t zerolen;
+	int base, rv;
 
 	MPASS(vp->v_type == VREG);
 	MPASS(newsize >= 0);
@@ -912,15 +913,57 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 	    newpages - oldpages > TMPFS_PAGES_AVAIL(tmp))
 		return (ENOSPC);
 
-	TMPFS_LOCK(tmp);
-	tmp->tm_pages_used += (newpages - oldpages);
-	TMPFS_UNLOCK(tmp);
-
-	node->tn_size = newsize;
-	vnode_pager_setsize(vp, newsize);
 	VM_OBJECT_LOCK(uobj);
 	if (newsize < oldsize) {
 		/*
+		 * Zero the truncated part of the last page.
+		 */
+		base = newsize & PAGE_MASK;
+		if (base != 0) {
+			idx = OFF_TO_IDX(newsize);
+retry:
+			m = vm_page_lookup(uobj, idx);
+			if (m != NULL) {
+				if ((m->oflags & VPO_BUSY) != 0 ||
+				    m->busy != 0) {
+					vm_page_sleep(m, "tmfssz");
+					goto retry;
+				}
+			} else if (vm_pager_has_page(uobj, idx, NULL, NULL)) {
+				m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL);
+				if (m == NULL) {
+					VM_OBJECT_UNLOCK(uobj);
+					VM_WAIT;
+					VM_OBJECT_LOCK(uobj);
+					goto retry;
+				} else if (m->valid != VM_PAGE_BITS_ALL) {
+					ma[0] = m;
+					rv = vm_pager_get_pages(uobj, ma, 1, 0);
+					m = vm_page_lookup(uobj, idx);
+				} else
+					/* A cached page was reactivated. */
+					rv = VM_PAGER_OK;
+				vm_page_lock(m);
+				if (rv == VM_PAGER_OK) {
+					vm_page_deactivate(m);
+					vm_page_unlock(m);
+					vm_page_wakeup(m);
+				} else {
+					vm_page_free(m);
+					vm_page_unlock(m);
+					VM_OBJECT_UNLOCK(uobj);
+					return (EIO);
+				}
+			}
+			if (m != NULL) {
+				pmap_zero_page_area(m, base, PAGE_SIZE - base);
+				MPASS(m->valid == VM_PAGE_BITS_ALL);
+				vm_page_dirty(m);
+				vm_pager_page_unswapped(m);
+			}
+		}
+
+		/*
 		 * Release any swap space and free any whole pages.
 		 */
 		if (newpages < oldpages) {
@@ -928,19 +971,16 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize)
 			    newpages);
 			vm_object_page_remove(uobj, newpages, 0, 0);
 		}
-
-		/*
-		 * Zero the truncated part of the last page.
-		 */
-		zerolen = round_page(newsize) - newsize;
-		if (zerolen > 0) {
-			m = vm_page_grab(uobj, OFF_TO_IDX(newsize),
-			    VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
-			pmap_zero_page_area(m, PAGE_SIZE - zerolen, zerolen);
-		}
 	}
 	uobj->size = newpages;
 	VM_OBJECT_UNLOCK(uobj);
+
+	TMPFS_LOCK(tmp);
+	tmp->tm_pages_used += (newpages - oldpages);
+	TMPFS_UNLOCK(tmp);
+
+	node->tn_size = newsize;
+	vnode_pager_setsize(vp, newsize);
 	return (0);
 }
 
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 4d064b5..ce7c722 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -39,13 +39,6 @@
  *
  * (3) Resource limits?  Does this need its own resource limits or are the
  *     existing limits in mmap(2) sufficient?
- *
- * (4) Partial page truncation.  vnode_pager_setsize() will zero any parts
- *     of a partially mapped page as a result of ftruncate(2)/truncate(2).
- *     We can do the same (with the same pmap evil), but do we need to
- *     worry about the bits on disk if the page is swapped out or will the
- *     swapper zero the parts of a page that are invalid if the page is
- *     swapped back in for us?
  */
 
 #include <sys/cdefs.h>
@@ -86,6 +79,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
@@ -253,9 +247,10 @@ static int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
-	vm_page_t m;
-	vm_pindex_t nobjsize;
+	vm_page_t m, ma[1];
+	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
+	int base, rv;
 
 	object = shmfd->shm_object;
 	VM_OBJECT_LOCK(object);
@@ -275,6 +270,56 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
 			VM_OBJECT_UNLOCK(object);
 			return (EBUSY);
 		}
+
+		/*
+		 * Zero the truncated part of the last page.
+		 */
+		base = length & PAGE_MASK;
+		if (base != 0) {
+			idx = OFF_TO_IDX(length);
+retry:
+			m = vm_page_lookup(object, idx);
+			if (m != NULL) {
+				if ((m->oflags & VPO_BUSY) != 0 ||
+				    m->busy != 0) {
+					vm_page_sleep(m, "shmtrc");
+					goto retry;
+				}
+			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
+				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
+				if (m == NULL) {
+					VM_OBJECT_UNLOCK(object);
+					VM_WAIT;
+					VM_OBJECT_LOCK(object);
+					goto retry;
+				} else if (m->valid != VM_PAGE_BITS_ALL) {
+					ma[0] = m;
+					rv = vm_pager_get_pages(object, ma, 1,
+					    0);
+					m = vm_page_lookup(object, idx);
+				} else
+					/* A cached page was reactivated. */
+					rv = VM_PAGER_OK;
+				vm_page_lock(m);
+				if (rv == VM_PAGER_OK) {
+					vm_page_deactivate(m);
+					vm_page_unlock(m);
+					vm_page_wakeup(m);
+				} else {
+					vm_page_free(m);
+					vm_page_unlock(m);
+					VM_OBJECT_UNLOCK(object);
+					return (EIO);
+				}
+			}
+			if (m != NULL) {
+				pmap_zero_page_area(m, base, PAGE_SIZE - base);
+				KASSERT(m->valid == VM_PAGE_BITS_ALL,
+				    ("shm_dotruncate: page %p is invalid", m));
+				vm_page_dirty(m);
+				vm_pager_page_unswapped(m);
+			}
+		}
 		delta = ptoa(object->size - nobjsize);
 
 		/* Toss in memory pages. */
@@ -289,45 +334,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length)
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
-
-		/*
-		 * If the last page is partially mapped, then zero out
-		 * the garbage at the end of the page.  See comments
-		 * in vnode_pager_setsize() for more details.
-		 *
-		 * XXXJHB: This handles in memory pages, but what about
-		 * a page swapped out to disk?
-		 */
-		if ((length & PAGE_MASK) &&
-		    (m = vm_page_lookup(object, OFF_TO_IDX(length))) != NULL &&
-		    m->valid != 0) {
-			int base = (int)length & PAGE_MASK;
-			int size = PAGE_SIZE - base;
-
-			pmap_zero_page_area(m, base, size);
-
-			/*
-			 * Update the valid bits to reflect the blocks that
-			 * have been zeroed.  Some of these valid bits may
-			 * have already been set.
-			 */
-			vm_page_set_valid_range(m, base, size);
-
-			/*
-			 * Round "base" to the next block boundary so that the
-			 * dirty bit for a partially zeroed block is not
-			 * cleared.
-			 */
-			base = roundup2(base, DEV_BSIZE);
-
-			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
-		} else if ((length & PAGE_MASK) &&
-		    __predict_false(object->cache != NULL)) {
-			vm_page_cache_free(object, OFF_TO_IDX(length),
-			    nobjsize);
-		}
 	} else {
-
 		/* Attempt to reserve the swap */
 		delta = ptoa(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
-- 
1.7.9.4