ocfs2: zero tail of sparse files on truncate

Since we don't zero on extend anymore, truncate needs to be fixed up to zero the part of a file between i_size and and end of it's cluster. Otherwise a subsequent extend could expose bad data. This introduced a new helper, which can be used in ocfs2_write(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
2007-02-16 11:46:50 -08:00 · 2007-02-16 11:46:50 -08:00 · 60b11392f1
commit 60b11392f1
parent 25baf2da14
7 changed files with 328 additions and 25 deletions
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@ -34,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
 #include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@ -3342,6 +3344,228 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	return status;
 }
 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
 {
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	return 0;
 }
 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
 {
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	return ocfs2_journal_dirty_data(handle, bh);
 }
 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
 				     struct page **pages, int numpages,
 				     u64 phys, handle_t *handle)
 {
 	int i, ret, partial = 0;
 	void *kaddr;
 	struct page *page;
 	unsigned int from, to = PAGE_CACHE_SIZE;
 	struct super_block *sb = inode->i_sb;
 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
 	if (numpages == 0)
 		goto out;
 	from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
 	if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
 		/*
 		 * Since 'from' has been capped to a value below page
 		 * size, this calculation won't be able to overflow
 		 * 'to'
 		 */
 		to = ocfs2_align_bytes_to_clusters(sb, from);
 		/*
 		 * The truncate tail in this case should never contain
 		 * more than one page at maximum. The loop below also
 		 * assumes this.
 		 */
 		BUG_ON(numpages != 1);
 	}
 	for(i = 0; i < numpages; i++) {
 		page = pages[i];
 		BUG_ON(from > PAGE_CACHE_SIZE);
 		BUG_ON(to > PAGE_CACHE_SIZE);
 		ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
 		if (ret)
 			mlog_errno(ret);
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + from, 0, to - from);
 		kunmap_atomic(kaddr, KM_USER0);
 		/*
 		 * Need to set the buffers we zero'd into uptodate
 		 * here if they aren't - ocfs2_map_page_blocks()
 		 * might've skipped some
 		 */
 		if (ocfs2_should_order_data(inode)) {
 			ret = walk_page_buffers(handle,
 						page_buffers(page),
 						from, to, &partial,
 						ocfs2_ordered_zero_func);
 			if (ret < 0)
 				mlog_errno(ret);
 		} else {
 			ret = walk_page_buffers(handle, page_buffers(page),
 						from, to, &partial,
 						ocfs2_writeback_zero_func);
 			if (ret < 0)
 				mlog_errno(ret);
 		}
 		if (!partial)
 			SetPageUptodate(page);
 		flush_dcache_page(page);
 		/*
 		 * Every page after the 1st one should be completely zero'd.
 		 */
 		from = 0;
 	}
 out:
 	if (pages) {
 		for (i = 0; i < numpages; i++) {
 			page = pages[i];
 			unlock_page(page);
 			mark_page_accessed(page);
 			page_cache_release(page);
 		}
 	}
 }
 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
 				int *num, u64 *phys)
 {
 	int i, numpages = 0, ret = 0;
 	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
 	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
 	u64 next_cluster_bytes;
 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
 	/* Cluster boundary, so we don't need to grab any pages. */
 	if ((isize & (csize - 1)) == 0)
 		goto out;
 	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
 					  phys, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	/* Tail is a hole. */
 	if (*phys == 0)
 		goto out;
 	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
 	index = isize >> PAGE_CACHE_SHIFT;
 	do {
 		pages[numpages] = grab_cache_page(mapping, index);
 		if (!pages[numpages]) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
 			goto out;
 		}
 		numpages++;
 		index++;
 	} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
 out:
 	if (ret != 0) {
 		if (pages) {
 			for (i = 0; i < numpages; i++) {
 				if (pages[i]) {
 					unlock_page(pages[i]);
 					page_cache_release(pages[i]);
 				}
 			}
 		}
 		numpages = 0;
 	}
 	*num = numpages;
 	return ret;
 }
 /*
 * Zero the area past i_size but still within an allocated
 * cluster. This avoids exposing nonzero data on subsequent file
 * extends.
 *
 * We need to call this before i_size is updated on the inode because
 * otherwise block_write_full_page() will skip writeout of pages past
 * i_size. The new_i_size parameter is passed for this reason.
 */
 int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
 				 u64 new_i_size)
 {
 	int ret, numpages;
 	struct page **pages = NULL;
 	u64 phys;
 	/*
 	 * File systems which don't support sparse files zero on every
 	 * extend.
 	 */
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		return 0;
 	pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
 			sizeof(struct page *), GFP_NOFS);
 	if (pages == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 	ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 	/*
 	 * Truncate on an i_size boundary - nothing more to do.
 	 */
 	if (numpages == 0)
 		goto out;
 	ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
 				 handle);
 	/*
 	 * Initiate writeout of the pages we zero'd here. We don't
 	 * wait on them - the truncate_inode_pages() call later will
 	 * do that for us.
 	 */
 	ret = filemap_fdatawrite(inode->i_mapping);
 	if (ret)
 		mlog_errno(ret);
 out:
 	if (pages)
 		kfree(pages);
 	return ret;
 }
 /*
 * It is expected, that by the time you call this function,
 * inode->i_size and fe->i_size have been adjusted.
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@ -71,6 +71,8 @@ struct ocfs2_truncate_context {
 	struct buffer_head *tc_last_eb_bh;
 };
 int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
 				 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@ -308,13 +308,13 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 * functionality yet, but IMHO it's better to cut and paste the whole
 * thing so we can avoid introducing our own bugs (and easily pick up
 * their fixes when they happen) --Mark */
-static int walk_page_buffers(	handle_t *handle,
+int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
+			struct buffer_head *head,
-				unsigned from,
+			unsigned from,
-				unsigned to,
+			unsigned to,
-				int *partial,
+			int *partial,
-				int (*fn)(	handle_t *handle,
+			int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+					struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@ -654,9 +654,9 @@ static void ocfs2_clear_page_regions(struct page *page,
 *
 * This will also skip zeroing, which is handled externally.
 */
-static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
-				 struct inode *inode, unsigned int from,
+			  struct inode *inode, unsigned int from,
-				 unsigned int to, int new)
+			  unsigned int to, int new)
 {
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 		 * Ignore blocks outside of our i/o range -
 		 * they may belong to unallocated clusters.
 		 */
-		if (block_start >= to ||
+		if (block_start >= to || block_end <= from) {
 		    (block_start + bsize) <= from) {
 			if (PageUptodate(page))
 				set_buffer_uptodate(bh);
 			continue;
@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 	u64 v_blkno, p_blkno;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	unsigned long index, start;
 	struct page **cpages;
@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
-	 * non-allocating write, or any writes where cluster size is
+	 * non allocating write, we just change the one
-	 * less than page size, we only need one page. Otherwise,
+	 * page. Otherwise, we'll need a whole clusters worth.
 	 * allocating writes of cluster size larger than page size
 	 * need cluster size pages.
 	 */
-	if (new && !wc->w_large_pages)
+	if (new)
-		numpages = (1 << cbits) / PAGE_SIZE;
+		numpages = ocfs2_pages_per_cluster(inode->i_sb);
 	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
 	if (!cpages) {
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned from,
 							 unsigned to);
 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new);
 int walk_page_buffers(	handle_t *handle,
 			struct buffer_head *head,
 			unsigned from,
 			unsigned to,
 			int *partial,
 			int (*fn)(	handle_t *handle,
 					struct buffer_head *bh));
 struct ocfs2_write_ctxt;
 typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
 				u64 *, unsigned int *, unsigned int *);
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
 	int status;
 	handle_t *handle;
 	struct ocfs2_dinode *di;
 	mlog_entry_void();
@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 	}
-	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	status = ocfs2_journal_access(handle, inode, fe_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	/*
 	 * Do this before setting i_size.
 	 */
 	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
 	if (status) {
 		mlog_errno(status);
 		goto out_commit;
 	}
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
 	di->i_size = cpu_to_le64(new_i_size);
 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0)
 		mlog_errno(status);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mlog_exit(status);
 	return status;
 }
@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		mlog_errno(status);
 		goto bail;
 	}
 	ocfs2_data_unlock(inode, 1);
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 	/* TODO: orphan dir cleanup here. */
 bail_unlock_data:
 	ocfs2_data_unlock(inode, 1);
 bail:
 	mlog_exit(status);
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	int status = 0;
 	struct ocfs2_truncate_context *tc = NULL;
 	struct ocfs2_dinode *fe;
 	handle_t *handle = NULL;
 	mlog_entry_void();
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 	if (fe->i_clusters) {
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
 			goto out;
 		}
 		status = ocfs2_journal_access(handle, inode, fe_bh,
 					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto out;
 		}
 		i_size_write(inode, 0);
 		status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto out;
 		}
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 		status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
 		if (status < 0) {
 			mlog_errno(status);
@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 			goto out;
 		}
 	}
 out:
 out:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 	mlog_exit(status);
 	return status;
 }
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc
 	return index;
 }
 static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 {
 	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
 	unsigned int pages_per_cluster = 1;
 	if (PAGE_CACHE_SHIFT < cbits)
 		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
 	return pages_per_cluster;
 }
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit