@@ -335,6 +335,27 @@
es->s_free_blocks_count =
cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
}
+
+ /* @@@ This prevents newly-allocated data from being
+ * freed and then reallocated within the same
+ * transaction.
+ *
+ * Ideally we would want to allow that to happen, but to
+ * do so requires making journal_forget() capable of
+ * revoking the queued write of a data block, which
+ * implies blocking on the journal lock. *forget()
+ * cannot block due to truncate races.
+ *
+ * Eventually we can fix this by making journal_forget()
+ * return a status indicating whether or not it was able
+ * to revoke the buffer. On successful revoke, it is
+ * safe not to set the allocation bit in the committed
+ * bitmap, because we know that there is no outstanding
+ * activity on the buffer any more and so it is safe to
+ * reallocate it.
+ */
+ J_ASSERT(bh->b_committed_data != NULL);
+ ext3_set_bit (bit + i, bh->b_committed_data);
}
+/* For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+{
+ if (ext3_test_bit(nr, bh->b_data))
+ return 0;
+ if (!bh->b_committed_data)
+ return 1;
+ return !ext3_test_bit(nr, bh->b_committed_data);
+}
+
/*
* ext3_new_block uses a goal block to assist allocation. If the goal is
* free, or there is a free block within 32 blocks of the goal, that block
@@ -362,9 +407,11 @@
* each block group the search first looks for an entire free byte in the block
* bitmap, and then for any free bit if that fails.
*/
-int ext3_new_block (handle_t *handle,
- const struct inode * inode, unsigned long goal,
- u32 * prealloc_count, u32 * prealloc_block, int * err)
+struct buffer_head * ext3_new_block (handle_t *handle,
+ const struct inode * inode,
+ unsigned long goal,
+ u32 * prealloc_count,
+ u32 * prealloc_block, int * err)
{
struct buffer_head * bh;
struct buffer_head * bh2;
@@ -381,7 +428,7 @@
sb = inode->i_sb;
if (!sb) {
printk ("ext3_new_block: nonexistent device");
- return 0;
+ return NULL;
}
- /* @@@ This will eventually have to be a data-style operation,
- not metadata */
- mark_buffer_uptodate(bh, 1);
- journal_get_write_access(handle, bh);
memset(bh->b_data, 0, sb->s_blocksize);
- journal_dirty_metadata(handle, bh);
- brelse (bh);
+ mark_buffer_uptodate(bh, 1);
+ /* Don't mark it dirty --- the caller has to decide whether the new
+ * block needs to be journaled as data or metadata. */
--- linux-2.2.17.ext3-0.0.4a/fs/ext3/file.c.~1~ Thu Sep 7 14:26:44 2000
+++ linux-2.2.17.ext3-0.0.4a/fs/ext3/file.c Thu Sep 14 16:15:49 2000
@@ -165,6 +165,7 @@
int i, buffercount, write_error, new_buffer;
unsigned long limit;
handle_t *handle;
+ int will_journal_data;
/* POSIX: mtime/ctime may not change for 0 count */
if (!count)
@@ -210,12 +211,21 @@
offset = pos & (sb->s_blocksize - 1);
c = sb->s_blocksize - offset;
+ /* Record this now so that we don't get confused if the user
+ * changes the flag half-way through! */
+ will_journal_data = ext3_should_journal_data(inode);
+
/* How large a transaction might we need? We can always
* underestimate and grow later for really large writes */
- needed = (count >> EXT3_BLOCK_SIZE_BITS(sb)) + 1;
- if (needed > EXT3_MAX_TRANS_DATA)
- needed = EXT3_MAX_TRANS_DATA;
+ if (will_journal_data) {
+ needed = (count >> EXT3_BLOCK_SIZE_BITS(sb)) + 1;
+ if (needed > EXT3_MAX_TRANS_DATA)
+ needed = EXT3_MAX_TRANS_DATA;
+ } else
+ needed = 0;
+
handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
+
/* Check for overflow.. */
@@ -335,12 +345,17 @@
if (new_buffer) {
set_bit(BH_Lock, &bh->b_state);
- journal_get_create_access(handle, bh);
+ if (will_journal_data)
+ journal_get_create_access(handle, bh);
c -= copy_from_user (bh->b_data + offset, buf, c);
if (c != sb->s_blocksize) {
+ /* On getting an EFAULT mid-copy, we end
+ * up throwing away the whole block.
+ * Too bad. */
c = 0;
unlock_buffer(bh);
- journal_release_buffer(handle, bh);
+ if (will_journal_data)
+ journal_release_buffer(handle, bh);
brelse(bh);
if (!written)
written = -EFAULT;
@@ -353,14 +368,14 @@
ll_rw_block (READ, 1, &bh);
wait_on_buffer (bh);
if (!buffer_uptodate(bh)) {
- journal_release_buffer(handle, bh);
brelse (bh);
if (!written)
written = -EIO;
break;
}
}
- journal_get_write_access(handle, bh);
+ if (will_journal_data)
+ journal_get_write_access(handle, bh);
c -= copy_from_user (bh->b_data + offset, buf, c);
}
if (!c) {
@@ -370,7 +385,10 @@
written = -EFAULT;
break;
}
- journal_dirty_metadata(handle, bh);
+ if (will_journal_data)
+ journal_dirty_metadata(handle, bh);
+ else
+ journal_dirty_data(handle, bh);
update_vm_cache(inode, pos, bh->b_data + offset, c);
pos += c;
written += c;
--- linux-2.2.17.ext3-0.0.4a/fs/ext3/inode.c.~1~ Fri Sep 8 14:12:17 2000
+++ linux-2.2.17.ext3-0.0.4a/fs/ext3/inode.c Wed Sep 13 18:43:27 2000
@@ -141,8 +141,9 @@
#endif
}
-static int ext3_alloc_block (handle_t *handle, struct inode * inode,
- unsigned long goal, int * err)
+static struct buffer_head * ext3_alloc_block (handle_t *handle,
+ struct inode * inode,
+ unsigned long goal, int * err)
{
#ifdef EXT3FS_DEBUG
static unsigned long alloc_hits = 0, alloc_attempts = 0;
@@ -299,10 +300,11 @@
ext3_debug ("goal = %d.\n", goal);
- tmp = ext3_alloc_block (handle, inode, goal, err);
- if (!tmp)
+ result = ext3_alloc_block (handle, inode, goal, err);
+ if (!result)
return NULL;
- result = getblk (inode->i_dev, tmp, inode->i_sb->s_blocksize);
+ tmp = result->b_blocknr;
+
if (*p) {
ext3_free_blocks (handle, inode, tmp, 1);
brelse (result);
@@ -370,12 +372,11 @@
if (!goal)
goal = bh->b_blocknr;
}
- tmp = ext3_alloc_block (handle, inode, goal, err);
- if (!tmp) {
- brelse (bh);
+ result = ext3_alloc_block (handle, inode, goal, err);
+ if (!result)
return NULL;
- }
- result = getblk (bh->b_dev, tmp, blocksize);
+ tmp = result->b_blocknr;
+
journal_get_write_access(handle, bh);
if (le32_to_cpu(*p)) {
/* @@@ Major danger here: we are using up more and more
--- linux-2.2.17.ext3-0.0.4a/fs/ext3/truncate.c.~1~ Thu Sep 7 14:26:44 2000
+++ linux-2.2.17.ext3-0.0.4a/fs/ext3/truncate.c Thu Sep 14 16:16:50 2000
@@ -103,16 +103,6 @@
*/
-static inline void ext3_bforget(struct buffer_head *buf)
-{
- if (buf) {
- J_ASSERT(buf->b_cp_transaction == NULL);
- J_ASSERT(buf->b_jlist == BJ_None);
- J_ASSERT(!test_bit(BH_JWrite, &buf->b_state));
- __bforget(buf);
- }
-}
-
/*
* The journaling doesn't have to break the rules above, as long as we
* do a journal_get_write_access() on the appropriate indirect blocks
--- linux-2.2.17.ext3-0.0.4a/fs/jfs/commit.c.~1~ Thu Sep 7 14:26:44 2000
+++ linux-2.2.17.ext3-0.0.4a/fs/jfs/commit.c Mon Sep 11 15:41:28 2000
@@ -138,7 +138,6 @@
if (bh) do {
if (buffer_dirty(bh) && !buffer_locked(bh)) {
- set_bit(BH_JWrite, &bh->b_state);
wbuf[bufs++] = bh;
}
bh = bh->b_tnext;
@@ -162,7 +161,6 @@
bh = commit_transaction->t_datalist;
if (bh) do {
- clear_bit(BH_JWrite, &bh->b_state);
if (buffer_locked(bh)) {
unlock_journal(journal);
wait_on_buffer(bh);
--- linux-2.2.17.ext3-0.0.4a/fs/jfs/journal.c.~1~ Thu Sep 7 14:26:44 2000
+++ linux-2.2.17.ext3-0.0.4a/fs/jfs/journal.c Mon Sep 11 16:05:07 2000
@@ -340,16 +340,26 @@
/* The call to lock_buffer() above should be the only place we ever lock
- * a buffer which is being journaled (ignoring the checkpoint lists). */
+ * a buffer which is being journaled (ignoring the checkpoint lists).
+ *
+ * @@@ This is heavily dependent on the big kernel lock in 2.2! */
/* We are not allowed to forget the dirty status on any buffer which is
--- linux-2.2.17.ext3-0.0.4a/fs/jfs/transaction.c.~1~ Thu Sep 7 14:26:44 2000
+++ linux-2.2.17.ext3-0.0.4a/fs/jfs/transaction.c Thu Sep 14 16:13:53 2000
@@ -491,6 +491,10 @@
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
+ /* The buffer may already belong to this transaction due to
+ pre-zeroing in the filesystem's new_block code */
+ J_ASSERT (bh->b_transaction == transaction || bh->b_transaction == NULL);
+
J_ASSERT (buffer_locked(bh));
lock_journal(journal);
@@ -511,7 +515,7 @@
* rewindable consequences
*
* Sometimes there is a need to distinguish between metadata which has
- * been committed to disk and that which has not. The ext2fs code uses
+ * been committed to disk and that which has not. The ext3fs code uses
* this for freeing and allocating space: we have to make sure that we
* do not reuse freed space until the deallocation has been committed,
* since if we overwrote that space we would make the delete
@@ -664,13 +668,11 @@
/*
* journal_release_buffer: undo a get_write_access without any buffer
- * updates, if the transaction decided in the end that it didn't need
- * access.
+ * updates, if the update decided in the end that it didn't need access.
*
* journal_get_write_access() can block, so it is quite possible for a
* journaling component to decide after the write access is returned
- * that global state has changed and the update is no longer required.
- */
+ * that global state has changed and the update is no longer required. */
void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
{
@@ -682,7 +684,8 @@
* transaction, then it is safe to release it. In all other
* cases, just leave the buffer as it is. */
/* symlink.c */
extern struct inode_operations ext3_symlink_inode_operations;
+
+/* @@@ Fix this in the future to allow data-journaling to be re-enabled
+ * per-inode or per-filesystem */
+static inline int ext3_should_journal_data(struct inode *inode)
+{
+ return 0;
+}