+++ linux-2.2.17pre9.ext3-0.0.2e/fs/buffer.c Thu Jun 29 20:41:30 2000

--- linux-2.2.17pre9.ext3-0.0.2e/fs/buffer.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/buffer.c Thu Jun 29 20:41:30 2000
@@ -234,9 +234,9 @@
bh->b_count++;
next->b_count++;
bh->b_flushtime = 0;
- ll_rw_block(WRITE, 1, &bh);
J_ASSERT(!bh->b_transaction);
J_ASSERT(bh->b_jlist == 0);
+ ll_rw_block(WRITE, 1, &bh);
bh->b_count--;
next->b_count--;
retry = 1;
@@ -1516,7 +1516,7 @@
if (buffer_locked(p)) {
if (wait)
__wait_on_buffer(p);
- } else if (buffer_dirty(p))
+ } else if (buffer_dirty(p) && !p->b_jlist)
ll_rw_block(WRITE, 1, &p);
} while (tmp != bh);

@@ -1790,9 +1790,9 @@
#ifdef DEBUG
if(nlist != BUF_DIRTY) ncount++;
#endif
- ll_rw_block(WRITE, 1, &bh);
J_ASSERT(!bh->b_transaction);
J_ASSERT(bh->b_jlist == 0);
+ ll_rw_block(WRITE, 1, &bh);
bh->b_count--;
next->b_count--;
}
@@ -1951,6 +1951,8 @@
bh->b_count++;
ndirty++;
bh->b_flushtime = 0;
+ J_ASSERT(!bh->b_transaction);
+ J_ASSERT(bh->b_jlist == 0);
if (major == LOOP_MAJOR) {
ll_rw_block(wrta_cmd,1, &bh);
wrta_cmd = WRITEA;
@@ -1959,8 +1961,6 @@
}
else
ll_rw_block(WRITE, 1, &bh);
- J_ASSERT(!bh->b_transaction);
- J_ASSERT(bh->b_jlist == 0);
#ifdef DEBUG
if(nlist != BUF_DIRTY) ncount++;
#endif
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/balloc.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/balloc.c Thu Jun 29 21:29:39 2000
@@ -214,7 +214,7 @@
*/
if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
- sb->u.ext3_sb.s_block_bitmap[block_group]) {
+ sb->u.ext3_sb.s_block_bitmap[0]) {
return 0;
}
/*
@@ -611,6 +611,8 @@
unlock_super (sb);
return 0;
}
+ if (!buffer_uptodate(bh))
+ wait_on_buffer(bh);

/* @@@ This will eventually have to be a data-style operation,
not metadata */
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/dir.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/dir.c Thu Jun 29 17:36:49 2000
@@ -196,15 +196,18 @@
* version stamp to detect whether or
* not the directory has been modified
* during the copy operation.
+ * AV: It can't be modified, but it fscking
+ * can be seeked by another process that shares
+ * the descriptor.
*/
- unsigned long version = inode->i_version;
+ unsigned long version = filp->f_version;

error = filldir(dirent, de->name,
de->name_len,
filp->f_pos, le32_to_cpu(de->inode));
if (error)
break;
- if (version != inode->i_version)
+ if (version != filp->f_version)
goto revalidate;
stored ++;
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/file.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/file.c Wed Jul 5 14:50:32 2000
@@ -216,8 +216,7 @@
needed = (count >> EXT3_BLOCK_SIZE_BITS(sb)) + 1;
if (needed > EXT3_MAX_TRANS_DATA)
needed = EXT3_MAX_TRANS_DATA;
- handle = journal_start(EXT3_JOURNAL(inode),
- EXT3_DATA_TRANS_BLOCKS + needed);
+ handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);

/* Check for overflow.. */

@@ -297,14 +296,13 @@
if (journal_extend(handle, needed)) {
/* Couldn't extend: OK, commit the current
* transaction and start a new one. */
- if (pos > inode->i_size)
- inode->i_size = pos;
+ if (pos > inode->u.ext3_i.i_disksize)
+ inode->u.ext3_i.i_disksize = pos;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
ext3_mark_inode_dirty(handle, inode);
- journal_stop(handle);
- handle = journal_start(EXT3_JOURNAL(inode),
- EXT3_DATA_TRANS_BLOCKS
- + needed);
+ ext3_journal_stop(handle, inode);
+ handle = ext3_journal_start
+ (inode, EXT3_DATA_TRANS_BLOCKS + needed);
}
}

@@ -417,13 +415,16 @@
if (filp->f_flags & O_SYNC)
handle->h_sync = 1;

- if (pos > inode->i_size)
+ if (pos > inode->i_size) {
inode->i_size = pos;
+ inode->u.ext3_i.i_disksize = pos;
+ }
+
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
*ppos = pos;
ext3_mark_inode_dirty(handle, inode);
error_out:
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
return written;
}

--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/fsync.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/fsync.c Tue Jul 4 15:00:23 2000
@@ -46,10 +46,21 @@
if (!bh)
return 0;
if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
- brelse (bh);
- return -1;
+ /* There can be a parallel read(2) that started read-I/O
+ on the buffer so we can't assume that there's been
+ an I/O error without first waiting I/O completation. */
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ {
+ brelse (bh);
+ return -1;
+ }
}
if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
+ if (wait)
+ /* when we return from fsync all the blocks
+ must be _just_ stored on disk */
+ wait_on_buffer(bh);
brelse (bh);
return 0;
}
@@ -262,7 +273,7 @@
struct inode *inode = dentry->d_inode;
handle_t *handle;

- handle = journal_start(EXT3_JOURNAL(inode), 1); /* @@@ Error? */
+ handle = ext3_journal_start(inode, 1); /* @@@ Error? */
handle->h_sync = 1;

if (S_ISLNK(inode->i_mode) && !(inode->i_blocks))
@@ -289,6 +300,6 @@

skip:
err |= ext3_mark_inode_dirty (handle, inode);
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
return err ? -EIO : 0;
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ialloc.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ialloc.c Fri Jun 30 11:06:36 2000
@@ -270,21 +270,6 @@
}

/*
- * This function increments the inode version number
- *
- * This may be used one day by the NFS server
- */
-static void inc_inode_version (struct inode * inode,
- struct ext3_group_desc *gdp,
- int mode)
-{
- inode->u.ext3_i.i_version++;
- mark_inode_dirty(inode);
-
- return;
-}
-
-/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -497,13 +482,15 @@
inode->u.ext3_i.i_file_acl = 0;
inode->u.ext3_i.i_dir_acl = 0;
inode->u.ext3_i.i_dtime = 0;
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
inode->u.ext3_i.i_block_group = i;
inode->i_op = NULL;
if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
inode->i_flags |= MS_SYNCHRONOUS;
insert_inode_hash(inode);
+ inode->i_generation = inode_generation_count++;
+ inode->u.ext3_i.i_version = inode->i_generation;
ext3_mark_inode_dirty(handle, inode);
- inc_inode_version (inode, gdp, mode);

unlock_super (sb);
if(DQUOT_ALLOC_INODE(sb, inode)) {
@@ -516,6 +503,51 @@
ext3_debug ("allocating inode %lu\n", inode->i_ino);

*err = 0;
+ return inode;
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
+{
+ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+ unsigned long block_group;
+ unsigned long bit;
+ int bitmap_nr;
+ struct buffer_head *bh;
+ struct inode *inode = NULL;
+
+ /* Error cases - e2fsck has already cleaned up for us */
+ if (ino > max_ino) {
+ ext3_warning(sb, __FUNCTION__,
+ "bad orphan ino %ld! e2fsck was run?\n", ino);
+ return NULL;
+ }
+
+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
+ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
+ ext3_warning(sb, __FUNCTION__,
+ "inode bitmap error for orphan %ld\n", ino);
+ return NULL;
+ }
+
+ /* Having the inode bit set should be a 100% indicator that this
+ * is a valid orphan (no e2fsck run on fs). Orphans also include
+ * inodes that were being truncated, so we can't check i_nlink==0.
+ */
+ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
+ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
+ ext3_warning(sb, __FUNCTION__,
+ "bad orphan inode %ld! e2fsck was run?\n", ino);
+
+ /* Avoid freeing blocks if we got a bad deleted inode */
+ if (inode && inode->i_nlink == 0)
+ inode->i_blocks = 0;
+ iput(inode);
+ return NULL;
+ }
+
return inode;
}

--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/inode.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/inode.c Wed Jul 5 14:48:31 2000
@@ -42,6 +42,40 @@
}

/*
+ * ext3_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+void ext3_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev = inode->u.ext3_i.i_orphan.prev;
+ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+ ino_t ino_next = NEXT_ORPHAN(inode);
+
+ if (list_empty(&inode->u.ext3_i.i_orphan))
+ return;
+
+ jfs_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+ lock_super(inode->i_sb);
+ list_del(&inode->u.ext3_i.i_orphan);
+
+ if (prev == &sbi->s_orphan) {
+ jfs_debug(4, "superblock will point to %ld\n", ino_next);
+ journal_get_write_access(handle, sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ journal_dirty_metadata(handle, sbi->s_sbh);
+ } else {
+ struct inode *i_prev =
+ list_entry(prev, struct inode, u.ext3_i.i_orphan);
+
+ jfs_debug(4, "orphan inode %ld will point to %ld\n",
+ i_prev->i_ino, ino_next);
+ NEXT_ORPHAN(i_prev) = ino_next;
+ ext3_mark_inode_dirty(handle, i_prev);
+ }
+ unlock_super(inode->i_sb);
+}
+
+/*
* Called at the last iput() if i_nlink is zero.
*/
void ext3_delete_inode (struct inode * inode)
@@ -51,19 +85,24 @@
if (inode->i_ino == EXT3_ACL_IDX_INO ||
inode->i_ino == EXT3_ACL_DATA_INO)
return;
- inode->u.ext3_i.i_dtime = CURRENT_TIME;

- handle = journal_start(EXT3_JOURNAL(inode),
- EXT3_DELETE_TRANS_BLOCKS);
+ /* When we delete an inode, we increment its i_version. If it
+ is ever read in from disk again, it will have a different
+ i_version. */
+ inode->u.ext3_i.i_version++;
+
+ handle = ext3_journal_start(inode, EXT3_DELETE_TRANS_BLOCKS);

if (IS_SYNC(inode))
handle->h_sync = 1;
- ext3_mark_inode_dirty(handle, inode);
inode->i_size = 0;
if (inode->i_blocks)
ext3_truncate (inode);
- ext3_free_inode (handle, inode);
- journal_stop(handle);
+ ext3_orphan_del(handle, inode);
+ inode->u.ext3_i.i_dtime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+ ext3_free_inode(handle, inode);
+ ext3_journal_stop(handle, inode);
}

#define inode_bmap(inode, nr) ((inode)->u.ext3_i.i_data[(nr)])
@@ -129,6 +168,8 @@
"cannot get block %lu", result);
return 0;
}
+ if (!buffer_uptodate(bh))
+ wait_on_buffer(bh);
/* @@@ Once we start journaling data separately, this
needs to become dependent on the type of inode we
are allocating inside */
@@ -579,10 +620,13 @@
<< 32;
#endif
}
+ inode->u.ext3_i.i_disksize = inode->i_size;
inode->u.ext3_i.i_version = le32_to_cpu(iloc.raw_inode->i_version);
+ inode->i_generation = inode->u.ext3_i.i_version;
inode->u.ext3_i.i_block_group = iloc.block_group;
inode->u.ext3_i.i_next_alloc_block = 0;
inode->u.ext3_i.i_next_alloc_goal = 0;
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
if (inode->u.ext3_i.i_prealloc_count)
ext3_error (inode->i_sb, "ext3_read_inode",
"New inode has non-zero prealloc count!");
@@ -656,7 +700,7 @@
raw_inode->i_uid = cpu_to_le16(inode->i_uid);
raw_inode->i_gid = cpu_to_le16(inode->i_gid);
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- raw_inode->i_size = cpu_to_le32(inode->i_size);
+ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
raw_inode->i_atime = cpu_to_le32(inode->i_atime);
raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
@@ -674,7 +718,7 @@
raw_inode->i_size_high =
cpu_to_le32(inode->u.ext3_i.i_high_size);
#else
- raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32);
+ raw_inode->i_size_high = cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
#endif
}
raw_inode->i_version = cpu_to_le32(inode->u.ext3_i.i_version);
@@ -813,14 +857,14 @@
struct buffer_head *bh = sb->u.ext3_sb.s_sbh;

/* @@@ Error, null handle? */
- handle = journal_start(EXT3_JOURNAL(inode), 1);
+ handle = ext3_journal_start(inode, 1);

/* If this is the first large file
* created, add a flag to the superblock */
es->s_feature_ro_compat |=
cpu_to_le32(EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
journal_dirty_metadata(handle, bh); /*@@@err*/
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
}
}
#endif
@@ -835,7 +879,7 @@
* required is one. */

/* @@@ Error, null handle? */
- handle = journal_start(EXT3_JOURNAL(inode), 1);
+ handle = ext3_journal_start(inode, 1);
retval = ext3_reserve_inode_write(handle, inode, &iloc);
if (retval)
goto out_stop;
@@ -877,7 +921,7 @@
retval = ext3_mark_iloc_dirty(handle, inode, &iloc);

out_stop:
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
out:
return retval;
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ioctl.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ioctl.c Thu Jun 29 21:36:18 2000
@@ -77,6 +77,7 @@
if (get_user(inode->u.ext3_i.i_version, (int *) arg))
return -EFAULT;
inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = inode->u.ext3_i.i_version;
mark_inode_dirty(inode);
return 0;
default:
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/namei.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/namei.c Tue Jul 4 15:35:08 2000
@@ -373,8 +373,7 @@
handle_t *handle;
int err = -EIO;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DATA_TRANS_BLOCKS + 3);
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);

/*
* N.B. Several error exits in ext3_new_inode don't set err.
@@ -406,7 +405,7 @@
d_instantiate(dentry, inode);

out:
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
return err;
}

@@ -418,8 +417,7 @@
int err = -EIO;
handle_t *handle = 0;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DATA_TRANS_BLOCKS + 3);
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);

inode = ext3_new_inode (handle, dir, mode, &err);
if (!inode)
@@ -438,6 +436,10 @@
if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
EXT3_FEATURE_INCOMPAT_FILETYPE))
de->file_type = EXT3_FT_REG_FILE;
+ } else if (S_ISSOCK(inode->i_mode)) {
+ if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
+ EXT3_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = EXT3_FT_SOCK;
} else if (S_ISCHR(inode->i_mode)) {
inode->i_op = &chrdev_inode_operations;
if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
@@ -463,7 +465,7 @@
d_instantiate(dentry, inode);
brelse(bh);
out_stop:
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
return err;

out_no_entry:
@@ -485,8 +487,7 @@
if (dir->i_nlink >= EXT3_LINK_MAX)
goto out;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DATA_TRANS_BLOCKS + 4);
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 4);
err = -EIO;
inode = ext3_new_inode (handle, dir, S_IFDIR, &err);
if (!inode)
@@ -545,7 +546,7 @@
err = 0;

out_stop:
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
out:
return err;

@@ -628,8 +629,7 @@
struct ext3_dir_entry_2 * de;
handle_t *handle;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DELETE_TRANS_BLOCKS);
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);

retval = -ENOENT;
bh = ext3_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &de);
@@ -669,11 +669,41 @@
handle->h_sync = 1;

end_rmdir:
- journal_stop(handle);
+ ext3_journal_stop(handle, dir);
brelse (bh);
return retval;
}

+/* ext3_orphan_add() links a unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext3_orphan_cleanup().
+ */
+void ext3_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ lock_super(sb);
+ if (!list_empty(&inode->u.ext3_i.i_orphan)) {
+ unlock_super(sb);
+ return;
+ }
+ journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+ ext3_mark_inode_dirty(handle, inode);
+ unlock_super(sb);
+
+ jfs_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jfs_debug(4, "orphan inode %ld will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+}
+
int ext3_unlink(struct inode * dir, struct dentry *dentry)
{
int retval;
@@ -682,8 +712,7 @@
struct ext3_dir_entry_2 * de;
handle_t *handle;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DELETE_TRANS_BLOCKS);
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);

retval = -ENOENT;
bh = ext3_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &de);
@@ -713,13 +742,15 @@
dir->u.ext3_i.i_flags &= ~EXT3_BTREE_FL;
ext3_mark_inode_dirty(handle, dir);
inode->i_nlink--;
+ if (!inode->i_nlink)
+ ext3_orphan_add(handle, inode);
ext3_mark_inode_dirty(handle, inode);
inode->i_ctime = dir->i_ctime;
retval = 0;
d_delete(dentry); /* This also frees the inode */

end_unlink:
- journal_stop(handle);
+ ext3_journal_stop(handle, dir);
brelse (bh);
return retval;
}
@@ -734,8 +765,7 @@
char c;
handle_t *handle;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DATA_TRANS_BLOCKS + 5);
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);

if (!(inode = ext3_new_inode (handle, dir, S_IFLNK, &err)))
goto out;
@@ -788,7 +818,7 @@
d_instantiate(dentry, inode);
err = 0;
out:
- journal_stop(handle);
+ ext3_journal_stop(handle, dir);
return err;

out_no_entry:
@@ -813,8 +843,7 @@
if (inode->i_nlink >= EXT3_LINK_MAX)
return -EMLINK;

- handle = journal_start(EXT3_JOURNAL(dir),
- EXT3_DATA_TRANS_BLOCKS);
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);

bh = ext3_add_entry (handle, dir, dentry->d_name.name, dentry->d_name.len, &de, &err);
if (!bh)
@@ -829,6 +858,8 @@
de->file_type = EXT3_FT_DIR;
else if (S_ISLNK(inode->i_mode))
de->file_type = EXT3_FT_SYMLINK;
+ else if (S_ISSOCK(inode->i_mode))
+ de->file_type = EXT3_FT_SOCK;
else if (S_ISCHR(inode->i_mode))
de->file_type = EXT3_FT_CHRDEV;
else if (S_ISBLK(inode->i_mode))
@@ -847,7 +878,7 @@
d_instantiate(dentry, inode);
err = 0;
out:
- journal_stop(handle);
+ ext3_journal_stop(handle, dir);
return err;
}

@@ -870,8 +901,7 @@

old_bh = new_bh = dir_bh = NULL;

- handle = journal_start(EXT3_JOURNAL(old_dir),
- 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);

old_bh = ext3_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len, &old_de);
/*
@@ -972,6 +1002,6 @@
brelse (dir_bh);
brelse (old_bh);
brelse (new_bh);
- journal_stop(handle);
+ ext3_journal_stop(handle, old_dir);
return retval;
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/super.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/super.c Tue Jul 4 23:47:07 2000
@@ -44,6 +44,8 @@
static void ext3_commit_super (struct super_block * sb,
struct ext3_super_block * es,
int sync);
+static void ext3_mark_recovery_complete(struct super_block * sb,
+ struct ext3_super_block * es);

void ext3_error (struct super_block * sb, const char * function,
const char * fmt, ...)
@@ -137,6 +139,8 @@
brelse (EXT3_SB(sb)->s_block_bitmap[i]);
brelse (EXT3_SB(sb)->s_sbh);

+ J_ASSERT (list_empty(&EXT3_SB(sb)->s_orphan));
+
MOD_DEC_USE_COUNT;
return;
}
@@ -197,7 +201,7 @@
else if (!strcmp (this_char, "errors")) {
if (!value || !*value) {
printk ("EXT3-fs: the errors option requires "
- "an argument");
+ "an argument\n");
return 0;
}
if (!strcmp (value, "continue")) {
@@ -414,6 +418,73 @@
return 1;
}

+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext3_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext3_orphan_cleanup (struct super_block * sb,
+ struct ext3_super_block * es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ if (!es->s_last_orphan) {
+ jfs_debug(4, "no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (s_flags & MS_RDONLY) {
+ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on read-only fs\n",
+ kdevname(sb->s_dev));
+ sb->s_flags &= ~MS_RDONLY;
+ }
+
+ while (es->s_last_orphan) {
+ struct inode *inode;
+
+ if (!(inode =
+ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+ if (inode->i_nlink) {
+ jfs_debug(2, "truncating inode %ld to %ld bytes\n",
+ inode->i_ino, inode->i_size);
+ ext3_truncate(inode);
+ nr_truncates++;
+ } else {
+ jfs_debug(2, "deleting unreferenced inode %ld\n",
+ inode->i_ino);
+ nr_orphans++;
+ }
+ iput(inode); /* The delete magic happens here! */
+ }
+
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+ if (nr_orphans)
+ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
+ kdevname(sb->s_dev), PLURAL(nr_orphans));
+ if (nr_truncates)
+ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
+ kdevname(sb->s_dev), PLURAL(nr_truncates));
+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
#define log2(n) ffz(~(n))

struct super_block * ext3_read_super (struct super_block * sb, void * data,
@@ -670,6 +741,7 @@
*/
sb->s_dev = dev;
sb->s_op = &ext3_sops;
+ INIT_LIST_HEAD(&sb->u.ext3_sb.s_orphan); /* unlinked but open files */
unlock_super (sb);

err = 0;
@@ -699,7 +771,10 @@
sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO), NULL);
if (!sb->s_root)
goto error_out;
- ext3_setup_super (sb, es);
+ ext3_setup_super(sb, es);
+ ext3_orphan_cleanup(sb, es);
+ ext3_mark_recovery_complete(sb, es);
+ printk (KERN_INFO "EXT3-fs: recovery complete.\n");
return sb;

error_out:
@@ -755,7 +830,7 @@
* can get read-write access to the device.
*/

- if (es->s_feature_incompat & EXT3_FEATURE_INCOMPAT_RECOVER) {
+ if (es->s_feature_incompat & cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER)) {
if (sb->s_flags & MS_RDONLY) {
printk(KERN_ERR "EXT3-fs: WARNING: recovery required on readonly filesystem.\n");
if (is_read_only(sb->s_dev)) {
@@ -785,21 +860,6 @@
}

EXT3_SB(sb)->s_journal = journal;
-
- /*
- * Have we just finished recovery? If so, and if we are
- * mounting the filesystem readonly, then we will end up with a
- * consistent fs on disk. Record that fact if so.
- */
-
- if (le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_RECOVER) {
- printk (KERN_INFO "EXT3-fs: recovery complete.\n");
- if (sb->s_flags & MS_RDONLY) {
- es->s_feature_incompat &= ~(cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER));
- ext3_commit_super(sb, es, 1);
- }
- }
-
return 0;
}

@@ -848,13 +908,33 @@
{
es->s_wtime = cpu_to_le32(CURRENT_TIME);
mark_buffer_dirty(sb->u.ext3_sb.s_sbh, 1);
- sb->s_dirt = 0;
if (sync) {
ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
wait_on_buffer(sb->u.ext3_sb.s_sbh);
}
}

+
+/*
+ * Have we just finished recovery? If so, and if we are mounting the
+ * filesystem readonly, then we will end up with a consistent fs on
+ * disk. Record that fact if so.
+ */
+static void ext3_mark_recovery_complete(struct super_block * sb,
+ struct ext3_super_block * es)
+{
+ journal_flush(EXT3_SB(sb)->s_journal);
+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+ if (sb->s_flags & MS_RDONLY) {
+ EXT3_SB(sb)->s_feature_incompat &= ~EXT3_FEATURE_INCOMPAT_RECOVER;
+ es->s_feature_incompat = cpu_to_le32(EXT3_SB(sb)->s_feature_incompat);
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_commit_super(sb, es, 1);
+ sb->s_dirt = 0;
+ }
+ }
+}
+
/*
* In the second extended file system, it is not necessary to
* write the super block since we use a mapping of the
@@ -868,20 +948,22 @@

void ext3_write_super (struct super_block * sb)
{
- struct ext3_super_block * es;
+ tid_t wait_tid;

+ sb->s_dirt = 0;
+
if (!(sb->s_flags & MS_RDONLY)) {
journal_t *journal;

journal = EXT3_SB(sb)->s_journal;
- es = sb->u.ext3_sb.s_es;

- if (journal->j_running_transaction)
+ if (journal->j_running_transaction) {
+ wait_tid = journal->j_running_transaction->t_tid;
log_start_commit(journal, journal->j_running_transaction);
- if (journal->j_committing_transaction)
+ log_wait_commit(journal, wait_tid);
+ } else if (journal->j_committing_transaction)
log_wait_commit(journal, journal->j_committing_transaction->t_tid);
}
- sb->s_dirt = 0;
}

int ext3_remount (struct super_block * sb, int * flags, char * data)
@@ -912,7 +994,6 @@
* to disable replay of the journal when we next remount
*/
sb->s_flags |= MS_RDONLY;
- journal_flush(EXT3_SB(sb)->s_journal);

/*
* OK, test if we are remounting a valid rw partition
@@ -923,11 +1004,7 @@
(sb->u.ext3_sb.s_mount_state & EXT3_VALID_FS))
es->s_state = cpu_to_le16(sb->u.ext3_sb.s_mount_state);

- es->s_feature_incompat &= cpu_to_le32(~EXT3_FEATURE_INCOMPAT_RECOVER);
- es->s_mtime = cpu_to_le32(CURRENT_TIME);
- mark_buffer_dirty(sb->u.ext3_sb.s_sbh, 1);
- sb->s_dirt = 1;
- ext3_commit_super (sb, es, 1);
+ ext3_mark_recovery_complete(sb, es);
}
else {
/*
@@ -938,6 +1015,7 @@
sb->u.ext3_sb.s_mount_state = le16_to_cpu(es->s_state);
sb->s_flags &= ~MS_RDONLY;
ext3_setup_super (sb, es);
+ sb->s_dirt = 1;
}
return 0;
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/truncate.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/truncate.c Wed Jul 5 14:53:52 2000
@@ -136,8 +136,7 @@
if (needed > EXT3_MAX_TRANS_DATA)
needed = EXT3_MAX_TRANS_DATA;

- return journal_start(EXT3_JOURNAL(inode),
- EXT3_DATA_TRANS_BLOCKS + needed);
+ return ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
}

static int extend_transaction(handle_t *handle, struct inode *inode)
@@ -487,6 +486,19 @@

handle = start_transaction(inode);

+ /* Add inode to orphan list, so that if this truncate spans multiple
+ * transactions, and we crash and don't recover the last transaction
+ * we will resume the truncate when the filesystem recovers.
+ */
+ ext3_orphan_add(handle, inode);
+ ext3_mark_inode_dirty(handle, inode);
+
+ /* The orphan list will now protect us from a crash before the
+ * truncate completes, so it is finally safe to propagate the
+ * new inode size (held for now in i_size) into the on-disk
+ * inode. */
+ inode->u.ext3_i.i_disksize = inode->i_size;
+
while (1) {
retry = trunc_direct(handle, inode);
retry |= trunc_indirect (handle, inode,
@@ -500,7 +512,7 @@
retry |= trunc_tindirect (handle, inode);
if (!retry)
break;
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
current->counter = 0;
run_task_queue(&tq_disk);
current->policy |= SCHED_YIELD;
@@ -528,6 +540,10 @@
}
}
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (inode->i_nlink) {
+ ext3_orphan_del(handle, inode);
+ NEXT_ORPHAN(inode) = 0;
+ }
ext3_mark_inode_dirty(handle, inode);
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/commit.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/commit.c Thu Jun 29 23:27:37 2000
@@ -38,7 +38,7 @@
int blocknr;
char *tagp = NULL;
journal_header_t *header;
- journal_block_tag_t *tag = NULL, *last_tag;
+ journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
int tag_flag;
--- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/journal.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/journal.c Tue Jul 4 12:30:32 2000
@@ -131,6 +131,11 @@
journal->j_commit_request = transaction->t_tid;
}

+ if (journal->j_commit_timer_active) {
+ journal->j_commit_timer_active = 0;
+ del_timer(journal->j_commit_timer);
+ }
+
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jfs_debug(1, "Journal thread exiting.\n");
@@ -318,7 +323,7 @@
journal_file_buffer(bh_in, transaction, BJ_Shadow);
journal_file_buffer(new_bh, transaction, BJ_IO);

- return do_escape + (done_copy_out << 1);
+ return do_escape | (done_copy_out << 1);
}

@@ -784,6 +789,12 @@
while (!err && journal->j_checkpoint_transactions != NULL)
err = log_do_checkpoint(journal, journal->j_maxlen);
unlock_journal(journal);
+
+ J_ASSERT(!journal->j_running_transaction);
+ J_ASSERT(!journal->j_committing_transaction);
+ J_ASSERT(!journal->j_checkpoint_transactions);
+ J_ASSERT(journal->j_head == journal->j_tail);
+ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);

return err;
}
--- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/recovery.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/recovery.c Tue Jul 4 17:29:54 2000
@@ -223,7 +223,7 @@
*/

if (!sb->s_start) {
- jfs_debug(1, "No recovery required, last transaction %ld\n",
+ jfs_debug(1, "No recovery required, last transaction %d\n",
ntohl(sb->s_sequence));
journal->j_transaction_sequence = ++next_commit_ID;
return 0;
@@ -327,8 +327,10 @@

/* If it is the commit block, then we are all done! */

- if (tmp->h_blocktype == htonl(JFS_COMMIT_BLOCK))
+ if (tmp->h_blocktype == htonl(JFS_COMMIT_BLOCK)) {
+ brelse(bh);
break;
+ }

/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
@@ -376,7 +378,7 @@
}

mark_buffer_dirty(nbh, 1);
- ll_rw_block(WRITE, 1, &nbh);
+ // ll_rw_block(WRITE, 1, &nbh);
brelse(obh);
brelse(nbh);
}
@@ -389,10 +391,11 @@
break;

} /* end of tag loop */
+
+ brelse(bh);

} /* end of descriptor block loop */
- brelse(bh);
-
+
/* We have now replayed that entire transaction: start
* looking for the next transaction. */
next_commit_ID++;
--- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/transaction.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/transaction.c Thu Jun 29 23:29:49 2000
@@ -507,7 +507,7 @@
* To deal with that, journal_get_undo_access requests write access to a
* buffer for parts of non-rewindable operations such as delete
* operations on the bitmaps. The journaling code must keep a copy of
-` * the buffer's contents prior to the undo_access call until such time
+ * the buffer's contents prior to the undo_access call until such time
* as we know that the buffer has definitely been committed to disk.
*
* We never need to know which transaction the committed data is part
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs.h.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs.h Tue Jul 4 16:43:28 2000
@@ -36,8 +36,8 @@
/*
* The second extended file system version
*/
-#define EXT3FS_DATE "2000/3/22"
-#define EXT3FS_VERSION "0.0.2d"
+#define EXT3FS_DATE "2000/07/04"
+#define EXT3FS_VERSION "0.0.2e"

/*
* Debug code
@@ -406,8 +406,10 @@
*/
__u8 s_journal_uuid[16]; /* uuid of journal superblock */
__u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */

- __u32 s_reserved[199]; /* Padding to the end of the block */
+ __u32 s_reserved[197]; /* Padding to the end of the block */
};

#ifdef __KERNEL__
@@ -419,6 +421,8 @@
#define EXT3_SB(sb) (sb)
#endif

+#define NEXT_ORPHAN(inode) inode->u.ext3_i.i_dtime
+
/*
* Codes for operating systems
*/
@@ -586,6 +590,7 @@
/* ialloc.c */
extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int, int *);
extern void ext3_free_inode (handle_t *, struct inode *);
+extern struct inode * ext3_orphan_get (struct super_block * sb, ino_t ino);
extern unsigned long ext3_count_free_inodes (struct super_block *);
extern void ext3_check_inodes_bitmap (struct super_block *);

@@ -601,6 +606,7 @@
extern void ext3_read_inode (struct inode *);
extern void ext3_write_inode (struct inode *);
extern void ext3_put_inode (struct inode *);
+extern void ext3_orphan_del (handle_t *handle, struct inode *);
extern void ext3_delete_inode (struct inode *);
extern int ext3_sync_inode (handle_t *, struct inode *);
extern int ext3_notify_change(struct dentry *, struct iattr *);
@@ -616,6 +622,7 @@
extern int ext3_create (struct inode *,struct dentry *,int);
extern int ext3_mkdir (struct inode *,struct dentry *,int);
extern int ext3_rmdir (struct inode *,struct dentry *);
+extern void ext3_orphan_add(handle_t *, struct inode *);
extern int ext3_unlink (struct inode *,struct dentry *);
extern int ext3_symlink (struct inode *,struct dentry *,const char *);
extern int ext3_link (struct dentry *, struct inode *, struct dentry *);
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_i.h.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_i.h Wed Jul 5 14:45:43 2000
@@ -36,7 +36,13 @@
__u32 i_prealloc_block;
__u32 i_prealloc_count;
__u32 i_high_size;
+ struct list_head i_orphan; /* unlinked but open inodes */
int i_new_inode:1; /* Is a freshly allocated inode */
+ /* i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to 0 by the VFS
+ * but the filesystem won't set i_disksize to 0 until the
+ * truncate is actually under way. */
+ off_t i_disksize;
};

#endif /* _LINUX_EXT3_FS_I */
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_sb.h.~1~ Thu Jun 29 17:41:18 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_sb.h Tue Jul 4 17:01:54 2000
@@ -65,6 +65,7 @@
/* Journaling */
struct inode * s_journal_inode;
struct journal_s * s_journal;
+ struct list_head s_orphan;
};

#endif /* _LINUX_EXT3_FS_SB */
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_jfs.h.~1~ Fri Jun 30 19:53:35 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_jfs.h Wed Jul 5 14:55:34 2000
@@ -113,4 +113,26 @@
return err;
}

+
+/*
+ * Wrappers for journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+
+static inline handle_t *ext3_journal_start (struct inode *inode, int nblocks)
+{
+ return journal_start(EXT3_JOURNAL(inode), nblocks);
+}
+
+static inline int ext3_journal_stop (handle_t *handle, struct inode *inode)
+{
+ int rc = journal_stop(handle);
+ inode->i_sb->s_dirt = 1;
+ return rc;
+}
+
#endif /* _LINUX_EXT3_JFS_H */
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/fs.h.~1~ Thu Jun 29 17:41:20 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/fs.h Wed Jul 5 14:54:03 2000
@@ -618,9 +618,6 @@
short int s_ibasket_max;
struct list_head s_dirty; /* dirty inodes */

- /* Pointer to journaling control structure for this filesystem */
- journal_t * s_journal;
-
/* Filesystem-specific data: */
union {
struct minix_sb_info minix_sb;