/* @@@ This will eventually have to be a data-style operation,
not metadata */
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/dir.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/dir.c Thu Jun 29 17:36:49 2000
@@ -196,15 +196,18 @@
* version stamp to detect whether or
* not the directory has been modified
* during the copy operation.
+ * AV: It can't be modified, but it fscking
+ * can be seeked by another process that shares
+ * the descriptor.
*/
- unsigned long version = inode->i_version;
+ unsigned long version = filp->f_version;
--- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/fsync.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/fsync.c Tue Jul 4 15:00:23 2000
@@ -46,10 +46,21 @@
if (!bh)
return 0;
if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
- brelse (bh);
- return -1;
+ /* There can be a parallel read(2) that started read-I/O
+ on the buffer so we can't assume that there's been
+ an I/O error without first waiting I/O completation. */
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ {
+ brelse (bh);
+ return -1;
+ }
}
if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
+ if (wait)
+ /* when we return from fsync all the blocks
+ must be _just_ stored on disk */
+ wait_on_buffer(bh);
brelse (bh);
return 0;
}
@@ -262,7 +273,7 @@
struct inode *inode = dentry->d_inode;
handle_t *handle;
/*
- * This function increments the inode version number
- *
- * This may be used one day by the NFS server
- */
-static void inc_inode_version (struct inode * inode,
- struct ext3_group_desc *gdp,
- int mode)
-{
- inode->u.ext3_i.i_version++;
- mark_inode_dirty(inode);
-
- return;
-}
-
-/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -497,13 +482,15 @@
inode->u.ext3_i.i_file_acl = 0;
inode->u.ext3_i.i_dir_acl = 0;
inode->u.ext3_i.i_dtime = 0;
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
inode->u.ext3_i.i_block_group = i;
inode->i_op = NULL;
if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
inode->i_flags |= MS_SYNCHRONOUS;
insert_inode_hash(inode);
+ inode->i_generation = inode_generation_count++;
+ inode->u.ext3_i.i_version = inode->i_generation;
ext3_mark_inode_dirty(handle, inode);
- inc_inode_version (inode, gdp, mode);
/*
+ * ext3_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+void ext3_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev = inode->u.ext3_i.i_orphan.prev;
+ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+ ino_t ino_next = NEXT_ORPHAN(inode);
+
+ if (list_empty(&inode->u.ext3_i.i_orphan))
+ return;
+
+ jfs_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+ lock_super(inode->i_sb);
+ list_del(&inode->u.ext3_i.i_orphan);
+
+ if (prev == &sbi->s_orphan) {
+ jfs_debug(4, "superblock will point to %ld\n", ino_next);
+ journal_get_write_access(handle, sbi->s_sbh);
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ journal_dirty_metadata(handle, sbi->s_sbh);
+ } else {
+ struct inode *i_prev =
+ list_entry(prev, struct inode, u.ext3_i.i_orphan);
+
+ jfs_debug(4, "orphan inode %ld will point to %ld\n",
+ i_prev->i_ino, ino_next);
+ NEXT_ORPHAN(i_prev) = ino_next;
+ ext3_mark_inode_dirty(handle, i_prev);
+ }
+ unlock_super(inode->i_sb);
+}
+
+/*
* Called at the last iput() if i_nlink is zero.
*/
void ext3_delete_inode (struct inode * inode)
@@ -51,19 +85,24 @@
if (inode->i_ino == EXT3_ACL_IDX_INO ||
inode->i_ino == EXT3_ACL_DATA_INO)
return;
- inode->u.ext3_i.i_dtime = CURRENT_TIME;
- handle = journal_start(EXT3_JOURNAL(inode),
- EXT3_DELETE_TRANS_BLOCKS);
+ /* When we delete an inode, we increment its i_version. If it
+ is ever read in from disk again, it will have a different
+ i_version. */
+ inode->u.ext3_i.i_version++;
+
+ handle = ext3_journal_start(inode, EXT3_DELETE_TRANS_BLOCKS);
/* If this is the first large file
* created, add a flag to the superblock */
es->s_feature_ro_compat |=
cpu_to_le32(EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
journal_dirty_metadata(handle, bh); /*@@@err*/
- journal_stop(handle);
+ ext3_journal_stop(handle, inode);
}
}
#endif
@@ -835,7 +879,7 @@
* required is one. */
+/* ext3_orphan_add() links a unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext3_orphan_cleanup().
+ */
+void ext3_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ lock_super(sb);
+ if (!list_empty(&inode->u.ext3_i.i_orphan)) {
+ unlock_super(sb);
+ return;
+ }
+ journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+ ext3_mark_inode_dirty(handle, inode);
+ unlock_super(sb);
+
+ jfs_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jfs_debug(4, "orphan inode %ld will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+}
+
int ext3_unlink(struct inode * dir, struct dentry *dentry)
{
int retval;
@@ -682,8 +712,7 @@
struct ext3_dir_entry_2 * de;
handle_t *handle;
+/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext3_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext3_orphan_cleanup (struct super_block * sb,
+ struct ext3_super_block * es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ if (!es->s_last_orphan) {
+ jfs_debug(4, "no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (s_flags & MS_RDONLY) {
+ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on read-only fs\n",
+ kdevname(sb->s_dev));
+ sb->s_flags &= ~MS_RDONLY;
+ }
+
+ while (es->s_last_orphan) {
+ struct inode *inode;
+
+ if (!(inode =
+ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+ if (inode->i_nlink) {
+ jfs_debug(2, "truncating inode %ld to %ld bytes\n",
+ inode->i_ino, inode->i_size);
+ ext3_truncate(inode);
+ nr_truncates++;
+ } else {
+ jfs_debug(2, "deleting unreferenced inode %ld\n",
+ inode->i_ino);
+ nr_orphans++;
+ }
+ iput(inode); /* The delete magic happens here! */
+ }
+
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+ if (nr_orphans)
+ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
+ kdevname(sb->s_dev), PLURAL(nr_orphans));
+ if (nr_truncates)
+ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
+ kdevname(sb->s_dev), PLURAL(nr_truncates));
+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
#define log2(n) ffz(~(n))
error_out:
@@ -755,7 +830,7 @@
* can get read-write access to the device.
*/
- if (es->s_feature_incompat & EXT3_FEATURE_INCOMPAT_RECOVER) {
+ if (es->s_feature_incompat & cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER)) {
if (sb->s_flags & MS_RDONLY) {
printk(KERN_ERR "EXT3-fs: WARNING: recovery required on readonly filesystem.\n");
if (is_read_only(sb->s_dev)) {
@@ -785,21 +860,6 @@
}
EXT3_SB(sb)->s_journal = journal;
-
- /*
- * Have we just finished recovery? If so, and if we are
- * mounting the filesystem readonly, then we will end up with a
- * consistent fs on disk. Record that fact if so.
- */
-
- if (le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_RECOVER) {
- printk (KERN_INFO "EXT3-fs: recovery complete.\n");
- if (sb->s_flags & MS_RDONLY) {
- es->s_feature_incompat &= ~(cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER));
- ext3_commit_super(sb, es, 1);
- }
- }
-
return 0;
}
+
+/*
+ * Have we just finished recovery? If so, and if we are mounting the
+ * filesystem readonly, then we will end up with a consistent fs on
+ * disk. Record that fact if so.
+ */
+static void ext3_mark_recovery_complete(struct super_block * sb,
+ struct ext3_super_block * es)
+{
+ journal_flush(EXT3_SB(sb)->s_journal);
+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+ if (sb->s_flags & MS_RDONLY) {
+ EXT3_SB(sb)->s_feature_incompat &= ~EXT3_FEATURE_INCOMPAT_RECOVER;
+ es->s_feature_incompat = cpu_to_le32(EXT3_SB(sb)->s_feature_incompat);
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_commit_super(sb, es, 1);
+ sb->s_dirt = 0;
+ }
+ }
+}
+
/*
* In the second extended file system, it is not necessary to
* write the super block since we use a mapping of the
@@ -868,20 +948,22 @@
journal = EXT3_SB(sb)->s_journal;
- es = sb->u.ext3_sb.s_es;
- if (journal->j_running_transaction)
+ if (journal->j_running_transaction) {
+ wait_tid = journal->j_running_transaction->t_tid;
log_start_commit(journal, journal->j_running_transaction);
- if (journal->j_committing_transaction)
+ log_wait_commit(journal, wait_tid);
+ } else if (journal->j_committing_transaction)
log_wait_commit(journal, journal->j_committing_transaction->t_tid);
}
- sb->s_dirt = 0;
}
int ext3_remount (struct super_block * sb, int * flags, char * data)
@@ -912,7 +994,6 @@
* to disable replay of the journal when we next remount
*/
sb->s_flags |= MS_RDONLY;
- journal_flush(EXT3_SB(sb)->s_journal);
/*
* OK, test if we are remounting a valid rw partition
@@ -923,11 +1004,7 @@
(sb->u.ext3_sb.s_mount_state & EXT3_VALID_FS))
es->s_state = cpu_to_le16(sb->u.ext3_sb.s_mount_state);
} /* end of descriptor block loop */
- brelse(bh);
-
+
/* We have now replayed that entire transaction: start
* looking for the next transaction. */
next_commit_ID++;
--- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/transaction.c.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/transaction.c Thu Jun 29 23:29:49 2000
@@ -507,7 +507,7 @@
* To deal with that, journal_get_undo_access requests write access to a
* buffer for parts of non-rewindable operations such as delete
* operations on the bitmaps. The journaling code must keep a copy of
-` * the buffer's contents prior to the undo_access call until such time
+ * the buffer's contents prior to the undo_access call until such time
* as we know that the buffer has definitely been committed to disk.
*
* We never need to know which transaction the committed data is part
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs.h.~1~ Thu Jun 29 17:24:22 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs.h Tue Jul 4 16:43:28 2000
@@ -36,8 +36,8 @@
/*
* The second extended file system version
*/
-#define EXT3FS_DATE "2000/3/22"
-#define EXT3FS_VERSION "0.0.2d"
+#define EXT3FS_DATE "2000/07/04"
+#define EXT3FS_VERSION "0.0.2e"
/*
* Debug code
@@ -406,8 +406,10 @@
*/
__u8 s_journal_uuid[16]; /* uuid of journal superblock */
__u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
- __u32 s_reserved[199]; /* Padding to the end of the block */
+ __u32 s_reserved[197]; /* Padding to the end of the block */
};
+
+/*
+ * Wrappers for journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+
+static inline handle_t *ext3_journal_start (struct inode *inode, int nblocks)
+{
+ return journal_start(EXT3_JOURNAL(inode), nblocks);
+}
+
+static inline int ext3_journal_stop (handle_t *handle, struct inode *inode)
+{
+ int rc = journal_stop(handle);
+ inode->i_sb->s_dirt = 1;
+ return rc;
+}
+
#endif /* _LINUX_EXT3_JFS_H */
--- linux-2.2.17pre9.ext3-0.0.2e/include/linux/fs.h.~1~ Thu Jun 29 17:41:20 2000
+++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/fs.h Wed Jul 5 14:54:03 2000
@@ -618,9 +618,6 @@
short int s_ibasket_max;
struct list_head s_dirty; /* dirty inodes */
- /* Pointer to journaling control structure for this filesystem */
- journal_t * s_journal;
-
/* Filesystem-specific data: */
union {
struct minix_sb_info minix_sb;