diff -rc2P linux/Documentation/Configure.help linux-2.4.13/Documentation/Configure.help
*** linux/Documentation/Configure.help  Sat Oct 20 22:17:19 2001
--- linux-2.4.13/Documentation/Configure.help   Fri Nov  9 16:58:00 2001
***************
*** 12059,12062 ****
--- 12059,12132 ----
   wants to say Y here.

+ Ext3 journaling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+   This is the journaling version of the Second extended file system
+   (often called ext3), the de facto standard Linux file system
+   (method to organize files on a storage device) for hard disks.
+
+   The journaling code included in this driver means you do not have
+   to run e2fsck (file system checker) on your file systems after a
+   crash.  The journal keeps track of any changes that were being made
+   at the time the system crashed, and can ensure that your file system
+   is consistent without the need for a lengthy check.
+
+   Other than adding the journal to the filesystem, the on-disk format of
+   ext3 is identical to ext2.  It is possible to freely switch between
+   using the ext3 driver and the ext2 driver, as long as the filesystem
+   has been cleanly unmounted, or e2fsck is run on the filesystem.
+
+   To add a journal on an existing ext2 filesystem or change the behavior
+   of ext3 file systems, you can use the tune2fs utility ("man tune2fs").
+   To modify attributes of files and directories on ext3 file systems,
+   use chattr ("man chattr").  You need to be using e2fsprogs version
+   1.20 or later in order to create ext3 journals (available at
+   <http://sourceforge.net/projects/e2fsprogs/>).
+
+   If you want to compile this file system as a module ( = code which
+   can be inserted in and removed from the running kernel whenever you
+   want), say M here and read Documentation/modules.txt. The module
+   will be called ext3.o. Be aware however that the file system of your
+   root partition (the one containing the directory /) cannot be
+   compiled as a module, and so this may be dangerous.
+
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+   This is a generic journaling layer for block devices. It is currently
+   used by the ext3 file system, but it could also be used to add journal
+   support to other file systems or block devices such as RAID or LVM.
+
+   If you are using the ext3 filesystem, you need to say Y here. If you
+   are not using ext3 then you will probably want to say N.
+
+   If you want to compile this device as a module ( = code which can be
+   inserted in and removed from the running kernel whenever you want),
+   say M here and read Documentation/modules.txt. The module will be called
+   jbd.o. If you are compiling ext3 into the kernel, you cannot compile
+   this code as a module.
+
+ JBD (ext3) debugging support
+ CONFIG_JBD_DEBUG
+   If you are using the ext3 journaled file system (or potentially any
+   other file system/device using JBD), this option allows you to enable
+   debugging output while the system is running, in order to help track
+   down any problems you are having.  By default the debugging output
+   will be turned off.
+
+   If you select Y here, then you will be able to turn on debugging with
+   "echo N > /proc/sys/fs/jbd-debug", where N is a number between 1 and 5,
+   the higher the number, the more debugging output is generated.  To turn
+   debugging off again, do "echo 0 > /proc/sys/fs/jbd-debug".
+
+ Buffer Head tracing (DEBUG)
+ CONFIG_BUFFER_DEBUG
+   If you are a kernel developer working with file systems or in the block
+   device layer, this buffer head tracing may help you to track down bugs
+   in your code.  This enables some debugging macros (BUFFER_TRACE, etc)
+   which allow you to track the state of a buffer through various layers
+   of code.  The debugging code is used primarily by ext3 and JBD code.
+
+   Because this option adds considerably to the size of each buffer, most
+   people will want to say N here.
+
 BFS file system support (EXPERIMENTAL)
 CONFIG_BFS_FS
diff -rc2P linux/drivers/block/ll_rw_blk.c linux-2.4.13/drivers/block/ll_rw_blk.c
*** linux/drivers/block/ll_rw_blk.c     Sat Oct 13 13:30:30 2001
--- linux-2.4.13/drivers/block/ll_rw_blk.c      Fri Nov  9 16:58:00 2001
***************
*** 672,677 ****
          down by us so at this point flushpage will block and
          won't clear the mapped bit under us. */
!       if (!buffer_mapped(bh))
               BUG();

       /*
--- 672,679 ----
          down by us so at this point flushpage will block and
          won't clear the mapped bit under us. */
!       if (!buffer_mapped(bh)) {
!               print_buffer_trace(bh);
               BUG();
+       }

       /*
***************
*** 1007,1013 ****
               switch(rw) {
               case WRITE:
!                       if (!atomic_set_buffer_clean(bh))
                               /* Hmmph! Nothing to write */
                               goto end_io;
                       __mark_buffer_clean(bh);
                       break;
--- 1009,1018 ----
               switch(rw) {
               case WRITE:
!                       if (!atomic_set_buffer_clean(bh)) {
!                               BUFFER_TRACE(bh, "already clean");
                               /* Hmmph! Nothing to write */
                               goto end_io;
+                       }
+                       BUFFER_TRACE(bh, "set clean, write underway");
                       __mark_buffer_clean(bh);
                       break;
***************
*** 1032,1037 ****
 sorry:
       /* Make sure we don't get infinite dirty retries.. */
!       for (i = 0; i < nr; i++)
               mark_buffer_clean(bhs[i]);
 }

--- 1037,1044 ----
 sorry:
       /* Make sure we don't get infinite dirty retries.. */
!       for (i = 0; i < nr; i++) {
!               BUFFER_TRACE(bhs[i], "sorry");
               mark_buffer_clean(bhs[i]);
+       }
 }

***************
*** 1133,1136 ****
--- 1140,1144 ----
               queue_nr_requests = 128;

+
       /*
        * Batch frees according to queue length
diff -rc2P linux/drivers/block/loop.c linux-2.4.13/drivers/block/loop.c
*** linux/drivers/block/loop.c  Mon Oct 15 21:53:51 2001
--- linux-2.4.13/drivers/block/loop.c   Fri Nov  9 16:58:00 2001
***************
*** 187,190 ****
--- 187,192 ----
       while (len > 0) {
               int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize;
+               int transfer_result;
+
               size = PAGE_CACHE_SIZE - offset;
               if (size > len)
***************
*** 198,205 ****
               kaddr = page_address(page);
               flush_dcache_page(page);
!               if (lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV))
!                       goto write_fail;
               if (aops->commit_write(file, page, offset, offset+size))
                       goto unlock;
               data += size;
               len -= size;
--- 200,216 ----
               kaddr = page_address(page);
               flush_dcache_page(page);
!               transfer_result = lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV);
!               if (transfer_result) {
!                       /*
!                        * The transfer failed, but we still write the data to
!                        * keep prepare/commit calls balanced.
!                        */
!                       printk(KERN_ERR "loop: transfer error block %ld\n", index);
!                       memset(kaddr + offset, 0, size);
!               }
               if (aops->commit_write(file, page, offset, offset+size))
                       goto unlock;
+               if (transfer_result)
+                       goto unlock;
               data += size;
               len -= size;
***************
*** 213,220 ****
       return 0;

- write_fail:
-       printk(KERN_ERR "loop: transfer error block %ld\n", index);
-       ClearPageUptodate(page);
-       kunmap(page);
 unlock:
       UnlockPage(page);
--- 224,227 ----
diff -rc2P linux/drivers/ide/ide-disk.c linux-2.4.13/drivers/ide/ide-disk.c
*** linux/drivers/ide/ide-disk.c        Thu Oct 11 12:14:32 2001
--- linux-2.4.13/drivers/ide/ide-disk.c Fri Nov  9 16:58:00 2001
***************
*** 368,371 ****
--- 368,392 ----
 static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
 {
+ #ifdef CONFIG_JBD_DEBUG
+       /*
+        * Silently stop writing to this disk to simulate a crash.
+        */
+       extern int journal_no_write[2];
+       int i;
+
+       if (rq->cmd != WRITE)
+               goto write_ok;
+
+       for (i = 0; i < 2; i++) {
+               if ((journal_no_write[i] & 0xdead0000) == 0xdead0000) {
+                       if (rq->rq_dev == (journal_no_write[i] & 0xffff)) {
+                               ide_end_request(1, HWGROUP(drive));
+                               return ide_stopped;
+                       }
+               }
+       }
+ write_ok:
+       ;
+ #endif
       if (IDE_CONTROL_REG)
               OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
diff -rc2P linux/fs/Config.in linux-2.4.13/fs/Config.in
*** linux/fs/Config.in  Thu Oct  4 18:13:18 2001
--- linux-2.4.13/fs/Config.in   Fri Nov  9 16:57:59 2001
***************
*** 21,24 ****
--- 21,32 ----
 dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL

+ tristate 'Ext3 journalling file system support (EXPERIMENTAL)' CONFIG_EXT3_FS
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+ define_bool CONFIG_JBD $CONFIG_EXT3_FS
+ dep_mbool '  JBD (ext3) debugging support' CONFIG_JBD_DEBUG $CONFIG_JBD
+ bool 'Buffer Head tracing (DEBUG)' CONFIG_BUFFER_DEBUG
+
 # msdos file systems
 tristate 'DOS FAT fs support' CONFIG_FAT_FS
diff -rc2P linux/fs/Makefile linux-2.4.13/fs/Makefile
*** linux/fs/Makefile   Thu Oct  4 18:13:18 2001
--- linux-2.4.13/fs/Makefile    Fri Nov  9 16:58:00 2001
***************
*** 8,12 ****
 O_TARGET := fs.o

! export-objs :=        filesystems.o open.o dcache.o
 mod-subdirs :=        nls

--- 8,12 ----
 O_TARGET := fs.o

! export-objs :=        filesystems.o open.o dcache.o buffer.o jbd-kernel.o
 mod-subdirs :=        nls

***************
*** 15,19 ****
               fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
               dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
!               filesystems.o namespace.o

 ifeq ($(CONFIG_QUOTA),y)
--- 15,19 ----
               fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
               dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
!               filesystems.o namespace.o jbd-kernel.o

 ifeq ($(CONFIG_QUOTA),y)
***************
*** 27,30 ****
--- 27,32 ----

 # Do not add any filesystems before this line
+ subdir-$(CONFIG_EXT3_FS)      += ext3    # Before ext2 so root fs can be ext3
+ subdir-$(CONFIG_JBD)          += jbd
 subdir-$(CONFIG_EXT2_FS)      += ext2
 subdir-$(CONFIG_CRAMFS)               += cramfs
diff -rc2P linux/fs/buffer.c linux-2.4.13/fs/buffer.c
*** linux/fs/buffer.c   Tue Oct 23 20:54:19 2001
--- linux-2.4.13/fs/buffer.c    Fri Nov  9 16:57:59 2001
***************
*** 46,49 ****
--- 46,51 ----
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
+ #include <linux/jbd.h>
+ #include <linux/module.h>
 #include <linux/completion.h>

***************
*** 614,619 ****
    by the user.

!    Thus invalidate_buffers in general usage is not allwowed to trash dirty
!    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.

    NOTE: In the case where the user removed a removable-media-disk even if
--- 616,625 ----
    by the user.

!    Thus invalidate_buffers in general usage is not allwowed to trash
!    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
!    be preserved.  These buffers are simply skipped.
!
!    We also skip buffers which are still in use.  For example this can
!    happen if a userspace program is reading the block device.

    NOTE: In the case where the user removed a removable-media-disk even if
***************
*** 718,721 ****
--- 724,728 ----
       bh->b_end_io = handler;
       bh->b_private = private;
+       buffer_trace_init(&bh->b_history);
 }

***************
*** 727,730 ****
--- 734,738 ----
       struct page *page;

+       BUFFER_TRACE(bh, "enter");
       mark_buffer_uptodate(bh, uptodate);

***************
*** 1093,1096 ****
--- 1101,1110 ----
 }

+ void set_buffer_flushtime(struct buffer_head *bh)
+ {
+       bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+ }
+ EXPORT_SYMBOL(set_buffer_flushtime);
+
 /*
  * A buffer may need to be moved from one buffer list to another
***************
*** 1100,1103 ****
--- 1114,1120 ----
 {
       int dispose = BUF_CLEAN;
+
+       BUFFER_TRACE(bh, "enter");
+
       if (buffer_locked(bh))
               dispose = BUF_LOCKED;
***************
*** 1111,1114 ****
--- 1128,1132 ----
               __insert_into_lru_list(bh, dispose);
       }
+       BUFFER_TRACE(bh, "exit");
 }

***************
*** 1125,1128 ****
--- 1143,1147 ----
 void __brelse(struct buffer_head * buf)
 {
+       BUFFER_TRACE(buf, "entry");
       if (atomic_read(&buf->b_count)) {
               put_bh(buf);
***************
*** 1138,1141 ****
--- 1157,1161 ----
 void __bforget(struct buffer_head * buf)
 {
+       BUFFER_TRACE(buf, "enter");
       mark_buffer_clean(buf);
       __brelse(buf);
***************
*** 1168,1175 ****
  * Note: the caller should wake up the buffer_wait list if needed.
  */
! static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
 {
       if (bh->b_inode)
               BUG();
       if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
               kmem_cache_free(bh_cachep, bh);
--- 1188,1207 ----
  * Note: the caller should wake up the buffer_wait list if needed.
  */
! static void __put_unused_buffer_head(struct buffer_head * bh)
 {
       if (bh->b_inode)
               BUG();
+
+       J_ASSERT_BH(bh, bh->b_prev_free == 0);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+       if (buffer_jbd(bh)) {
+               J_ASSERT_BH(bh, bh2jh(bh)->b_transaction == 0);
+               J_ASSERT_BH(bh, bh2jh(bh)->b_next_transaction == 0);
+               J_ASSERT_BH(bh, bh2jh(bh)->b_frozen_data == 0);
+               J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data == 0);
+       }
+ #endif
+       buffer_trace_init(&bh->b_history);
+
       if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
               kmem_cache_free(bh_cachep, bh);
***************
*** 1185,1188 ****
--- 1217,1228 ----
 }

+ void put_unused_buffer_head(struct buffer_head *bh)
+ {
+       spin_lock(&unused_list_lock);
+       __put_unused_buffer_head(bh);
+       spin_unlock(&unused_list_lock);
+ }
+ EXPORT_SYMBOL(put_unused_buffer_head);
+
 /*
  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
***************
*** 1190,1194 ****
  * buffer heads is now handled in create_buffers().
  */
! static struct buffer_head * get_unused_buffer_head(int async)
 {
       struct buffer_head * bh;
--- 1230,1234 ----
  * buffer heads is now handled in create_buffers().
  */
! struct buffer_head * get_unused_buffer_head(int async)
 {
       struct buffer_head * bh;
***************
*** 1211,1214 ****
--- 1251,1255 ----
               bh->b_blocknr = -1;
               bh->b_this_page = NULL;
+               buffer_trace_init(&bh->b_history);
               return bh;
       }
***************
*** 1224,1227 ****
--- 1265,1269 ----
                       nr_unused_buffer_heads--;
                       spin_unlock(&unused_list_lock);
+                       buffer_trace_init(&bh->b_history);
                       return bh;
               }
***************
*** 1231,1234 ****
--- 1273,1277 ----
       return NULL;
 }
+ EXPORT_SYMBOL(get_unused_buffer_head);

 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
***************
*** 1245,1248 ****
--- 1288,1292 ----
               bh->b_data = page_address(page) + offset;
 }
+ EXPORT_SYMBOL(set_bh_page);

 /*
***************
*** 1328,1331 ****
--- 1372,1376 ----
 {
       if (buffer_mapped(bh)) {
+               BUFFER_TRACE(bh, "entry");
               mark_buffer_clean(bh);
               lock_buffer(bh);
***************
*** 1338,1341 ****
--- 1383,1411 ----
 }

+ /**
+  * try_to_release_page - release old fs-specific metadata on a page
+  *
+  */
+
+ int try_to_release_page(struct page * page, int gfp_mask)
+ {
+       if (!PageLocked(page))
+               BUG();
+
+       if (!page->mapping)
+               goto try_to_free;
+       if (!page->mapping->a_ops->releasepage)
+               goto try_to_free;
+       if (page->mapping->a_ops->releasepage(page, gfp_mask))
+               goto try_to_free;
+       /*
+        * We couldn't release buffer metadata; don't even bother trying
+        * to release buffers.
+        */
+       return 0;
+ try_to_free:
+       return try_to_free_buffers(page, gfp_mask);
+ }
+
 /*
  * We don't have to release all buffers here, but
***************
*** 1381,1385 ****
        */
       if (!offset) {
!               if (!try_to_free_buffers(page, 0))
                       return 0;
       }
--- 1451,1455 ----
        */
       if (!offset) {
!               if (!try_to_release_page(page, 0))
                       return 0;
       }
***************
*** 1409,1412 ****
--- 1479,1483 ----
       page_cache_get(page);
 }
+ EXPORT_SYMBOL(create_empty_buffers);

 /*
***************
*** 1427,1431 ****
--- 1498,1505 ----

       old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+       J_ASSERT_BH(bh, old_bh != bh);
       if (old_bh) {
+               BUFFER_TRACE(old_bh, "old_bh - entry");
+               J_ASSERT_BH(old_bh, !buffer_jlist_eq(old_bh, BJ_Metadata));
               mark_buffer_clean(old_bh);
               wait_on_buffer(old_bh);
***************
*** 1449,1454 ****

 /*
!  * block_write_full_page() is SMP-safe - currently it's still
!  * being called with the kernel lock held, but the code is ready.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
--- 1523,1527 ----

 /*
!  * block_write_full_page() is SMP threaded - the kernel lock is not held.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
***************
*** 1484,1489 ****
                       if (err)
                               goto out;
!                       if (buffer_new(bh))
                               unmap_underlying_metadata(bh);
               }
               bh = bh->b_this_page;
--- 1557,1564 ----
                       if (err)
                               goto out;
!                       if (buffer_new(bh)) {
!                               BUFFER_TRACE(bh, "new: call unmap_underlying_metadata");
                               unmap_underlying_metadata(bh);
+                       }
               }
               bh = bh->b_this_page;
***************
*** 1493,1496 ****
--- 1568,1572 ----
       /* Stage 2: lock the buffers, mark them clean */
       do {
+               BUFFER_TRACE(bh, "lock it");
               lock_buffer(bh);
               set_buffer_async_io(bh);
***************
*** 1549,1554 ****
--- 1625,1632 ----
                               goto out;
                       if (buffer_new(bh)) {
+                               BUFFER_TRACE(bh, "new: call unmap_underlying_metadata");
                               unmap_underlying_metadata(bh);
                               if (Page_Uptodate(page)) {
+                                       BUFFER_TRACE(bh, "setting uptodate");
                                       set_bit(BH_Uptodate, &bh->b_state);
                                       continue;
***************
*** 1564,1567 ****
--- 1642,1646 ----
               }
               if (Page_Uptodate(page)) {
+                       BUFFER_TRACE(bh, "setting uptodate");
                       set_bit(BH_Uptodate, &bh->b_state);
                       continue;
***************
*** 1569,1572 ****
--- 1648,1652 ----
               if (!buffer_uptodate(bh) &&
                    (block_start < from || block_end > to)) {
+                       BUFFER_TRACE(bh, "reading");
                       ll_rw_block(READ, 1, &bh);
                       *wait_bh++=bh;
***************
*** 1607,1610 ****
--- 1687,1691 ----
                       set_bit(BH_Uptodate, &bh->b_state);
                       if (!atomic_set_buffer_dirty(bh)) {
+                               BUFFER_TRACE(bh, "mark dirty");
                               __mark_dirty(bh);
                               buffer_insert_inode_data_queue(bh, inode);
***************
*** 1890,1893 ****
--- 1971,1975 ----
       kunmap(page);

+       BUFFER_TRACE(bh, "zeroed end of block");
       __mark_buffer_dirty(bh);
       err = 0;
***************
*** 2447,2450 ****
--- 2529,2534 ----
       return 0;
 }
+ EXPORT_SYMBOL(try_to_free_buffers);
+ EXPORT_SYMBOL(buffermem_pages);

 /* ================== Debugging =================== */
diff -rc2P linux/fs/ext3/Makefile linux-2.4.13/fs/ext3/Makefile
*** linux/fs/ext3/Makefile      Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/Makefile       Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,16 ----
+ #
+ # Makefile for the linux ext2-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+ # unless it's something special (ie not a .c file).
+ #
+ # Note 2! The CFLAGS definitions are now in the main makefile...
+
+ O_TARGET := ext3.o
+
+ obj-y    := acl.o balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+               ioctl.o namei.o super.o symlink.o
+ obj-m    := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
diff -rc2P linux/fs/ext3/acl.c linux-2.4.13/fs/ext3/acl.c
*** linux/fs/ext3/acl.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/acl.c  Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,17 ----
+ /*
+  * linux/fs/ext3/acl.c
+  *
+  * Copyright (C) 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  */
+
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+
+
+ /*
+  * This file will contain the Access Control Lists management for the
+  * second extended file system.
+  */
diff -rc2P linux/fs/ext3/balloc.c linux-2.4.13/fs/ext3/balloc.c
*** linux/fs/ext3/balloc.c      Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/balloc.c       Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,995 ----
+ /*
+  *  linux/fs/ext3/balloc.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  Enhanced block allocation by Stephen Tweedie ([email protected]), 1993
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  */
+
+ #include <linux/config.h>
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+ /*
+  * balloc.c contains the blocks allocation and deallocation routines
+  */
+
+ /*
+  * The free blocks are managed by bitmaps.  A file system contains several
+  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+  * block for inodes, N blocks for the inode table and data blocks.
+  *
+  * The file system contains group descriptors which are located after the
+  * super block.  Each descriptor contains the number of the bitmap block and
+  * the free blocks count in the block.  The descriptors are loaded in memory
+  * when a file system is mounted (see ext3_read_super).
+  */
+
+
+ #define in_range(b, first, len)       ((b) >= (first) && (b) <= (first) + (len) - 1)
+
+ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+                                            unsigned int block_group,
+                                            struct buffer_head ** bh)
+ {
+       unsigned long group_desc;
+       unsigned long desc;
+       struct ext3_group_desc * gdp;
+
+       if (block_group >= sb->u.ext3_sb.s_groups_count) {
+               ext3_error (sb, "ext3_get_group_desc",
+                           "block_group >= groups_count - "
+                           "block_group = %d, groups_count = %lu",
+                           block_group, sb->u.ext3_sb.s_groups_count);
+
+               return NULL;
+       }
+
+       group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
+       desc = block_group % EXT3_DESC_PER_BLOCK(sb);
+       if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
+               ext3_error (sb, "ext3_get_group_desc",
+                           "Group descriptor not loaded - "
+                           "block_group = %d, group_desc = %lu, desc = %lu",
+                            block_group, group_desc, desc);
+               return NULL;
+       }
+
+       gdp = (struct ext3_group_desc *)
+             sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
+       if (bh)
+               *bh = sb->u.ext3_sb.s_group_desc[group_desc];
+       return gdp + desc;
+ }
+
+ /*
+  * Read the bitmap for a given block_group, reading into the specified
+  * slot in the superblock's bitmap cache.
+  *
+  * Return >=0 on success or a -ve error code.
+  */
+
+ static int read_block_bitmap (struct super_block * sb,
+                              unsigned int block_group,
+                              unsigned long bitmap_nr)
+ {
+       struct ext3_group_desc * gdp;
+       struct buffer_head * bh = NULL;
+       int retval = -EIO;
+
+       gdp = ext3_get_group_desc (sb, block_group, NULL);
+       if (!gdp)
+               goto error_out;
+       retval = 0;
+       bh = bread (sb->s_dev,
+                       le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize);
+       if (!bh) {
+               ext3_error (sb, "read_block_bitmap",
+                           "Cannot read block bitmap - "
+                           "block_group = %d, block_bitmap = %lu",
+                           block_group, (unsigned long) gdp->bg_block_bitmap);
+               retval = -EIO;
+       }
+       /*
+        * On IO error, just leave a zero in the superblock's block pointer for
+        * this group.  The IO will be retried next time.
+        */
+ error_out:
+       sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group;
+       sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh;
+       return retval;
+ }
+
+ /*
+  * load_block_bitmap loads the block bitmap for a blocks group
+  *
+  * It maintains a cache for the last bitmaps loaded.  This cache is managed
+  * with a LRU algorithm.
+  *
+  * Notes:
+  * 1/ There is one cache per mounted file system.
+  * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
+  *    this function reads the bitmap without maintaining a LRU cache.
+  *
+  * Return the slot used to store the bitmap, or a -ve error code.
+  */
+ static int __load_block_bitmap (struct super_block * sb,
+                               unsigned int block_group)
+ {
+       int i, j, retval = 0;
+       unsigned long block_bitmap_number;
+       struct buffer_head * block_bitmap;
+
+       if (block_group >= sb->u.ext3_sb.s_groups_count)
+               ext3_panic (sb, "load_block_bitmap",
+                           "block_group >= groups_count - "
+                           "block_group = %d, groups_count = %lu",
+                           block_group, sb->u.ext3_sb.s_groups_count);
+
+       if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+               if (sb->u.ext3_sb.s_block_bitmap[block_group]) {
+                       if (sb->u.ext3_sb.s_block_bitmap_number[block_group] ==
+                           block_group)
+                               return block_group;
+                       ext3_error (sb, "__load_block_bitmap",
+                                   "block_group != block_bitmap_number");
+               }
+               retval = read_block_bitmap (sb, block_group, block_group);
+               if (retval < 0)
+                       return retval;
+               return block_group;
+       }
+
+       for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+                   sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++)
+               ;
+       if (i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+           sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) {
+               block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i];
+               block_bitmap = sb->u.ext3_sb.s_block_bitmap[i];
+               for (j = i; j > 0; j--) {
+                       sb->u.ext3_sb.s_block_bitmap_number[j] =
+                               sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+                       sb->u.ext3_sb.s_block_bitmap[j] =
+                               sb->u.ext3_sb.s_block_bitmap[j - 1];
+               }
+               sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number;
+               sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap;
+
+               /*
+                * There's still one special case here --- if block_bitmap == 0
+                * then our last attempt to read the bitmap failed and we have
+                * just ended up caching that failure.  Try again to read it.
+                */
+               if (!block_bitmap)
+                       retval = read_block_bitmap (sb, block_group, 0);
+       } else {
+               if (sb->u.ext3_sb.s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
+                       sb->u.ext3_sb.s_loaded_block_bitmaps++;
+               else
+                       brelse (sb->u.ext3_sb.s_block_bitmap
+                                       [EXT3_MAX_GROUP_LOADED - 1]);
+               for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1;
+                                       j > 0;  j--) {
+                       sb->u.ext3_sb.s_block_bitmap_number[j] =
+                               sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+                       sb->u.ext3_sb.s_block_bitmap[j] =
+                               sb->u.ext3_sb.s_block_bitmap[j - 1];
+               }
+               retval = read_block_bitmap (sb, block_group, 0);
+       }
+       return retval;
+ }
+
+ /*
+  * Load the block bitmap for a given block group.  First of all do a couple
+  * of fast lookups for common cases and then pass the request onto the guts
+  * of the bitmap loader.
+  *
+  * Return the slot number of the group in the superblock bitmap cache's on
+  * success, or a -ve error code.
+  *
+  * There is still one inconsistency here --- if the number of groups in this
+  * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of
+  * differentiating between a group for which we have never performed a bitmap
+  * IO request, and a group for which the last bitmap read request failed.
+  */
+ static inline int load_block_bitmap (struct super_block * sb,
+                                    unsigned int block_group)
+ {
+       int slot;
+
+       /*
+        * Do the lookup for the slot.  First of all, check if we're asking
+        * for the same slot as last time, and did we succeed that last time?
+        */
+       if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
+           sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
+           sb->u.ext3_sb.s_block_bitmap[0]) {
+               return 0;
+       }
+       /*
+        * Or can we do a fast lookup based on a loaded group on a filesystem
+        * small enough to be mapped directly into the superblock?
+        */
+       else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED &&
+                sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group
+                       && sb->u.ext3_sb.s_block_bitmap[block_group]) {
+               slot = block_group;
+       }
+       /*
+        * If not, then do a full lookup for this block group.
+        */
+       else {
+               slot = __load_block_bitmap (sb, block_group);
+       }
+
+       /*
+        * <0 means we just got an error
+        */
+       if (slot < 0)
+               return slot;
+
+       /*
+        * If it's a valid slot, we may still have cached a previous IO error,
+        * in which case the bh in the superblock cache will be zero.
+        */
+       if (!sb->u.ext3_sb.s_block_bitmap[slot])
+               return -EIO;
+
+       /*
+        * Must have been read in OK to get this far.
+        */
+       return slot;
+ }
+
+ /* Free given blocks, update quota and i_blocks field */
+ void ext3_free_blocks (handle_t *handle, struct inode * inode,
+                       unsigned long block, unsigned long count)
+ {
+       struct buffer_head *bitmap_bh;
+       struct buffer_head *gd_bh;
+       unsigned long block_group;
+       unsigned long bit;
+       unsigned long i;
+       int bitmap_nr;
+       unsigned long overflow;
+       struct super_block * sb;
+       struct ext3_group_desc * gdp;
+       struct ext3_super_block * es;
+       int err = 0, ret;
+       int dquot_freed_blocks = 0;
+
+       sb = inode->i_sb;
+       if (!sb) {
+               printk ("ext3_free_blocks: nonexistent device");
+               return;
+       }
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+       if (block < le32_to_cpu(es->s_first_data_block) ||
+           (block + count) > le32_to_cpu(es->s_blocks_count)) {
+               ext3_error (sb, "ext3_free_blocks",
+                           "Freeing blocks not in datazone - "
+                           "block = %lu, count = %lu", block, count);
+               goto error_return;
+       }
+
+       ext3_debug ("freeing block %lu\n", block);
+
+ do_more:
+       overflow = 0;
+       block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+                     EXT3_BLOCKS_PER_GROUP(sb);
+       bit = (block - le32_to_cpu(es->s_first_data_block)) %
+                     EXT3_BLOCKS_PER_GROUP(sb);
+       /*
+        * Check to see if we are freeing blocks across a group
+        * boundary.
+        */
+       if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
+               overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
+               count -= overflow;
+       }
+       bitmap_nr = load_block_bitmap (sb, block_group);
+       if (bitmap_nr < 0)
+               goto error_return;
+
+       bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+       gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
+       if (!gdp)
+               goto error_return;
+
+       if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
+           in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
+           in_range (block, le32_to_cpu(gdp->bg_inode_table),
+                     sb->u.ext3_sb.s_itb_per_group) ||
+           in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
+                     sb->u.ext3_sb.s_itb_per_group))
+               ext3_error (sb, "ext3_free_blocks",
+                           "Freeing blocks in system zones - "
+                           "Block = %lu, count = %lu",
+                           block, count);
+
+       /*
+        * We are about to start releasing blocks in the bitmap,
+        * so we need undo access.
+        */
+       /* @@@ check errors */
+       BUFFER_TRACE(bitmap_bh, "getting undo access");
+       err = ext3_journal_get_undo_access(handle, bitmap_bh);
+       if (err)
+               goto error_return;
+
+       /*
+        * We are about to modify some metadata.  Call the journal APIs
+        * to unshare ->b_data if a currently-committing transaction is
+        * using it
+        */
+       BUFFER_TRACE(gd_bh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, gd_bh);
+       if (err)
+               goto error_return;
+
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+       if (err)
+               goto error_return;
+
+       for (i = 0; i < count; i++) {
+               /*
+                * An HJ special.  This is expensive...
+                */
+ #ifdef CONFIG_JBD_DEBUG
+               {
+                       struct buffer_head *debug_bh;
+                       debug_bh = get_hash_table(sb->s_dev, block + i,
+                                                       sb->s_blocksize);
+                       if (debug_bh) {
+                               BUFFER_TRACE(debug_bh, "Deleted!");
+                               if (!bh2jh(bitmap_bh)->b_committed_data)
+                                       BUFFER_TRACE(debug_bh,
+                                               "No commited data in bitmap");
+                               BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+                               __brelse(debug_bh);
+                       }
+               }
+ #endif
+               BUFFER_TRACE(bitmap_bh, "clear bit");
+               if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
+                       ext3_error (sb, __FUNCTION__,
+                                     "bit already cleared for block %lu",
+                                     block + i);
+                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
+               } else {
+                       dquot_freed_blocks++;
+                       gdp->bg_free_blocks_count =
+                         cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
+                       es->s_free_blocks_count =
+                         cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
+               }
+               /* @@@ This prevents newly-allocated data from being
+                * freed and then reallocated within the same
+                * transaction.
+                *
+                * Ideally we would want to allow that to happen, but to
+                * do so requires making journal_forget() capable of
+                * revoking the queued write of a data block, which
+                * implies blocking on the journal lock.  *forget()
+                * cannot block due to truncate races.
+                *
+                * Eventually we can fix this by making journal_forget()
+                * return a status indicating whether or not it was able
+                * to revoke the buffer.  On successful revoke, it is
+                * safe not to set the allocation bit in the committed
+                * bitmap, because we know that there is no outstanding
+                * activity on the buffer any more and so it is safe to
+                * reallocate it.
+                */
+               BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
+               J_ASSERT_BH(bitmap_bh,
+                               bh2jh(bitmap_bh)->b_committed_data != NULL);
+               ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
+       }
+
+       /* We dirtied the bitmap block */
+       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+       err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+
+       /* And the group descriptor block */
+       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+       ret = ext3_journal_dirty_metadata(handle, gd_bh);
+       if (!err) err = ret;
+
+       /* And the superblock */
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
+       ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+       if (!err) err = ret;
+
+       if (overflow && !err) {
+               block += count;
+               count = overflow;
+               goto do_more;
+       }
+       sb->s_dirt = 1;
+ error_return:
+       ext3_std_error(sb, err);
+       unlock_super(sb);
+       if (dquot_freed_blocks)
+               DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+       return;
+ }
+
+ /* For ext3 allocations, we must not reuse any blocks which are
+  * allocated in the bitmap buffer's "last committed data" copy.  This
+  * prevents deletes from freeing up the page for reuse until we have
+  * committed the delete transaction.
+  *
+  * If we didn't do this, then deleting something and reallocating it as
+  * data would allow the old block to be overwritten before the
+  * transaction committed (because we force data to disk before commit).
+  * This would lead to corruption if we crashed between overwriting the
+  * data and committing the delete.
+  *
+  * @@@ We may want to make this allocation behaviour conditional on
+  * data-writes at some point, and disable it for metadata allocations or
+  * sync-data inodes.
+  */
+ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+ {
+       if (ext3_test_bit(nr, bh->b_data))
+               return 0;
+       if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
+               return 1;
+       return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
+ }
+
+ /*
+  * Find an allocatable block in a bitmap.  We honour both the bitmap and
+  * its last-committed copy (if that exists), and perform the "most
+  * appropriate allocation" algorithm of looking for a free block near
+  * the initial goal; then for a free byte somewhere in the bitmap; then
+  * for any free bit in the bitmap.
+  */
+ static int find_next_usable_block(int start,
+                       struct buffer_head *bh, int maxblocks)
+ {
+       int here, next;
+       char *p, *r;
+
+       if (start > 0) {
+               /*
+                * The goal was occupied; search forward for a free
+                * block within the next XX blocks.
+                *
+                * end_goal is more or less random, but it has to be
+                * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
+                * next 64-bit boundary is simple..
+                */
+               int end_goal = (start + 63) & ~63;
+               here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
+               if (here < end_goal && ext3_test_allocatable(here, bh))
+                       return here;
+
+               ext3_debug ("Bit not found near goal\n");
+
+       }
+
+       here = start;
+       if (here < 0)
+               here = 0;
+
+       /*
+        * There has been no free block found in the near vicinity of
+        * the goal: do a search forward through the block groups,
+        * searching in each group first for an entire free byte in the
+        * bitmap and then for any free bit.
+        *
+        * Search first in the remainder of the current group
+        */
+       p = ((char *) bh->b_data) + (here >> 3);
+       r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+       next = (r - ((char *) bh->b_data)) << 3;
+
+       if (next < maxblocks && ext3_test_allocatable(next, bh))
+               return next;
+
+       /* The bitmap search --- search forward alternately
+        * through the actual bitmap and the last-committed copy
+        * until we find a bit free in both. */
+
+       while (here < maxblocks) {
+               next  = ext3_find_next_zero_bit ((unsigned long *) bh->b_data,
+                                                maxblocks, here);
+               if (next >= maxblocks)
+                       return -1;
+               if (ext3_test_allocatable(next, bh))
+                       return next;
+
+               J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
+               here = ext3_find_next_zero_bit
+                       ((unsigned long *) bh2jh(bh)->b_committed_data,
+                        maxblocks, next);
+       }
+       return -1;
+ }
+
+ /*
+  * ext3_new_block uses a goal block to assist allocation.  If the goal is
+  * free, or there is a free block within 32 blocks of the goal, that block
+  * is allocated.  Otherwise a forward search is made for a free block; within
+  * each block group the search first looks for an entire free byte in the block
+  * bitmap, and then for any free bit if that fails.
+  * This function also updates quota and i_blocks field.
+  */
+ int ext3_new_block (handle_t *handle, struct inode * inode,
+               unsigned long goal, u32 * prealloc_count,
+               u32 * prealloc_block, int * errp)
+ {
+       struct buffer_head * bh, *bhtmp;
+       struct buffer_head * bh2;
+ #if 0
+       char * p, * r;
+ #endif
+       int i, j, k, tmp, alloctmp;
+       int bitmap_nr;
+       int fatal = 0, err;
+       struct super_block * sb;
+       struct ext3_group_desc * gdp;
+       struct ext3_super_block * es;
+ #ifdef EXT3FS_DEBUG
+       static int goal_hits = 0, goal_attempts = 0;
+ #endif
+       *errp = -ENOSPC;
+       sb = inode->i_sb;
+       if (!sb) {
+               printk ("ext3_new_block: nonexistent device");
+               return 0;
+       }
+
+       /*
+        * Check quota for allocation of this block.
+        */
+       if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+               *errp = -EDQUOT;
+               return 0;
+       }
+
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+       if (le32_to_cpu(es->s_free_blocks_count) <=
+                       le32_to_cpu(es->s_r_blocks_count) &&
+           ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
+            (sb->u.ext3_sb.s_resgid == 0 ||
+             !in_group_p (sb->u.ext3_sb.s_resgid)) &&
+            !capable(CAP_SYS_RESOURCE)))
+               goto out;
+
+       ext3_debug ("goal=%lu.\n", goal);
+
+       /*
+        * First, test whether the goal block is free.
+        */
+       if (goal < le32_to_cpu(es->s_first_data_block) ||
+           goal >= le32_to_cpu(es->s_blocks_count))
+               goal = le32_to_cpu(es->s_first_data_block);
+       i = (goal - le32_to_cpu(es->s_first_data_block)) /
+                       EXT3_BLOCKS_PER_GROUP(sb);
+       gdp = ext3_get_group_desc (sb, i, &bh2);
+       if (!gdp)
+               goto io_error;
+
+       if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+               j = ((goal - le32_to_cpu(es->s_first_data_block)) %
+                               EXT3_BLOCKS_PER_GROUP(sb));
+ #ifdef EXT3FS_DEBUG
+               if (j)
+                       goal_attempts++;
+ #endif
+               bitmap_nr = load_block_bitmap (sb, i);
+               if (bitmap_nr < 0)
+                       goto io_error;
+
+               bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+
+               ext3_debug ("goal is at %d:%d.\n", i, j);
+
+               if (ext3_test_allocatable(j, bh)) {
+ #ifdef EXT3FS_DEBUG
+                       goal_hits++;
+                       ext3_debug ("goal bit allocated.\n");
+ #endif
+                       goto got_block;
+               }
+
+               j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb));
+               if (j >= 0)
+                       goto search_back;
+       }
+
+       ext3_debug ("Bit not found in block group %d.\n", i);
+
+       /*
+        * Now search the rest of the groups.  We assume that
+        * i and gdp correctly point to the last group visited.
+        */
+       for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) {
+               i++;
+               if (i >= sb->u.ext3_sb.s_groups_count)
+                       i = 0;
+               gdp = ext3_get_group_desc (sb, i, &bh2);
+               if (!gdp) {
+                       *errp = -EIO;
+                       goto out;
+               }
+               if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+                       bitmap_nr = load_block_bitmap (sb, i);
+                       if (bitmap_nr < 0)
+                               goto io_error;
+
+                       bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+                       j = find_next_usable_block(-1, bh,
+                                                  EXT3_BLOCKS_PER_GROUP(sb));
+                       if (j >= 0)
+                               goto search_back;
+               }
+       }
+
+       /* No space left on the device */
+       unlock_super (sb);
+       return 0;
+
+ search_back:
+       /*
+        * We have succeeded in finding a free byte in the block
+        * bitmap.  Now search backwards up to 7 bits to find the
+        * start of this group of free blocks.
+        */
+       for (   k = 0;
+               k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh);
+               k++, j--)
+               ;
+
+ got_block:
+
+       ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count);
+
+       /* Make sure we use undo access for the bitmap, because it is
+            critical that we do the frozen_data COW on bitmap buffers in
+            all cases even if the buffer is in BJ_Forget state in the
+            committing transaction.  */
+       BUFFER_TRACE(bh, "get undo access for marking new block");
+       fatal = ext3_journal_get_undo_access(handle, bh);
+       if (fatal) goto out;
+
+       BUFFER_TRACE(bh2, "get_write_access");
+       fatal = ext3_journal_get_write_access(handle, bh2);
+       if (fatal) goto out;
+
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+       fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+       if (fatal) goto out;
+
+       tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb)
+                               + le32_to_cpu(es->s_first_data_block);
+
+       if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
+           tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
+           in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
+                     sb->u.ext3_sb.s_itb_per_group))
+               ext3_error (sb, "ext3_new_block",
+                           "Allocating block in system zone - "
+                           "block = %u", tmp);
+
+       /* The superblock lock should guard against anybody else beating
+        * us to this point! */
+       J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data));
+       BUFFER_TRACE(bh, "setting bitmap bit");
+       ext3_set_bit(j, bh->b_data);
+
+ #ifdef CONFIG_JBD_DEBUG
+       {
+               struct buffer_head *debug_bh;
+
+               /* Record bitmap buffer state in the newly allocated block */
+               debug_bh = get_hash_table(sb->s_dev, tmp, sb->s_blocksize);
+               if (debug_bh) {
+                       BUFFER_TRACE(debug_bh, "state when allocated");
+                       BUFFER_TRACE2(debug_bh, bh, "bitmap state");
+                       brelse(debug_bh);
+               }
+       }
+ #endif
+       if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data)
+               J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data));
+       bhtmp = bh;
+       alloctmp = j;
+
+       ext3_debug ("found bit %d\n", j);
+
+       /*
+        * Do block preallocation now if required.
+        */
+ #ifdef EXT3_PREALLOCATE
+       /*
+        * akpm: this is not enabled for ext3.  Need to use
+        * ext3_test_allocatable()
+        */
+       /* Writer: ->i_prealloc* */
+       if (prealloc_count && !*prealloc_count) {
+               int     prealloc_goal;
+               unsigned long next_block = tmp + 1;
+
+               prealloc_goal = es->s_prealloc_blocks ?
+                       es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS;
+
+               *prealloc_block = next_block;
+               /* Writer: end */
+               for (k = 1;
+                    k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb);
+                    k++, next_block++) {
+                       if (DQUOT_PREALLOC_BLOCK(inode, 1))
+                               break;
+                       /* Writer: ->i_prealloc* */
+                       if (*prealloc_block + *prealloc_count != next_block ||
+                           ext3_set_bit (j + k, bh->b_data)) {
+                               /* Writer: end */
+                               DQUOT_FREE_BLOCK(inode, 1);
+                               break;
+                       }
+                       (*prealloc_count)++;
+                       /* Writer: end */
+               }
+               /*
+                * As soon as we go for per-group spinlocks we'll need these
+                * done inside the loop above.
+                */
+               gdp->bg_free_blocks_count =
+                       cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
+                              (k - 1));
+               es->s_free_blocks_count =
+                       cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) -
+                              (k - 1));
+               ext3_debug ("Preallocated a further %lu bits.\n",
+                              (k - 1));
+       }
+ #endif
+
+       j = tmp;
+
+       BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block");
+       err = ext3_journal_dirty_metadata(handle, bh);
+       if (!fatal) fatal = err;
+
+       if (j >= le32_to_cpu(es->s_blocks_count)) {
+               ext3_error (sb, "ext3_new_block",
+                           "block(%d) >= blocks count(%d) - "
+                           "block_group = %d, es == %p ",j,
+                       le32_to_cpu(es->s_blocks_count), i, es);
+               goto out;
+       }
+
+       /*
+        * It is up to the caller to add the new buffer to a journal
+        * list of some description.  We don't know in advance whether
+        * the caller wants to use it as metadata or data.
+        */
+
+       ext3_debug ("allocating block %d. "
+                   "Goal hits %d of %d.\n", j, goal_hits, goal_attempts);
+
+       gdp->bg_free_blocks_count =
+                       cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
+       es->s_free_blocks_count =
+                       cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1);
+
+       BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor");
+       err = ext3_journal_dirty_metadata(handle, bh2);
+       if (!fatal) fatal = err;
+
+       BUFFER_TRACE(bh, "journal_dirty_metadata for superblock");
+       err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+       if (!fatal) fatal = err;
+
+       sb->s_dirt = 1;
+       if (fatal)
+               goto out;
+
+       unlock_super (sb);
+       *errp = 0;
+       return j;
+
+ io_error:
+       *errp = -EIO;
+ out:
+       if (fatal) {
+               *errp = fatal;
+               ext3_std_error(sb, fatal);
+       }
+       unlock_super (sb);
+       return 0;
+
+ }
+
+ unsigned long ext3_count_free_blocks (struct super_block * sb)
+ {
+ #ifdef EXT3FS_DEBUG
+       struct ext3_super_block * es;
+       unsigned long desc_count, bitmap_count, x;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       int i;
+
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+       for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+               desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+               bitmap_nr = load_block_bitmap (sb, i);
+               if (bitmap_nr < 0)
+                       continue;
+
+               x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr],
+                                    sb->s_blocksize);
+               printk ("group %d: stored = %d, counted = %lu\n",
+                       i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+               bitmap_count += x;
+       }
+       printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
+              le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+       unlock_super (sb);
+       return bitmap_count;
+ #else
+       return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
+ #endif
+ }
+
+ static inline int block_in_use (unsigned long block,
+                               struct super_block * sb,
+                               unsigned char * map)
+ {
+       return ext3_test_bit ((block -
+               le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
+                        EXT3_BLOCKS_PER_GROUP(sb), map);
+ }
+
+ static inline int test_root(int a, int b)
+ {
+       if (a == 0)
+               return 1;
+       while (1) {
+               if (a == 1)
+                       return 1;
+               if (a % b)
+                       return 0;
+               a = a / b;
+       }
+ }
+
+ int ext3_group_sparse(int group)
+ {
+       return (test_root(group, 3) || test_root(group, 5) ||
+               test_root(group, 7));
+ }
+
+ /**
+  *    ext3_bg_has_super - number of blocks used by the superblock in group
+  *    @sb: superblock for filesystem
+  *    @group: group number to check
+  *
+  *    Return the number of blocks used by the superblock (primary or backup)
+  *    in this group.  Currently this will be only 0 or 1.
+  */
+ int ext3_bg_has_super(struct super_block *sb, int group)
+ {
+       if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+           !ext3_group_sparse(group))
+               return 0;
+       return 1;
+ }
+
+ /**
+  *    ext3_bg_num_gdb - number of blocks used by the group table in group
+  *    @sb: superblock for filesystem
+  *    @group: group number to check
+  *
+  *    Return the number of blocks used by the group descriptor table
+  *    (primary or backup) in this group.  In the future there may be a
+  *    different number of descriptor blocks in each group.
+  */
+ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
+ {
+       if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+           !ext3_group_sparse(group))
+               return 0;
+       return EXT3_SB(sb)->s_gdb_count;
+ }
+
+ #ifdef CONFIG_EXT3_CHECK
+ /* Called at mount-time, super-block is locked */
+ void ext3_check_blocks_bitmap (struct super_block * sb)
+ {
+       struct buffer_head * bh;
+       struct ext3_super_block * es;
+       unsigned long desc_count, bitmap_count, x, j;
+       unsigned long desc_blocks;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       int i;
+
+       es = sb->u.ext3_sb.s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+       for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+               desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+               bitmap_nr = load_block_bitmap (sb, i);
+               if (bitmap_nr < 0)
+                       continue;
+
+               bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
+
+               if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data))
+                       ext3_error(sb, __FUNCTION__,
+                                  "Superblock in group %d is marked free", i);
+
+               desc_blocks = ext3_bg_num_gdb(sb, i);
+               for (j = 0; j < desc_blocks; j++)
+                       if (!ext3_test_bit(j + 1, bh->b_data))
+                               ext3_error(sb, __FUNCTION__,
+                                          "Descriptor block #%ld in group "
+                                          "%d is marked free", j, i);
+
+               if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
+                                               sb, bh->b_data))
+                       ext3_error (sb, "ext3_check_blocks_bitmap",
+                                   "Block bitmap for group %d is marked free",
+                                   i);
+
+               if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
+                                               sb, bh->b_data))
+                       ext3_error (sb, "ext3_check_blocks_bitmap",
+                                   "Inode bitmap for group %d is marked free",
+                                   i);
+
+               for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
+                       if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
+                                                       sb, bh->b_data))
+                               ext3_error (sb, "ext3_check_blocks_bitmap",
+                                           "Block #%d of the inode table in "
+                                           "group %d is marked free", j, i);
+
+               x = ext3_count_free (bh, sb->s_blocksize);
+               if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
+                       ext3_error (sb, "ext3_check_blocks_bitmap",
+                                   "Wrong free blocks count for group %d, "
+                                   "stored = %d, counted = %lu", i,
+                                   le16_to_cpu(gdp->bg_free_blocks_count), x);
+               bitmap_count += x;
+       }
+       if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
+               ext3_error (sb, "ext3_check_blocks_bitmap",
+                       "Wrong free blocks count in super block, "
+                       "stored = %lu, counted = %lu",
+                       (unsigned long)le32_to_cpu(es->s_free_blocks_count),
+                       bitmap_count);
+ }
+ #endif
diff -rc2P linux/fs/ext3/bitmap.c linux-2.4.13/fs/ext3/bitmap.c
*** linux/fs/ext3/bitmap.c      Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/bitmap.c       Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,26 ----
+ /*
+  *  linux/fs/ext3/bitmap.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  */
+
+ #include <linux/fs.h>
+
+
+ static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
+ {
+       unsigned int i;
+       unsigned long sum = 0;
+
+       if (!map)
+               return (0);
+       for (i = 0; i < numchars; i++)
+               sum += nibblemap[map->b_data[i] & 0xf] +
+                       nibblemap[(map->b_data[i] >> 4) & 0xf];
+       return (sum);
+ }
diff -rc2P linux/fs/ext3/dir.c linux-2.4.13/fs/ext3/dir.c
*** linux/fs/ext3/dir.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/dir.c  Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,190 ----
+ /*
+  *  linux/fs/ext3/dir.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/dir.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3 directory handling functions
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+
+ static unsigned char ext3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+
+ static int ext3_readdir(struct file *, void *, filldir_t);
+
+ struct file_operations ext3_dir_operations = {
+       read:           generic_read_dir,
+       readdir:        ext3_readdir,           /* BKL held */
+       ioctl:          ext3_ioctl,             /* BKL held */
+       fsync:          ext3_sync_file,         /* BKL held */
+ };
+
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+                         struct ext3_dir_entry_2 * de,
+                         struct buffer_head * bh,
+                         unsigned long offset)
+ {
+       const char * error_msg = NULL;
+       const int rlen = le16_to_cpu(de->rec_len);
+
+       if (rlen < EXT3_DIR_REC_LEN(1))
+               error_msg = "rec_len is smaller than minimal";
+       else if (rlen % 4 != 0)
+               error_msg = "rec_len % 4 != 0";
+       else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+               error_msg = "rec_len is too small for name_len";
+       else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+               error_msg = "directory entry across blocks";
+       else if (le32_to_cpu(de->inode) >
+                       le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
+               error_msg = "inode out of bounds";
+
+       if (error_msg != NULL)
+               ext3_error (dir->i_sb, function,
+                       "bad entry in directory #%lu: %s - "
+                       "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                       dir->i_ino, error_msg, offset,
+                       (unsigned long) le32_to_cpu(de->inode),
+                       rlen, de->name_len);
+       return error_msg == NULL ? 1 : 0;
+ }
+
+ static int ext3_readdir(struct file * filp,
+                        void * dirent, filldir_t filldir)
+ {
+       int error = 0;
+       unsigned long offset, blk;
+       int i, num, stored;
+       struct buffer_head * bh, * tmp, * bha[16];
+       struct ext3_dir_entry_2 * de;
+       struct super_block * sb;
+       int err;
+       struct inode *inode = filp->f_dentry->d_inode;
+
+       sb = inode->i_sb;
+
+       stored = 0;
+       bh = NULL;
+       offset = filp->f_pos & (sb->s_blocksize - 1);
+
+       while (!error && !stored && filp->f_pos < inode->i_size) {
+               blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
+               bh = ext3_bread (0, inode, blk, 0, &err);
+               if (!bh) {
+                       ext3_error (sb, "ext3_readdir",
+                               "directory #%lu contains a hole at offset %lu",
+                               inode->i_ino, (unsigned long)filp->f_pos);
+                       filp->f_pos += sb->s_blocksize - offset;
+                       continue;
+               }
+
+               /*
+                * Do the readahead
+                */
+               if (!offset) {
+                       for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
+                            i > 0; i--) {
+                               tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
+                               if (tmp && !buffer_uptodate(tmp) &&
+                                               !buffer_locked(tmp))
+                                       bha[num++] = tmp;
+                               else
+                                       brelse (tmp);
+                       }
+                       if (num) {
+                               ll_rw_block (READA, num, bha);
+                               for (i = 0; i < num; i++)
+                                       brelse (bha[i]);
+                       }
+               }
+
+ revalidate:
+               /* If the dir block has changed since the last call to
+                * readdir(2), then we might be pointing to an invalid
+                * dirent right now.  Scan from the start of the block
+                * to make sure. */
+               if (filp->f_version != inode->i_version) {
+                       for (i = 0; i < sb->s_blocksize && i < offset; ) {
+                               de = (struct ext3_dir_entry_2 *)
+                                       (bh->b_data + i);
+                               /* It's too expensive to do a full
+                                * dirent test each time round this
+                                * loop, but we do have to test at
+                                * least that it is non-zero.  A
+                                * failure will be detected in the
+                                * dirent test below. */
+                               if (le16_to_cpu(de->rec_len) <
+                                               EXT3_DIR_REC_LEN(1))
+                                       break;
+                               i += le16_to_cpu(de->rec_len);
+                       }
+                       offset = i;
+                       filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                               | offset;
+                       filp->f_version = inode->i_version;
+               }
+
+               while (!error && filp->f_pos < inode->i_size
+                      && offset < sb->s_blocksize) {
+                       de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
+                       if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
+                                                  bh, offset)) {
+                               /* On error, skip the f_pos to the
+                                    next block. */
+                               filp->f_pos = (filp->f_pos |
+                                               (sb->s_blocksize - 1)) + 1;
+                               brelse (bh);
+                               return stored;
+                       }
+                       offset += le16_to_cpu(de->rec_len);
+                       if (le32_to_cpu(de->inode)) {
+                               /* We might block in the next section
+                                * if the data destination is
+                                * currently swapped out.  So, use a
+                                * version stamp to detect whether or
+                                * not the directory has been modified
+                                * during the copy operation.
+                                */
+                               unsigned long version = filp->f_version;
+                               unsigned char d_type = DT_UNKNOWN;
+
+                               if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+                                               EXT3_FEATURE_INCOMPAT_FILETYPE)
+                                               && de->file_type < EXT3_FT_MAX)
+                                       d_type =
+                                         ext3_filetype_table[de->file_type];
+                               error = filldir(dirent, de->name,
+                                               de->name_len,
+                                               filp->f_pos,
+                                               le32_to_cpu(de->inode),
+                                               d_type);
+                               if (error)
+                                       break;
+                               if (version != filp->f_version)
+                                       goto revalidate;
+                               stored ++;
+                       }
+                       filp->f_pos += le16_to_cpu(de->rec_len);
+               }
+               offset = 0;
+               brelse (bh);
+       }
+       UPDATE_ATIME(inode);
+       return 0;
+ }
diff -rc2P linux/fs/ext3/file.c linux-2.4.13/fs/ext3/file.c
*** linux/fs/ext3/file.c        Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/file.c Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,97 ----
+ /*
+  *  linux/fs/ext3/file.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/file.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3 fs regular file handling primitives
+  *
+  *  64-bit file support on 64-bit platforms by Jakub Jelinek
+  *    ([email protected])
+  */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+
+ /*
+  * Called when an inode is released. Note that this is different
+  * from ext3_file_open: open gets called at every open, but release
+  * gets called only when /all/ the files are closed.
+  */
+ static int ext3_release_file (struct inode * inode, struct file * filp)
+ {
+       if (filp->f_mode & FMODE_WRITE)
+               ext3_discard_prealloc (inode);
+       return 0;
+ }
+
+ /*
+  * Called when an inode is about to be opened.
+  * We use this to disallow opening RW large files on 32bit systems if
+  * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
+  * on this flag in sys_open.
+  */
+ static int ext3_open_file (struct inode * inode, struct file * filp)
+ {
+       if (!(filp->f_flags & O_LARGEFILE) &&
+           inode->i_size > 0x7FFFFFFFLL)
+               return -EFBIG;
+       return 0;
+ }
+
+ /*
+  * ext3_file_write().
+  *
+  * Most things are done in ext3_prepare_write() and ext3_commit_write().
+  */
+
+ static ssize_t
+ ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
+ {
+       int ret;
+       struct inode *inode = file->f_dentry->d_inode;
+
+       ret = generic_file_write(file, buf, count, ppos);
+       if ((ret >= 0) && IS_SYNC(inode)) {
+               if (file->f_flags & O_SYNC) {
+                       /*
+                        * generic_osync_inode() has already done the sync
+                        */
+               } else {
+                       int ret2 = ext3_force_commit(inode->i_sb);
+                       if (ret2)
+                               ret = ret2;
+               }
+       }
+       return ret;
+ }
+
+ struct file_operations ext3_file_operations = {
+       llseek:         generic_file_llseek,    /* BKL held */
+       read:           generic_file_read,      /* BKL not held.  Don't need */
+       write:          ext3_file_write,        /* BKL not held.  Don't need */
+       ioctl:          ext3_ioctl,             /* BKL held */
+       mmap:           generic_file_mmap,
+       open:           ext3_open_file,         /* BKL not held.  Don't need */
+       release:        ext3_release_file,      /* BKL not held.  Don't need */
+       fsync:          ext3_sync_file,         /* BKL held */
+ };
+
+ struct inode_operations ext3_file_inode_operations = {
+       truncate:       ext3_truncate,          /* BKL held */
+       setattr:        ext3_setattr,           /* BKL held */
+ };
+
diff -rc2P linux/fs/ext3/fsync.c linux-2.4.13/fs/ext3/fsync.c
*** linux/fs/ext3/fsync.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/fsync.c        Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,69 ----
+ /*
+  *  linux/fs/ext3/fsync.c
+  *
+  *  Copyright (C) 1993  Stephen Tweedie ([email protected])
+  *  from
+  *  Copyright (C) 1992  Remy Card ([email protected])
+  *                      Laboratoire MASI - Institut Blaise Pascal
+  *                      Universite Pierre et Marie Curie (Paris VI)
+  *  from
+  *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3fs fsync primitive
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  *
+  *  Removed unnecessary code duplication for little endian machines
+  *  and excessive __inline__s.
+  *        Andi Kleen, 1997
+  *
+  * Major simplications and cleanup - we only need to do the metadata, because
+  * we can depend on generic_block_fdatasync() to sync the data blocks.
+  */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/smp_lock.h>
+
+ /*
+  * akpm: A new design for ext3_sync_file().
+  *
+  * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+  * There cannot be a transaction open by this task. (AKPM: quotas?)
+  * Another task could have dirtied this inode.  Its data can be in any
+  * state in the journalling system.
+  *
+  * What we do is just kick off a commit and wait on it.  This will snapshot the
+  * inode to disk.
+  *
+  * Note that there is a serious optimisation we can make here: if the current
+  * inode is not part of j_running_transaction or j_committing_transaction
+  * then we have nothing to do.  That would require implementation of t_ilist,
+  * which isn't too hard.
+  */
+
+ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+ {
+       struct inode *inode = dentry->d_inode;
+       int ret;
+
+       J_ASSERT(ext3_journal_current_handle() == 0);
+
+       /*
+        * fsync_inode_buffers() just walks i_dirty_buffers and waits
+        * on them.  It's a no-op for full data journalling because
+        * i_dirty_buffers will be ampty.
+        * Really, we only need to start I/O on the dirty buffers -
+        * we'll end up waiting on them in commit.
+        */
+       ret = fsync_inode_buffers(inode);
+
+       ext3_force_commit(inode->i_sb);
+
+       return ret;
+ }
diff -rc2P linux/fs/ext3/ialloc.c linux-2.4.13/fs/ext3/ialloc.c
*** linux/fs/ext3/ialloc.c      Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/ialloc.c       Fri Nov  9 17:03:46 2001
***************
*** 0 ****
--- 1,664 ----
+ /*
+  *  linux/fs/ext3/ialloc.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  BSD ufs-inspired inode and directory allocation by
+  *  Stephen Tweedie ([email protected]), 1993
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+ #include <asm/bitops.h>
+ #include <asm/byteorder.h>
+
+ /*
+  * ialloc.c contains the inodes allocation and deallocation routines
+  */
+
+ /*
+  * The free inodes are managed by bitmaps.  A file system contains several
+  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+  * block for inodes, N blocks for the inode table and data blocks.
+  *
+  * The file system contains group descriptors which are located after the
+  * super block.  Each descriptor contains the number of the bitmap block and
+  * the free blocks count in the block.  The descriptors are loaded in memory
+  * when a file system is mounted (see ext3_read_super).
+  */
+
+
+ /*
+  * Read the inode allocation bitmap for a given block_group, reading
+  * into the specified slot in the superblock's bitmap cache.
+  *
+  * Return >=0 on success or a -ve error code.
+  */
+ static int read_inode_bitmap (struct super_block * sb,
+                              unsigned long block_group,
+                              unsigned int bitmap_nr)
+ {
+       struct ext3_group_desc * gdp;
+       struct buffer_head * bh = NULL;
+       int retval = 0;
+
+       gdp = ext3_get_group_desc (sb, block_group, NULL);
+       if (!gdp) {
+               retval = -EIO;
+               goto error_out;
+       }
+       bh = bread (sb->s_dev,
+                       le32_to_cpu(gdp->bg_inode_bitmap), sb->s_blocksize);
+       if (!bh) {
+               ext3_error (sb, "read_inode_bitmap",
+                           "Cannot read inode bitmap - "
+                           "block_group = %lu, inode_bitmap = %lu",
+                           block_group, (unsigned long) gdp->bg_inode_bitmap);
+               retval = -EIO;
+       }
+       /*
+        * On IO error, just leave a zero in the superblock's block pointer for
+        * this group.  The IO will be retried next time.
+        */
+ error_out:
+       sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group;
+       sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh;
+       return retval;
+ }
+
+ /*
+  * load_inode_bitmap loads the inode bitmap for a blocks group
+  *
+  * It maintains a cache for the last bitmaps loaded.  This cache is managed
+  * with a LRU algorithm.
+  *
+  * Notes:
+  * 1/ There is one cache per mounted file system.
+  * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
+  *    this function reads the bitmap without maintaining a LRU cache.
+  *
+  * Return the slot used to store the bitmap, or a -ve error code.
+  */
+ static int load_inode_bitmap (struct super_block * sb,
+                             unsigned int block_group)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       unsigned long inode_bitmap_number;
+       struct buffer_head * inode_bitmap;
+       int i, j, retval = 0;
+
+       if (block_group >= sbi->s_groups_count)
+               ext3_panic (sb, "load_inode_bitmap",
+                           "block_group >= groups_count - "
+                           "block_group = %d, groups_count = %lu",
+                           block_group, sbi->s_groups_count);
+       if (sbi->s_loaded_inode_bitmaps > 0 &&
+           sbi->s_inode_bitmap_number[0] == block_group &&
+           sbi->s_inode_bitmap[0] != NULL)
+               return 0;
+       if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+               if (sbi->s_inode_bitmap[block_group]) {
+                       if (sbi->s_inode_bitmap_number[block_group] !=
+                                               block_group)
+                               ext3_panic(sb, "load_inode_bitmap",
+                                       "block_group != inode_bitmap_number");
+                       return block_group;
+               }
+               retval = read_inode_bitmap(sb, block_group, block_group);
+               if (retval < 0)
+                       return retval;
+               return block_group;
+       }
+
+       for (i = 0; i < sbi->s_loaded_inode_bitmaps &&
+                   sbi->s_inode_bitmap_number[i] != block_group; i++)
+               /* do nothing */;
+       if (i < sbi->s_loaded_inode_bitmaps &&
+           sbi->s_inode_bitmap_number[i] == block_group) {
+               inode_bitmap_number = sbi->s_inode_bitmap_number[i];
+               inode_bitmap = sbi->s_inode_bitmap[i];
+               for (j = i; j > 0; j--) {
+                       sbi->s_inode_bitmap_number[j] =
+                               sbi->s_inode_bitmap_number[j - 1];
+                       sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
+               }
+               sbi->s_inode_bitmap_number[0] = inode_bitmap_number;
+               sbi->s_inode_bitmap[0] = inode_bitmap;
+
+               /*
+                * There's still one special case here --- if inode_bitmap == 0
+                * then our last attempt to read the bitmap failed and we have
+                * just ended up caching that failure.  Try again to read it.
+                */
+               if (!inode_bitmap)
+                       retval = read_inode_bitmap (sb, block_group, 0);
+       } else {
+               if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED)
+                       sbi->s_loaded_inode_bitmaps++;
+               else
+                       brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]);
+               for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) {
+                       sbi->s_inode_bitmap_number[j] =
+                               sbi->s_inode_bitmap_number[j - 1];
+                       sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
+               }
+               retval = read_inode_bitmap (sb, block_group, 0);
+       }
+       return retval;
+ }
+
+ /*
+  * NOTE! When we get the inode, we're the only people
+  * that have access to it, and as such there are no
+  * race conditions we have to worry about. The inode
+  * is not on the hash-lists, and it cannot be reached
+  * through the filesystem because the directory entry
+  * has been deleted earlier.
+  *
+  * HOWEVER: we must make sure that we get no aliases,
+  * which means that we have to call "clear_inode()"
+  * _before_ we mark the inode not in use in the inode
+  * bitmaps. Otherwise a newly created file might use
+  * the same inode number (not actually the same pointer
+  * though), and then we'd have two inodes sharing the
+  * same inode number and space on the harddisk.
+  */
+ void ext3_free_inode (handle_t *handle, struct inode * inode)
+ {
+       struct super_block * sb = inode->i_sb;
+       int is_directory;
+       unsigned long ino;
+       struct buffer_head * bh;
+       struct buffer_head * bh2;
+       unsigned long block_group;
+       unsigned long bit;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       struct ext3_super_block * es;
+       int fatal = 0, err;
+
+       if (!inode->i_dev) {
+               printk ("ext3_free_inode: inode has no device\n");
+               return;
+       }
+       if (atomic_read(&inode->i_count) > 1) {
+               printk ("ext3_free_inode: inode has count=%d\n",
+                                       atomic_read(&inode->i_count));
+               return;
+       }
+       if (inode->i_nlink) {
+               printk ("ext3_free_inode: inode has nlink=%d\n",
+                       inode->i_nlink);
+               return;
+       }
+       if (!sb) {
+               printk("ext3_free_inode: inode on nonexistent device\n");
+               return;
+       }
+
+       ino = inode->i_ino;
+       ext3_debug ("freeing inode %lu\n", ino);
+
+       /*
+        * Note: we must free any quota before locking the superblock,
+        * as writing the quota to disk may need the lock as well.
+        */
+       DQUOT_INIT(inode);
+       DQUOT_FREE_INODE(inode);
+       DQUOT_DROP(inode);
+
+       is_directory = S_ISDIR(inode->i_mode);
+
+       /* Do this BEFORE marking the inode not in use or returning an error */
+       clear_inode (inode);
+
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+       if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+               ext3_error (sb, "ext3_free_inode",
+                           "reserved or nonexistent inode %lu", ino);
+               goto error_return;
+       }
+       block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+       bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+       bitmap_nr = load_inode_bitmap (sb, block_group);
+       if (bitmap_nr < 0)
+               goto error_return;
+
+       bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
+
+       BUFFER_TRACE(bh, "get_write_access");
+       fatal = ext3_journal_get_write_access(handle, bh);
+       if (fatal)
+               goto error_return;
+
+       /* Ok, now we can actually update the inode bitmaps.. */
+       if (!ext3_clear_bit (bit, bh->b_data))
+               ext3_error (sb, "ext3_free_inode",
+                             "bit already cleared for inode %lu", ino);
+       else {
+               gdp = ext3_get_group_desc (sb, block_group, &bh2);
+
+               BUFFER_TRACE(bh2, "get_write_access");
+               fatal = ext3_journal_get_write_access(handle, bh2);
+               if (fatal) goto error_return;
+
+               BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
+               fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+               if (fatal) goto error_return;
+
+               if (gdp) {
+                       gdp->bg_free_inodes_count = cpu_to_le16(
+                               le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+                       if (is_directory)
+                               gdp->bg_used_dirs_count = cpu_to_le16(
+                                 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+               }
+               BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+               err = ext3_journal_dirty_metadata(handle, bh2);
+               if (!fatal) fatal = err;
+               es->s_free_inodes_count =
+                       cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
+               BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
+                                       "call ext3_journal_dirty_metadata");
+               err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+               if (!fatal) fatal = err;
+       }
+       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+       err = ext3_journal_dirty_metadata(handle, bh);
+       if (!fatal)
+               fatal = err;
+       sb->s_dirt = 1;
+ error_return:
+       ext3_std_error(sb, fatal);
+       unlock_super(sb);
+ }
+
+ /*
+  * There are two policies for allocating an inode.  If the new inode is
+  * a directory, then a forward search is made for a block group with both
+  * free space and a low directory-to-inode ratio; if that fails, then of
+  * the groups with above-average free space, that group with the fewest
+  * directories already is chosen.
+  *
+  * For other inodes, search forward from the parent directory's block
+  * group to find a free inode.
+  */
+ struct inode * ext3_new_inode (handle_t *handle,
+                               const struct inode * dir, int mode)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh;
+       struct buffer_head * bh2;
+       int i, j, avefreei;
+       struct inode * inode;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       struct ext3_group_desc * tmp;
+       struct ext3_super_block * es;
+       int err = 0;
+
+       /* Cannot create files in a deleted directory */
+       if (!dir || !dir->i_nlink)
+               return ERR_PTR(-EPERM);
+
+       sb = dir->i_sb;
+       inode = new_inode(sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       init_rwsem(&inode->u.ext3_i.truncate_sem);
+
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+ repeat:
+       gdp = NULL;
+       i = 0;
+
+       if (S_ISDIR(mode)) {
+               avefreei = le32_to_cpu(es->s_free_inodes_count) /
+                       sb->u.ext3_sb.s_groups_count;
+               if (!gdp) {
+                       for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
+                               struct buffer_head *temp_buffer;
+                               tmp = ext3_get_group_desc (sb, j, &temp_buffer);
+                               if (tmp &&
+                                   le16_to_cpu(tmp->bg_free_inodes_count) &&
+                                   le16_to_cpu(tmp->bg_free_inodes_count) >=
+                                                       avefreei) {
+                                       if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) >
+                                               le16_to_cpu(gdp->bg_free_blocks_count))) {
+                                               i = j;
+                                               gdp = tmp;
+                                               bh2 = temp_buffer;
+                                       }
+                               }
+                       }
+               }
+       } else {
+               /*
+                * Try to place the inode in its parent directory
+                */
+               i = dir->u.ext3_i.i_block_group;
+               tmp = ext3_get_group_desc (sb, i, &bh2);
+               if (tmp && le16_to_cpu(tmp->bg_free_inodes_count))
+                       gdp = tmp;
+               else
+               {
+                       /*
+                        * Use a quadratic hash to find a group with a
+                        * free inode
+                        */
+                       for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
+                               i += j;
+                               if (i >= sb->u.ext3_sb.s_groups_count)
+                                       i -= sb->u.ext3_sb.s_groups_count;
+                               tmp = ext3_get_group_desc (sb, i, &bh2);
+                               if (tmp &&
+                                   le16_to_cpu(tmp->bg_free_inodes_count)) {
+                                       gdp = tmp;
+                                       break;
+                               }
+                       }
+               }
+               if (!gdp) {
+                       /*
+                        * That failed: try linear search for a free inode
+                        */
+                       i = dir->u.ext3_i.i_block_group + 1;
+                       for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
+                               if (++i >= sb->u.ext3_sb.s_groups_count)
+                                       i = 0;
+                               tmp = ext3_get_group_desc (sb, i, &bh2);
+                               if (tmp &&
+                                   le16_to_cpu(tmp->bg_free_inodes_count)) {
+                                       gdp = tmp;
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       err = -ENOSPC;
+       if (!gdp)
+               goto fail;
+
+       err = -EIO;
+       bitmap_nr = load_inode_bitmap (sb, i);
+       if (bitmap_nr < 0)
+               goto fail;
+
+       bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
+
+       if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data,
+                                     EXT3_INODES_PER_GROUP(sb))) <
+           EXT3_INODES_PER_GROUP(sb)) {
+               BUFFER_TRACE(bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, bh);
+               if (err) goto fail;
+
+               if (ext3_set_bit (j, bh->b_data)) {
+                       ext3_error (sb, "ext3_new_inode",
+                                     "bit already set for inode %d", j);
+                       goto repeat;
+               }
+               BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+               err = ext3_journal_dirty_metadata(handle, bh);
+               if (err) goto fail;
+       } else {
+               if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) {
+                       ext3_error (sb, "ext3_new_inode",
+                                   "Free inodes count corrupted in group %d",
+                                   i);
+                       /* Is it really ENOSPC? */
+                       err = -ENOSPC;
+                       if (sb->s_flags & MS_RDONLY)
+                               goto fail;
+
+                       BUFFER_TRACE(bh2, "get_write_access");
+                       err = ext3_journal_get_write_access(handle, bh2);
+                       if (err) goto fail;
+                       gdp->bg_free_inodes_count = 0;
+                       BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+                       err = ext3_journal_dirty_metadata(handle, bh2);
+                       if (err) goto fail;
+               }
+               goto repeat;
+       }
+       j += i * EXT3_INODES_PER_GROUP(sb) + 1;
+       if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
+               ext3_error (sb, "ext3_new_inode",
+                           "reserved inode or inode > inodes count - "
+                           "block_group = %d,inode=%d", i, j);
+               err = -EIO;
+               goto fail;
+       }
+
+       BUFFER_TRACE(bh2, "get_write_access");
+       err = ext3_journal_get_write_access(handle, bh2);
+       if (err) goto fail;
+       gdp->bg_free_inodes_count =
+               cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+       if (S_ISDIR(mode))
+               gdp->bg_used_dirs_count =
+                       cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+       BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+       err = ext3_journal_dirty_metadata(handle, bh2);
+       if (err) goto fail;
+
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+       if (err) goto fail;
+       es->s_free_inodes_count =
+               cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
+       err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+       sb->s_dirt = 1;
+       if (err) goto fail;
+
+       inode->i_uid = current->fsuid;
+       if (test_opt (sb, GRPID))
+               inode->i_gid = dir->i_gid;
+       else if (dir->i_mode & S_ISGID) {
+               inode->i_gid = dir->i_gid;
+               if (S_ISDIR(mode))
+                       mode |= S_ISGID;
+       } else
+               inode->i_gid = current->fsgid;
+       inode->i_mode = mode;
+
+       inode->i_ino = j;
+       /* This is the optimal IO size (for stat), not the fs block size */
+       inode->i_blksize = PAGE_SIZE;
+       inode->i_blocks = 0;
+       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+       inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL;
+       if (S_ISLNK(mode))
+               inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL | EXT3_APPEND_FL);
+ #ifdef EXT3_FRAGMENTS
+       inode->u.ext3_i.i_faddr = 0;
+       inode->u.ext3_i.i_frag_no = 0;
+       inode->u.ext3_i.i_frag_size = 0;
+ #endif
+       inode->u.ext3_i.i_file_acl = 0;
+       inode->u.ext3_i.i_dir_acl = 0;
+       inode->u.ext3_i.i_dtime = 0;
+       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+ #ifdef EXT3_PREALLOCATE
+       inode->u.ext3_i.i_prealloc_count = 0;
+ #endif
+       inode->u.ext3_i.i_block_group = i;
+
+       if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
+               inode->i_flags |= S_SYNC;
+       if (IS_SYNC(inode))
+               handle->h_sync = 1;
+       insert_inode_hash(inode);
+       inode->i_generation = event++;
+
+       inode->u.ext3_i.i_state = EXT3_STATE_NEW;
+       err = ext3_mark_inode_dirty(handle, inode);
+       if (err) goto fail;
+
+       unlock_super (sb);
+       if(DQUOT_ALLOC_INODE(inode)) {
+               DQUOT_DROP(inode);
+               inode->i_flags |= S_NOQUOTA;
+               inode->i_nlink = 0;
+               iput(inode);
+               return ERR_PTR(-EDQUOT);
+       }
+       ext3_debug ("allocating inode %lu\n", inode->i_ino);
+       return inode;
+
+ fail:
+       unlock_super(sb);
+       iput(inode);
+       ext3_std_error(sb, err);
+       return ERR_PTR(err);
+ }
+
+ /* Verify that we are loading a valid orphan from disk */
+ struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
+ {
+       ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+       unsigned long block_group;
+       int bit;
+       int bitmap_nr;
+       struct buffer_head *bh;
+       struct inode *inode = NULL;
+
+       /* Error cases - e2fsck has already cleaned up for us */
+       if (ino > max_ino) {
+               ext3_warning(sb, __FUNCTION__,
+                            "bad orphan ino %ld!  e2fsck was run?\n", ino);
+               return NULL;
+       }
+
+       block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+       bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+       if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
+           !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
+               ext3_warning(sb, __FUNCTION__,
+                            "inode bitmap error for orphan %ld\n", ino);
+               return NULL;
+       }
+
+       /* Having the inode bit set should be a 100% indicator that this
+        * is a valid orphan (no e2fsck run on fs).  Orphans also include
+        * inodes that were being truncated, so we can't check i_nlink==0.
+        */
+       if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
+           is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
+               ext3_warning(sb, __FUNCTION__,
+                            "bad orphan inode %ld!  e2fsck was run?\n", ino);
+               printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
+                      bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
+               printk(KERN_NOTICE "inode=%p\n", inode);
+               if (inode) {
+                       printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+                              is_bad_inode(inode));
+                       printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n",
+                              NEXT_ORPHAN(inode));
+                       printk(KERN_NOTICE "max_ino=%ld\n", max_ino);
+               }
+               /* Avoid freeing blocks if we got a bad deleted inode */
+               if (inode && inode->i_nlink == 0)
+                       inode->i_blocks = 0;
+               iput(inode);
+               return NULL;
+       }
+
+       return inode;
+ }
+
+ unsigned long ext3_count_free_inodes (struct super_block * sb)
+ {
+ #ifdef EXT3FS_DEBUG
+       struct ext3_super_block * es;
+       unsigned long desc_count, bitmap_count, x;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       int i;
+
+       lock_super (sb);
+       es = sb->u.ext3_sb.s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+       for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+               desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+               bitmap_nr = load_inode_bitmap (sb, i);
+               if (bitmap_nr < 0)
+                       continue;
+
+               x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+                                    EXT3_INODES_PER_GROUP(sb) / 8);
+               printk ("group %d: stored = %d, counted = %lu\n",
+                       i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+               bitmap_count += x;
+       }
+       printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
+               le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+       unlock_super (sb);
+       return desc_count;
+ #else
+       return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
+ #endif
+ }
+
+ #ifdef CONFIG_EXT3_CHECK
+ /* Called at mount-time, super-block is locked */
+ void ext3_check_inodes_bitmap (struct super_block * sb)
+ {
+       struct ext3_super_block * es;
+       unsigned long desc_count, bitmap_count, x;
+       int bitmap_nr;
+       struct ext3_group_desc * gdp;
+       int i;
+
+       es = sb->u.ext3_sb.s_es;
+       desc_count = 0;
+       bitmap_count = 0;
+       gdp = NULL;
+       for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+               gdp = ext3_get_group_desc (sb, i, NULL);
+               if (!gdp)
+                       continue;
+               desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+               bitmap_nr = load_inode_bitmap (sb, i);
+               if (bitmap_nr < 0)
+                       continue;
+
+               x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+                                    EXT3_INODES_PER_GROUP(sb) / 8);
+               if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
+                       ext3_error (sb, "ext3_check_inodes_bitmap",
+                                   "Wrong free inodes count in group %d, "
+                                   "stored = %d, counted = %lu", i,
+                                   le16_to_cpu(gdp->bg_free_inodes_count), x);
+               bitmap_count += x;
+       }
+       if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
+               ext3_error (sb, "ext3_check_inodes_bitmap",
+                           "Wrong free inodes count in super block, "
+                           "stored = %lu, counted = %lu",
+                           (unsigned long)le32_to_cpu(es->s_free_inodes_count),
+                           bitmap_count);
+ }
+ #endif
diff -rc2P linux/fs/ext3/inode.c linux-2.4.13/fs/ext3/inode.c
*** linux/fs/ext3/inode.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/inode.c        Fri Nov  9 17:03:19 2001
***************
*** 0 ****
--- 1,2676 ----
+ /*
+  *  linux/fs/ext3/inode.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/inode.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  Goal-directed block allocation by Stephen Tweedie
+  *    ([email protected]), 1993, 1998
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  *  64-bit file support on 64-bit platforms by Jakub Jelinek
+  *    ([email protected])
+  *
+  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
+  */
+
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
+
+
+ /*
+  * SEARCH_FROM_ZERO forces each block allocation to search from the start
+  * of the filesystem.  This is to force rapid reallocation of recently-freed
+  * blocks.  The file fragmentation is horrendous.
+  */
+ #undef SEARCH_FROM_ZERO
+
+ /* The ext3 forget function must perform a revoke if we are freeing data
+  * which has been journaled.  Metadata (eg. indirect blocks) must be
+  * revoked in all cases.
+  *
+  * "bh" may be NULL: a metadata block may have been freed from memory
+  * but there may still be a record of it in the journal, and that record
+  * still needs to be revoked.
+  */
+
+ static int ext3_forget(handle_t *handle, int is_metadata,
+                      struct inode *inode, struct buffer_head *bh,
+                      int blocknr)
+ {
+       int err;
+
+       BUFFER_TRACE(bh, "enter");
+
+       jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+                 "data mode %lx\n",
+                 bh, is_metadata, inode->i_mode,
+                 test_opt(inode->i_sb, DATA_FLAGS));
+
+       /* Never use the revoke function if we are doing full data
+        * journaling: there is no need to, and a V1 superblock won't
+        * support it.  Otherwise, only skip the revoke on un-journaled
+        * data blocks. */
+
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
+           (!is_metadata && !ext3_should_journal_data(inode))) {
+               if (bh) {
+                       BUFFER_TRACE(bh, "call journal_forget");
+                       ext3_journal_forget(handle, bh);
+               }
+               return 0;
+       }
+
+       /*
+        * data!=journal && (is_metadata || should_journal_data(inode))
+        */
+       BUFFER_TRACE(bh, "call ext3_journal_revoke");
+       err = ext3_journal_revoke(handle, blocknr, bh);
+       if (err)
+               ext3_abort(inode->i_sb, __FUNCTION__,
+                          "error %d when attempting revoke", err);
+       BUFFER_TRACE(bh, "exit");
+       return err;
+ }
+
+ /*
+  * Truncate transactions can be complex and absolutely huge.  So we need to
+  * be able to restart the transaction at a conventient checkpoint to make
+  * sure we don't overflow the journal.
+  *
+  * start_transaction gets us a new handle for a truncate transaction,
+  * and extend_transaction tries to extend the existing one a bit.  If
+  * extend fails, we need to propagate the failure up and restart the
+  * transaction in the top-level truncate loop. --sct
+  */
+
+ static handle_t *start_transaction(struct inode *inode)
+ {
+       long needed;
+       handle_t *result;
+
+       needed = inode->i_blocks;
+       if (needed > EXT3_MAX_TRANS_DATA)
+               needed = EXT3_MAX_TRANS_DATA;
+
+       result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
+       if (!IS_ERR(result))
+               return result;
+
+       ext3_std_error(inode->i_sb, PTR_ERR(result));
+       return result;
+ }
+
+ /*
+  * Try to extend this transaction for the purposes of truncation.
+  *
+  * Returns 0 if we managed to create more room.  If we can't create more
+  * room, and the transaction must be restarted we return 1.
+  */
+ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+ {
+       long needed;
+
+       if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+               return 0;
+       needed = inode->i_blocks;
+       if (needed > EXT3_MAX_TRANS_DATA)
+               needed = EXT3_MAX_TRANS_DATA;
+       if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
+               return 0;
+       return 1;
+ }
+
+ /*
+  * Restart the transaction associated with *handle.  This does a commit,
+  * so before we call here everything must be consistently dirtied against
+  * this transaction.
+  */
+ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+ {
+       long needed = inode->i_blocks;
+       if (needed > EXT3_MAX_TRANS_DATA)
+               needed = EXT3_MAX_TRANS_DATA;
+       jbd_debug(2, "restarting handle %p\n", handle);
+       return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
+ }
+
+ /*
+  * Called at each iput()
+  */
+ void ext3_put_inode (struct inode * inode)
+ {
+       ext3_discard_prealloc (inode);
+ }
+
+ /*
+  * Called at the last iput() if i_nlink is zero.
+  */
+ void ext3_delete_inode (struct inode * inode)
+ {
+       handle_t *handle;
+
+       if (is_bad_inode(inode) ||
+           inode->i_ino == EXT3_ACL_IDX_INO ||
+           inode->i_ino == EXT3_ACL_DATA_INO)
+               goto no_delete;
+
+       lock_kernel();
+       handle = start_transaction(inode);
+       if (IS_ERR(handle)) {
+               /* If we're going to skip the normal cleanup, we still
+                * need to make sure that the in-core orphan linked list
+                * is properly cleaned up. */
+               ext3_orphan_del(NULL, inode);
+
+               ext3_std_error(inode->i_sb, PTR_ERR(handle));
+               unlock_kernel();
+               goto no_delete;
+       }
+
+       if (IS_SYNC(inode))
+               handle->h_sync = 1;
+       inode->i_size = 0;
+       if (inode->i_blocks)
+               ext3_truncate(inode);
+       /*
+        * Kill off the orphan record which ext3_truncate created.
+        * AKPM: I think this can be inside the above `if'.
+        * Note that ext3_orphan_del() has to be able to cope with the
+        * deletion of a non-existent orphan - this is because we don't
+        * know if ext3_truncate() actually created an orphan record.
+        * (Well, we could do this if we need to, but heck - it works)
+        */
+       ext3_orphan_del(handle, inode);
+       inode->u.ext3_i.i_dtime = CURRENT_TIME;
+
+       /*
+        * One subtle ordering requirement: if anything has gone wrong
+        * (transaction abort, IO errors, whatever), then we can still
+        * do these next steps (the fs will already have been marked as
+        * having errors), but we can't free the inode if the mark_dirty
+        * fails.
+        */
+       if (ext3_mark_inode_dirty(handle, inode))
+               /* If that failed, just do the required in-core inode clear. */
+               clear_inode(inode);
+       else
+               ext3_free_inode(handle, inode);
+       ext3_journal_stop(handle, inode);
+       unlock_kernel();
+       return;
+ no_delete:
+       clear_inode(inode);     /* We must guarantee clearing of inode... */
+ }
+
+ void ext3_discard_prealloc (struct inode * inode)
+ {
+ #ifdef EXT3_PREALLOCATE
+       lock_kernel();
+       /* Writer: ->i_prealloc* */
+       if (inode->u.ext3_i.i_prealloc_count) {
+               unsigned short total = inode->u.ext3_i.i_prealloc_count;
+               unsigned long block = inode->u.ext3_i.i_prealloc_block;
+               inode->u.ext3_i.i_prealloc_count = 0;
+               inode->u.ext3_i.i_prealloc_block = 0;
+               /* Writer: end */
+               ext3_free_blocks (inode, block, total);
+       }
+       unlock_kernel();
+ #endif
+ }
+
+ static int ext3_alloc_block (handle_t *handle,
+                       struct inode * inode, unsigned long goal, int *err)
+ {
+ #ifdef EXT3FS_DEBUG
+       static unsigned long alloc_hits = 0, alloc_attempts = 0;
+ #endif
+       unsigned long result;
+
+ #ifdef EXT3_PREALLOCATE
+       /* Writer: ->i_prealloc* */
+       if (inode->u.ext3_i.i_prealloc_count &&
+           (goal == inode->u.ext3_i.i_prealloc_block ||
+            goal + 1 == inode->u.ext3_i.i_prealloc_block))
+       {
+               result = inode->u.ext3_i.i_prealloc_block++;
+               inode->u.ext3_i.i_prealloc_count--;
+               /* Writer: end */
+               ext3_debug ("preallocation hit (%lu/%lu).\n",
+                           ++alloc_hits, ++alloc_attempts);
+       } else {
+               ext3_discard_prealloc (inode);
+               ext3_debug ("preallocation miss (%lu/%lu).\n",
+                           alloc_hits, ++alloc_attempts);
+               if (S_ISREG(inode->i_mode))
+                       result = ext3_new_block (inode, goal,
+                                &inode->u.ext3_i.i_prealloc_count,
+                                &inode->u.ext3_i.i_prealloc_block, err);
+               else
+                       result = ext3_new_block (inode, goal, 0, 0, err);
+               /*
+                * AKPM: this is somewhat sticky.  I'm not surprised it was
+                * disabled in 2.2's ext3.  Need to integrate b_committed_data
+                * guarding with preallocation, if indeed preallocation is
+                * effective.
+                */
+       }
+ #else
+       result = ext3_new_block (handle, inode, goal, 0, 0, err);
+ #endif
+       return result;
+ }
+
+
+ typedef struct {
+       u32     *p;
+       u32     key;
+       struct buffer_head *bh;
+ } Indirect;
+
+ static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
+ {
+       p->key = *(p->p = v);
+       p->bh = bh;
+ }
+
+ static inline int verify_chain(Indirect *from, Indirect *to)
+ {
+       while (from <= to && from->key == *from->p)
+               from++;
+       return (from > to);
+ }
+
+ /**
+  *    ext3_block_to_path - parse the block number into array of offsets
+  *    @inode: inode in question (we are only interested in its superblock)
+  *    @i_block: block number to be parsed
+  *    @offsets: array to store the offsets in
+  *
+  *    To store the locations of file's data ext3 uses a data structure common
+  *    for UNIX filesystems - tree of pointers anchored in the inode, with
+  *    data blocks at leaves and indirect blocks in intermediate nodes.
+  *    This function translates the block number into path in that tree -
+  *    return value is the path length and @offsets[n] is the offset of
+  *    pointer to (n+1)th node in the nth one. If @block is out of range
+  *    (negative or too large) warning is printed and zero returned.
+  *
+  *    Note: function doesn't find node addresses, so no IO is needed. All
+  *    we need to know is the capacity of indirect blocks (taken from the
+  *    inode->i_sb).
+  */
+
+ /*
+  * Portability note: the last comparison (check that we fit into triple
+  * indirect block) is spelled differently, because otherwise on an
+  * architecture with 32-bit longs and 8Kb pages we might get into trouble
+  * if our filesystem had 8Kb blocks. We might use long long, but that would
+  * kill us on x86. Oh, well, at least the sign propagation does not matter -
+  * i_block would have to be negative in the very beginning, so we would not
+  * get there at all.
+  */
+
+ static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
+ {
+       int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+       int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+       const long direct_blocks = EXT3_NDIR_BLOCKS,
+               indirect_blocks = ptrs,
+               double_blocks = (1 << (ptrs_bits * 2));
+       int n = 0;
+
+       if (i_block < 0) {
+               ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
+       } else if (i_block < direct_blocks) {
+               offsets[n++] = i_block;
+       } else if ( (i_block -= direct_blocks) < indirect_blocks) {
+               offsets[n++] = EXT3_IND_BLOCK;
+               offsets[n++] = i_block;
+       } else if ((i_block -= indirect_blocks) < double_blocks) {
+               offsets[n++] = EXT3_DIND_BLOCK;
+               offsets[n++] = i_block >> ptrs_bits;
+               offsets[n++] = i_block & (ptrs - 1);
+       } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+               offsets[n++] = EXT3_TIND_BLOCK;
+               offsets[n++] = i_block >> (ptrs_bits * 2);
+               offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+               offsets[n++] = i_block & (ptrs - 1);
+       } else {
+               ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
+       }
+       return n;
+ }
+
+ /**
+  *    ext3_get_branch - read the chain of indirect blocks leading to data
+  *    @inode: inode in question
+  *    @depth: depth of the chain (1 - direct pointer, etc.)
+  *    @offsets: offsets of pointers in inode/indirect blocks
+  *    @chain: place to store the result
+  *    @err: here we store the error value
+  *
+  *    Function fills the array of triples <key, p, bh> and returns %NULL
+  *    if everything went OK or the pointer to the last filled triple
+  *    (incomplete one) otherwise. Upon the return chain[i].key contains
+  *    the number of (i+1)-th block in the chain (as it is stored in memory,
+  *    i.e. little-endian 32-bit), chain[i].p contains the address of that
+  *    number (it points into struct inode for i==0 and into the bh->b_data
+  *    for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+  *    block for i>0 and NULL for i==0. In other words, it holds the block
+  *    numbers of the chain, addresses they were taken from (and where we can
+  *    verify that chain did not change) and buffer_heads hosting these
+  *    numbers.
+  *
+  *    Function stops when it stumbles upon zero pointer (absent block)
+  *            (pointer to last triple returned, *@err == 0)
+  *    or when it gets an IO error reading an indirect block
+  *            (ditto, *@err == -EIO)
+  *    or when it notices that chain had been changed while it was reading
+  *            (ditto, *@err == -EAGAIN)
+  *    or when it reads all @depth-1 indirect blocks successfully and finds
+  *    the whole chain, all way to the data (returns %NULL, *err == 0).
+  */
+ static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
+                                Indirect chain[4], int *err)
+ {
+       kdev_t dev = inode->i_dev;
+       int blocksize = inode->i_sb->s_blocksize;
+       Indirect *p = chain;
+       struct buffer_head *bh;
+
+       *err = 0;
+       /* i_data is not going away, no lock needed */
+       add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
+       if (!p->key)
+               goto no_block;
+       while (--depth) {
+               bh = bread(dev, le32_to_cpu(p->key), blocksize);
+               if (!bh)
+                       goto failure;
+               /* Reader: pointers */
+               if (!verify_chain(chain, p))
+                       goto changed;
+               add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+               /* Reader: end */
+               if (!p->key)
+                       goto no_block;
+       }
+       return NULL;
+
+ changed:
+       *err = -EAGAIN;
+       goto no_block;
+ failure:
+       *err = -EIO;
+ no_block:
+       return p;
+ }
+
+ /**
+  *    ext3_find_near - find a place for allocation with sufficient locality
+  *    @inode: owner
+  *    @ind: descriptor of indirect block.
+  *
+  *    This function returns the prefered place for block allocation.
+  *    It is used when heuristic for sequential allocation fails.
+  *    Rules are:
+  *      + if there is a block to the left of our position - allocate near it.
+  *      + if pointer will live in indirect block - allocate near that block.
+  *      + if pointer will live in inode - allocate in the same
+  *        cylinder group.
+  *    Caller must make sure that @ind is valid and will stay that way.
+  */
+
+ static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+ {
+       u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
+       u32 *p;
+
+       /* Try to find previous block */
+       for (p = ind->p - 1; p >= start; p--)
+               if (*p)
+                       return le32_to_cpu(*p);
+
+       /* No such thing, so let's try location of indirect block */
+       if (ind->bh)
+               return ind->bh->b_blocknr;
+
+       /*
+        * It is going to be refered from inode itself? OK, just put it into
+        * the same cylinder group then.
+        */
+       return (inode->u.ext3_i.i_block_group *
+               EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+              le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
+ }
+
+ /**
+  *    ext3_find_goal - find a prefered place for allocation.
+  *    @inode: owner
+  *    @block:  block we want
+  *    @chain:  chain of indirect blocks
+  *    @partial: pointer to the last triple within a chain
+  *    @goal:  place to store the result.
+  *
+  *    Normally this function find the prefered place for block allocation,
+  *    stores it in *@goal and returns zero. If the branch had been changed
+  *    under us we return -EAGAIN.
+  */
+
+ static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
+                         Indirect *partial, unsigned long *goal)
+ {
+       /* Writer: ->i_next_alloc* */
+       if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
+               inode->u.ext3_i.i_next_alloc_block++;
+               inode->u.ext3_i.i_next_alloc_goal++;
+       }
+ #ifdef SEARCH_FROM_ZERO
+       inode->u.ext3_i.i_next_alloc_block = 0;
+       inode->u.ext3_i.i_next_alloc_goal = 0;
+ #endif
+       /* Writer: end */
+       /* Reader: pointers, ->i_next_alloc* */
+       if (verify_chain(chain, partial)) {
+               /*
+                * try the heuristic for sequential allocation,
+                * failing that at least try to get decent locality.
+                */
+               if (block == inode->u.ext3_i.i_next_alloc_block)
+                       *goal = inode->u.ext3_i.i_next_alloc_goal;
+               if (!*goal)
+                       *goal = ext3_find_near(inode, partial);
+ #ifdef SEARCH_FROM_ZERO
+               *goal = 0;
+ #endif
+               return 0;
+       }
+       /* Reader: end */
+       return -EAGAIN;
+ }
+
+ /**
+  *    ext3_alloc_branch - allocate and set up a chain of blocks.
+  *    @inode: owner
+  *    @num: depth of the chain (number of blocks to allocate)
+  *    @offsets: offsets (in the blocks) to store the pointers to next.
+  *    @branch: place to store the chain in.
+  *
+  *    This function allocates @num blocks, zeroes out all but the last one,
+  *    links them into chain and (if we are synchronous) writes them to disk.
+  *    In other words, it prepares a branch that can be spliced onto the
+  *    inode. It stores the information about that chain in the branch[], in
+  *    the same format as ext3_get_branch() would do. We are calling it after
+  *    we had read the existing part of chain and partial points to the last
+  *    triple of that (one with zero ->key). Upon the exit we have the same
+  *    picture as after the successful ext3_get_block(), excpet that in one
+  *    place chain is disconnected - *branch->p is still zero (we did not
+  *    set the last link), but branch->key contains the number that should
+  *    be placed into *branch->p to fill that gap.
+  *
+  *    If allocation fails we free all blocks we've allocated (and forget
+  *    their buffer_heads) and return the error value the from failed
+  *    ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+  *    as described above and return 0.
+  */
+
+ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
+                            int num,
+                            unsigned long goal,
+                            int *offsets,
+                            Indirect *branch)
+ {
+       int blocksize = inode->i_sb->s_blocksize;
+       int n = 0, keys = 0;
+       int err = 0;
+       int i;
+       int parent = ext3_alloc_block(handle, inode, goal, &err);
+
+       branch[0].key = cpu_to_le32(parent);
+       if (parent) {
+               for (n = 1; n < num; n++) {
+                       struct buffer_head *bh;
+                       /* Allocate the next block */
+                       int nr = ext3_alloc_block(handle, inode, parent, &err);
+                       if (!nr)
+                               break;
+                       branch[n].key = cpu_to_le32(nr);
+                       keys = n+1;
+
+                       /*
+                        * Get buffer_head for parent block, zero it out
+                        * and set the pointer to new one, then send
+                        * parent to disk.
+                        */
+                       bh = getblk(inode->i_dev, parent, blocksize);
+                       branch[n].bh = bh;
+                       lock_buffer(bh);
+                       BUFFER_TRACE(bh, "call get_create_access");
+                       err = ext3_journal_get_create_access(handle, bh);
+                       if (err) {
+                               unlock_buffer(bh);
+                               brelse(bh);
+                               break;
+                       }
+
+                       memset(bh->b_data, 0, blocksize);
+                       branch[n].p = (u32*) bh->b_data + offsets[n];
+                       *branch[n].p = branch[n].key;
+                       BUFFER_TRACE(bh, "marking uptodate");
+                       mark_buffer_uptodate(bh, 1);
+                       unlock_buffer(bh);
+
+                       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+                       err = ext3_journal_dirty_metadata(handle, bh);
+                       if (err)
+                               break;
+
+                       parent = nr;
+               }
+               if (IS_SYNC(inode))
+                       handle->h_sync = 1;
+       }
+       if (n == num)
+               return 0;
+
+       /* Allocation failed, free what we already allocated */
+       for (i = 1; i < keys; i++) {
+               BUFFER_TRACE(branch[i].bh, "call journal_forget");
+               ext3_journal_forget(handle, branch[i].bh);
+       }
+       for (i = 0; i < keys; i++)
+               ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
+       return err;
+ }
+
+ /**
+  *    ext3_splice_branch - splice the allocated branch onto inode.
+  *    @inode: owner
+  *    @block: (logical) number of block we are adding
+  *    @chain: chain of indirect blocks (with a missing link - see
+  *            ext3_alloc_branch)
+  *    @where: location of missing link
+  *    @num:   number of blocks we are adding
+  *
+  *    This function verifies that chain (up to the missing link) had not
+  *    changed, fills the missing link and does all housekeeping needed in
+  *    inode (->i_blocks, etc.). In case of success we end up with the full
+  *    chain to new block and return 0. Otherwise (== chain had been changed)
+  *    we free the new blocks (forgetting their buffer_heads, indeed) and
+  *    return -EAGAIN.
+  */
+
+ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
+                             Indirect chain[4], Indirect *where, int num)
+ {
+       int i;
+       int err = 0;
+
+       /*
+        * If we're splicing into a [td]indirect block (as opposed to the
+        * inode) then we need to get write access to the [td]indirect block
+        * before the splice.
+        */
+       if (where->bh) {
+               BUFFER_TRACE(where->bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, where->bh);
+               if (err)
+                       goto err_out;
+       }
+       /* Verify that place we are splicing to is still there and vacant */
+
+       /* Writer: pointers, ->i_next_alloc* */
+       if (!verify_chain(chain, where-1) || *where->p)
+               /* Writer: end */
+               goto changed;
+
+       /* That's it */
+
+       *where->p = where->key;
+       inode->u.ext3_i.i_next_alloc_block = block;
+       inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+ #ifdef SEARCH_FROM_ZERO
+       inode->u.ext3_i.i_next_alloc_block = 0;
+       inode->u.ext3_i.i_next_alloc_goal = 0;
+ #endif
+       /* Writer: end */
+
+       /* We are done with atomic stuff, now do the rest of housekeeping */
+
+       inode->i_ctime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+
+       /* had we spliced it onto indirect block? */
+       if (where->bh) {
+               /*
+                * akpm: If we spliced it onto an indirect block, we haven't
+                * altered the inode.  Note however that if it is being spliced
+                * onto an indirect block at the very end of the file (the
+                * file is growing) then we *will* alter the inode to reflect
+                * the new i_size.  But that is not done here - it is done in
+                * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
+                */
+               jbd_debug(5, "splicing indirect only\n");
+               BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
+               err = ext3_journal_dirty_metadata(handle, where->bh);
+               if (err)
+                       goto err_out;
+       } else {
+               /*
+                * OK, we spliced it into the inode itself on a direct block.
+                * Inode was dirtied above.
+                */
+               jbd_debug(5, "splicing direct\n");
+       }
+       return err;
+
+ changed:
+       /*
+        * AKPM: if where[i].bh isn't part of the current updating
+        * transaction then we explode nastily.  Test this code path.
+        */
+       jbd_debug(1, "the chain changed: try again\n");
+       err = -EAGAIN;
+
+ err_out:
+       for (i = 1; i < num; i++) {
+               BUFFER_TRACE(where[i].bh, "call journal_forget");
+               ext3_journal_forget(handle, where[i].bh);
+       }
+       /* For the normal collision cleanup case, we free up the blocks.
+        * On genuine filesystem errors we don't even think about doing
+        * that. */
+       if (err == -EAGAIN)
+               for (i = 0; i < num; i++)
+                       ext3_free_blocks(handle, inode,
+                                        le32_to_cpu(where[i].key), 1);
+       return err;
+ }
+
+ /*
+  * Allocation strategy is simple: if we have to allocate something, we will
+  * have to go the whole way to leaf. So let's do it before attaching anything
+  * to tree, set linkage between the newborn blocks, write them if sync is
+  * required, recheck the path, free and repeat if check fails, otherwise
+  * set the last missing link (that will protect us from any truncate-generated
+  * removals - all blocks on the path are immune now) and possibly force the
+  * write on the parent block.
+  * That has a nice additional property: no special recovery from the failed
+  * allocations is needed - we simply release blocks and do not touch anything
+  * reachable from inode.
+  *
+  * akpm: `handle' can be NULL if create == 0.
+  */
+
+ static int ext3_get_block_handle(handle_t *handle, struct inode *inode,
+                                long iblock,
+                                struct buffer_head *bh_result, int create)
+ {
+       int err = -EIO;
+       int offsets[4];
+       Indirect chain[4];
+       Indirect *partial;
+       unsigned long goal;
+       int left;
+       int depth = ext3_block_to_path(inode, iblock, offsets);
+       loff_t new_size;
+
+       J_ASSERT(handle != NULL || create == 0);
+
+       if (depth == 0)
+               goto out;
+
+       lock_kernel();
+ reread:
+       partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+
+       /* Simplest case - block found, no allocation needed */
+       if (!partial) {
+               bh_result->b_state &= ~(1UL << BH_New);
+ got_it:
+               bh_result->b_dev = inode->i_dev;
+               bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
+               bh_result->b_state |= (1UL << BH_Mapped);
+               /* Clean up and exit */
+               partial = chain+depth-1; /* the whole chain */
+               goto cleanup;
+       }
+
+       /* Next simple case - plain lookup or failed read of indirect block */
+       if (!create || err == -EIO) {
+ cleanup:
+               while (partial > chain) {
+                       BUFFER_TRACE(partial->bh, "call brelse");
+                       brelse(partial->bh);
+                       partial--;
+               }
+               BUFFER_TRACE(bh_result, "returned");
+               unlock_kernel();
+ out:
+               return err;
+       }
+
+       /*
+        * Indirect block might be removed by truncate while we were
+        * reading it. Handling of that case (forget what we've got and
+        * reread) is taken out of the main path.
+        */
+       if (err == -EAGAIN)
+               goto changed;
+
+       if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
+               goto changed;
+
+       left = (chain + depth) - partial;
+
+       /*
+        * Block out ext3_truncate while we alter the tree
+        */
+       down_read(&inode->u.ext3_i.truncate_sem);
+       err = ext3_alloc_branch(handle, inode, left, goal,
+                                       offsets+(partial-chain), partial);
+
+       /* The ext3_splice_branch call will free and forget any buffers
+        * on the new chain if there is a failure, but that risks using
+        * up transaction credits, especially for bitmaps where the
+        * credits cannot be returned.  Can we handle this somehow?  We
+        * may need to return -EAGAIN upwards in the worst case.  --sct */
+       if (!err)
+               err = ext3_splice_branch(handle, inode, iblock, chain,
+                                        partial, left);
+       up_read(&inode->u.ext3_i.truncate_sem);
+       if (err == -EAGAIN)
+               goto changed;
+       if (err)
+               goto cleanup;
+
+       new_size = inode->i_size;
+       /*
+        * This is not racy against ext3_truncate's modification of i_disksize
+        * because VM/VFS ensures that the file cannot be extended while
+        * truncate is in progress.  It is racy between multiple parallel
+        * instances of get_block, but we have the BKL.
+        */
+       if (new_size > inode->u.ext3_i.i_disksize)
+               inode->u.ext3_i.i_disksize = new_size;
+
+       bh_result->b_state |= (1UL << BH_New);
+       goto got_it;
+
+ changed:
+       while (partial > chain) {
+               jbd_debug(1, "buffer chain changed, retrying\n");
+               BUFFER_TRACE(partial->bh, "brelsing");
+               brelse(partial->bh);
+               partial--;
+       }
+       goto reread;
+ }
+
+ static int ext3_get_block(struct inode *inode, long iblock,
+                       struct buffer_head *bh_result, int create)
+ {
+       handle_t *handle = 0;
+       int ret;
+
+       if (create) {
+               handle = ext3_journal_current_handle();
+               J_ASSERT(handle != 0);
+       }
+       ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
+       return ret;
+ }
+
+ /*
+  * `handle' can be NULL if create is zero
+  */
+ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
+                               long block, int create, int * errp)
+ {
+       struct buffer_head dummy;
+       int fatal = 0, err;
+
+       J_ASSERT(handle != NULL || create == 0);
+
+       dummy.b_state = 0;
+       dummy.b_blocknr = -1000;
+       buffer_trace_init(&dummy.b_history);
+       *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
+       if (!*errp && buffer_mapped(&dummy)) {
+               struct buffer_head *bh;
+               bh = getblk(dummy.b_dev, dummy.b_blocknr,
+                                       inode->i_sb->s_blocksize);
+               if (buffer_new(&dummy)) {
+                       J_ASSERT(create != 0);
+                       J_ASSERT(handle != 0);
+
+                       /* Now that we do not always journal data, we
+                          should keep in mind whether this should
+                          always journal the new buffer as metadata.
+                          For now, regular file writes use
+                          ext3_get_block instead, so it's not a
+                          problem. */
+                       lock_kernel();
+                       lock_buffer(bh);
+                       BUFFER_TRACE(bh, "call get_create_access");
+                       fatal = ext3_journal_get_create_access(handle, bh);
+                       if (!fatal) {
+                               memset(bh->b_data, 0,
+                                      inode->i_sb->s_blocksize);
+                               mark_buffer_uptodate(bh, 1);
+                       }
+                       unlock_buffer(bh);
+                       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+                       err = ext3_journal_dirty_metadata(handle, bh);
+                       if (!fatal) fatal = err;
+                       unlock_kernel();
+               } else {
+                       BUFFER_TRACE(bh, "not a new buffer");
+               }
+               if (fatal) {
+                       *errp = fatal;
+                       brelse(bh);
+                       bh = NULL;
+               }
+               return bh;
+       }
+       return NULL;
+ }
+
+ struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
+                              int block, int create, int *err)
+ {
+       struct buffer_head * bh;
+       int prev_blocks;
+
+       prev_blocks = inode->i_blocks;
+
+       bh = ext3_getblk (handle, inode, block, create, err);
+       if (!bh)
+               return bh;
+ #ifdef EXT3_PREALLOCATE
+       /*
+        * If the inode has grown, and this is a directory, then use a few
+        * more of the preallocated blocks to keep directory fragmentation
+        * down.  The preallocated blocks are guaranteed to be contiguous.
+        */
+       if (create &&
+           S_ISDIR(inode->i_mode) &&
+           inode->i_blocks > prev_blocks &&
+           EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+                                   EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
+               int i;
+               struct buffer_head *tmp_bh;
+
+               for (i = 1;
+                    inode->u.ext3_i.i_prealloc_count &&
+                    i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
+                    i++) {
+                       /*
+                        * ext3_getblk will zero out the contents of the
+                        * directory for us
+                        */
+                       tmp_bh = ext3_getblk(handle, inode,
+                                               block+i, create, err);
+                       if (!tmp_bh) {
+                               brelse (bh);
+                               return 0;
+                       }
+                       brelse (tmp_bh);
+               }
+       }
+ #endif
+       if (buffer_uptodate(bh))
+               return bh;
+       ll_rw_block (READ, 1, &bh);
+       wait_on_buffer (bh);
+       if (buffer_uptodate(bh))
+               return bh;
+       brelse (bh);
+       *err = -EIO;
+       return NULL;
+ }
+
+ static int walk_page_buffers( handle_t *handle,
+                               struct buffer_head *head,
+                               unsigned from,
+                               unsigned to,
+                               int *partial,
+                               int (*fn)(      handle_t *handle,
+                                               struct buffer_head *bh))
+ {
+       struct buffer_head *bh;
+       unsigned block_start, block_end;
+       unsigned blocksize = head->b_size;
+       int err, ret = 0;
+
+       for (   bh = head, block_start = 0;
+               ret == 0 && (bh != head || !block_start);
+               block_start = block_end, bh = bh->b_this_page)
+       {
+               block_end = block_start + blocksize;
+               if (block_end <= from || block_start >= to) {
+                       if (partial && !buffer_uptodate(bh))
+                               *partial = 1;
+                       continue;
+               }
+               err = (*fn)(handle, bh);
+               if (!ret)
+                       ret = err;
+       }
+       return ret;
+ }
+
+ /*
+  * To preserve ordering, it is essential that the hole instantiation and
+  * the data write be encapsulated in a single transaction.  We cannot
+  * close off a transaction and start a new one between the ext3_get_block()
+  * and the commit_write().  So doing the journal_start at the start of
+  * prepare_write() is the right place.
+  *
+  * Also, this function can nest inside ext3_writepage() ->
+  * block_write_full_page(). In that case, we *know* that ext3_writepage()
+  * has generated enough buffer credits to do the whole page.  So we won't
+  * block on the journal in that case, which is good, because the caller may
+  * be PF_MEMALLOC.
+  *
+  * By accident, ext3 can be reentered when a transaction is open via
+  * quota file writes.  If we were to commit the transaction while thus
+  * reentered, there can be a deadlock - we would be holding a quota
+  * lock, and the commit would never complete if another thread had a
+  * transaction open and was blocking on the quota lock - a ranking
+  * violation.
+  *
+  * So what we do is to rely on the fact that journal_stop/journal_start
+  * will _not_ run commit under these circumstances because handle->h_ref
+  * is elevated.  We'll still have enough credits for the tiny quotafile
+  * write.
+  */
+
+ static int do_journal_get_write_access(handle_t *handle,
+                                      struct buffer_head *bh)
+ {
+       return ext3_journal_get_write_access(handle, bh);
+ }
+
+ static int ext3_prepare_write(struct file *file, struct page *page,
+                             unsigned from, unsigned to)
+ {
+       struct inode *inode = page->mapping->host;
+       handle_t *handle = ext3_journal_current_handle();
+       int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+
+       lock_kernel();
+       handle = ext3_journal_start(inode, needed_blocks);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out;
+       }
+       ret = block_prepare_write(page, from, to, ext3_get_block);
+       if (ret != 0)
+               goto prepare_write_failed;
+
+       if (ext3_should_journal_data(inode))
+               ret = walk_page_buffers(handle, page->buffers,
+                               from, to, NULL, do_journal_get_write_access);
+ prepare_write_failed:
+       if (ret)
+               ext3_journal_stop(handle, inode);
+ out:
+       unlock_kernel();
+       return ret;
+ }
+
+ static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
+ {
+       return ext3_journal_dirty_data(handle, bh, 0);
+ }
+
+ /*
+  * For ext3_writepage().  We also brelse() the buffer to account for
+  * the bget() which ext3_writepage() performs.
+  */
+ static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
+ {
+       int ret = ext3_journal_dirty_data(handle, bh, 1);
+       __brelse(bh);
+       return ret;
+ }
+
+ /* For commit_write() in data=journal mode */
+ static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+ {
+       set_bit(BH_Uptodate, &bh->b_state);
+       return ext3_journal_dirty_metadata(handle, bh);
+ }
+
+ /*
+  * We need to pick up the new inode size which generic_commit_write gave us
+  * `file' can be NULL - eg, when called from block_symlink().
+  *
+  * ext3 inode->i_dirty_buffers policy:  If we're journalling data we
+  * definitely don't want them to appear on the inode at all - instead
+  * we need to manage them at the JBD layer and we need to intercept
+  * the relevant sync operations and translate them into journal operations.
+  *
+  * If we're not journalling data then we can just leave the buffers
+  * on ->i_dirty_buffers.  If someone writes them out for us then thanks.
+  * Otherwise we'll do it in commit, if we're using ordered data.
+  */
+
+ static int ext3_commit_write(struct file *file, struct page *page,
+                            unsigned from, unsigned to)
+ {
+       handle_t *handle = ext3_journal_current_handle();
+       struct inode *inode = page->mapping->host;
+       int ret = 0, ret2;
+
+       lock_kernel();
+       if (ext3_should_journal_data(inode)) {
+               /*
+                * Here we duplicate the generic_commit_write() functionality
+                */
+               int partial = 0;
+               loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+               ret = walk_page_buffers(handle, page->buffers,
+                       from, to, &partial, commit_write_fn);
+               if (!partial)
+                       SetPageUptodate(page);
+               kunmap(page);
+               if (pos > inode->i_size)
+                       inode->i_size = pos;
+               set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state);
+       } else {
+               if (ext3_should_order_data(inode)) {
+                       ret = walk_page_buffers(handle, page->buffers,
+                               from, to, NULL, journal_dirty_sync_data);
+               }
+               /* Be careful here if generic_commit_write becomes a
+                * required invocation after block_prepare_write. */
+               if (ret == 0)
+                       ret = generic_commit_write(file, page, from, to);
+       }
+       if (inode->i_size > inode->u.ext3_i.i_disksize) {
+               inode->u.ext3_i.i_disksize = inode->i_size;
+               ret2 = ext3_mark_inode_dirty(handle, inode);
+               if (!ret)
+                       ret = ret2;
+       }
+       ret2 = ext3_journal_stop(handle, inode);
+       unlock_kernel();
+       if (!ret)
+               ret = ret2;
+       return ret;
+ }
+
+ /*
+  * bmap() is special.  It gets used by applications such as lilo and by
+  * the swapper to find the on-disk block of a specific piece of data.
+  *
+  * Naturally, this is dangerous if the block concerned is still in the
+  * journal.  If somebody makes a swapfile on an ext3 data-journaling
+  * filesystem and enables swap, then they may get a nasty shock when the
+  * data getting swapped to that swapfile suddenly gets overwritten by
+  * the original zero's written out previously to the journal and
+  * awaiting writeback in the kernel's buffer cache.
+  *
+  * So, if we see any bmap calls here on a modified, data-journaled file,
+  * take extra steps to flush any blocks which might be in the cache.
+  */
+ static int ext3_bmap(struct address_space *mapping, long block)
+ {
+       struct inode *inode = mapping->host;
+       journal_t *journal;
+       int err;
+
+       if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) {
+               /*
+                * This is a REALLY heavyweight approach, but the use of
+                * bmap on dirty files is expected to be extremely rare:
+                * only if we run lilo or swapon on a freshly made file
+                * do we expect this to happen.
+                *
+                * (bmap requires CAP_SYS_RAWIO so this does not
+                * represent an unprivileged user DOS attack --- we'd be
+                * in trouble if mortal users could trigger this path at
+                * will.)
+                *
+                * NB. EXT3_STATE_JDATA is not set on files other than
+                * regular files.  If somebody wants to bmap a directory
+                * or symlink and gets confused because the buffer
+                * hasn't yet been flushed to disk, they deserve
+                * everything they get.
+                */
+
+               journal = EXT3_JOURNAL(inode);
+               journal_lock_updates(journal);
+               err = journal_flush(journal);
+               journal_unlock_updates(journal);
+
+               if (err)
+                       return 0;
+       }
+
+       return generic_block_bmap(mapping,block,ext3_get_block);
+ }
+
+ static int bget_one(handle_t *handle, struct buffer_head *bh)
+ {
+       atomic_inc(&bh->b_count);
+       return 0;
+ }
+
+ /*
+  * Note that we always start a transaction even if we're not journalling
+  * data.  This is to preserve ordering: any hole instantiation within
+  * __block_write_full_page -> ext3_get_block() should be journalled
+  * along with the data so we don't crash and then get metadata which
+  * refers to old data.
+  *
+  * In all journalling modes block_write_full_page() will start the I/O.
+  *
+  * Problem:
+  *
+  *    ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+  *            ext3_writepage()
+  *
+  * Similar for:
+  *
+  *    ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+  *
+  * Same applies to ext3_get_block().  We will deadlock on various things like
+  * lock_journal and i_truncate_sem.
+  *
+  * Setting PF_MEMALLOC here doesn't work - too many internal memory
+  * allocations fail.
+  *
+  * 16May01: If we're reentered then journal_current_handle() will be
+  *        non-zero. We simply *return*.
+  *
+  * 1 July 2001: @@@ FIXME:
+  *   In journalled data mode, a data buffer may be metadata against the
+  *   current transaction.  But the same file is part of a shared mapping
+  *   and someone does a writepage() on it.
+  *
+  *   We will move the buffer onto the async_data list, but *after* it has
+  *   been dirtied. So there's a small window where we have dirty data on
+  *   BJ_Metadata.
+  *
+  *   Note that this only applies to the last partial page in the file.  The
+  *   bit which block_write_full_page() uses prepare/commit for.  (That's
+  *   broken code anyway: it's wrong for msync()).
+  *
+  *   It's a rare case: affects the final partial page, for journalled data
+  *   where the file is subject to bith write() and writepage() in the same
+  *   transction.  To fix it we'll need a custom block_write_full_page().
+  *   We'll probably need that anyway for journalling writepage() output.
+  *
+  * We don't honour synchronous mounts for writepage().  That would be
+  * disastrous.  Any write() or metadata operation will sync the fs for
+  * us.
+  */
+ static int ext3_writepage(struct page *page)
+ {
+       struct inode *inode = page->mapping->host;
+       struct buffer_head *page_buffers;
+       handle_t *handle = NULL;
+       int ret = 0, err;
+       int needed;
+       int order_data;
+
+       J_ASSERT(PageLocked(page));
+
+       /*
+        * We give up here if we're reentered, because it might be
+        * for a different filesystem.  One *could* look for a
+        * nested transaction opportunity.
+        */
+       lock_kernel();
+       if (ext3_journal_current_handle())
+               goto out_fail;
+
+       needed = ext3_writepage_trans_blocks(inode);
+       if (current->flags & PF_MEMALLOC)
+               handle = ext3_journal_try_start(inode, needed);
+       else
+               handle = ext3_journal_start(inode, needed);
+
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_fail;
+       }
+
+       order_data = ext3_should_order_data(inode) ||
+                       ext3_should_journal_data(inode);
+
+       unlock_kernel();
+
+       page_buffers = NULL;    /* Purely to prevent compiler warning */
+
+       /* bget() all the buffers */
+       if (order_data) {
+               if (!page->buffers)
+                       create_empty_buffers(page,
+                               inode->i_dev, inode->i_sb->s_blocksize);
+               page_buffers = page->buffers;
+               walk_page_buffers(handle, page_buffers, 0,
+                               PAGE_CACHE_SIZE, NULL, bget_one);
+       }
+
+       ret = block_write_full_page(page, ext3_get_block);
+
+       /*
+        * The page can become unlocked at any point now, and
+        * truncate can then come in and change things.  So we
+        * can't touch *page from now on.  But *page_buffers is
+        * safe due to elevated refcount.
+        */
+
+       handle = ext3_journal_current_handle();
+       lock_kernel();
+
+       /* And attach them to the current transaction */
+       if (order_data) {
+               err = walk_page_buffers(handle, page_buffers,
+                       0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
+               if (!ret)
+                       ret = err;
+       }
+
+       err = ext3_journal_stop(handle, inode);
+       if (!ret)
+               ret = err;
+       unlock_kernel();
+       return ret;
+
+ out_fail:
+
+       unlock_kernel();
+       SetPageDirty(page);
+       UnlockPage(page);
+       return ret;
+ }
+
+ static int ext3_readpage(struct file *file, struct page *page)
+ {
+       return block_read_full_page(page,ext3_get_block);
+ }
+
+
+ static int ext3_flushpage(struct page *page, unsigned long offset)
+ {
+       journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+       return journal_flushpage(journal, page, offset);
+ }
+
+ static int ext3_releasepage(struct page *page, int wait)
+ {
+       journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+       return journal_try_to_free_buffers(journal, page, wait);
+ }
+
+
+ struct address_space_operations ext3_aops = {
+       readpage:       ext3_readpage,          /* BKL not held.  Don't need */
+       writepage:      ext3_writepage,         /* BKL not held.  We take it */
+       sync_page:      block_sync_page,
+       prepare_write:  ext3_prepare_write,     /* BKL not held.  We take it */
+       commit_write:   ext3_commit_write,      /* BKL not held.  We take it */
+       bmap:           ext3_bmap,              /* BKL held */
+       flushpage:      ext3_flushpage,         /* BKL not held.  Don't need */
+       releasepage:    ext3_releasepage,       /* BKL not held.  Don't need */
+ };
+
+ /*
+  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
+  * up to the end of the block which corresponds to `from'.
+  * This required during truncate. We need to physically zero the tail end
+  * of that block so it doesn't yield old data if the file is later grown.
+  */
+ static int ext3_block_truncate_page(handle_t *handle,
+               struct address_space *mapping, loff_t from)
+ {
+       unsigned long index = from >> PAGE_CACHE_SHIFT;
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned blocksize, iblock, length, pos;
+       struct inode *inode = mapping->host;
+       struct page *page;
+       struct buffer_head *bh;
+       int err;
+
+       blocksize = inode->i_sb->s_blocksize;
+       length = offset & (blocksize - 1);
+
+       /* Block boundary? Nothing to do */
+       if (!length)
+               return 0;
+
+       length = blocksize - length;
+       iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+       page = grab_cache_page(mapping, index);
+       err = -ENOMEM;
+       if (!page)
+               goto out;
+
+       if (!page->buffers)
+               create_empty_buffers(page, inode->i_dev, blocksize);
+
+       /* Find the buffer that contains "offset" */
+       bh = page->buffers;
+       pos = blocksize;
+       while (offset >= pos) {
+               bh = bh->b_this_page;
+               iblock++;
+               pos += blocksize;
+       }
+
+       err = 0;
+       if (!buffer_mapped(bh)) {
+               /* Hole? Nothing to do */
+               if (buffer_uptodate(bh))
+                       goto unlock;
+               ext3_get_block(inode, iblock, bh, 0);
+               /* Still unmapped? Nothing to do */
+               if (!buffer_mapped(bh))
+                       goto unlock;
+       }
+
+       /* Ok, it's mapped. Make sure it's up-to-date */
+       if (Page_Uptodate(page))
+               set_bit(BH_Uptodate, &bh->b_state);
+
+       if (!buffer_uptodate(bh)) {
+               err = -EIO;
+               ll_rw_block(READ, 1, &bh);
+               wait_on_buffer(bh);
+               /* Uhhuh. Read error. Complain and punt. */
+               if (!buffer_uptodate(bh))
+                       goto unlock;
+       }
+
+       if (ext3_should_journal_data(inode)) {
+               BUFFER_TRACE(bh, "get write access");
+               err = ext3_journal_get_write_access(handle, bh);
+               if (err)
+                       goto unlock;
+       }
+
+       memset(kmap(page) + offset, 0, length);
+       flush_dcache_page(page);
+       kunmap(page);
+
+       BUFFER_TRACE(bh, "zeroed end of block");
+
+       err = 0;
+       if (ext3_should_journal_data(inode)) {
+               err = ext3_journal_dirty_metadata(handle, bh);
+       } else {
+               if (ext3_should_order_data(inode))
+                       err = ext3_journal_dirty_data(handle, bh, 0);
+               __mark_buffer_dirty(bh);
+       }
+
+ unlock:
+       UnlockPage(page);
+       page_cache_release(page);
+ out:
+       return err;
+ }
+
+ /*
+  * Probably it should be a library function... search for first non-zero word
+  * or memcmp with zero_page, whatever is better for particular architecture.
+  * Linus?
+  */
+ static inline int all_zeroes(u32 *p, u32 *q)
+ {
+       while (p < q)
+               if (*p++)
+                       return 0;
+       return 1;
+ }
+
+ /**
+  *    ext3_find_shared - find the indirect blocks for partial truncation.
+  *    @inode:   inode in question
+  *    @depth:   depth of the affected branch
+  *    @offsets: offsets of pointers in that branch (see ext3_block_to_path)
+  *    @chain:   place to store the pointers to partial indirect blocks
+  *    @top:     place to the (detached) top of branch
+  *
+  *    This is a helper function used by ext3_truncate().
+  *
+  *    When we do truncate() we may have to clean the ends of several
+  *    indirect blocks but leave the blocks themselves alive. Block is
+  *    partially truncated if some data below the new i_size is refered
+  *    from it (and it is on the path to the first completely truncated
+  *    data block, indeed).  We have to free the top of that path along
+  *    with everything to the right of the path. Since no allocation
+  *    past the truncation point is possible until ext3_truncate()
+  *    finishes, we may safely do the latter, but top of branch may
+  *    require special attention - pageout below the truncation point
+  *    might try to populate it.
+  *
+  *    We atomically detach the top of branch from the tree, store the
+  *    block number of its root in *@top, pointers to buffer_heads of
+  *    partially truncated blocks - in @chain[].bh and pointers to
+  *    their last elements that should not be removed - in
+  *    @chain[].p. Return value is the pointer to last filled element
+  *    of @chain.
+  *
+  *    The work left to caller to do the actual freeing of subtrees:
+  *            a) free the subtree starting from *@top
+  *            b) free the subtrees whose roots are stored in
+  *                    (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+  *            c) free the subtrees growing from the inode past the @chain[0].
+  *                    (no partially truncated stuff there).  */
+
+ static Indirect *ext3_find_shared(struct inode *inode,
+                               int depth,
+                               int offsets[4],
+                               Indirect chain[4],
+                               u32 *top)
+ {
+       Indirect *partial, *p;
+       int k, err;
+
+       *top = 0;
+       /* Make k index the deepest non-null offest + 1 */
+       for (k = depth; k > 1 && !offsets[k-1]; k--)
+               ;
+       partial = ext3_get_branch(inode, k, offsets, chain, &err);
+       /* Writer: pointers */
+       if (!partial)
+               partial = chain + k-1;
+       /*
+        * If the branch acquired continuation since we've looked at it -
+        * fine, it should all survive and (new) top doesn't belong to us.
+        */
+       if (!partial->key && *partial->p)
+               /* Writer: end */
+               goto no_top;
+       for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
+               ;
+       /*
+        * OK, we've found the last block that must survive. The rest of our
+        * branch should be detached before unlocking. However, if that rest
+        * of branch is all ours and does not grow immediately from the inode
+        * it's easier to cheat and just decrement partial->p.
+        */
+       if (p == chain + k - 1 && p > chain) {
+               p->p--;
+       } else {
+               *top = *p->p;
+               /* Nope, don't do this in ext3.  Must leave the tree intact */
+ #if 0
+               *p->p = 0;
+ #endif
+       }
+       /* Writer: end */
+
+       while(partial > p)
+       {
+               brelse(partial->bh);
+               partial--;
+       }
+ no_top:
+       return partial;
+ }
+
+ /*
+  * Zero a number of block pointers in either an inode or an indirect block.
+  * If we restart the transaction we must again get write access to the
+  * indirect block for further modification.
+  *
+  * We release `count' blocks on disk, but (last - first) may be greater
+  * than `count' because there can be holes in there.
+  */
+ static void
+ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
+               unsigned long block_to_free, unsigned long count,
+               u32 *first, u32 *last)
+ {
+       u32 *p;
+       kdev_t dev = inode->i_sb->s_dev;
+       unsigned long blocksize = inode->i_sb->s_blocksize;
+
+       if (try_to_extend_transaction(handle, inode)) {
+               if (bh) {
+                       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+                       ext3_journal_dirty_metadata(handle, bh);
+               }
+               ext3_mark_inode_dirty(handle, inode);
+               ext3_journal_test_restart(handle, inode);
+               BUFFER_TRACE(bh, "get_write_access");
+               ext3_journal_get_write_access(handle, bh);
+       }
+
+       /*
+        * Any buffers which are on the journal will be in memory. We find
+        * them on the hash table so journal_revoke() will run journal_forget()
+        * on them.  We've already detached each block from the file, so
+        * bforget() in journal_forget() should be safe.
+        *
+        * AKPM: turn on bforget in journal_forget()!!!
+        */
+       for (p = first; p < last; p++) {
+               u32 nr = le32_to_cpu(*p);
+               if (nr) {
+                       struct buffer_head *bh;
+
+                       *p = 0;
+                       bh = get_hash_table(dev, nr, blocksize);
+                       ext3_forget(handle, 0, inode, bh, nr);
+               }
+       }
+
+       ext3_free_blocks(handle, inode, block_to_free, count);
+ }
+
+ /**
+  * ext3_free_data - free a list of data blocks
+  * @handle:   handle for this transaction
+  * @inode:    inode we are dealing with
+  * @this_bh:  indirect buffer_head which contains *@first and *@last
+  * @first:    array of block numbers
+  * @last:     points immediately past the end of array
+  *
+  * We are freeing all blocks refered from that array (numbers are stored as
+  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+  *
+  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+  * blocks are contiguous then releasing them at one time will only affect one
+  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+  * actually use a lot of journal space.
+  *
+  * @this_bh will be %NULL if @first and @last point into the inode's direct
+  * block pointers.
+  */
+ static void ext3_free_data(handle_t *handle, struct inode *inode,
+                          struct buffer_head *this_bh, u32 *first, u32 *last)
+ {
+       unsigned long block_to_free = 0;    /* Starting block # of a run */
+       unsigned long count = 0;            /* Number of blocks in the run */
+       u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
+                                              corresponding to
+                                              block_to_free */
+       unsigned long nr;                   /* Current block # */
+       u32 *p;                             /* Pointer into inode/ind
+                                              for current block */
+       int err;
+
+       if (this_bh) {                          /* For indirect block */
+               BUFFER_TRACE(this_bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, this_bh);
+               /* Important: if we can't update the indirect pointers
+                * to the blocks, we can't free them. */
+               if (err)
+                       return;
+       }
+
+       for (p = first; p < last; p++) {
+               nr = le32_to_cpu(*p);
+               if (nr) {
+                       /* accumulate blocks to free if they're contiguous */
+                       if (count == 0) {
+                               block_to_free = nr;
+                               block_to_free_p = p;
+                               count = 1;
+                       } else if (nr == block_to_free + count) {
+                               count++;
+                       } else {
+                               ext3_clear_blocks(handle, inode, this_bh,
+                                                 block_to_free,
+                                                 count, block_to_free_p, p);
+                               block_to_free = nr;
+                               block_to_free_p = p;
+                               count = 1;
+                       }
+               }
+       }
+
+       if (count > 0)
+               ext3_clear_blocks(handle, inode, this_bh, block_to_free,
+                                 count, block_to_free_p, p);
+
+       if (this_bh) {
+               BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
+               ext3_journal_dirty_metadata(handle, this_bh);
+       }
+ }
+
+ /**
+  *    ext3_free_branches - free an array of branches
+  *    @handle: JBD handle for this transaction
+  *    @inode: inode we are dealing with
+  *    @parent_bh: the buffer_head which contains *@first and *@last
+  *    @first: array of block numbers
+  *    @last:  pointer immediately past the end of array
+  *    @depth: depth of the branches to free
+  *
+  *    We are freeing all blocks refered from these branches (numbers are
+  *    stored as little-endian 32-bit) and updating @inode->i_blocks
+  *    appropriately.
+  */
+ static void ext3_free_branches(handle_t *handle, struct inode *inode,
+                              struct buffer_head *parent_bh,
+                              u32 *first, u32 *last, int depth)
+ {
+       unsigned long nr;
+       u32 *p;
+
+       if (is_handle_aborted(handle))
+               return;
+
+       if (depth--) {
+               struct buffer_head *bh;
+               int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+               p = last;
+               while (--p >= first) {
+                       nr = le32_to_cpu(*p);
+                       if (!nr)
+                               continue;               /* A hole */
+
+                       /* Go read the buffer for the next level down */
+                       bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize);
+
+                       /*
+                        * A read failure? Report error and clear slot
+                        * (should be rare).
+                        */
+                       if (!bh) {
+                               ext3_error(inode->i_sb, "ext3_free_branches",
+                                          "Read failure, inode=%ld, block=%ld",
+                                          inode->i_ino, nr);
+                               continue;
+                       }
+
+                       /* This zaps the entire block.  Bottom up. */
+                       BUFFER_TRACE(bh, "free child branches");
+                       ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
+                                          (u32*)bh->b_data + addr_per_block,
+                                          depth);
+
+                       /*
+                        * We've probably journalled the indirect block several
+                        * times during the truncate.  But it's no longer
+                        * needed and we now drop it from the transaction via
+                        * journal_revoke().
+                        *
+                        * That's easy if it's exclusively part of this
+                        * transaction.  But if it's part of the committing
+                        * transaction then journal_forget() will simply
+                        * brelse() it.  That means that if the underlying
+                        * block is reallocated in ext3_get_block(),
+                        * unmap_underlying_metadata() will find this block
+                        * and will try to get rid of it.  damn, damn.
+                        *
+                        * If this block has already been committed to the
+                        * journal, a revoke record will be written.  And
+                        * revoke records must be emitted *before* clearing
+                        * this block's bit in the bitmaps.
+                        */
+                       ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
+                       /*
+                        * Everything below this this pointer has been
+                        * released.  Now let this top-of-subtree go.
+                        *
+                        * We want the freeing of this indirect block to be
+                        * atomic in the journal with the updating of the
+                        * bitmap block which owns it.  So make some room in
+                        * the journal.
+                        *
+                        * We zero the parent pointer *after* freeing its
+                        * pointee in the bitmaps, so if extend_transaction()
+                        * for some reason fails to put the bitmap changes and
+                        * the release into the same transaction, recovery
+                        * will merely complain about releasing a free block,
+                        * rather than leaking blocks.
+                        */
+                       if (is_handle_aborted(handle))
+                               return;
+                       if (try_to_extend_transaction(handle, inode)) {
+                               ext3_mark_inode_dirty(handle, inode);
+                               ext3_journal_test_restart(handle, inode);
+                       }
+
+                       ext3_free_blocks(handle, inode, nr, 1);
+
+                       if (parent_bh) {
+                               /*
+                                * The block which we have just freed is
+                                * pointed to by an indirect block: journal it
+                                */
+                               BUFFER_TRACE(parent_bh, "get_write_access");
+                               if (!ext3_journal_get_write_access(handle,
+                                                                  parent_bh)){
+                                       *p = 0;
+                                       BUFFER_TRACE(parent_bh,
+                                       "call ext3_journal_dirty_metadata");
+                                       ext3_journal_dirty_metadata(handle,
+                                                                   parent_bh);
+                               }
+                       }
+               }
+       } else {
+               /* We have reached the bottom of the tree. */
+               BUFFER_TRACE(parent_bh, "free data blocks");
+               ext3_free_data(handle, inode, parent_bh, first, last);
+       }
+ }
+
+ /*
+  * ext3_truncate()
+  *
+  * We block out ext3_get_block() block instantiations across the entire
+  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
+  * simultaneously on behalf of the same inode.
+  *
+  * As we work through the truncate and commmit bits of it to the journal there
+  * is one core, guiding principle: the file's tree must always be consistent on
+  * disk.  We must be able to restart the truncate after a crash.
+  *
+  * The file's tree may be transiently inconsistent in memory (although it
+  * probably isn't), but whenever we close off and commit a journal transaction,
+  * the contents of (the filesystem + the journal) must be consistent and
+  * restartable.  It's pretty simple, really: bottom up, right to left (although
+  * left-to-right works OK too).
+  *
+  * Note that at recovery time, journal replay occurs *before* the restart of
+  * truncate against the orphan inode list.
+  *
+  * The committed inode has the new, desired i_size (which is the same as
+  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
+  * that this inode's truncate did not complete and it will again call
+  * ext3_truncate() to have another go.  So there will be instantiated blocks
+  * to the right of the truncation point in a crashed ext3 filesystem.  But
+  * that's fine - as long as they are linked from the inode, the post-crash
+  * ext3_truncate() run will find them and release them.
+  */
+
+ void ext3_truncate(struct inode * inode)
+ {
+       handle_t *handle;
+       u32 *i_data = inode->u.ext3_i.i_data;
+       int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+       int offsets[4];
+       Indirect chain[4];
+       Indirect *partial;
+       int nr = 0;
+       int n;
+       long last_block;
+       unsigned blocksize;
+
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode))
+               return;
+
+       ext3_discard_prealloc(inode);
+
+       handle = start_transaction(inode);
+       if (IS_ERR(handle))
+               return;         /* AKPM: return what? */
+
+       blocksize = inode->i_sb->s_blocksize;
+       last_block = (inode->i_size + blocksize-1)
+                                       >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+
+       ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
+
+
+       n = ext3_block_to_path(inode, last_block, offsets);
+       if (n == 0)
+               goto out_stop;  /* error */
+
+       /*
+        * OK.  This truncate is going to happen.  We add the inode to the
+        * orphan list, so that if this truncate spans multiple transactions,
+        * and we crash, we will resume the truncate when the filesystem
+        * recovers.  It also marks the inode dirty, to catch the new size.
+        *
+        * Implication: the file must always be in a sane, consistent
+        * truncatable state while each transaction commits.
+        */
+       if (ext3_orphan_add(handle, inode))
+               goto out_stop;
+
+       /*
+        * The orphan list entry will now protect us from any crash which
+        * occurs before the truncate completes, so it is now safe to propagate
+        * the new, shorter inode size (held for now in i_size) into the
+        * on-disk inode. We do this via i_disksize, which is the value which
+        * ext3 *really* writes onto the disk inode.
+        */
+       inode->u.ext3_i.i_disksize = inode->i_size;
+
+       /*
+        * From here we block out all ext3_get_block() callers who want to
+        * modify the block allocation tree.
+        */
+       down_write(&inode->u.ext3_i.truncate_sem);
+
+       if (n == 1) {           /* direct blocks */
+               ext3_free_data(handle, inode, NULL, i_data+offsets[0],
+                              i_data + EXT3_NDIR_BLOCKS);
+               goto do_indirects;
+       }
+
+       partial = ext3_find_shared(inode, n, offsets, chain, &nr);
+       /* Kill the top of shared branch (not detached) */
+       if (nr) {
+               if (partial == chain) {
+                       /* Shared branch grows from the inode */
+                       ext3_free_branches(handle, inode, NULL,
+                                          &nr, &nr+1, (chain+n-1) - partial);
+                       *partial->p = 0;
+                       /*
+                        * We mark the inode dirty prior to restart,
+                        * and prior to stop.  No need for it here.
+                        */
+               } else {
+                       /* Shared branch grows from an indirect block */
+                       BUFFER_TRACE(partial->bh, "get_write_access");
+                       ext3_free_branches(handle, inode, partial->bh,
+                                       partial->p,
+                                       partial->p+1, (chain+n-1) - partial);
+               }
+       }
+       /* Clear the ends of indirect blocks on the shared branch */
+       while (partial > chain) {
+               ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
+                                  (u32*)partial->bh->b_data + addr_per_block,
+                                  (chain+n-1) - partial);
+               BUFFER_TRACE(partial->bh, "call brelse");
+               brelse (partial->bh);
+               partial--;
+       }
+ do_indirects:
+       /* Kill the remaining (whole) subtrees */
+       switch (offsets[0]) {
+               default:
+                       nr = i_data[EXT3_IND_BLOCK];
+                       if (nr) {
+                               ext3_free_branches(handle, inode, NULL,
+                                                  &nr, &nr+1, 1);
+                               i_data[EXT3_IND_BLOCK] = 0;
+                       }
+               case EXT3_IND_BLOCK:
+                       nr = i_data[EXT3_DIND_BLOCK];
+                       if (nr) {
+                               ext3_free_branches(handle, inode, NULL,
+                                                  &nr, &nr+1, 2);
+                               i_data[EXT3_DIND_BLOCK] = 0;
+                       }
+               case EXT3_DIND_BLOCK:
+                       nr = i_data[EXT3_TIND_BLOCK];
+                       if (nr) {
+                               ext3_free_branches(handle, inode, NULL,
+                                                  &nr, &nr+1, 3);
+                               i_data[EXT3_TIND_BLOCK] = 0;
+                       }
+               case EXT3_TIND_BLOCK:
+                       ;
+       }
+       up_write(&inode->u.ext3_i.truncate_sem);
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+
+       /* In a multi-transaction truncate, we only make the final
+        * transaction synchronous */
+       if (IS_SYNC(inode))
+               handle->h_sync = 1;
+ out_stop:
+       /*
+        * If this was a simple ftruncate(), and the file will remain alive
+        * then we need to clear up the orphan record which we created above.
+        * However, if this was a real unlink then we were called by
+        * ext3_delete_inode(), and we allow that function to clean up the
+        * orphan info for us.
+        */
+       if (inode->i_nlink)
+               ext3_orphan_del(handle, inode);
+
+       ext3_journal_stop(handle, inode);
+ }
+
+ /*
+  * ext3_get_inode_loc returns with an extra refcount against the
+  * inode's underlying buffer_head on success.
+  */
+
+ int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
+ {
+       struct buffer_head *bh = 0;
+       unsigned long block;
+       unsigned long block_group;
+       unsigned long group_desc;
+       unsigned long desc;
+       unsigned long offset;
+       struct ext3_group_desc * gdp;
+
+       if ((inode->i_ino != EXT3_ROOT_INO &&
+               inode->i_ino != EXT3_ACL_IDX_INO &&
+               inode->i_ino != EXT3_ACL_DATA_INO &&
+               inode->i_ino != EXT3_JOURNAL_INO &&
+               inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+               inode->i_ino > le32_to_cpu(
+                       inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
+               ext3_error (inode->i_sb, "ext3_get_inode_loc",
+                           "bad inode number: %lu", inode->i_ino);
+               goto bad_inode;
+       }
+       block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
+       if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
+               ext3_error (inode->i_sb, "ext3_get_inode_loc",
+                           "group >= groups count");
+               goto bad_inode;
+       }
+       group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
+       desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
+       bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
+       if (!bh) {
+               ext3_error (inode->i_sb, "ext3_get_inode_loc",
+                           "Descriptor not loaded");
+               goto bad_inode;
+       }
+
+       gdp = (struct ext3_group_desc *) bh->b_data;
+       /*
+        * Figure out the offset within the block group inode table
+        */
+       offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
+               EXT3_INODE_SIZE(inode->i_sb);
+       block = le32_to_cpu(gdp[desc].bg_inode_table) +
+               (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
+       if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
+               ext3_error (inode->i_sb, "ext3_get_inode_loc",
+                           "unable to read inode block - "
+                           "inode=%lu, block=%lu", inode->i_ino, block);
+               goto bad_inode;
+       }
+       offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
+
+       iloc->bh = bh;
+       iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
+       iloc->block_group = block_group;
+
+       return 0;
+
+  bad_inode:
+       return -EIO;
+ }
+
+ void ext3_read_inode(struct inode * inode)
+ {
+       struct ext3_iloc iloc;
+       struct ext3_inode *raw_inode;
+       struct buffer_head *bh;
+       int block;
+
+       if(ext3_get_inode_loc(inode, &iloc))
+               goto bad_inode;
+       bh = iloc.bh;
+       raw_inode = iloc.raw_inode;
+       init_rwsem(&inode->u.ext3_i.truncate_sem);
+       inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+       inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+       inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+       if(!(test_opt (inode->i_sb, NO_UID32))) {
+               inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+               inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+       }
+       inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+       inode->i_size = le32_to_cpu(raw_inode->i_size);
+       inode->i_atime = le32_to_cpu(raw_inode->i_atime);
+       inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
+       inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
+       inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
+       /* We now have enough fields to check if the inode was active or not.
+        * This is needed because nfsd might try to access dead inodes
+        * the test is that same one that e2fsck uses
+        * NeilBrown 1999oct15
+        */
+       if (inode->i_nlink == 0) {
+               if (inode->i_mode == 0 ||
+                   !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
+                       /* this inode is deleted */
+                       brelse (bh);
+                       goto bad_inode;
+               }
+               /* The only unlinked inodes we let through here have
+                * valid i_mode and are being read by the orphan
+                * recovery code: that's fine, we're about to complete
+                * the process of deleting those. */
+       }
+       inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
+                                        * (for stat), not the fs block
+                                        * size */
+       inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+       inode->i_version = ++event;
+       inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
+ #ifdef EXT3_FRAGMENTS
+       inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
+       inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
+       inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
+ #endif
+       inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+       if (!S_ISREG(inode->i_mode)) {
+               inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+       } else {
+               inode->i_size |=
+                       ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+       }
+       inode->u.ext3_i.i_disksize = inode->i_size;
+       inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+ #ifdef EXT3_PREALLOCATE
+       inode->u.ext3_i.i_prealloc_count = 0;
+ #endif
+       inode->u.ext3_i.i_block_group = iloc.block_group;
+
+       /*
+        * NOTE! The in-memory inode i_data array is in little-endian order
+        * even on big-endian machines: we do NOT byteswap the block numbers!
+        */
+       for (block = 0; block < EXT3_N_BLOCKS; block++)
+               inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
+       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+
+       brelse (iloc.bh);
+
+       if (inode->i_ino == EXT3_ACL_IDX_INO ||
+           inode->i_ino == EXT3_ACL_DATA_INO)
+               /* Nothing to do */ ;
+       else if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext3_file_inode_operations;
+               inode->i_fop = &ext3_file_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+       } else if (S_ISDIR(inode->i_mode)) {
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
+       } else if (S_ISLNK(inode->i_mode)) {
+               if (!inode->i_blocks)
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+               else {
+                       inode->i_op = &page_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext3_aops;
+               }
+       } else
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(iloc.raw_inode->i_block[0]));
+       /* inode->i_attr_flags = 0;                             unused */
+       if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+               inode->i_flags |= S_SYNC;
+       }
+       if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_APPEND;     unused */
+               inode->i_flags |= S_APPEND;
+       }
+       if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FILE_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE;  unused */
+               inode->i_flags |= S_IMMUTABLE_FILE;
+        }
+        if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_LINK_FL) {
+                /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_LINK; unused */
+                inode->i_flags |= S_IMMUTABLE_LINK;
+       }
+       if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_NOATIME;    unused */
+               inode->i_flags |= S_NOATIME;
+       }
+       return;
+
+ bad_inode:
+       make_bad_inode(inode);
+       return;
+ }
+
+ /*
+  * Post the struct inode info into an on-disk inode location in the
+  * buffer-cache.  This gobbles the caller's reference to the
+  * buffer_head in the inode location struct.
+  */
+
+ static int ext3_do_update_inode(handle_t *handle,
+                               struct inode *inode,
+                               struct ext3_iloc *iloc)
+ {
+       struct ext3_inode *raw_inode = iloc->raw_inode;
+       struct buffer_head *bh = iloc->bh;
+       int err = 0, rc, block;
+
+       if (handle) {
+               BUFFER_TRACE(bh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, bh);
+               if (err)
+                       goto out_brelse;
+       }
+       raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+       if(!(test_opt(inode->i_sb, NO_UID32))) {
+               raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+               raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ /*
+  * Fix up interoperability with old kernels. Otherwise, old inodes get
+  * re-used with the upper 16 bits of the uid/gid intact
+  */
+               if(!inode->u.ext3_i.i_dtime) {
+                       raw_inode->i_uid_high =
+                               cpu_to_le16(high_16_bits(inode->i_uid));
+                       raw_inode->i_gid_high =
+                               cpu_to_le16(high_16_bits(inode->i_gid));
+               } else {
+                       raw_inode->i_uid_high = 0;
+                       raw_inode->i_gid_high = 0;
+               }
+       } else {
+               raw_inode->i_uid_low =
+                       cpu_to_le16(fs_high2lowuid(inode->i_uid));
+               raw_inode->i_gid_low =
+                       cpu_to_le16(fs_high2lowgid(inode->i_gid));
+               raw_inode->i_uid_high = 0;
+               raw_inode->i_gid_high = 0;
+       }
+       raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+       raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
+       raw_inode->i_atime = cpu_to_le32(inode->i_atime);
+       raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
+       raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
+       raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+       raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
+       raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
+ #ifdef EXT3_FRAGMENTS
+       raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
+       raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
+       raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
+ #else
+       /* If we are not tracking these fields in the in-memory inode,
+        * then preserve them on disk, but still initialise them to zero
+        * for new inodes. */
+       if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) {
+               raw_inode->i_faddr = 0;
+               raw_inode->i_frag = 0;
+               raw_inode->i_fsize = 0;
+       }
+ #endif
+       raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
+       if (!S_ISREG(inode->i_mode)) {
+               raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
+       } else {
+               raw_inode->i_size_high =
+                       cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
+               if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
+                       struct super_block *sb = inode->i_sb;
+                       if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
+                           EXT3_SB(sb)->s_es->s_rev_level ==
+                                       cpu_to_le32(EXT3_GOOD_OLD_REV)) {
+                              /* If this is the first large file
+                               * created, add a flag to the superblock.
+                               */
+                               err = ext3_journal_get_write_access(handle,
+                                               sb->u.ext3_sb.s_sbh);
+                               if (err)
+                                       goto out_brelse;
+                               ext3_update_dynamic_rev(sb);
+                               EXT3_SET_RO_COMPAT_FEATURE(sb,
+                                       EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
+                               sb->s_dirt = 1;
+                               handle->h_sync = 1;
+                               err = ext3_journal_dirty_metadata(handle,
+                                               sb->u.ext3_sb.s_sbh);
+                       }
+               }
+       }
+       raw_inode->i_generation = le32_to_cpu(inode->i_generation);
+       if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+               raw_inode->i_block[0] =
+                       cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
+       else for (block = 0; block < EXT3_N_BLOCKS; block++)
+               raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
+
+       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+       rc = ext3_journal_dirty_metadata(handle, bh);
+       if (!err)
+               err = rc;
+       inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW;
+
+ out_brelse:
+       brelse (bh);
+       ext3_std_error(inode->i_sb, err);
+       return err;
+ }
+
+ /*
+  * ext3_write_inode()
+  *
+  * We are called from a few places:
+  *
+  * - Within generic_file_write() for O_SYNC files.
+  *   Here, there will be no transaction running. We wait for any running
+  *   trasnaction to commit.
+  *
+  * - Within sys_sync(), kupdate and such.
+  *   We wait on commit, if tol to.
+  *
+  * - Within prune_icache() (PF_MEMALLOC == true)
+  *   Here we simply return.  We can't afford to block kswapd on the
+  *   journal commit.
+  *
+  * In all cases it is actually safe for us to return without doing anything,
+  * because the inode has been copied into a raw inode buffer in
+  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+  * knfsd.
+  *
+  * Note that we are absolutely dependent upon all inode dirtiers doing the
+  * right thing: they *must* call mark_inode_dirty() after dirtying info in
+  * which we are interested.
+  *
+  * It would be a bug for them to not do this.  The code:
+  *
+  *    mark_inode_dirty(inode)
+  *    stuff();
+  *    inode->i_size = expr;
+  *
+  * is in error because a kswapd-driven write_inode() could occur while
+  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+  * will no longer be on the superblock's dirty inode list.
+  */
+ void ext3_write_inode(struct inode *inode, int wait)
+ {
+       if (current->flags & PF_MEMALLOC)
+               return;
+
+       if (ext3_journal_current_handle()) {
+               jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+               return;
+       }
+
+       if (!wait)
+               return;
+
+       ext3_force_commit(inode->i_sb);
+ }
+
+ /*
+  * ext3_setattr()
+  *
+  * Called from notify_change.
+  *
+  * We want to trap VFS attempts to truncate the file as soon as
+  * possible.  In particular, we want to make sure that when the VFS
+  * shrinks i_size, we put the inode on the orphan list and modify
+  * i_disksize immediately, so that during the subsequent flushing of
+  * dirty pages and freeing of disk blocks, we can guarantee that any
+  * commit will leave the blocks being flushed in an unused state on
+  * disk.  (On recovery, the inode will get truncated and the blocks will
+  * be freed, so we have a strong guarantee that no future commit will
+  * leave these blocks visible to the user.)
+  *
+  * This is only needed for regular files.  rmdir() has its own path, and
+  * we can never truncate a direcory except on final unlink (at which
+  * point i_nlink is zero so recovery is easy.)
+  *
+  * Called with the BKL.
+  */
+
+ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
+ {
+       struct inode *inode = dentry->d_inode;
+       int error, rc;
+
+       error = inode_change_ok(inode, attr);
+       if (error)
+               return error;
+
+       if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+               handle_t *handle;
+
+               handle = ext3_journal_start(inode, 3);
+               if (IS_ERR(handle)) {
+                       error = PTR_ERR(handle);
+                       goto err_out;
+               }
+
+               error = ext3_orphan_add(handle, inode);
+               inode->u.ext3_i.i_disksize = attr->ia_size;
+               rc = ext3_mark_inode_dirty(handle, inode);
+               if (!error)
+                       error = rc;
+               ext3_journal_stop(handle, inode);
+       }
+
+       inode_setattr(inode, attr);
+
+       /* If inode_setattr's call to ext3_truncate failed to get a
+        * transaction handle at all, we need to clean up the in-core
+        * orphan list manually. */
+       if (inode->i_nlink)
+               ext3_orphan_del(NULL, inode);
+
+ err_out:
+       ext3_std_error(inode->i_sb, error);
+       return 0;
+ }
+
+
+ /*
+  * akpm: how many blocks doth make a writepage()?
+  *
+  * With N blocks per page, it may be:
+  * N data blocks
+  * 2 indirect block
+  * 2 dindirect
+  * 1 tindirect
+  * N+5 bitmap blocks (from the above)
+  * N+5 group descriptor summary blocks
+  * 1 inode block
+  * 1 superblock.
+  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
+  *
+  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+  *
+  * With ordered or writeback data it's the same, less the N data blocks.
+  *
+  * If the inode's direct blocks can hold an integral number of pages then a
+  * page cannot straddle two indirect blocks, and we can only touch one indirect
+  * and dindirect block, and the "5" above becomes "3".
+  *
+  * This still overestimates under most circumstances.  If we were to pass the
+  * start and end offsets in here as well we could do block_to_path() on each
+  * block and work out the exact number of indirects which are touched.  Pah.
+  */
+
+ int ext3_writepage_trans_blocks(struct inode *inode)
+ {
+       int bpp = ext3_journal_blocks_per_page(inode);
+       int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+       int ret;
+
+       if (ext3_should_journal_data(inode))
+               ret = 3 * (bpp + indirects) + 2;
+       else
+               ret = 2 * (bpp + indirects) + 2;
+
+ #ifdef CONFIG_QUOTA
+       ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
+ #endif
+
+       return ret;
+ }
+
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+                    struct inode *inode,
+                    struct ext3_iloc *iloc)
+ {
+       int err = 0;
+
+       if (handle) {
+               /* the do_update_inode consumes one bh->b_count */
+               atomic_inc(&iloc->bh->b_count);
+               err = ext3_do_update_inode(handle, inode, iloc);
+               /* ext3_do_update_inode() does journal_dirty_metadata */
+               brelse(iloc->bh);
+       } else {
+               printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n");
+       }
+       return err;
+ }
+
+ /*
+  * On success, We end up with an outstanding reference count against
+  * iloc->bh.  This _must_ be cleaned up later.
+  */
+
+ int
+ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+                        struct ext3_iloc *iloc)
+ {
+       int err = 0;
+       if (handle) {
+               err = ext3_get_inode_loc(inode, iloc);
+               if (!err) {
+                       BUFFER_TRACE(iloc->bh, "get_write_access");
+                       err = ext3_journal_get_write_access(handle, iloc->bh);
+                       if (err) {
+                               brelse(iloc->bh);
+                               iloc->bh = NULL;
+                       }
+               }
+       }
+       ext3_std_error(inode->i_sb, err);
+       return err;
+ }
+
+ /*
+  * akpm: What we do here is to mark the in-core inode as clean
+  * with respect to inode dirtiness (it may still be data-dirty).
+  * This means that the in-core inode may be reaped by prune_icache
+  * without having to perform any I/O.  This is a very good thing,
+  * because *any* task may call prune_icache - even ones which
+  * have a transaction open against a different journal.
+  *
+  * Is this cheating?  Not really.  Sure, we haven't written the
+  * inode out, but prune_icache isn't a user-visible syncing function.
+  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+  * we start and wait on commits.
+  *
+  * Is this efficient/effective?  Well, we're being nice to the system
+  * by cleaning up our inodes proactively so they can be reaped
+  * without I/O.  But we are potentially leaving up to five seconds'
+  * worth of inodes floating about which prune_icache wants us to
+  * write out.  One way to fix that would be to get prune_icache()
+  * to do a write_super() to free up some memory.  It has the desired
+  * effect.
+  */
+ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
+ {
+       struct ext3_iloc iloc;
+       int err;
+
+       err = ext3_reserve_inode_write(handle, inode, &iloc);
+       if (!err)
+               err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+       return err;
+ }
+
+ /*
+  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
+  *
+  * We're really interested in the case where a file is being extended.
+  * i_size has been changed by generic_commit_write() and we thus need
+  * to include the updated inode in the current transaction.
+  *
+  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+  * are allocated to the file.
+  *
+  * If the inode is marked synchronous, we don't honour that here - doing
+  * so would cause a commit on atime updates, which we don't bother doing.
+  * We handle synchronous inodes at the highest possible level.
+  */
+ void ext3_dirty_inode(struct inode *inode)
+ {
+       handle_t *current_handle = ext3_journal_current_handle();
+       handle_t *handle;
+
+       lock_kernel();
+       handle = ext3_journal_start(inode, 1);
+       if (IS_ERR(handle))
+               goto out;
+       if (current_handle &&
+               current_handle->h_transaction != handle->h_transaction) {
+               /* This task has a transaction open against a different fs */
+               printk(KERN_EMERG __FUNCTION__": transactions do not match!\n");
+       } else {
+               jbd_debug(5, "marking dirty.  outer handle=%p\n",
+                               current_handle);
+               ext3_mark_inode_dirty(handle, inode);
+       }
+       ext3_journal_stop(handle, inode);
+ out:
+       unlock_kernel();
+ }
+
+ #ifdef AKPM
+ /*
+  * Bind an inode's backing buffer_head into this transaction, to prevent
+  * it from being flushed to disk early.  Unlike
+  * ext3_reserve_inode_write, this leaves behind no bh reference and
+  * returns no iloc structure, so the caller needs to repeat the iloc
+  * lookup to mark the inode dirty later.
+  */
+ static inline int
+ ext3_pin_inode(handle_t *handle, struct inode *inode)
+ {
+       struct ext3_iloc iloc;
+
+       int err = 0;
+       if (handle) {
+               err = ext3_get_inode_loc(inode, &iloc);
+               if (!err) {
+                       BUFFER_TRACE(iloc.bh, "get_write_access");
+                       err = journal_get_write_access(handle, iloc.bh);
+                       if (!err)
+                               err = ext3_journal_dirty_metadata(handle,
+                                                                 iloc.bh);
+                       brelse(iloc.bh);
+               }
+       }
+       ext3_std_error(inode->i_sb, err);
+       return err;
+ }
+ #endif
+
+ int ext3_change_inode_journal_flag(struct inode *inode, int val)
+ {
+       journal_t *journal;
+       handle_t *handle;
+       int err;
+
+       /*
+        * We have to be very careful here: changing a data block's
+        * journaling status dynamically is dangerous.  If we write a
+        * data block to the journal, change the status and then delete
+        * that block, we risk forgetting to revoke the old log record
+        * from the journal and so a subsequent replay can corrupt data.
+        * So, first we make sure that the journal is empty and that
+        * nobody is changing anything.
+        */
+
+       journal = EXT3_JOURNAL(inode);
+       if (is_journal_aborted(journal) || IS_RDONLY(inode))
+               return -EROFS;
+
+       journal_lock_updates(journal);
+       journal_flush(journal);
+
+       /*
+        * OK, there are no updates running now, and all cached data is
+        * synced to disk.  We are now in a completely consistent state
+        * which doesn't have anything in the journal, and we know that
+        * no filesystem updates are running, so it is safe to modify
+        * the inode's in-core data-journaling state flag now.
+        */
+
+       if (val)
+               inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
+       else
+               inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
+
+       journal_unlock_updates(journal);
+
+       /* Finally we can mark the inode as dirty. */
+
+       handle = ext3_journal_start(inode, 1);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       err = ext3_mark_inode_dirty(handle, inode);
+       handle->h_sync = 1;
+       ext3_journal_stop(handle, inode);
+       ext3_std_error(inode->i_sb, err);
+
+       return err;
+ }
+
+
+ /*
+  * ext3_aops_journal_start().
+  *
+  * <This function died, but the comment lives on>
+  *
+  * We need to take the inode semaphore *outside* the
+  * journal_start/journal_stop.  Otherwise, a different task could do a
+  * wait_for_commit() while holding ->i_sem, which deadlocks.  The rule
+  * is: transaction open/closes are considered to be a locking operation
+  * and they nest *inside* ->i_sem.
+  * ----------------------------------------------------------------------------
+  * Possible problem:
+  *    ext3_file_write()
+  *    -> generic_file_write()
+  *       -> __alloc_pages()
+  *          -> page_launder()
+  *             -> ext3_writepage()
+  *
+  * And the writepage can be on a different fs while we have a
+  * transaction open against this one!  Bad.
+  *
+  * I tried making the task PF_MEMALLOC here, but that simply results in
+  * 0-order allocation failures passed back to generic_file_write().
+  * Instead, we rely on the reentrancy protection in ext3_writepage().
+  * ----------------------------------------------------------------------------
+  * When we do the journal_start() here we don't really need to reserve
+  * any blocks - we won't need any until we hit ext3_prepare_write(),
+  * which does all the needed journal extending.  However!  There is a
+  * problem with quotas:
+  *
+  * Thread 1:
+  * sys_sync
+  * ->sync_dquots
+  *   ->commit_dquot
+  *     ->lock_dquot
+  *     ->write_dquot
+  *       ->ext3_file_write
+  *         ->journal_start
+  *         ->ext3_prepare_write
+  *           ->journal_extend
+  *           ->journal_start
+  * Thread 2:
+  * ext3_create                (for example)
+  * ->ext3_new_inode
+  *   ->dquot_initialize
+  *     ->lock_dquot
+  *
+  * Deadlock.  Thread 1's journal_start blocks because thread 2 has a
+  * transaction open.  Thread 2's transaction will never close because
+  * thread 2 is stuck waiting for the dquot lock.
+  *
+  * So.  We must ensure that thread 1 *never* needs to extend the journal
+  * for quota writes.  We do that by reserving enough journal blocks
+  * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
+  * need to extend" test in ext3_prepare_write() succeeds.
+  */
+
+
+ MODULE_LICENSE("GPL");
diff -rc2P linux/fs/ext3/ioctl.c linux-2.4.13/fs/ext3/ioctl.c
*** linux/fs/ext3/ioctl.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/ioctl.c        Fri Nov  9 17:03:13 2001
***************
*** 0 ****
--- 1,176 ----
+ /*
+  * linux/fs/ext3/ioctl.c
+  *
+  * Copyright (C) 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/sched.h>
+ #include <asm/uaccess.h>
+
+
+ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+               unsigned long arg)
+ {
+       unsigned int flags;
+
+       ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+       switch (cmd) {
+       case EXT3_IOC_GETFLAGS:
+               flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
+               return put_user(flags, (int *) arg);
+       case EXT3_IOC_SETFLAGS: {
+               handle_t *handle = NULL;
+               int err;
+               struct ext3_iloc iloc;
+               unsigned int oldflags;
+               unsigned int jflag;
+
+               if (IS_RDONLY(inode))
+                       return -EROFS;
+
+               if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                       return -EPERM;
+
+               if (get_user(flags, (int *) arg))
+                       return -EFAULT;
+
+               oldflags = inode->u.ext3_i.i_flags;
+
+               /* The JOURNAL_DATA flag is modifiable only by root */
+               jflag = flags & EXT3_JOURNAL_DATA_FL;
+
+               /*
+                * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+                * the relevant capability.
+                *
+                * This test looks nicer. Thanks to Pauline Middelink
+                */
+               if ((flags ^ oldflags) & (EXT3_APPEND_FL |  EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL)) {
+                       if (!capable(CAP_LINUX_IMMUTABLE))
+                               return -EPERM;
+               }
+
+               /*
+                * The JOURNAL_DATA flag can only be changed by
+                * the relevant capability.
+                */
+               if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+                       if (!capable(CAP_SYS_RESOURCE))
+                               return -EPERM;
+               }
+
+
+               handle = ext3_journal_start(inode, 1);
+               if (IS_ERR(handle))
+                       return PTR_ERR(handle);
+               if (IS_SYNC(inode))
+                       handle->h_sync = 1;
+               err = ext3_reserve_inode_write(handle, inode, &iloc);
+               if (err)
+                       goto flags_err;
+
+               flags = flags & EXT3_FL_USER_MODIFIABLE;
+               flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
+               inode->u.ext3_i.i_flags = flags;
+
+               if (flags & EXT3_SYNC_FL)
+                       inode->i_flags |= S_SYNC;
+               else
+                       inode->i_flags &= ~S_SYNC;
+               if (flags & EXT3_APPEND_FL)
+                       inode->i_flags |= S_APPEND;
+               else
+                       inode->i_flags &= ~S_APPEND;
+                if (flags & EXT3_IMMUTABLE_FILE_FL)
+                        inode->i_flags |= S_IMMUTABLE_FILE;
+                else
+                        inode->i_flags &= ~S_IMMUTABLE_FILE;
+
+                if (flags & EXT3_IMMUTABLE_LINK_FL)
+                        inode->i_flags |= S_IMMUTABLE_LINK;
+               else
+                        inode->i_flags &= ~S_IMMUTABLE_LINK;
+
+               if (flags & EXT3_NOATIME_FL)
+                       inode->i_flags |= S_NOATIME;
+               else
+                       inode->i_flags &= ~S_NOATIME;
+               inode->i_ctime = CURRENT_TIME;
+
+               err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ flags_err:
+               ext3_journal_stop(handle, inode);
+               if (err)
+                       return err;
+
+               if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
+                       err = ext3_change_inode_journal_flag(inode, jflag);
+               return err;
+       }
+       case EXT3_IOC_GETVERSION:
+       case EXT3_IOC_GETVERSION_OLD:
+               return put_user(inode->i_generation, (int *) arg);
+       case EXT3_IOC_SETVERSION:
+       case EXT3_IOC_SETVERSION_OLD: {
+               handle_t *handle;
+               struct ext3_iloc iloc;
+               __u32 generation;
+               int err;
+
+               if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                       return -EPERM;
+               if (IS_RDONLY(inode))
+                       return -EROFS;
+               if (get_user(generation, (int *) arg))
+                       return -EFAULT;
+
+               handle = ext3_journal_start(inode, 1);
+               if (IS_ERR(handle))
+                       return PTR_ERR(handle);
+               err = ext3_reserve_inode_write(handle, inode, &iloc);
+               if (err)
+                       return err;
+
+               inode->i_ctime = CURRENT_TIME;
+               inode->i_generation = generation;
+
+               err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+               ext3_journal_stop(handle, inode);
+               return err;
+       }
+ #ifdef CONFIG_JBD_DEBUG
+       case EXT3_IOC_WAIT_FOR_READONLY:
+               /*
+                * This is racy - by the time we're woken up and running,
+                * the superblock could be released.  And the module could
+                * have been unloaded.  So sue me.
+                *
+                * Returns 1 if it slept, else zero.
+                */
+               {
+                       struct super_block *sb = inode->i_sb;
+                       DECLARE_WAITQUEUE(wait, current);
+                       int ret = 0;
+
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+                       if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
+                               schedule();
+                               ret = 1;
+                       }
+                       remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+                       return ret;
+               }
+ #endif
+       default:
+               return -ENOTTY;
+       }
+ }
diff -rc2P linux/fs/ext3/namei.c linux-2.4.13/fs/ext3/namei.c
*** linux/fs/ext3/namei.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/namei.c        Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,1125 ----
+ /*
+  *  linux/fs/ext3/namei.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/namei.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  *  Directory entry file type support and forward compatibility hooks
+  *    for B-tree directories by Theodore Ts'o ([email protected]), 1998
+  */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+
+ /*
+  * define how far ahead to read directories while searching them.
+  */
+ #define NAMEI_RA_CHUNKS  2
+ #define NAMEI_RA_BLOCKS  4
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+ /*
+  * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+  *
+  * `len <= EXT3_NAME_LEN' is guaranteed by caller.
+  * `de != NULL' is guaranteed by caller.
+  */
+ static inline int ext3_match (int len, const char * const name,
+                             struct ext3_dir_entry_2 * de)
+ {
+       if (len != de->name_len)
+               return 0;
+       if (!de->inode)
+               return 0;
+       return !memcmp(name, de->name, len);
+ }
+
+ /*
+  * Returns 0 if not found, -1 on failure, and 1 on success
+  */
+ static int inline search_dirblock(struct buffer_head * bh,
+                                 struct inode *dir,
+                                 struct dentry *dentry,
+                                 unsigned long offset,
+                                 struct ext3_dir_entry_2 ** res_dir)
+ {
+       struct ext3_dir_entry_2 * de;
+       char * dlimit;
+       int de_len;
+       const char *name = dentry->d_name.name;
+       int namelen = dentry->d_name.len;
+
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+       dlimit = bh->b_data + dir->i_sb->s_blocksize;
+       while ((char *) de < dlimit) {
+               /* this code is executed quadratically often */
+               /* do minimal checking `by hand' */
+
+               if ((char *) de + namelen <= dlimit &&
+                   ext3_match (namelen, name, de)) {
+                       /* found a match - just to be sure, do a full check */
+                       if (!ext3_check_dir_entry("ext3_find_entry",
+                                                 dir, de, bh, offset))
+                               return -1;
+                       *res_dir = de;
+                       return 1;
+               }
+               /* prevent looping on a bad block */
+               de_len = le16_to_cpu(de->rec_len);
+               if (de_len <= 0)
+                       return -1;
+               offset += de_len;
+               de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
+       }
+       return 0;
+ }
+
+ /*
+  *    ext3_find_entry()
+  *
+  * finds an entry in the specified directory with the wanted name. It
+  * returns the cache buffer in which the entry was found, and the entry
+  * itself (as a parameter - res_dir). It does NOT read the inode of the
+  * entry - you'll have to do that yourself if you want to.
+  *
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+                                       struct ext3_dir_entry_2 ** res_dir)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh_use[NAMEI_RA_SIZE];
+       struct buffer_head * bh, *ret = NULL;
+       unsigned long start, block, b;
+       int ra_max = 0;         /* Number of bh's in the readahead
+                                  buffer, bh_use[] */
+       int ra_ptr = 0;         /* Current index into readahead
+                                  buffer */
+       int num = 0;
+       int nblocks, i, err;
+       struct inode *dir = dentry->d_parent->d_inode;
+
+       *res_dir = NULL;
+       sb = dir->i_sb;
+
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+       start = dir->u.ext3_i.i_dir_start_lookup;
+       if (start >= nblocks)
+               start = 0;
+       block = start;
+ restart:
+       do {
+               /*
+                * We deal with the read-ahead logic here.
+                */
+               if (ra_ptr >= ra_max) {
+                       /* Refill the readahead buffer */
+                       ra_ptr = 0;
+                       b = block;
+                       for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+                               /*
+                                * Terminate if we reach the end of the
+                                * directory and must wrap, or if our
+                                * search has finished at this block.
+                                */
+                               if (b >= nblocks || (num && block == start)) {
+                                       bh_use[ra_max] = NULL;
+                                       break;
+                               }
+                               num++;
+                               bh = ext3_getblk(NULL, dir, b++, 0, &err);
+                               bh_use[ra_max] = bh;
+                               if (bh)
+                                       ll_rw_block(READ, 1, &bh);
+                       }
+               }
+               if ((bh = bh_use[ra_ptr++]) == NULL)
+                       goto next;
+               wait_on_buffer(bh);
+               if (!buffer_uptodate(bh)) {
+                       /* read error, skip block & hope for the best */
+                       brelse(bh);
+                       goto next;
+               }
+               i = search_dirblock(bh, dir, dentry,
+                           block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+               if (i == 1) {
+                       dir->u.ext3_i.i_dir_start_lookup = block;
+                       ret = bh;
+                       goto cleanup_and_exit;
+               } else {
+                       brelse(bh);
+                       if (i < 0)
+                               goto cleanup_and_exit;
+               }
+       next:
+               if (++block >= nblocks)
+                       block = 0;
+       } while (block != start);
+
+       /*
+        * If the directory has grown while we were searching, then
+        * search the last part of the directory before giving up.
+        */
+       block = nblocks;
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+       if (block < nblocks) {
+               start = 0;
+               goto restart;
+       }
+
+ cleanup_and_exit:
+       /* Clean up the read-ahead blocks */
+       for (; ra_ptr < ra_max; ra_ptr++)
+               brelse (bh_use[ra_ptr]);
+       return ret;
+ }
+
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+       struct inode * inode;
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
+
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       bh = ext3_find_entry(dentry, &de);
+       inode = NULL;
+       if (bh) {
+               unsigned long ino = le32_to_cpu(de->inode);
+               brelse (bh);
+               inode = iget(dir->i_sb, ino);
+
+               if (!inode)
+                       return ERR_PTR(-EACCES);
+       }
+       d_add(dentry, inode);
+       return NULL;
+ }
+
+ #define S_SHIFT 12
+ static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+       [S_IFREG >> S_SHIFT]    EXT3_FT_REG_FILE,
+       [S_IFDIR >> S_SHIFT]    EXT3_FT_DIR,
+       [S_IFCHR >> S_SHIFT]    EXT3_FT_CHRDEV,
+       [S_IFBLK >> S_SHIFT]    EXT3_FT_BLKDEV,
+       [S_IFIFO >> S_SHIFT]    EXT3_FT_FIFO,
+       [S_IFSOCK >> S_SHIFT]   EXT3_FT_SOCK,
+       [S_IFLNK >> S_SHIFT]    EXT3_FT_SYMLINK,
+ };
+
+ static inline void ext3_set_de_type(struct super_block *sb,
+                               struct ext3_dir_entry_2 *de,
+                               umode_t mode) {
+       if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
+               de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+
+ /*
+  *    ext3_add_entry()
+  *
+  * adds a file entry to the specified directory, using the same
+  * semantics as ext3_find_entry(). It returns NULL if it failed.
+  *
+  * NOTE!! The inode part of 'de' is left at 0 - which means you
+  * may not sleep between calling this and putting something into
+  * the entry, as someone else might have used it while you slept.
+  */
+
+ /*
+  * AKPM: the journalling code here looks wrong on the error paths
+  */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+       struct inode *inode)
+ {
+       struct inode *dir = dentry->d_parent->d_inode;
+       const char *name = dentry->d_name.name;
+       int namelen = dentry->d_name.len;
+       unsigned long offset;
+       unsigned short rec_len;
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de, * de1;
+       struct super_block * sb;
+       int     retval;
+
+       sb = dir->i_sb;
+
+       if (!namelen)
+               return -EINVAL;
+       bh = ext3_bread (handle, dir, 0, 0, &retval);
+       if (!bh)
+               return retval;
+       rec_len = EXT3_DIR_REC_LEN(namelen);
+       offset = 0;
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+       while (1) {
+               if ((char *)de >= sb->s_blocksize + bh->b_data) {
+                       brelse (bh);
+                       bh = NULL;
+                       bh = ext3_bread (handle, dir,
+                               offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+                       if (!bh)
+                               return retval;
+                       if (dir->i_size <= offset) {
+                               if (dir->i_size == 0) {
+                                       brelse(bh);
+                                       return -ENOENT;
+                               }
+
+                               ext3_debug ("creating next block\n");
+
+                               BUFFER_TRACE(bh, "get_write_access");
+                               ext3_journal_get_write_access(handle, bh);
+                               de = (struct ext3_dir_entry_2 *) bh->b_data;
+                               de->inode = 0;
+                               de->rec_len = le16_to_cpu(sb->s_blocksize);
+                               dir->u.ext3_i.i_disksize =
+                                       dir->i_size = offset + sb->s_blocksize;
+                               dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+                               ext3_mark_inode_dirty(handle, dir);
+                       } else {
+
+                               ext3_debug ("skipping to next block\n");
+
+                               de = (struct ext3_dir_entry_2 *) bh->b_data;
+                       }
+               }
+               if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+                                          offset)) {
+                       brelse (bh);
+                       return -ENOENT;
+               }
+               if (ext3_match (namelen, name, de)) {
+                               brelse (bh);
+                               return -EEXIST;
+               }
+               if ((le32_to_cpu(de->inode) == 0 &&
+                               le16_to_cpu(de->rec_len) >= rec_len) ||
+                   (le16_to_cpu(de->rec_len) >=
+                               EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+                       BUFFER_TRACE(bh, "get_write_access");
+                       ext3_journal_get_write_access(handle, bh);
+                       /* By now the buffer is marked for journaling */
+                       offset += le16_to_cpu(de->rec_len);
+                       if (le32_to_cpu(de->inode)) {
+                               de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+                                       EXT3_DIR_REC_LEN(de->name_len));
+                               de1->rec_len =
+                                       cpu_to_le16(le16_to_cpu(de->rec_len) -
+                                       EXT3_DIR_REC_LEN(de->name_len));
+                               de->rec_len = cpu_to_le16(
+                                               EXT3_DIR_REC_LEN(de->name_len));
+                               de = de1;
+                       }
+                       de->file_type = EXT3_FT_UNKNOWN;
+                       if (inode) {
+                               de->inode = cpu_to_le32(inode->i_ino);
+                               ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+                       } else
+                               de->inode = 0;
+                       de->name_len = namelen;
+                       memcpy (de->name, name, namelen);
+                       /*
+                        * XXX shouldn't update any times until successful
+                        * completion of syscall, but too many callers depend
+                        * on this.
+                        *
+                        * XXX similarly, too many callers depend on
+                        * ext3_new_inode() setting the times, but error
+                        * recovery deletes the inode, so the worst that can
+                        * happen is that the times are slightly out of date
+                        * and/or different from the directory change time.
+                        */
+                       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+                       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+                       ext3_mark_inode_dirty(handle, dir);
+                       dir->i_version = ++event;
+                       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+                       ext3_journal_dirty_metadata(handle, bh);
+                       brelse(bh);
+                       return 0;
+               }
+               offset += le16_to_cpu(de->rec_len);
+               de = (struct ext3_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       }
+       brelse (bh);
+       return -ENOSPC;
+ }
+
+ /*
+  * ext3_delete_entry deletes a directory entry by merging it with the
+  * previous entry
+  */
+ static int ext3_delete_entry (handle_t *handle,
+                             struct inode * dir,
+                             struct ext3_dir_entry_2 * de_del,
+                             struct buffer_head * bh)
+ {
+       struct ext3_dir_entry_2 * de, * pde;
+       int i;
+
+       i = 0;
+       pde = NULL;
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+       while (i < bh->b_size) {
+               if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
+                       return -EIO;
+               if (de == de_del)  {
+                       BUFFER_TRACE(bh, "get_write_access");
+                       ext3_journal_get_write_access(handle, bh);
+                       if (pde)
+                               pde->rec_len =
+                                       cpu_to_le16(le16_to_cpu(pde->rec_len) +
+                                                   le16_to_cpu(de->rec_len));
+                       else
+                               de->inode = 0;
+                       dir->i_version = ++event;
+                       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+                       ext3_journal_dirty_metadata(handle, bh);
+                       return 0;
+               }
+               i += le16_to_cpu(de->rec_len);
+               pde = de;
+               de = (struct ext3_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       }
+       return -ENOENT;
+ }
+
+ /*
+  * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
+  * do not perform it in these functions.  We perform it at the call site,
+  * if it is needed.
+  */
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+       inode->i_nlink++;
+ }
+
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+       inode->i_nlink--;
+ }
+
+ static int ext3_add_nondir(handle_t *handle,
+               struct dentry *dentry, struct inode *inode)
+ {
+       int err = ext3_add_entry(handle, dentry, inode);
+       if (!err) {
+               d_instantiate(dentry, inode);
+               return 0;
+       }
+       ext3_dec_count(handle, inode);
+       iput(inode);
+       return err;
+ }
+
+ /*
+  * By the time this is called, we already have created
+  * the directory cache entry for the new file, but it
+  * is so far negative - it has no inode.
+  *
+  * If the create succeeds, we fill in the inode information
+  * with d_instantiate().
+  */
+ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode)
+ {
+       handle_t *handle;
+       struct inode * inode;
+       int err;
+
+       handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       inode = ext3_new_inode (handle, dir, mode);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               inode->i_op = &ext3_file_inode_operations;
+               inode->i_fop = &ext3_file_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               ext3_mark_inode_dirty(handle, inode);
+               err = ext3_add_nondir(handle, dentry, inode);
+       }
+       ext3_journal_stop(handle, dir);
+       return err;
+ }
+
+ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
+                       int mode, int rdev)
+ {
+       handle_t *handle;
+       struct inode *inode;
+       int err;
+
+       handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       inode = ext3_new_inode (handle, dir, mode);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               init_special_inode(inode, mode, rdev);
+               ext3_mark_inode_dirty(handle, inode);
+               err = ext3_add_nondir(handle, dentry, inode);
+       }
+       ext3_journal_stop(handle, dir);
+       return err;
+ }
+
+ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+ {
+       handle_t *handle;
+       struct inode * inode;
+       struct buffer_head * dir_block;
+       struct ext3_dir_entry_2 * de;
+       int err;
+
+       if (dir->i_nlink >= EXT3_LINK_MAX)
+               return -EMLINK;
+
+       handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       inode = ext3_new_inode (handle, dir, S_IFDIR);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+       inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
+       inode->i_blocks = 0;
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+               inode->i_nlink--; /* is this nlink == 0? */
+               ext3_mark_inode_dirty(handle, inode);
+               iput (inode);
+               goto out_stop;
+       }
+       BUFFER_TRACE(dir_block, "get_write_access");
+       ext3_journal_get_write_access(handle, dir_block);
+       de = (struct ext3_dir_entry_2 *) dir_block->b_data;
+       de->inode = cpu_to_le32(inode->i_ino);
+       de->name_len = 1;
+       de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
+       strcpy (de->name, ".");
+       ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+       de = (struct ext3_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       de->inode = cpu_to_le32(dir->i_ino);
+       de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
+       de->name_len = 2;
+       strcpy (de->name, "..");
+       ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+       inode->i_nlink = 2;
+       BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+       ext3_journal_dirty_metadata(handle, dir_block);
+       brelse (dir_block);
+       inode->i_mode = S_IFDIR | mode;
+       if (dir->i_mode & S_ISGID)
+               inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+       if (err)
+               goto out_no_entry;
+       dir->i_nlink++;
+       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+       d_instantiate(dentry, inode);
+ out_stop:
+       ext3_journal_stop(handle, dir);
+       return err;
+
+ out_no_entry:
+       inode->i_nlink = 0;
+       ext3_mark_inode_dirty(handle, inode);
+       iput (inode);
+       goto out_stop;
+ }
+
+ /*
+  * routine to check that the specified directory is empty (for rmdir)
+  */
+ static int empty_dir (struct inode * inode)
+ {
+       unsigned long offset;
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de, * de1;
+       struct super_block * sb;
+       int err;
+
+       sb = inode->i_sb;
+       if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
+           !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+               ext3_warning (inode->i_sb, "empty_dir",
+                             "bad directory (dir #%lu) - no data block",
+                             inode->i_ino);
+               return 1;
+       }
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+       de1 = (struct ext3_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       if (le32_to_cpu(de->inode) != inode->i_ino ||
+                       !le32_to_cpu(de1->inode) ||
+                       strcmp (".", de->name) ||
+                       strcmp ("..", de1->name)) {
+               ext3_warning (inode->i_sb, "empty_dir",
+                             "bad directory (dir #%lu) - no `.' or `..'",
+                             inode->i_ino);
+               brelse (bh);
+               return 1;
+       }
+       offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+       de = (struct ext3_dir_entry_2 *)
+                       ((char *) de1 + le16_to_cpu(de1->rec_len));
+       while (offset < inode->i_size ) {
+               if (!bh ||
+                       (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                       brelse (bh);
+                       bh = ext3_bread (NULL, inode,
+                               offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
+                       if (!bh) {
+ #if 0
+                               ext3_error (sb, "empty_dir",
+                               "directory #%lu contains a hole at offset %lu",
+                                       inode->i_ino, offset);
+ #endif
+                               offset += sb->s_blocksize;
+                               continue;
+                       }
+                       de = (struct ext3_dir_entry_2 *) bh->b_data;
+               }
+               if (!ext3_check_dir_entry ("empty_dir", inode, de, bh,
+                                          offset)) {
+                       brelse (bh);
+                       return 1;
+               }
+               if (le32_to_cpu(de->inode)) {
+                       brelse (bh);
+                       return 0;
+               }
+               offset += le16_to_cpu(de->rec_len);
+               de = (struct ext3_dir_entry_2 *)
+                               ((char *) de + le16_to_cpu(de->rec_len));
+       }
+       brelse (bh);
+       return 1;
+ }
+
+ /* ext3_orphan_add() links an unlinked or truncated inode into a list of
+  * such inodes, starting at the superblock, in case we crash before the
+  * file is closed/deleted, or in case the inode truncate spans multiple
+  * transactions and the last transaction is not recovered after a crash.
+  *
+  * At filesystem recovery time, we walk this list deleting unlinked
+  * inodes and truncating linked inodes in ext3_orphan_cleanup().
+  */
+ int ext3_orphan_add(handle_t *handle, struct inode *inode)
+ {
+       struct super_block *sb = inode->i_sb;
+       struct ext3_iloc iloc;
+       int err = 0, rc;
+
+       lock_super(sb);
+       if (!list_empty(&inode->u.ext3_i.i_orphan))
+               goto out_unlock;
+
+       /* Orphan handling is only valid for files with data blocks
+        * being truncated, or files being unlinked. */
+
+       /* @@@ FIXME: Observation from aviro:
+        * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
+        * here (on lock_super()), so race with ext3_link() which might bump
+        * ->i_nlink. For, say it, character device. Not a regular file,
+        * not a directory, not a symlink and ->i_nlink > 0.
+        */
+       J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+               S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+       err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+       if (err)
+               goto out_unlock;
+
+       err = ext3_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto out_unlock;
+
+       /* Insert this inode at the head of the on-disk orphan list... */
+       NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+       EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+       err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+       rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
+       if (!err)
+               err = rc;
+
+       /* Only add to the head of the in-memory list if all the
+        * previous operations succeeded.  If the orphan_add is going to
+        * fail (possibly taking the journal offline), we can't risk
+        * leaving the inode on the orphan list: stray orphan-list
+        * entries can cause panics at unmount time.
+        *
+        * This is safe: on error we're going to ignore the orphan list
+        * anyway on the next recovery. */
+       if (!err)
+               list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+
+       jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+       jbd_debug(4, "orphan inode %ld will point to %d\n",
+                       inode->i_ino, NEXT_ORPHAN(inode));
+ out_unlock:
+       unlock_super(sb);
+       ext3_std_error(inode->i_sb, err);
+       return err;
+ }
+
+ /*
+  * ext3_orphan_del() removes an unlinked or truncated inode from the list
+  * of such inodes stored on disk, because it is finally being cleaned up.
+  */
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+       struct list_head *prev;
+       struct ext3_sb_info *sbi;
+       ino_t ino_next;
+       struct ext3_iloc iloc;
+       int err = 0;
+
+       lock_super(inode->i_sb);
+       if (list_empty(&inode->u.ext3_i.i_orphan)) {
+               unlock_super(inode->i_sb);
+               return 0;
+       }
+
+       ino_next = NEXT_ORPHAN(inode);
+       prev = inode->u.ext3_i.i_orphan.prev;
+       sbi = EXT3_SB(inode->i_sb);
+
+       jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+
+       list_del(&inode->u.ext3_i.i_orphan);
+       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+
+       /* If we're on an error path, we may not have a valid
+        * transaction handle with which to update the orphan list on
+        * disk, but we still need to remove the inode from the linked
+        * list in memory. */
+       if (!handle)
+               goto out;
+
+       err = ext3_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto out_err;
+
+       if (prev == &sbi->s_orphan) {
+               jbd_debug(4, "superblock will point to %ld\n", ino_next);
+               BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+               err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+               if (err)
+                       goto out_brelse;
+               sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+               err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+       } else {
+               struct ext3_iloc iloc2;
+               struct inode *i_prev =
+                       list_entry(prev, struct inode, u.ext3_i.i_orphan);
+
+               jbd_debug(4, "orphan inode %ld will point to %ld\n",
+                         i_prev->i_ino, ino_next);
+               err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
+               if (err)
+                       goto out_brelse;
+               NEXT_ORPHAN(i_prev) = ino_next;
+               err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
+       }
+       if (err)
+               goto out_brelse;
+       NEXT_ORPHAN(inode) = 0;
+       err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+       if (err)
+               goto out_brelse;
+
+ out_err:
+       ext3_std_error(inode->i_sb, err);
+ out:
+       unlock_super(inode->i_sb);
+       return err;
+
+ out_brelse:
+       brelse(iloc.bh);
+       goto out_err;
+ }
+
+ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
+ {
+       int retval;
+       struct inode * inode;
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de;
+       handle_t *handle;
+
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       retval = -ENOENT;
+       bh = ext3_find_entry (dentry, &de);
+       if (!bh)
+               goto end_rmdir;
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       inode = dentry->d_inode;
+       DQUOT_INIT(inode);
+
+       retval = -EIO;
+       if (le32_to_cpu(de->inode) != inode->i_ino)
+               goto end_rmdir;
+
+       retval = -ENOTEMPTY;
+       if (!empty_dir (inode))
+               goto end_rmdir;
+
+       retval = ext3_delete_entry(handle, dir, de, bh);
+       if (retval)
+               goto end_rmdir;
+       if (inode->i_nlink != 2)
+               ext3_warning (inode->i_sb, "ext3_rmdir",
+                             "empty directory has nlink!=2 (%d)",
+                             inode->i_nlink);
+       inode->i_version = ++event;
+       inode->i_nlink = 0;
+       /* There's no need to set i_disksize: the fact that i_nlink is
+        * zero will ensure that the right thing happens during any
+        * recovery. */
+       inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+
+ end_rmdir:
+       ext3_journal_stop(handle, dir);
+       brelse (bh);
+       return retval;
+ }
+
+ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
+ {
+       int retval;
+       struct inode * inode;
+       struct buffer_head * bh;
+       struct ext3_dir_entry_2 * de;
+       handle_t *handle;
+
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       retval = -ENOENT;
+       bh = ext3_find_entry (dentry, &de);
+       if (!bh)
+               goto end_unlink;
+
+       inode = dentry->d_inode;
+       DQUOT_INIT(inode);
+
+       retval = -EIO;
+       if (le32_to_cpu(de->inode) != inode->i_ino)
+               goto end_unlink;
+
+       if (!inode->i_nlink) {
+               ext3_warning (inode->i_sb, "ext3_unlink",
+                             "Deleting nonexistent file (%lu), %d",
+                             inode->i_ino, inode->i_nlink);
+               inode->i_nlink = 1;
+       }
+       retval = ext3_delete_entry(handle, dir, de, bh);
+       if (retval)
+               goto end_unlink;
+       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+       inode->i_nlink--;
+       if (!inode->i_nlink)
+               ext3_orphan_add(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       inode->i_ctime = dir->i_ctime;
+       retval = 0;
+
+ end_unlink:
+       ext3_journal_stop(handle, dir);
+       brelse (bh);
+       return retval;
+ }
+
+ static int ext3_symlink (struct inode * dir,
+               struct dentry *dentry, const char * symname)
+ {
+       handle_t *handle;
+       struct inode * inode;
+       int l, err;
+
+       l = strlen(symname)+1;
+       if (l > dir->i_sb->s_blocksize)
+               return -ENAMETOOLONG;
+
+       handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+
+       if (l > sizeof (inode->u.ext3_i.i_data)) {
+               inode->i_op = &page_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               /*
+                * block_symlink() calls back into ext3_prepare/commit_write.
+                * We have a transaction open.  All is sweetness.  It also sets
+                * i_size in generic_commit_write().
+                */
+               err = block_symlink(inode, symname, l);
+               if (err)
+                       goto out_no_entry;
+       } else {
+               inode->i_op = &ext3_fast_symlink_inode_operations;
+               memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
+               inode->i_size = l-1;
+       }
+       inode->u.ext3_i.i_disksize = inode->i_size;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_nondir(handle, dentry, inode);
+ out_stop:
+       ext3_journal_stop(handle, dir);
+       return err;
+
+ out_no_entry:
+       ext3_dec_count(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       iput (inode);
+       goto out_stop;
+ }
+
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+       handle_t *handle;
+       struct inode *inode = old_dentry->d_inode;
+       int err;
+
+       if (S_ISDIR(inode->i_mode))
+               return -EPERM;
+
+       if (inode->i_nlink >= EXT3_LINK_MAX)
+               return -EMLINK;
+
+       handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+
+       inode->i_ctime = CURRENT_TIME;
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
+
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_nondir(handle, dentry, inode);
+       ext3_journal_stop(handle, dir);
+       return err;
+ }
+
+ #define PARENT_INO(buffer) \
+       ((struct ext3_dir_entry_2 *) ((char *) buffer + \
+       le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
+
+ /*
+  * Anybody can rename anything with this: the permission checks are left to the
+  * higher-level routines.
+  */
+ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
+                          struct inode * new_dir,struct dentry *new_dentry)
+ {
+       handle_t *handle;
+       struct inode * old_inode, * new_inode;
+       struct buffer_head * old_bh, * new_bh, * dir_bh;
+       struct ext3_dir_entry_2 * old_de, * new_de;
+       int retval;
+
+       old_bh = new_bh = dir_bh = NULL;
+
+       handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+               handle->h_sync = 1;
+
+       old_bh = ext3_find_entry (old_dentry, &old_de);
+       /*
+        *  Check for inode number is _not_ due to possible IO errors.
+        *  We might rmdir the source, keep it as pwd of some process
+        *  and merrily kill the link to whatever was created under the
+        *  same name. Goodbye sticky bit ;-<
+        */
+       old_inode = old_dentry->d_inode;
+       retval = -ENOENT;
+       if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+               goto end_rename;
+
+       new_inode = new_dentry->d_inode;
+       new_bh = ext3_find_entry (new_dentry, &new_de);
+       if (new_bh) {
+               if (!new_inode) {
+                       brelse (new_bh);
+                       new_bh = NULL;
+               } else {
+                       DQUOT_INIT(new_inode);
+               }
+       }
+       if (S_ISDIR(old_inode->i_mode)) {
+               if (new_inode) {
+                       retval = -ENOTEMPTY;
+                       if (!empty_dir (new_inode))
+                               goto end_rename;
+               }
+               retval = -EIO;
+               dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+               if (!dir_bh)
+                       goto end_rename;
+               if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+                       goto end_rename;
+               retval = -EMLINK;
+               if (!new_inode && new_dir!=old_dir &&
+                               new_dir->i_nlink >= EXT3_LINK_MAX)
+                       goto end_rename;
+       }
+       if (!new_bh) {
+               retval = ext3_add_entry (handle, new_dentry, old_inode);
+               if (retval)
+                       goto end_rename;
+       } else {
+               BUFFER_TRACE(new_bh, "get write access");
+               BUFFER_TRACE(new_bh, "get_write_access");
+               ext3_journal_get_write_access(handle, new_bh);
+               new_de->inode = le32_to_cpu(old_inode->i_ino);
+               if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+                                             EXT3_FEATURE_INCOMPAT_FILETYPE))
+                       new_de->file_type = old_de->file_type;
+               new_dir->i_version = ++event;
+               BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
+               ext3_journal_dirty_metadata(handle, new_bh);
+               brelse(new_bh);
+               new_bh = NULL;
+       }
+
+       /*
+        * Like most other Unix systems, set the ctime for inodes on a
+        * rename.
+        */
+       old_inode->i_ctime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, old_inode);
+
+       /*
+        * ok, that's it
+        */
+       ext3_delete_entry(handle, old_dir, old_de, old_bh);
+
+       if (new_inode) {
+               new_inode->i_nlink--;
+               new_inode->i_ctime = CURRENT_TIME;
+       }
+       old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+       old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       if (dir_bh) {
+               BUFFER_TRACE(dir_bh, "get_write_access");
+               ext3_journal_get_write_access(handle, dir_bh);
+               PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
+               BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+               ext3_journal_dirty_metadata(handle, dir_bh);
+               old_dir->i_nlink--;
+               if (new_inode) {
+                       new_inode->i_nlink--;
+               } else {
+                       new_dir->i_nlink++;
+                       new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+                       ext3_mark_inode_dirty(handle, new_dir);
+               }
+       }
+       ext3_mark_inode_dirty(handle, old_dir);
+       if (new_inode) {
+               ext3_mark_inode_dirty(handle, new_inode);
+               if (!new_inode->i_nlink)
+                       ext3_orphan_add(handle, new_inode);
+       }
+       retval = 0;
+
+ end_rename:
+       brelse (dir_bh);
+       brelse (old_bh);
+       brelse (new_bh);
+       ext3_journal_stop(handle, old_dir);
+       return retval;
+ }
+
+ /*
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
+       create:         ext3_create,            /* BKL held */
+       lookup:         ext3_lookup,            /* BKL held */
+       link:           ext3_link,              /* BKL held */
+       unlink:         ext3_unlink,            /* BKL held */
+       symlink:        ext3_symlink,           /* BKL held */
+       mkdir:          ext3_mkdir,             /* BKL held */
+       rmdir:          ext3_rmdir,             /* BKL held */
+       mknod:          ext3_mknod,             /* BKL held */
+       rename:         ext3_rename,            /* BKL held */
+ };
diff -rc2P linux/fs/ext3/super.c linux-2.4.13/fs/ext3/super.c
*** linux/fs/ext3/super.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/super.c        Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,1743 ----
+ /*
+  *  linux/fs/ext3/super.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/inode.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller ([email protected]), 1995
+  */
+
+ #include <linux/config.h>
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+ #include <linux/blkdev.h>
+ #include <linux/smp_lock.h>
+ #include <asm/uaccess.h>
+
+ #ifdef CONFIG_JBD_DEBUG
+ static int ext3_ro_after; /* Make fs read-only after this many jiffies */
+ #endif
+
+ static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
+ static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+                              int);
+ static void ext3_commit_super (struct super_block * sb,
+                              struct ext3_super_block * es,
+                              int sync);
+ static void ext3_mark_recovery_complete(struct super_block * sb,
+                                       struct ext3_super_block * es);
+ static void ext3_clear_journal_err(struct super_block * sb,
+                                  struct ext3_super_block * es);
+
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+  * Debug code for turning filesystems "read-only" after a specified
+  * amount of time.  This is for crash/recovery testing.
+  */
+
+ static void make_rdonly(kdev_t dev, int *no_write)
+ {
+       if (dev) {
+               printk(KERN_WARNING "Turning device %s read-only\n",
+                      bdevname(dev));
+               *no_write = 0xdead0000 + dev;
+       }
+ }
+
+ static void turn_fs_readonly(unsigned long arg)
+ {
+       struct super_block *sb = (struct super_block *)arg;
+
+       make_rdonly(sb->s_dev, &journal_no_write[0]);
+       make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]);
+       wake_up(&EXT3_SB(sb)->ro_wait_queue);
+ }
+
+ static void setup_ro_after(struct super_block *sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       init_timer(&sbi->turn_ro_timer);
+       if (ext3_ro_after) {
+               printk(KERN_DEBUG "fs will go read-only in %d jiffies\n",
+                      ext3_ro_after);
+               init_waitqueue_head(&sbi->ro_wait_queue);
+               journal_no_write[0] = 0;
+               journal_no_write[1] = 0;
+               sbi->turn_ro_timer.function = turn_fs_readonly;
+               sbi->turn_ro_timer.data = (unsigned long)sb;
+               sbi->turn_ro_timer.expires = jiffies + ext3_ro_after;
+               ext3_ro_after = 0;
+               add_timer(&sbi->turn_ro_timer);
+       }
+ }
+
+ static void clear_ro_after(struct super_block *sb)
+ {
+       del_timer_sync(&EXT3_SB(sb)->turn_ro_timer);
+       journal_no_write[0] = 0;
+       journal_no_write[1] = 0;
+       ext3_ro_after = 0;
+ }
+ #else
+ #define setup_ro_after(sb)    do {} while (0)
+ #define clear_ro_after(sb)    do {} while (0)
+ #endif
+
+
+ static char error_buf[1024];
+
+ /* Determine the appropriate response to ext3_error on a given filesystem */
+
+ static int ext3_error_behaviour(struct super_block *sb)
+ {
+       /* First check for mount-time options */
+       if (test_opt (sb, ERRORS_PANIC))
+               return EXT3_ERRORS_PANIC;
+       if (test_opt (sb, ERRORS_RO))
+               return EXT3_ERRORS_RO;
+       if (test_opt (sb, ERRORS_CONT))
+               return EXT3_ERRORS_CONTINUE;
+
+       /* If no overrides were specified on the mount, then fall back
+        * to the default behaviour set in the filesystem's superblock
+        * on disk. */
+       switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
+       case EXT3_ERRORS_PANIC:
+               return EXT3_ERRORS_PANIC;
+       case EXT3_ERRORS_RO:
+               return EXT3_ERRORS_RO;
+       default:
+               break;
+       }
+       return EXT3_ERRORS_CONTINUE;
+ }
+
+ /* Deal with the reporting of failure conditions on a filesystem such as
+  * inconsistencies detected or read IO failures.
+  *
+  * On ext2, we can store the error state of the filesystem in the
+  * superblock.  That is not possible on ext3, because we may have other
+  * write ordering constraints on the superblock which prevent us from
+  * writing it out straight away; and given that the journal is about to
+  * be aborted, we can't rely on the current, or future, transactions to
+  * write out the superblock safely.
+  *
+  * We'll just use the journal_abort() error code to record an error in
+  * the journal instead.  On recovery, the journal will compain about
+  * that error until we've noted it down and cleared it.
+  */
+
+ static void ext3_handle_error(struct super_block *sb)
+ {
+       struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+       EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+       es->s_state |= cpu_to_le32(EXT3_ERROR_FS);
+
+       if (sb->s_flags & MS_RDONLY)
+               return;
+
+       if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) {
+               EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+               journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+       }
+
+       if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
+               panic ("EXT3-fs (device %s): panic forced after error\n",
+                      bdevname(sb->s_dev));
+
+       if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) {
+               printk (KERN_CRIT "Remounting filesystem read-only\n");
+               sb->s_flags |= MS_RDONLY;
+       }
+
+       ext3_commit_super(sb, es, 1);
+ }
+
+ void ext3_error (struct super_block * sb, const char * function,
+                const char * fmt, ...)
+ {
+       va_list args;
+
+       va_start (args, fmt);
+       vsprintf (error_buf, fmt, args);
+       va_end (args);
+
+       printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n",
+               bdevname(sb->s_dev), function, error_buf);
+
+       ext3_handle_error(sb);
+ }
+
+ const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16])
+ {
+       char *errstr = NULL;
+
+       switch (errno) {
+       case -EIO:
+               errstr = "IO failure";
+               break;
+       case -ENOMEM:
+               errstr = "Out of memory";
+               break;
+       case -EROFS:
+               if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
+                       errstr = "Journal has aborted";
+               else
+                       errstr = "Readonly filesystem";
+               break;
+       default:
+               /* If the caller passed in an extra buffer for unknown
+                * errors, textualise them now.  Else we just return
+                * NULL. */
+               if (nbuf) {
+                       /* Check for truncated error codes... */
+                       if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                               errstr = nbuf;
+               }
+
+               break;
+       }
+
+       return errstr;
+ }
+
+ /* __ext3_std_error decodes expected errors from journaling functions
+  * automatically and invokes the appropriate error response.  */
+
+ void __ext3_std_error (struct super_block * sb, const char * function,
+                      int errno)
+ {
+       char nbuf[16];
+       const char *errstr = ext3_decode_error(sb, errno, nbuf);
+
+       printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
+               bdevname(sb->s_dev), function, errstr);
+
+       ext3_handle_error(sb);
+ }
+
+ /*
+  * ext3_abort is a much stronger failure handler than ext3_error.  The
+  * abort function may be used to deal with unrecoverable failures such
+  * as journal IO errors or ENOMEM at a critical moment in log management.
+  *
+  * We unconditionally force the filesystem into an ABORT|READONLY state,
+  * unless the error response on the fs has been set to panic in which
+  * case we take the easy way out and panic immediately.
+  */
+
+ void ext3_abort (struct super_block * sb, const char * function,
+                const char * fmt, ...)
+ {
+       va_list args;
+
+       printk (KERN_CRIT "ext3_abort called.\n");
+
+       va_start (args, fmt);
+       vsprintf (error_buf, fmt, args);
+       va_end (args);
+
+       if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
+               panic ("EXT3-fs panic (device %s): %s: %s\n",
+                      bdevname(sb->s_dev), function, error_buf);
+
+       printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n",
+               bdevname(sb->s_dev), function, error_buf);
+
+       if (sb->s_flags & MS_RDONLY)
+               return;
+
+       printk (KERN_CRIT "Remounting filesystem read-only\n");
+       sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+       sb->s_flags |= MS_RDONLY;
+       sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
+       journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+ }
+
+ /* Deal with the reporting of failure conditions while running, such as
+  * inconsistencies in operation or invalid system states.
+  *
+  * Use ext3_error() for cases of invalid filesystem states, as that will
+  * record an error on disk and force a filesystem check on the next boot.
+  */
+ NORET_TYPE void ext3_panic (struct super_block * sb, const char * function,
+                           const char * fmt, ...)
+ {
+       va_list args;
+
+       va_start (args, fmt);
+       vsprintf (error_buf, fmt, args);
+       va_end (args);
+
+       /* this is to prevent panic from syncing this filesystem */
+       /* AKPM: is this sufficient? */
+       sb->s_flags |= MS_RDONLY;
+       panic ("EXT3-fs panic (device %s): %s: %s\n",
+              bdevname(sb->s_dev), function, error_buf);
+ }
+
+ void ext3_warning (struct super_block * sb, const char * function,
+                  const char * fmt, ...)
+ {
+       va_list args;
+
+       va_start (args, fmt);
+       vsprintf (error_buf, fmt, args);
+       va_end (args);
+       printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n",
+               bdevname(sb->s_dev), function, error_buf);
+ }
+
+ void ext3_update_dynamic_rev(struct super_block *sb)
+ {
+       struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+       if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
+               return;
+
+       ext3_warning(sb, __FUNCTION__,
+                    "updating to rev %d because of new feature flag, "
+                    "running e2fsck is recommended",
+                    EXT3_DYNAMIC_REV);
+
+       es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
+       es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
+       es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
+       /* leave es->s_feature_*compat flags alone */
+       /* es->s_uuid will be set by e2fsck if empty */
+
+       /*
+        * The rest of the superblock fields should be zero, and if not it
+        * means they are likely already in use, so leave them alone.  We
+        * can leave it up to e2fsck to clean up any inconsistencies there.
+        */
+ }
+
+ /*
+  * Open the external journal device
+  */
+ static struct block_device *ext3_blkdev_get(kdev_t dev)
+ {
+       struct block_device *bdev;
+       int err = -ENODEV;
+
+       bdev = bdget(kdev_t_to_nr(dev));
+       if (bdev == NULL)
+               goto fail;
+       err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
+       if (err < 0)
+               goto fail;
+       return bdev;
+
+ fail:
+       printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n",
+                       bdevname(dev), err);
+       return NULL;
+ }
+
+ /*
+  * Release the journal device
+  */
+ static int ext3_blkdev_put(struct block_device *bdev)
+ {
+       return blkdev_put(bdev, BDEV_FS);
+ }
+
+ static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+ {
+       struct block_device *bdev;
+       int ret = -ENODEV;
+
+       bdev = sbi->journal_bdev;
+       if (bdev) {
+               ret = ext3_blkdev_put(bdev);
+               sbi->journal_bdev = 0;
+       }
+       return ret;
+ }
+
+ #define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan)
+
+ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
+ {
+       struct list_head *l;
+
+       printk(KERN_ERR "sb orphan head is %d\n",
+              le32_to_cpu(sbi->s_es->s_last_orphan));
+
+       printk(KERN_ERR "sb_info orphan list:\n");
+       list_for_each(l, &sbi->s_orphan) {
+               struct inode *inode = orphan_list_entry(l);
+               printk(KERN_ERR "  "
+                      "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n",
+                      inode->i_dev, inode->i_ino, inode,
+                      inode->i_mode, inode->i_nlink,
+                      le32_to_cpu(NEXT_ORPHAN(inode)));
+       }
+ }
+
+ void ext3_put_super (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       struct ext3_super_block *es = sbi->s_es;
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
+
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+               es->s_state = le16_to_cpu(sbi->s_mount_state);
+               BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+               mark_buffer_dirty(sbi->s_sbh);
+               ext3_commit_super(sb, es, 1);
+       }
+
+       for (i = 0; i < sbi->s_gdb_count; i++)
+               brelse(sbi->s_group_desc[i]);
+       kfree(sbi->s_group_desc);
+       for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
+               brelse(sbi->s_inode_bitmap[i]);
+       for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
+               brelse(sbi->s_block_bitmap[i]);
+       brelse(sbi->s_sbh);
+
+       /* Debugging code just in case the in-memory inode orphan list
+        * isn't empty.  The on-disk one can be non-empty if we've
+        * detected an error and taken the fs readonly, but the
+        * in-memory list had better be clean by this point. */
+       if (!list_empty(&sbi->s_orphan))
+               dump_orphan_list(sb, sbi);
+       J_ASSERT(list_empty(&sbi->s_orphan));
+
+       invalidate_buffers(sb->s_dev);
+       if (j_dev != sb->s_dev) {
+               /*
+                * Invalidate the journal device's buffers.  We don't want them
+                * floating about in memory - the physical journal device may
+                * hotswapped, and it breaks the `ro-after' testing code.
+                */
+               fsync_no_super(j_dev);
+               invalidate_buffers(j_dev);
+               ext3_blkdev_remove(sbi);
+       }
+       clear_ro_after(sb);
+
+       return;
+ }
+
+ static struct super_operations ext3_sops = {
+       read_inode:     ext3_read_inode,        /* BKL held */
+       write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
+       dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
+       put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
+       delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
+       put_super:      ext3_put_super,         /* BKL held */
+       write_super:    ext3_write_super,       /* BKL held */
+       write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
+       unlockfs:       ext3_unlockfs,          /* BKL not held.  We take it */
+       statfs:         ext3_statfs,            /* BKL held */
+       remount_fs:     ext3_remount,           /* BKL held */
+ };
+
+ static int want_value(char *value, char *option)
+ {
+       if (!value || !*value) {
+               printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n",
+                      option);
+               return -1;
+       }
+       return 0;
+ }
+
+ static int want_null_value(char *value, char *option)
+ {
+       if (*value) {
+               printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n",
+                      option, value);
+               return -1;
+       }
+       return 0;
+ }
+
+ static int want_numeric(char *value, char *option, unsigned long *number)
+ {
+       if (want_value(value, option))
+               return -1;
+       *number = simple_strtoul(value, &value, 0);
+       if (want_null_value(value, option))
+               return -1;
+       return 0;
+ }
+
+ /*
+  * This function has been shamelessly adapted from the msdos fs
+  */
+ static int parse_options (char * options, unsigned long * sb_block,
+                         struct ext3_sb_info *sbi,
+                         unsigned long * inum,
+                         int is_remount)
+ {
+       unsigned long *mount_options = &sbi->s_mount_opt;
+       uid_t *resuid = &sbi->s_resuid;
+       gid_t *resgid = &sbi->s_resgid;
+       char * this_char;
+       char * value;
+
+       if (!options)
+               return 1;
+       for (this_char = strtok (options, ",");
+            this_char != NULL;
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+                       set_opt (*mount_options, NO_UID32);
+               }
+               else if (!strcmp (this_char, "abort"))
+                       set_opt (*mount_options, ABORT);
+               else if (!strcmp (this_char, "check")) {
+                       if (!value || !*value || !strcmp (value, "none"))
+                               clear_opt (*mount_options, CHECK);
+                       else
+ #ifdef CONFIG_EXT3_CHECK
+                               set_opt (*mount_options, CHECK);
+ #else
+                               printk(KERN_ERR
+                                      "EXT3 Check option not supported\n");
+ #endif
+               }
+               else if (!strcmp (this_char, "debug"))
+                       set_opt (*mount_options, DEBUG);
+               else if (!strcmp (this_char, "errors")) {
+                       if (want_value(value, "errors"))
+                               return 0;
+                       if (!strcmp (value, "continue")) {
+                               clear_opt (*mount_options, ERRORS_RO);
+                               clear_opt (*mount_options, ERRORS_PANIC);
+                               set_opt (*mount_options, ERRORS_CONT);
+                       }
+                       else if (!strcmp (value, "remount-ro")) {
+                               clear_opt (*mount_options, ERRORS_CONT);
+                               clear_opt (*mount_options, ERRORS_PANIC);
+                               set_opt (*mount_options, ERRORS_RO);
+                       }
+                       else if (!strcmp (value, "panic")) {
+                               clear_opt (*mount_options, ERRORS_CONT);
+                               clear_opt (*mount_options, ERRORS_RO);
+                               set_opt (*mount_options, ERRORS_PANIC);
+                       }
+                       else {
+                               printk (KERN_ERR
+                                       "EXT3-fs: Invalid errors option: %s\n",
+                                       value);
+                               return 0;
+                       }
+               }
+               else if (!strcmp (this_char, "grpid") ||
+                        !strcmp (this_char, "bsdgroups"))
+                       set_opt (*mount_options, GRPID);
+               else if (!strcmp (this_char, "minixdf"))
+                       set_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nocheck"))
+                       clear_opt (*mount_options, CHECK);
+               else if (!strcmp (this_char, "nogrpid") ||
+                        !strcmp (this_char, "sysvgroups"))
+                       clear_opt (*mount_options, GRPID);
+               else if (!strcmp (this_char, "resgid")) {
+                       unsigned long v;
+                       if (want_numeric(value, "resgid", &v))
+                               return 0;
+                       *resgid = v;
+               }
+               else if (!strcmp (this_char, "resuid")) {
+                       unsigned long v;
+                       if (want_numeric(value, "resuid", &v))
+                               return 0;
+                       *resuid = v;
+               }
+               else if (!strcmp (this_char, "sb")) {
+                       if (want_numeric(value, "sb", sb_block))
+                               return 0;
+               }
+ #ifdef CONFIG_JBD_DEBUG
+               else if (!strcmp (this_char, "ro-after")) {
+                       unsigned long v;
+                       if (want_numeric(value, "ro-after", &v))
+                               return 0;
+                       ext3_ro_after = v;
+               }
+ #endif
+               /* Silently ignore the quota options */
+               else if (!strcmp (this_char, "grpquota")
+                        || !strcmp (this_char, "noquota")
+                        || !strcmp (this_char, "quota")
+                        || !strcmp (this_char, "usrquota"))
+                       /* Don't do anything ;-) */ ;
+               else if (!strcmp (this_char, "journal")) {
+                       /* @@@ FIXME */
+                       /* Eventually we will want to be able to create
+                            a journal file here.  For now, only allow the
+                            user to specify an existing inode to be the
+                            journal file. */
+                       if (is_remount) {
+                               printk(KERN_ERR "EXT3-fs: cannot specify "
+                                      "journal on remount\n");
+                               return 0;
+                       }
+
+                       if (want_value(value, "journal"))
+                               return 0;
+                       if (!strcmp (value, "update"))
+                               set_opt (*mount_options, UPDATE_JOURNAL);
+                       else if (want_numeric(value, "journal", inum))
+                               return 0;
+               }
+               else if (!strcmp (this_char, "noload"))
+                       set_opt (*mount_options, NOLOAD);
+               else if (!strcmp (this_char, "data")) {
+                       int data_opt = 0;
+
+                       if (want_value(value, "data"))
+                               return 0;
+                       if (!strcmp (value, "journal"))
+                               data_opt = EXT3_MOUNT_JOURNAL_DATA;
+                       else if (!strcmp (value, "ordered"))
+                               data_opt = EXT3_MOUNT_ORDERED_DATA;
+                       else if (!strcmp (value, "writeback"))
+                               data_opt = EXT3_MOUNT_WRITEBACK_DATA;
+                       else {
+                               printk (KERN_ERR
+                                       "EXT3-fs: Invalid data option: %s\n",
+                                       value);
+                               return 0;
+                       }
+                       if (is_remount) {
+                               if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) !=
+                                                       data_opt) {
+                                       printk(KERN_ERR
+                                              "EXT3-fs: cannot change data "
+                                              "mode on remount\n");
+                                       return 0;
+                               }
+                       } else {
+                               *mount_options &= ~EXT3_MOUNT_DATA_FLAGS;
+                               *mount_options |= data_opt;
+                       }
+               } else {
+                       printk (KERN_ERR
+                               "EXT3-fs: Unrecognized mount option %s\n",
+                               this_char);
+                       return 0;
+               }
+       }
+       return 1;
+ }
+
+ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
+                           int read_only)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       int res = 0;
+
+       if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
+               printk (KERN_ERR "EXT3-fs warning: revision level too high, "
+                       "forcing read-only mode\n");
+               res = MS_RDONLY;
+       }
+       if (read_only)
+               return res;
+       if (!(sbi->s_mount_state & EXT3_VALID_FS))
+               printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
+                       "running e2fsck is recommended\n");
+       else if ((sbi->s_mount_state & EXT3_ERROR_FS))
+               printk (KERN_WARNING
+                       "EXT3-fs warning: mounting fs with errors, "
+                       "running e2fsck is recommended\n");
+       else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+                le16_to_cpu(es->s_mnt_count) >=
+                (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+               printk (KERN_WARNING
+                       "EXT3-fs warning: maximal mount count reached, "
+                       "running e2fsck is recommended\n");
+       else if (le32_to_cpu(es->s_checkinterval) &&
+               (le32_to_cpu(es->s_lastcheck) +
+                       le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME))
+               printk (KERN_WARNING
+                       "EXT3-fs warning: checktime reached, "
+                       "running e2fsck is recommended\n");
+ #if 0
+               /* @@@ We _will_ want to clear the valid bit if we find
+                    inconsistencies, to force a fsck at reboot.  But for
+                    a plain journaled filesystem we can keep it set as
+                    valid forever! :) */
+       es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
+ #endif
+       if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+               es->s_max_mnt_count =
+                       (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
+       es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+       es->s_mtime = cpu_to_le32(CURRENT_TIME);
+       ext3_update_dynamic_rev(sb);
+       EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+       ext3_commit_super (sb, es, 1);
+       if (test_opt (sb, DEBUG))
+               printk (KERN_INFO
+                       "[EXT3 FS %s, %s, bs=%lu, gc=%lu, "
+                       "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+                       EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize,
+                       sbi->s_groups_count,
+                       EXT3_BLOCKS_PER_GROUP(sb),
+                       EXT3_INODES_PER_GROUP(sb),
+                       sbi->s_mount_opt);
+       printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+                               bdevname(sb->s_dev));
+       if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+               printk("external journal on %s\n",
+                               bdevname(EXT3_SB(sb)->s_journal->j_dev));
+       } else {
+               printk("internal journal\n");
+       }
+ #ifdef CONFIG_EXT3_CHECK
+       if (test_opt (sb, CHECK)) {
+               ext3_check_blocks_bitmap (sb);
+               ext3_check_inodes_bitmap (sb);
+       }
+ #endif
+       setup_ro_after(sb);
+       return res;
+ }
+
+ static int ext3_check_descriptors (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+       struct ext3_group_desc * gdp = NULL;
+       int desc_block = 0;
+       int i;
+
+       ext3_debug ("Checking group descriptors");
+
+       for (i = 0; i < sbi->s_groups_count; i++)
+       {
+               if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
+                       gdp = (struct ext3_group_desc *)
+                                       sbi->s_group_desc[desc_block++]->b_data;
+               if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
+                   le32_to_cpu(gdp->bg_block_bitmap) >=
+                               block + EXT3_BLOCKS_PER_GROUP(sb))
+               {
+                       ext3_error (sb, "ext3_check_descriptors",
+                                   "Block bitmap for group %d"
+                                   " not in group (block %lu)!",
+                                   i, (unsigned long)
+                                       le32_to_cpu(gdp->bg_block_bitmap));
+                       return 0;
+               }
+               if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
+                   le32_to_cpu(gdp->bg_inode_bitmap) >=
+                               block + EXT3_BLOCKS_PER_GROUP(sb))
+               {
+                       ext3_error (sb, "ext3_check_descriptors",
+                                   "Inode bitmap for group %d"
+                                   " not in group (block %lu)!",
+                                   i, (unsigned long)
+                                       le32_to_cpu(gdp->bg_inode_bitmap));
+                       return 0;
+               }
+               if (le32_to_cpu(gdp->bg_inode_table) < block ||
+                   le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
+                   block + EXT3_BLOCKS_PER_GROUP(sb))
+               {
+                       ext3_error (sb, "ext3_check_descriptors",
+                                   "Inode table for group %d"
+                                   " not in group (block %lu)!",
+                                   i, (unsigned long)
+                                       le32_to_cpu(gdp->bg_inode_table));
+                       return 0;
+               }
+               block += EXT3_BLOCKS_PER_GROUP(sb);
+               gdp++;
+       }
+       return 1;
+ }
+
+
+ /* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+  * the superblock) which were deleted from all directories, but held open by
+  * a process at the time of a crash.  We walk the list and try to delete these
+  * inodes at recovery time (only with a read-write filesystem).
+  *
+  * In order to keep the orphan inode chain consistent during traversal (in
+  * case of crash during recovery), we link each inode into the superblock
+  * orphan list_head and handle it the same way as an inode deletion during
+  * normal operation (which journals the operations for us).
+  *
+  * We only do an iget() and an iput() on each inode, which is very safe if we
+  * accidentally point at an in-use or already deleted inode.  The worst that
+  * can happen in this case is that we get a "bit already cleared" message from
+  * ext3_free_inode().  The only reason we would point at a wrong inode is if
+  * e2fsck was run on this filesystem, and it must have already done the orphan
+  * inode cleanup for us, so we can safely abort without any further action.
+  */
+ static void ext3_orphan_cleanup (struct super_block * sb,
+                                struct ext3_super_block * es)
+ {
+       unsigned int s_flags = sb->s_flags;
+       int nr_orphans = 0, nr_truncates = 0;
+       if (!es->s_last_orphan) {
+               jbd_debug(4, "no orphan inodes to clean up\n");
+               return;
+       }
+
+       if (s_flags & MS_RDONLY) {
+               printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
+                      bdevname(sb->s_dev));
+               sb->s_flags &= ~MS_RDONLY;
+       }
+
+       if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
+               if (es->s_last_orphan)
+                       jbd_debug(1, "Errors on filesystem, "
+                                 "clearing orphan list.\n");
+               es->s_last_orphan = 0;
+               jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+               return;
+       }
+
+       while (es->s_last_orphan) {
+               struct inode *inode;
+
+               if (!(inode =
+                     ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+                       es->s_last_orphan = 0;
+                       break;
+               }
+
+               list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+               if (inode->i_nlink) {
+                       printk(KERN_DEBUG __FUNCTION__
+                               ": truncating inode %ld to %Ld bytes\n",
+                               inode->i_ino, inode->i_size);
+                       jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+                                 inode->i_ino, inode->i_size);
+                       ext3_truncate(inode);
+                       nr_truncates++;
+               } else {
+                       printk(KERN_DEBUG __FUNCTION__
+                               ": deleting unreferenced inode %ld\n",
+                               inode->i_ino);
+                       jbd_debug(2, "deleting unreferenced inode %ld\n",
+                                 inode->i_ino);
+                       nr_orphans++;
+               }
+               iput(inode);  /* The delete magic happens here! */
+       }
+
+ #define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+       if (nr_orphans)
+               printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
+                      bdevname(sb->s_dev), PLURAL(nr_orphans));
+       if (nr_truncates)
+               printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
+                      bdevname(sb->s_dev), PLURAL(nr_truncates));
+       sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ }
+
+ #define log2(n) ffz(~(n))
+
+ /*
+  * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+  * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+  * We need to be 1 filesystem block less than the 2^32 sector limit.
+  */
+ static loff_t ext3_max_size(int bits)
+ {
+       loff_t res = EXT3_NDIR_BLOCKS;
+       res += 1LL << (bits-2);
+       res += 1LL << (2*(bits-2));
+       res += 1LL << (3*(bits-2));
+       res <<= bits;
+       if (res > (512LL << 32) - (1 << bits))
+               res = (512LL << 32) - (1 << bits);
+       return res;
+ }
+
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+                                     int silent)
+ {
+       struct buffer_head * bh;
+       struct ext3_super_block *es = 0;
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       unsigned long sb_block = 1;
+       unsigned long logic_sb_block = 1;
+       unsigned long offset = 0;
+       unsigned long journal_inum = 0;
+       kdev_t dev = sb->s_dev;
+       int blocksize;
+       int hblock;
+       int db_count;
+       int i;
+       int needs_recovery;
+
+ #ifdef CONFIG_JBD_DEBUG
+       ext3_ro_after = 0;
+ #endif
+       /*
+        * See what the current blocksize for the device is, and
+        * use that as the blocksize.  Otherwise (or if the blocksize
+        * is smaller than the default) use the default.
+        * This is important for devices that have a hardware
+        * sectorsize that is larger than the default.
+        */
+       blocksize = EXT3_MIN_BLOCK_SIZE;
+       hblock = get_hardsect_size(dev);
+       if (blocksize < hblock)
+               blocksize = hblock;
+
+       sbi->s_mount_opt = 0;
+       sbi->s_resuid = EXT3_DEF_RESUID;
+       sbi->s_resgid = EXT3_DEF_RESGID;
+       if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+               sb->s_dev = 0;
+               goto out_fail;
+       }
+
+       set_blocksize (dev, blocksize);
+
+       /*
+        * The ext3 superblock will not be buffer aligned for other than 1kB
+        * block sizes.  We need to calculate the offset from buffer start.
+        */
+       if (blocksize != EXT3_MIN_BLOCK_SIZE) {
+               logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+               offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+       }
+
+       if (!(bh = bread (dev, logic_sb_block, blocksize))) {
+               printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+               goto out_fail;
+       }
+       /*
+        * Note: s_es must be initialized as soon as possible because
+        *       some ext3 macro-instructions depend on its value
+        */
+       es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+       sbi->s_es = es;
+       sb->s_magic = le16_to_cpu(es->s_magic);
+       if (sb->s_magic != EXT3_SUPER_MAGIC) {
+               if (!silent)
+                       printk(KERN_ERR
+                              "VFS: Can't find ext3 filesystem on dev %s.\n",
+                              bdevname(dev));
+               goto failed_mount;
+       }
+       if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
+           (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
+            EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+            EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+               printk(KERN_WARNING
+                      "EXT3-fs warning: feature flags set on rev 0 fs, "
+                      "running e2fsck is recommended\n");
+       /*
+        * Check feature flags regardless of the revision level, since we
+        * previously didn't change the revision level when setting the flags,
+        * so there is a chance incompat flags are set on a rev 0 filesystem.
+        */
+       if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) {
+               printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
+                      "unsupported optional features (%x).\n",
+                      bdevname(dev), i);
+               goto failed_mount;
+       }
+       if (!(sb->s_flags & MS_RDONLY) &&
+           (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){
+               printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
+                      "unsupported optional features (%x).\n",
+                      bdevname(dev), i);
+               goto failed_mount;
+       }
+       sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10;
+       sb->s_blocksize = 1 << sb->s_blocksize_bits;
+
+       if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE ||
+           sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) {
+               printk(KERN_ERR
+                      "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
+                      blocksize, bdevname(dev));
+               goto failed_mount;
+       }
+
+       sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
+
+       if (sb->s_blocksize != blocksize) {
+               blocksize = sb->s_blocksize;
+
+               /*
+                * Make sure the blocksize for the filesystem is larger
+                * than the hardware sectorsize for the machine.
+                */
+               if (sb->s_blocksize < hblock) {
+                       printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
+                              "device blocksize %d.\n", blocksize, hblock);
+                       goto failed_mount;
+               }
+
+               brelse (bh);
+               set_blocksize (dev, sb->s_blocksize);
+               logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+               offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+               bh = bread (dev, logic_sb_block, blocksize);
+               if (!bh) {
+                       printk(KERN_ERR
+                              "EXT3-fs: Can't read superblock on 2nd try.\n");
+                       return NULL;
+               }
+               es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+               sbi->s_es = es;
+               if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) {
+                       printk (KERN_ERR
+                               "EXT3-fs: Magic mismatch, very weird !\n");
+                       goto failed_mount;
+               }
+       }
+
+       if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
+               sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
+               sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
+       } else {
+               sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+               sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+               if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) {
+                       printk (KERN_ERR
+                               "EXT3-fs: unsupported inode size: %d\n",
+                               sbi->s_inode_size);
+                       goto failed_mount;
+               }
+       }
+       sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
+                                  le32_to_cpu(es->s_log_frag_size);
+       if (blocksize != sbi->s_frag_size) {
+               printk(KERN_ERR
+                      "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+                      sbi->s_frag_size, blocksize);
+               goto failed_mount;
+       }
+       sbi->s_frags_per_block = 1;
+       sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+       sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+       sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+       sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
+       sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block;
+       sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
+       sbi->s_sbh = bh;
+       if (sbi->s_resuid == EXT3_DEF_RESUID)
+               sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+       if (sbi->s_resgid == EXT3_DEF_RESGID)
+               sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+       sbi->s_mount_state = le16_to_cpu(es->s_state);
+       sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+       sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
+
+       if (sbi->s_blocks_per_group > blocksize * 8) {
+               printk (KERN_ERR
+                       "EXT3-fs: #blocks per group too big: %lu\n",
+                       sbi->s_blocks_per_group);
+               goto failed_mount;
+       }
+       if (sbi->s_frags_per_group > blocksize * 8) {
+               printk (KERN_ERR
+                       "EXT3-fs: #fragments per group too big: %lu\n",
+                       sbi->s_frags_per_group);
+               goto failed_mount;
+       }
+       if (sbi->s_inodes_per_group > blocksize * 8) {
+               printk (KERN_ERR
+                       "EXT3-fs: #inodes per group too big: %lu\n",
+                       sbi->s_inodes_per_group);
+               goto failed_mount;
+       }
+
+       sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+                              le32_to_cpu(es->s_first_data_block) +
+                              EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+                             EXT3_BLOCKS_PER_GROUP(sb);
+       db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
+                  EXT3_DESC_PER_BLOCK(sb);
+       sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+                                   GFP_KERNEL);
+       if (sbi->s_group_desc == NULL) {
+               printk (KERN_ERR "EXT3-fs: not enough memory\n");
+               goto failed_mount;
+       }
+       for (i = 0; i < db_count; i++) {
+               sbi->s_group_desc[i] = bread(dev, logic_sb_block + i + 1,
+                                            blocksize);
+               if (!sbi->s_group_desc[i]) {
+                       printk (KERN_ERR "EXT3-fs: "
+                               "can't read group descriptor %d\n", i);
+                       db_count = i;
+                       goto failed_mount2;
+               }
+       }
+       if (!ext3_check_descriptors (sb)) {
+               printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+               goto failed_mount2;
+       }
+       for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) {
+               sbi->s_inode_bitmap_number[i] = 0;
+               sbi->s_inode_bitmap[i] = NULL;
+               sbi->s_block_bitmap_number[i] = 0;
+               sbi->s_block_bitmap[i] = NULL;
+       }
+       sbi->s_loaded_inode_bitmaps = 0;
+       sbi->s_loaded_block_bitmaps = 0;
+       sbi->s_gdb_count = db_count;
+       /*
+        * set up enough so that it can read an inode
+        */
+       sb->s_op = &ext3_sops;
+       INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+
+       sb->s_root = 0;
+
+       needs_recovery = (es->s_last_orphan != 0 ||
+                         EXT3_HAS_INCOMPAT_FEATURE(sb,
+                                   EXT3_FEATURE_INCOMPAT_RECOVER));
+
+       /*
+        * The first inode we look at is the journal inode.  Don't try
+        * root first: it may be modified in the journal!
+        */
+       if (!test_opt(sb, NOLOAD) &&
+           EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+               if (ext3_load_journal(sb, es))
+                       goto failed_mount2;
+       } else if (journal_inum) {
+               if (ext3_create_journal(sb, es, journal_inum))
+                       goto failed_mount2;
+       } else {
+               if (!silent)
+                       printk (KERN_ERR
+                               "ext3: No journal on filesystem on %s\n",
+                               bdevname(dev));
+               goto failed_mount2;
+       }
+
+       /* We have now updated the journal if required, so we can
+        * validate the data journaling mode. */
+       switch (test_opt(sb, DATA_FLAGS)) {
+       case 0:
+               /* No mode set, assume a default based on the journal
+                    capabilities: ORDERED_DATA if the journal can
+                    cope, else JOURNAL_DATA */
+               if (journal_check_available_features
+                   (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
+                       set_opt(sbi->s_mount_opt, ORDERED_DATA);
+               else
+                       set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+               break;
+
+       case EXT3_MOUNT_ORDERED_DATA:
+       case EXT3_MOUNT_WRITEBACK_DATA:
+               if (!journal_check_available_features
+                   (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
+                       printk(KERN_ERR "EXT3-fs: Journal does not support "
+                              "requested data journaling mode\n");
+                       goto failed_mount3;
+               }
+       default:
+               break;
+       }
+
+       /*
+        * The journal_load will have done any necessary log recovery,
+        * so we can safely mount the rest of the filesystem now.
+        */
+
+       sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO));
+       if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) ||
+           !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) {
+               if (sb->s_root) {
+                       dput(sb->s_root);
+                       sb->s_root = NULL;
+                       printk(KERN_ERR
+                              "EXT3-fs: corrupt root inode, run e2fsck\n");
+               } else
+                       printk(KERN_ERR "EXT3-fs: get root inode failed\n");
+               goto failed_mount3;
+       }
+
+       ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+       /*
+        * akpm: core read_super() calls in here with the superblock locked.
+        * That deadlocks, because orphan cleanup needs to lock the superblock
+        * in numerous places.  Here we just pop the lock - it's relatively
+        * harmless, because we are now ready to accept write_super() requests,
+        * and aviro says that's the only reason for hanging onto the
+        * superblock lock.
+        */
+       EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+       unlock_super(sb);       /* akpm: sigh */
+       ext3_orphan_cleanup(sb, es);
+       lock_super(sb);
+       EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+       if (needs_recovery)
+               printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+       ext3_mark_recovery_complete(sb, es);
+       printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+               test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
+               test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+               "writeback");
+
+       return sb;
+
+ failed_mount3:
+       journal_destroy(sbi->s_journal);
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
+       kfree(sbi->s_group_desc);
+ failed_mount:
+       ext3_blkdev_remove(sbi);
+       brelse(bh);
+ out_fail:
+       return NULL;
+ }
+
+ static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
+ {
+       struct inode *journal_inode;
+       journal_t *journal;
+
+       /* First, test for the existence of a valid inode on disk.  Bad
+        * things happen if we iget() an unused inode, as the subsequent
+        * iput() will try to delete it. */
+
+       journal_inode = iget(sb, journal_inum);
+       if (!journal_inode) {
+               printk(KERN_ERR "EXT3-fs: no journal found.\n");
+               return NULL;
+       }
+       if (!journal_inode->i_nlink) {
+               make_bad_inode(journal_inode);
+               iput(journal_inode);
+               printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+               return NULL;
+       }
+
+       jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+                 journal_inode, journal_inode->i_size);
+       if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
+               printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+               iput(journal_inode);
+               return NULL;
+       }
+
+       journal = journal_init_inode(journal_inode);
+       if (!journal)
+               iput(journal_inode);
+       return journal;
+ }
+
+ static journal_t *ext3_get_dev_journal(struct super_block *sb,
+                                      int dev)
+ {
+       struct buffer_head * bh;
+       journal_t *journal;
+       int start;
+       int len;
+       int hblock, blocksize;
+       unsigned long sb_block;
+       unsigned long offset;
+       kdev_t journal_dev = to_kdev_t(dev);
+       struct ext3_super_block * es;
+       struct block_device *bdev;
+
+       bdev = ext3_blkdev_get(journal_dev);
+       if (bdev == NULL)
+               return NULL;
+
+       blocksize = sb->s_blocksize;
+       hblock = get_hardsect_size(journal_dev);
+       if (blocksize < hblock) {
+               printk(KERN_ERR
+                       "EXT3-fs: blocksize too small for journal device.\n");
+               goto out_bdev;
+       }
+
+       sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
+       offset = EXT3_MIN_BLOCK_SIZE % blocksize;
+       set_blocksize(dev, blocksize);
+       if (!(bh = bread(dev, sb_block, blocksize))) {
+               printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
+                      "external journal\n");
+               goto out_bdev;
+       }
+
+       es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+       if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
+           !(le32_to_cpu(es->s_feature_incompat) &
+             EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+               printk(KERN_ERR "EXT3-fs: external journal has "
+                                       "bad superblock\n");
+               brelse(bh);
+               goto out_bdev;
+       }
+
+       if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+               printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+               brelse(bh);
+               goto out_bdev;
+       }
+
+       len = le32_to_cpu(es->s_blocks_count);
+       start = sb_block + 1;
+       brelse(bh);     /* we're done with the superblock */
+
+       journal = journal_init_dev(journal_dev, sb->s_dev,
+                                       start, len, blocksize);
+       if (!journal) {
+               printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+               goto out_bdev;
+       }
+       ll_rw_block(READ, 1, &journal->j_sb_buffer);
+       wait_on_buffer(journal->j_sb_buffer);
+       if (!buffer_uptodate(journal->j_sb_buffer)) {
+               printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+               goto out_journal;
+       }
+       if (ntohl(journal->j_superblock->s_nr_users) != 1) {
+               printk(KERN_ERR "EXT3-fs: External journal has more than one "
+                                       "user (unsupported) - %d\n",
+                       ntohl(journal->j_superblock->s_nr_users));
+               goto out_journal;
+       }
+       EXT3_SB(sb)->journal_bdev = bdev;
+       return journal;
+ out_journal:
+       journal_destroy(journal);
+ out_bdev:
+       ext3_blkdev_put(bdev);
+       return NULL;
+ }
+
+ static int ext3_load_journal(struct super_block * sb,
+                            struct ext3_super_block * es)
+ {
+       journal_t *journal;
+       int journal_inum = le32_to_cpu(es->s_journal_inum);
+       int journal_dev = le32_to_cpu(es->s_journal_dev);
+       int err;
+       int really_read_only;
+
+       really_read_only = is_read_only(sb->s_dev);
+
+       /*
+        * Are we loading a blank journal or performing recovery after a
+        * crash?  For recovery, we need to check in advance whether we
+        * can get read-write access to the device.
+        */
+
+       if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+               if (sb->s_flags & MS_RDONLY) {
+                       printk(KERN_INFO "EXT3-fs: INFO: recovery "
+                                       "required on readonly filesystem.\n");
+                       if (really_read_only) {
+                               printk(KERN_ERR "EXT3-fs: write access "
+                                       "unavailable, cannot proceed.\n");
+                               return -EROFS;
+                       }
+                       printk (KERN_INFO "EXT3-fs: write access will "
+                                       "be enabled during recovery.\n");
+               }
+       }
+
+       if (journal_inum && journal_dev) {
+               printk(KERN_ERR "EXT3-fs: filesystem has both journal "
+                      "and inode journals!\n");
+               return -EINVAL;
+       }
+
+       if (journal_inum) {
+               if (!(journal = ext3_get_journal(sb, journal_inum)))
+                       return -EINVAL;
+       } else {
+               if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
+                       return -EINVAL;
+       }
+
+
+       if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+               err = journal_update_format(journal);
+               if (err)  {
+                       printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+                       journal_destroy(journal);
+                       return err;
+               }
+       }
+
+       if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
+               journal_wipe(journal, !really_read_only);
+
+       err = journal_load(journal);
+       if (err) {
+               printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+               journal_destroy(journal);
+               return err;
+       }
+
+       EXT3_SB(sb)->s_journal = journal;
+       ext3_clear_journal_err(sb, es);
+       return 0;
+ }
+
+ static int ext3_create_journal(struct super_block * sb,
+                              struct ext3_super_block * es,
+                              int journal_inum)
+ {
+       journal_t *journal;
+
+       if (sb->s_flags & MS_RDONLY) {
+               printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
+                               "create journal.\n");
+               return -EROFS;
+       }
+
+       if (!(journal = ext3_get_journal(sb, journal_inum)))
+               return -EINVAL;
+
+       printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
+              journal_inum);
+
+       if (journal_create(journal)) {
+               printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+               journal_destroy(journal);
+               return -EIO;
+       }
+
+       EXT3_SB(sb)->s_journal = journal;
+
+       ext3_update_dynamic_rev(sb);
+       EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+       EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
+
+       es->s_journal_inum = cpu_to_le32(journal_inum);
+       sb->s_dirt = 1;
+
+       /* Make sure we flush the recovery flag to disk. */
+       ext3_commit_super(sb, es, 1);
+
+       return 0;
+ }
+
+ static void ext3_commit_super (struct super_block * sb,
+                              struct ext3_super_block * es,
+                              int sync)
+ {
+       es->s_wtime = cpu_to_le32(CURRENT_TIME);
+       BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
+       mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
+       if (sync) {
+               ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
+               wait_on_buffer(sb->u.ext3_sb.s_sbh);
+       }
+ }
+
+
+ /*
+  * Have we just finished recovery?  If so, and if we are mounting (or
+  * remounting) the filesystem readonly, then we will end up with a
+  * consistent fs on disk.  Record that fact.
+  */
+ static void ext3_mark_recovery_complete(struct super_block * sb,
+                                       struct ext3_super_block * es)
+ {
+       journal_flush(EXT3_SB(sb)->s_journal);
+       if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
+           sb->s_flags & MS_RDONLY) {
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+               sb->s_dirt = 0;
+               ext3_commit_super(sb, es, 1);
+       }
+ }
+
+ /*
+  * If we are mounting (or read-write remounting) a filesystem whose journal
+  * has recorded an error from a previous lifetime, move that error to the
+  * main filesystem now.
+  */
+ static void ext3_clear_journal_err(struct super_block * sb,
+                                  struct ext3_super_block * es)
+ {
+       journal_t *journal;
+       int j_errno;
+       const char *errstr;
+
+       journal = EXT3_SB(sb)->s_journal;
+
+       /*
+        * Now check for any error status which may have been recorded in the
+        * journal by a prior ext3_error() or ext3_abort()
+        */
+
+       j_errno = journal_errno(journal);
+       if (j_errno) {
+               char nbuf[16];
+
+               errstr = ext3_decode_error(sb, j_errno, nbuf);
+               ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
+                            "from previous mount: %s", errstr);
+               ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
+                            "filesystem check.");
+
+               sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+               es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+               ext3_commit_super (sb, es, 1);
+
+               journal_clear_err(journal);
+       }
+ }
+
+ /*
+  * Force the running and committing transactions to commit,
+  * and wait on the commit.
+  */
+ int ext3_force_commit(struct super_block *sb)
+ {
+       journal_t *journal;
+       int ret;
+
+       if (sb->s_flags & MS_RDONLY)
+               return 0;
+
+       journal = EXT3_SB(sb)->s_journal;
+       sb->s_dirt = 0;
+       lock_kernel();  /* important: lock down j_running_transaction */
+       ret = ext3_journal_force_commit(journal);
+       unlock_kernel();
+       return ret;
+ }
+
+ /*
+  * Ext3 always journals updates to the superblock itself, so we don't
+  * have to propagate any other updates to the superblock on disk at this
+  * point.  Just start an async writeback to get the buffers on their way
+  * to the disk.
+  *
+  * This implicitly triggers the writebehind on sync().
+  */
+
+ static int do_sync_supers = 0;
+ MODULE_PARM(do_sync_supers, "i");
+ MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
+
+ void ext3_write_super (struct super_block * sb)
+ {
+       tid_t target;
+
+       if (down_trylock(&sb->s_lock) == 0)
+               BUG();          /* aviro detector */
+       sb->s_dirt = 0;
+       target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
+       if (do_sync_supers) {
+               unlock_super(sb);
+               log_wait_commit(EXT3_SB(sb)->s_journal, target);
+               lock_super(sb);
+       }
+ }
+
+ /*
+  * LVM calls this function before a (read-only) snapshot is created.  This
+  * gives us a chance to flush the journal completely and mark the fs clean.
+  */
+ void ext3_write_super_lockfs(struct super_block *sb)
+ {
+       sb->s_dirt = 0;
+
+       lock_kernel();          /* 2.4.5 forgot to do this for us */
+       if (!(sb->s_flags & MS_RDONLY)) {
+               journal_t *journal = EXT3_SB(sb)->s_journal;
+
+               /* Now we set up the journal barrier. */
+               journal_lock_updates(journal);
+               journal_flush(journal);
+
+               /* Journal blocked and flushed, clear needs_recovery flag. */
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+               ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+       }
+       unlock_kernel();
+ }
+
+ /*
+  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+  * flag here, even though the filesystem is not technically dirty yet.
+  */
+ void ext3_unlockfs(struct super_block *sb)
+ {
+       if (!(sb->s_flags & MS_RDONLY)) {
+               lock_kernel();
+               lock_super(sb);
+               /* Reser the needs_recovery flag before the fs is unlocked. */
+               EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+               ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+               unlock_super(sb);
+               journal_unlock_updates(EXT3_SB(sb)->s_journal);
+               unlock_kernel();
+       }
+ }
+
+ int ext3_remount (struct super_block * sb, int * flags, char * data)
+ {
+       struct ext3_super_block * es;
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+       unsigned long tmp;
+
+       clear_ro_after(sb);
+
+       /*
+        * Allow the "check" option to be passed as a remount option.
+        */
+       if (!parse_options(data, &tmp, sbi, &tmp, 1))
+               return -EINVAL;
+
+       if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+               ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+
+       es = sbi->s_es;
+
+       if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+               if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+                       return -EROFS;
+
+               if (*flags & MS_RDONLY) {
+                       /*
+                        * First of all, the unconditional stuff we have to do
+                        * to disable replay of the journal when we next remount
+                        */
+                       sb->s_flags |= MS_RDONLY;
+
+                       /*
+                        * OK, test if we are remounting a valid rw partition
+                        * readonly, and if so set the rdonly flag and then
+                        * mark the partition as valid again.
+                        */
+                       if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
+                           (sbi->s_mount_state & EXT3_VALID_FS))
+                               es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+                       ext3_mark_recovery_complete(sb, es);
+               } else {
+                       int ret;
+                       if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
+                                       ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
+                               printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+                                      "remount RDWR because of unsupported "
+                                      "optional features (%x).\n",
+                                      bdevname(sb->s_dev), ret);
+                               return -EROFS;
+                       }
+                       /*
+                        * Mounting a RDONLY partition read-write, so reread
+                        * and store the current valid flag.  (It may have
+                        * been changed by e2fsck since we originally mounted
+                        * the partition.)
+                        */
+                       ext3_clear_journal_err(sb, es);
+                       sbi->s_mount_state = le16_to_cpu(es->s_state);
+                       if (!ext3_setup_super (sb, es, 0))
+                               sb->s_flags &= ~MS_RDONLY;
+               }
+       }
+       setup_ro_after(sb);
+       return 0;
+ }
+
+ int ext3_statfs (struct super_block * sb, struct statfs * buf)
+ {
+       struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+       unsigned long overhead;
+       int i;
+
+       if (test_opt (sb, MINIX_DF))
+               overhead = 0;
+       else {
+               /*
+                * Compute the overhead (FS structures)
+                */
+
+               /*
+                * All of the blocks before first_data_block are
+                * overhead
+                */
+               overhead = le32_to_cpu(es->s_first_data_block);
+
+               /*
+                * Add the overhead attributed to the superblock and
+                * block group descriptors.  If the sparse superblocks
+                * feature is turned on, then not all groups have this.
+                */
+               for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+                       overhead += ext3_bg_has_super(sb, i) +
+                               ext3_bg_num_gdb(sb, i);
+
+               /*
+                * Every block group has an inode bitmap, a block
+                * bitmap, and an inode table.
+                */
+               overhead += (EXT3_SB(sb)->s_groups_count *
+                            (2 + EXT3_SB(sb)->s_itb_per_group));
+       }
+
+       buf->f_type = EXT3_SUPER_MAGIC;
+       buf->f_bsize = sb->s_blocksize;
+       buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+       buf->f_bfree = ext3_count_free_blocks (sb);
+       buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+       if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+               buf->f_bavail = 0;
+       buf->f_files = le32_to_cpu(es->s_inodes_count);
+       buf->f_ffree = ext3_count_free_inodes (sb);
+       buf->f_namelen = EXT3_NAME_LEN;
+       return 0;
+ }
+
+ static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
+
+ static int __init init_ext3_fs(void)
+ {
+         return register_filesystem(&ext3_fs_type);
+ }
+
+ static void __exit exit_ext3_fs(void)
+ {
+       unregister_filesystem(&ext3_fs_type);
+ }
+
+ EXPORT_NO_SYMBOLS;
+
+ MODULE_LICENSE("GPL");
+ module_init(init_ext3_fs)
+ module_exit(exit_ext3_fs)
diff -rc2P linux/fs/ext3/symlink.c linux-2.4.13/fs/ext3/symlink.c
*** linux/fs/ext3/symlink.c     Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/symlink.c      Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,39 ----
+ /*
+  *  linux/fs/ext3/symlink.c
+  *
+  * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/symlink.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3 symlink handling code
+  */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+       char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+       return vfs_readlink(dentry, buffer, buflen, s);
+ }
+
+ static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+ {
+       char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+       return vfs_follow_link(nd, s);
+ }
+
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+       readlink:       ext3_readlink,          /* BKL not held.  Don't need */
+       follow_link:    ext3_follow_link,       /* BKL not held.  Don't need */
+ };
diff -rc2P linux/fs/inode.c linux-2.4.13/fs/inode.c
*** linux/fs/inode.c    Fri Sep 28 21:03:48 2001
--- linux-2.4.13/fs/inode.c     Fri Nov  9 16:57:59 2001
***************
*** 110,113 ****
--- 110,114 ----
               sema_init(&inode->i_sem, 1);
               sema_init(&inode->i_zombie, 1);
+               init_rwsem(&inode->i_truncate_sem);
               spin_lock_init(&inode->i_data.i_shared_lock);
       }
diff -rc2P linux/fs/jbd/Makefile linux-2.4.13/fs/jbd/Makefile
*** linux/fs/jbd/Makefile       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/Makefile        Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,15 ----
+ #
+ # fs/jbd/Makefile
+ #
+ # Makefile for the linux journaling routines.
+ #
+
+ export-objs := journal.o
+ O_TARGET := jbd.o
+
+ obj-y   := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
+
+ obj-m   := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
+
diff -rc2P linux/fs/jbd/checkpoint.c linux-2.4.13/fs/jbd/checkpoint.c
*** linux/fs/jbd/checkpoint.c   Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/checkpoint.c    Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,608 ----
+ /*
+  * linux/fs/checkpoint.c
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 1999
+  *
+  * Copyright 1999 Red Hat Software --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Checkpoint routines for the generic filesystem journaling code.
+  * Part of the ext2fs journaling system.
+  *
+  * Checkpointing is the process of ensuring that a section of the log is
+  * committed fully to disk, so that that portion of the log can be
+  * reused.
+  */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+
+ extern spinlock_t journal_datalist_lock;
+
+ /*
+  * Unlink a buffer from a transaction.
+  *
+  * Called with journal_datalist_lock held.
+  */
+
+ static inline void __buffer_unlink(struct journal_head *jh)
+ {
+       transaction_t *transaction;
+
+       transaction = jh->b_cp_transaction;
+       jh->b_cp_transaction = NULL;
+
+       jh->b_cpnext->b_cpprev = jh->b_cpprev;
+       jh->b_cpprev->b_cpnext = jh->b_cpnext;
+       if (transaction->t_checkpoint_list == jh)
+               transaction->t_checkpoint_list = jh->b_cpnext;
+       if (transaction->t_checkpoint_list == jh)
+               transaction->t_checkpoint_list = NULL;
+ }
+
+ /*
+  * Try to release a checkpointed buffer from its transaction.
+  * Returns 1 if we released it.
+  * Requires journal_datalist_lock
+  */
+ static int __try_to_free_cp_buf(struct journal_head *jh)
+ {
+       int ret = 0;
+       struct buffer_head *bh = jh2bh(jh);
+
+       if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+               JBUFFER_TRACE(jh, "remove from checkpoint list");
+               __journal_remove_checkpoint(jh);
+               __journal_remove_journal_head(bh);
+               BUFFER_TRACE(bh, "release");
+               /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
+               refile_buffer(bh);
+               __brelse(bh);
+               ret = 1;
+       }
+       return ret;
+ }
+
+ /*
+  * log_wait_for_space: wait until there is space in the journal.
+  *
+  * Called with the journal already locked, but it will be unlocked if we have
+  * to wait for a checkpoint to free up some space in the log.
+  */
+
+ void log_wait_for_space(journal_t *journal, int nblocks)
+ {
+       while (log_space_left(journal) < nblocks) {
+               if (journal->j_flags & JFS_ABORT)
+                       return;
+               unlock_journal(journal);
+               down(&journal->j_checkpoint_sem);
+               lock_journal(journal);
+
+               /* Test again, another process may have checkpointed
+                * while we were waiting for the checkpoint lock */
+               if (log_space_left(journal) < nblocks) {
+                       log_do_checkpoint(journal, nblocks);
+               }
+               up(&journal->j_checkpoint_sem);
+       }
+ }
+
+ /*
+  * Clean up a transaction's checkpoint list.
+  *
+  * We wait for any pending IO to complete and make sure any clean
+  * buffers are removed from the transaction.
+  *
+  * Return 1 if we performed any actions which might have destroyed the
+  * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
+  * the last checkpoint buffer is cleansed)
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  */
+ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+ {
+       struct journal_head *jh, *next_jh, *last_jh;
+       struct buffer_head *bh;
+       int ret = 0;
+
+       assert_spin_locked(&journal_datalist_lock);
+       jh = transaction->t_checkpoint_list;
+       if (!jh)
+               return 0;
+
+       last_jh = jh->b_cpprev;
+       next_jh = jh;
+       do {
+               jh = next_jh;
+               bh = jh2bh(jh);
+               if (buffer_locked(bh)) {
+                       atomic_inc(&bh->b_count);
+                       spin_unlock(&journal_datalist_lock);
+                       unlock_journal(journal);
+                       wait_on_buffer(bh);
+                       /* the journal_head may have gone by now */
+                       BUFFER_TRACE(bh, "brelse");
+                       __brelse(bh);
+                       goto out_return_1;
+               }
+
+               if (jh->b_transaction != NULL) {
+                       transaction_t *transaction = jh->b_transaction;
+                       tid_t tid = transaction->t_tid;
+
+                       spin_unlock(&journal_datalist_lock);
+                       log_start_commit(journal, transaction);
+                       unlock_journal(journal);
+                       log_wait_commit(journal, tid);
+                       goto out_return_1;
+               }
+
+               /*
+                * We used to test for (jh->b_list != BUF_CLEAN) here.
+                * But unmap_underlying_metadata() can place buffer onto
+                * BUF_CLEAN. Since refile_buffer() no longer takes buffers
+                * off checkpoint lists, we cope with it here
+                */
+               /*
+                * AKPM: I think the buffer_jdirty test is redundant - it
+                * shouldn't have NULL b_transaction?
+                */
+               next_jh = jh->b_cpnext;
+               if (!buffer_dirty(bh) && !buffer_jdirty(bh)) {
+                       BUFFER_TRACE(bh, "remove from checkpoint");
+                       __journal_remove_checkpoint(jh);
+                       __journal_remove_journal_head(bh);
+                       refile_buffer(bh);
+                       __brelse(bh);
+                       ret = 1;
+               }
+
+               jh = next_jh;
+       } while (jh != last_jh);
+
+       return ret;
+ out_return_1:
+       lock_journal(journal);
+       spin_lock(&journal_datalist_lock);
+       return 1;
+ }
+
+ #define NR_BATCH      64
+
+ static void __flush_batch(struct buffer_head **bhs, int *batch_count)
+ {
+       int i;
+
+       spin_unlock(&journal_datalist_lock);
+       ll_rw_block(WRITE, *batch_count, bhs);
+       run_task_queue(&tq_disk);
+       spin_lock(&journal_datalist_lock);
+       for (i = 0; i < *batch_count; i++) {
+               struct buffer_head *bh = bhs[i];
+               clear_bit(BH_JWrite, &bh->b_state);
+               BUFFER_TRACE(bh, "brelse");
+               __brelse(bh);
+       }
+       *batch_count = 0;
+ }
+
+ /*
+  * Try to flush one buffer from the checkpoint list to disk.
+  *
+  * Return 1 if something happened which requires us to abort the current
+  * scan of the checkpoint list.
+  *
+  * Called with journal_datalist_lock held.
+  */
+ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
+                       struct buffer_head **bhs, int *batch_count,
+                       int *drop_count)
+ {
+       struct buffer_head *bh = jh2bh(jh);
+       int ret = 0;
+
+       if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
+               J_ASSERT_JH(jh, jh->b_transaction == NULL);
+
+               /*
+                * Important: we are about to write the buffer, and
+                * possibly block, while still holding the journal lock.
+                * We cannot afford to let the transaction logic start
+                * messing around with this buffer before we write it to
+                * disk, as that would break recoverability.
+                */
+               BUFFER_TRACE(bh, "queue");
+               atomic_inc(&bh->b_count);
+               J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state));
+               set_bit(BH_JWrite, &bh->b_state);
+               bhs[*batch_count] = bh;
+               (*batch_count)++;
+               if (*batch_count == NR_BATCH) {
+                       __flush_batch(bhs, batch_count);
+                       ret = 1;
+               }
+       } else {
+               int last_buffer = 0;
+               if (jh->b_cpnext == jh) {
+                       /* We may be about to drop the transaction.  Tell the
+                        * caller that the lists have changed.
+                        */
+                       last_buffer = 1;
+               }
+               if (__try_to_free_cp_buf(jh)) {
+                       (*drop_count)++;
+                       ret = last_buffer;
+               }
+       }
+       return ret;
+ }
+
+
+ /*
+  * Perform an actual checkpoint.  We don't write out only enough to
+  * satisfy the current blocked requests: rather we submit a reasonably
+  * sized chunk of the outstanding data to disk at once for
+  * efficiency.  log_wait_for_space() will retry if we didn't free enough.
+  *
+  * However, we _do_ take into account the amount requested so that once
+  * the IO has been queued, we can return as soon as enough of it has
+  * completed to disk.
+  *
+  * The journal should be locked before calling this function.
+  */
+
+ /* @@@ `nblocks' is unused.  Should it be used? */
+ int log_do_checkpoint (journal_t *journal, int nblocks)
+ {
+       transaction_t *transaction, *last_transaction, *next_transaction;
+       int result;
+       int target;
+       int batch_count = 0;
+       struct buffer_head *bhs[NR_BATCH];
+
+       jbd_debug(1, "Start checkpoint\n");
+
+       /*
+        * First thing: if there are any transactions in the log which
+        * don't need checkpointing, just eliminate them from the
+        * journal straight away.
+        */
+       result = cleanup_journal_tail(journal);
+       jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+       if (result <= 0)
+               return result;
+
+       /*
+        * OK, we need to start writing disk blocks.  Try to free up a
+        * quarter of the log in a single checkpoint if we can.
+        */
+       /*
+        * AKPM: check this code.  I had a feeling a while back that it
+        * degenerates into a busy loop at unmount time.
+        */
+       target = (journal->j_last - journal->j_first) / 4;
+
+       spin_lock(&journal_datalist_lock);
+ repeat:
+       transaction = journal->j_checkpoint_transactions;
+       if (transaction == NULL)
+               goto done;
+       last_transaction = transaction->t_cpprev;
+       next_transaction = transaction;
+
+       do {
+               struct journal_head *jh, *last_jh, *next_jh;
+               int drop_count = 0;
+               int cleanup_ret, retry = 0;
+
+               transaction = next_transaction;
+               next_transaction = transaction->t_cpnext;
+               jh = transaction->t_checkpoint_list;
+               last_jh = jh->b_cpprev;
+               next_jh = jh;
+               do {
+                       jh = next_jh;
+                       next_jh = jh->b_cpnext;
+                       retry = __flush_buffer(journal, jh, bhs, &batch_count,
+                                               &drop_count);
+               } while (jh != last_jh && !retry);
+               if (batch_count) {
+                       __flush_batch(bhs, &batch_count);
+                       goto repeat;
+               }
+               if (retry)
+                       goto repeat;
+               /*
+                * We have walked the whole transaction list without
+                * finding anything to write to disk.  We had better be
+                * able to make some progress or we are in trouble.
+                */
+               cleanup_ret = __cleanup_transaction(journal, transaction);
+               J_ASSERT(drop_count != 0 || cleanup_ret != 0);
+               goto repeat;    /* __cleanup may have dropped lock */
+       } while (transaction != last_transaction);
+
+ done:
+       spin_unlock(&journal_datalist_lock);
+       result = cleanup_journal_tail(journal);
+       if (result < 0)
+               return result;
+
+       return 0;
+ }
+
+ /*
+  * Check the list of checkpoint transactions for the journal to see if
+  * we have already got rid of any since the last update of the log tail
+  * in the journal superblock.  If so, we can instantly roll the
+  * superblock forward to remove those transactions from the log.
+  *
+  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+  *
+  * Called with the journal lock held.
+  *
+  * This is the only part of the journaling code which really needs to be
+  * aware of transaction aborts.  Checkpointing involves writing to the
+  * main filesystem area rather than to the journal, so it can proceed
+  * even in abort state, but we must not update the journal superblock if
+  * we have an abort error outstanding.
+  */
+
+ int cleanup_journal_tail(journal_t *journal)
+ {
+       transaction_t * transaction;
+       tid_t           first_tid;
+       unsigned long   blocknr, freed;
+
+       /* OK, work out the oldest transaction remaining in the log, and
+        * the log block it starts at.
+        *
+        * If the log is now empty, we need to work out which is the
+        * next transaction ID we will write, and where it will
+        * start. */
+
+       /* j_checkpoint_transactions needs locking */
+       spin_lock(&journal_datalist_lock);
+       transaction = journal->j_checkpoint_transactions;
+       if (transaction) {
+               first_tid = transaction->t_tid;
+               blocknr = transaction->t_log_start;
+       } else if ((transaction = journal->j_committing_transaction) != NULL) {
+               first_tid = transaction->t_tid;
+               blocknr = transaction->t_log_start;
+       } else if ((transaction = journal->j_running_transaction) != NULL) {
+               first_tid = transaction->t_tid;
+               blocknr = journal->j_head;
+       } else {
+               first_tid = journal->j_transaction_sequence;
+               blocknr = journal->j_head;
+       }
+       spin_unlock(&journal_datalist_lock);
+       J_ASSERT (blocknr != 0);
+
+       /* If the oldest pinned transaction is at the tail of the log
+            already then there's not much we can do right now. */
+       if (journal->j_tail_sequence == first_tid)
+               return 1;
+
+       /* OK, update the superblock to recover the freed space.
+        * Physical blocks come first: have we wrapped beyond the end of
+        * the log?  */
+       freed = blocknr - journal->j_tail;
+       if (blocknr < journal->j_tail)
+               freed = freed + journal->j_last - journal->j_first;
+
+       jbd_debug(1,
+                 "Cleaning journal tail from %d to %d (offset %lu), "
+                 "freeing %lu\n",
+                 journal->j_tail_sequence, first_tid, blocknr, freed);
+
+       journal->j_free += freed;
+       journal->j_tail_sequence = first_tid;
+       journal->j_tail = blocknr;
+       if (!(journal->j_flags & JFS_ABORT))
+               journal_update_superblock(journal, 1);
+       return 0;
+ }
+
+
+ /* Checkpoint list management */
+
+ /*
+  * journal_clean_checkpoint_list
+  *
+  * Find all the written-back checkpoint buffers in the journal and release them.
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  * Returns number of bufers reaped (for debug)
+  */
+
+ int __journal_clean_checkpoint_list(journal_t *journal)
+ {
+       transaction_t *transaction, *last_transaction, *next_transaction;
+       int ret = 0;
+
+       transaction = journal->j_checkpoint_transactions;
+       if (transaction == 0)
+               goto out;
+
+       last_transaction = transaction->t_cpprev;
+       next_transaction = transaction;
+       do {
+               struct journal_head *jh;
+
+               transaction = next_transaction;
+               next_transaction = transaction->t_cpnext;
+               jh = transaction->t_checkpoint_list;
+               if (jh) {
+                       struct journal_head *last_jh = jh->b_cpprev;
+                       struct journal_head *next_jh = jh;
+                       do {
+                               struct buffer_head *bh;
+
+                               jh = next_jh;
+                               next_jh = jh->b_cpnext;
+                               bh = jh2bh(jh);
+                               ret += __try_to_free_cp_buf(jh);
+                       } while (jh != last_jh);
+               }
+       } while (transaction != last_transaction);
+ out:
+       return ret;
+ }
+
+ /*
+  * journal_remove_checkpoint: called after a buffer has been committed
+  * to disk (either by being write-back flushed to disk, or being
+  * committed to the log).
+  *
+  * We cannot safely clean a transaction out of the log until all of the
+  * buffer updates committed in that transaction have safely been stored
+  * elsewhere on disk.  To achieve this, all of the buffers in a
+  * transaction need to be maintained on the transaction's checkpoint
+  * list until they have been rewritten, at which point this function is
+  * called to remove the buffer from the existing transaction's
+  * checkpoint list.
+  *
+  * This function is called with the journal locked.
+  * This function is called with journal_datalist_lock held.
+  */
+
+ void __journal_remove_checkpoint(struct journal_head *jh)
+ {
+       transaction_t *transaction;
+       journal_t *journal;
+
+       JBUFFER_TRACE(jh, "entry");
+
+       if ((transaction = jh->b_cp_transaction) == NULL) {
+               JBUFFER_TRACE(jh, "not on transaction");
+               goto out;
+       }
+
+       journal = transaction->t_journal;
+
+       __buffer_unlink(jh);
+
+       if (transaction->t_checkpoint_list != NULL)
+               goto out;
+       JBUFFER_TRACE(jh, "transaction has no more buffers");
+
+       /* There is one special case to worry about: if we have just
+            pulled the buffer off a committing transaction's forget list,
+            then even if the checkpoint list is empty, the transaction
+            obviously cannot be dropped! */
+
+       if (transaction == journal->j_committing_transaction) {
+               JBUFFER_TRACE(jh, "belongs to committing transaction");
+               goto out;
+       }
+
+       /* OK, that was the last buffer for the transaction: we can now
+          safely remove this transaction from the log */
+
+       __journal_drop_transaction(journal, transaction);
+
+       /* Just in case anybody was waiting for more transactions to be
+            checkpointed... */
+       wake_up(&journal->j_wait_logspace);
+ out:
+       JBUFFER_TRACE(jh, "exit");
+ }
+
+ void journal_remove_checkpoint(struct journal_head *jh)
+ {
+       spin_lock(&journal_datalist_lock);
+       __journal_remove_checkpoint(jh);
+       spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+  * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+  * list so that we know when it is safe to clean the transaction out of
+  * the log.
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  */
+ void __journal_insert_checkpoint(struct journal_head *jh,
+                              transaction_t *transaction)
+ {
+       JBUFFER_TRACE(jh, "entry");
+       J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh)));
+       J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+       assert_spin_locked(&journal_datalist_lock);
+       jh->b_cp_transaction = transaction;
+
+       if (!transaction->t_checkpoint_list) {
+               jh->b_cpnext = jh->b_cpprev = jh;
+       } else {
+               jh->b_cpnext = transaction->t_checkpoint_list;
+               jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+               jh->b_cpprev->b_cpnext = jh;
+               jh->b_cpnext->b_cpprev = jh;
+       }
+       transaction->t_checkpoint_list = jh;
+ }
+
+ void journal_insert_checkpoint(struct journal_head *jh,
+                              transaction_t *transaction)
+ {
+       spin_lock(&journal_datalist_lock);
+       __journal_insert_checkpoint(jh, transaction);
+       spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+  * We've finished with this transaction structure: adios...
+  *
+  * The transaction must have no links except for the checkpoint by this
+  * point.
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  */
+
+ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+ {
+       assert_spin_locked(&journal_datalist_lock);
+       if (transaction->t_cpnext) {
+               transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+               transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+               if (journal->j_checkpoint_transactions == transaction)
+                       journal->j_checkpoint_transactions =
+                               transaction->t_cpnext;
+               if (journal->j_checkpoint_transactions == transaction)
+                       journal->j_checkpoint_transactions = NULL;
+       }
+
+       J_ASSERT (transaction->t_ilist == NULL);
+       J_ASSERT (transaction->t_buffers == NULL);
+       J_ASSERT (transaction->t_sync_datalist == NULL);
+       J_ASSERT (transaction->t_async_datalist == NULL);
+       J_ASSERT (transaction->t_forget == NULL);
+       J_ASSERT (transaction->t_iobuf_list == NULL);
+       J_ASSERT (transaction->t_shadow_list == NULL);
+       J_ASSERT (transaction->t_log_list == NULL);
+       J_ASSERT (transaction->t_checkpoint_list == NULL);
+       J_ASSERT (transaction->t_updates == 0);
+
+       J_ASSERT (transaction->t_journal->j_committing_transaction !=
+                                       transaction);
+
+       jbd_debug (1, "Dropping transaction %d, all done\n",
+                  transaction->t_tid);
+       kfree (transaction);
+ }
+
diff -rc2P linux/fs/jbd/commit.c linux-2.4.13/fs/jbd/commit.c
*** linux/fs/jbd/commit.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/commit.c        Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,701 ----
+ /*
+  * linux/fs/commit.c
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 1998
+  *
+  * Copyright 1998 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Journal commit routines for the generic filesystem journaling code;
+  * part of the ext2fs journaling system.
+  */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+
+ extern spinlock_t journal_datalist_lock;
+
+ /*
+  * Default IO end handler for temporary BJ_IO buffer_heads.
+  */
+ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+ {
+       BUFFER_TRACE(bh, "");
+       mark_buffer_uptodate(bh, uptodate);
+       unlock_buffer(bh);
+ }
+
+ /*
+  * journal_commit_transaction
+  *
+  * The primary function for committing a transaction to the log.  This
+  * function is called by the journal thread to begin a complete commit.
+  */
+ void journal_commit_transaction(journal_t *journal)
+ {
+       transaction_t *commit_transaction;
+       struct journal_head *jh, *new_jh, *descriptor;
+       struct journal_head *next_jh, *last_jh;
+       struct buffer_head *wbuf[64];
+       int bufs;
+       int flags;
+       int blocknr;
+       char *tagp = NULL;
+       journal_header_t *header;
+       journal_block_tag_t *tag = NULL;
+       int space_left = 0;
+       int first_tag = 0;
+       int tag_flag;
+       int i;
+
+       /*
+        * First job: lock down the current transaction and wait for
+        * all outstanding updates to complete.
+        */
+
+       lock_journal(journal); /* Protect journal->j_running_transaction */
+
+ #ifdef COMMIT_STATS
+       spin_lock(&journal_datalist_lock);
+       summarise_journal_usage(journal);
+       spin_unlock(&journal_datalist_lock);
+ #endif
+
+       lock_kernel();
+
+       J_ASSERT (journal->j_running_transaction != NULL);
+       J_ASSERT (journal->j_committing_transaction == NULL);
+
+       commit_transaction = journal->j_running_transaction;
+       J_ASSERT (commit_transaction->t_state == T_RUNNING);
+
+       jbd_debug (1, "JBD: starting commit of transaction %d\n",
+                  commit_transaction->t_tid);
+
+       commit_transaction->t_state = T_LOCKED;
+       while (commit_transaction->t_updates != 0) {
+               unlock_journal(journal);
+               sleep_on(&journal->j_wait_updates);
+               lock_journal(journal);
+       }
+
+       J_ASSERT (commit_transaction->t_outstanding_credits <=
+                       journal->j_max_transaction_buffers);
+
+       /* Do we need to erase the effects of a prior journal_flush? */
+       if (journal->j_flags & JFS_FLUSHED) {
+               jbd_debug(3, "super block updated\n");
+               journal_update_superblock(journal, 1);
+       } else {
+               jbd_debug(3, "superblock not updated\n");
+       }
+
+       /*
+        * First thing we are allowed to do is to discard any remaining
+        * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+        * that there are no such buffers: if a large filesystem
+        * operation like a truncate needs to split itself over multiple
+        * transactions, then it may try to do a journal_restart() while
+        * there are still BJ_Reserved buffers outstanding.  These must
+        * be released cleanly from the current transaction.
+        *
+        * In this case, the filesystem must still reserve write access
+        * again before modifying the buffer in the new transaction, but
+        * we do not require it to remember exactly which old buffers it
+        * has reserved.  This is consistent with the existing behaviour
+        * that multiple journal_get_write_access() calls to the same
+        * buffer are perfectly permissable.
+        */
+
+       while (commit_transaction->t_reserved_list) {
+               jh = commit_transaction->t_reserved_list;
+               JBUFFER_TRACE(jh, "reserved, unused: refile");
+               journal_refile_buffer(jh);
+       }
+
+       /*
+        * Now try to drop any written-back buffers from the journal's
+        * checkpoint lists.  We do this *before* commit because it potentially
+        * frees some memory
+        */
+       spin_lock(&journal_datalist_lock);
+       __journal_clean_checkpoint_list(journal);
+       spin_unlock(&journal_datalist_lock);
+
+       /* First part of the commit: force the revoke list out to disk.
+        * The revoke code generates its own metadata blocks on disk for this.
+        *
+        * It is important that we do this while the transaction is
+        * still locked.  Generating the revoke records should not
+        * generate any IO stalls, so this should be quick; and doing
+        * the work while we have the transaction locked means that we
+        * only ever have to maintain the revoke list for one
+        * transaction at a time.
+        */
+
+       jbd_debug (3, "JBD: commit phase 1\n");
+
+       journal_write_revoke_records(journal, commit_transaction);
+
+       /*
+        * Now that we have built the revoke records, we can start
+        * reusing the revoke list for a new running transaction.  We
+        * can now safely start committing the old transaction: time to
+        * get a new running transaction for incoming filesystem updates
+        */
+
+       commit_transaction->t_state = T_FLUSH;
+
+       wake_up(&journal->j_wait_transaction_locked);
+
+       journal->j_committing_transaction = commit_transaction;
+       journal->j_running_transaction = NULL;
+
+       commit_transaction->t_log_start = journal->j_head;
+
+       unlock_kernel();
+
+       jbd_debug (3, "JBD: commit phase 2\n");
+
+       /*
+        * Now start flushing things to disk, in the order they appear
+        * on the transaction lists.  Data blocks go first.
+        */
+
+       /*
+        * Whenever we unlock the journal and sleep, things can get added
+        * onto ->t_datalist, so we have to keep looping back to write_out_data
+        * until we *know* that the list is empty.
+        */
+ write_out_data:
+
+       /*
+        * Cleanup any flushed data buffers from the data list.  Even in
+        * abort mode, we want to flush this out as soon as possible.
+        *
+        * We take journal_datalist_lock to protect the lists from
+        * journal_try_to_free_buffers().
+        */
+       spin_lock(&journal_datalist_lock);
+
+ write_out_data_locked:
+       bufs = 0;
+       next_jh = commit_transaction->t_sync_datalist;
+       if (next_jh == NULL)
+               goto sync_datalist_empty;
+       last_jh = next_jh->b_tprev;
+
+       do {
+               struct buffer_head *bh;
+
+               jh = next_jh;
+               next_jh = jh->b_tnext;
+               bh = jh2bh(jh);
+               if (!buffer_locked(bh)) {
+                       if (buffer_dirty(bh)) {
+                               BUFFER_TRACE(bh, "start journal writeout");
+                               atomic_inc(&bh->b_count);
+                               wbuf[bufs++] = bh;
+                       } else {
+                               BUFFER_TRACE(bh, "writeout complete: unfile");
+                               __journal_unfile_buffer(jh);
+                               jh->b_transaction = NULL;
+                               __journal_remove_journal_head(bh);
+                               refile_buffer(bh);
+                               __brelse(bh);
+                       }
+               }
+               if (bufs == ARRAY_SIZE(wbuf)) {
+                       /*
+                        * Major speedup: start here on the next scan
+                        */
+                       J_ASSERT(commit_transaction->t_sync_datalist != 0);
+                       commit_transaction->t_sync_datalist = jh;
+                       break;
+               }
+       } while (jh != last_jh);
+
+       if (bufs || current->need_resched) {
+               jbd_debug(2, "submit %d writes\n", bufs);
+               spin_unlock(&journal_datalist_lock);
+               unlock_journal(journal);
+               if (bufs)
+                       ll_rw_block(WRITE, bufs, wbuf);
+               if (current->need_resched)
+                       schedule();
+               journal_brelse_array(wbuf, bufs);
+               lock_journal(journal);
+               spin_lock(&journal_datalist_lock);
+               if (bufs)
+                       goto write_out_data_locked;
+       }
+
+       /*
+        * Wait for all previously submitted IO on the data list to complete.
+        */
+       jh = commit_transaction->t_sync_datalist;
+       if (jh == NULL)
+               goto sync_datalist_empty;
+
+       do {
+               struct buffer_head *bh;
+               jh = jh->b_tprev;       /* Wait on the last written */
+               bh = jh2bh(jh);
+               if (buffer_locked(bh)) {
+                       spin_unlock(&journal_datalist_lock);
+                       unlock_journal(journal);
+                       wait_on_buffer(bh);
+                       /* the journal_head may have been removed now */
+                       lock_journal(journal);
+                       goto write_out_data;
+               } else if (buffer_dirty(bh)) {
+                       goto write_out_data_locked;
+               }
+       } while (jh != commit_transaction->t_sync_datalist);
+       goto write_out_data_locked;
+
+ sync_datalist_empty:
+       /*
+        * Wait for all the async writepage data.  As they become unlocked
+        * in end_buffer_io_async(), the only place where they can be
+        * reaped is in try_to_free_buffers(), and we're locked against
+        * that.
+        */
+       while ((jh = commit_transaction->t_async_datalist)) {
+               struct buffer_head *bh = jh2bh(jh);
+               if (buffer_locked(bh)) {
+                       spin_unlock(&journal_datalist_lock);
+                       unlock_journal(journal);
+                       wait_on_buffer(bh);
+                       lock_journal(journal);
+                       spin_lock(&journal_datalist_lock);
+                       continue;       /* List may have changed */
+               }
+               if (jh->b_next_transaction) {
+                       /*
+                        * For writepage() buffers in journalled data mode: a
+                        * later transaction may want the buffer for "metadata"
+                        */
+                       __journal_refile_buffer(jh);
+               } else {
+                       BUFFER_TRACE(bh, "finished async writeout: unfile");
+                       __journal_unfile_buffer(jh);
+                       jh->b_transaction = NULL;
+                       __journal_remove_journal_head(bh);
+                       BUFFER_TRACE(bh, "finished async writeout: refile");
+                       /* It can sometimes be on BUF_LOCKED due to migration
+                        * from syncdata to asyncdata */
+                       if (bh->b_list != BUF_CLEAN)
+                               refile_buffer(bh);
+                       __brelse(bh);
+               }
+       }
+       spin_unlock(&journal_datalist_lock);
+
+       /*
+        * If we found any dirty or locked buffers, then we should have
+        * looped back up to the write_out_data label.  If there weren't
+        * any then journal_clean_data_list should have wiped the list
+        * clean by now, so check that it is in fact empty.
+        */
+       J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+       J_ASSERT (commit_transaction->t_async_datalist == NULL);
+
+       jbd_debug (3, "JBD: commit phase 3\n");
+
+       /*
+        * Way to go: we have now written out all of the data for a
+        * transaction!  Now comes the tricky part: we need to write out
+        * metadata.  Loop over the transaction's entire buffer list:
+        */
+       commit_transaction->t_state = T_COMMIT;
+
+       descriptor = 0;
+       bufs = 0;
+       while (commit_transaction->t_buffers) {
+
+               /* Find the next buffer to be journaled... */
+
+               jh = commit_transaction->t_buffers;
+
+               /* If we're in abort mode, we just un-journal the buffer and
+                  release it for background writing. */
+
+               if (is_journal_aborted(journal)) {
+                       JBUFFER_TRACE(jh, "journal is aborting: refile");
+                       journal_refile_buffer(jh);
+                       /* If that was the last one, we need to clean up
+                        * any descriptor buffers which may have been
+                        * already allocated, even if we are now
+                        * aborting. */
+                       if (!commit_transaction->t_buffers)
+                               goto start_journal_io;
+                       continue;
+               }
+
+               /* Make sure we have a descriptor block in which to
+                  record the metadata buffer. */
+
+               if (!descriptor) {
+                       struct buffer_head *bh;
+
+                       J_ASSERT (bufs == 0);
+
+                       jbd_debug(4, "JBD: get descriptor\n");
+
+                       descriptor = journal_get_descriptor_buffer(journal);
+                       bh = jh2bh(descriptor);
+                       jbd_debug(4, "JBD: got buffer %ld (%p)\n",
+                               bh->b_blocknr, bh->b_data);
+                       header = (journal_header_t *)&bh->b_data[0];
+                       header->h_magic     = htonl(JFS_MAGIC_NUMBER);
+                       header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK);
+                       header->h_sequence  = htonl(commit_transaction->t_tid);
+
+                       tagp = &bh->b_data[sizeof(journal_header_t)];
+                       space_left = bh->b_size - sizeof(journal_header_t);
+                       first_tag = 1;
+                       set_bit(BH_JWrite, &bh->b_state);
+                       wbuf[bufs++] = bh;
+
+                       /* Record it so that we can wait for IO
+                            completion later */
+                       BUFFER_TRACE(bh, "ph3: file as descriptor");
+                       journal_file_buffer(descriptor, commit_transaction,
+                                               BJ_LogCtl);
+               }
+
+               /* Where is the buffer to be written? */
+
+               blocknr = journal_next_log_block(journal);
+
+               /* Bump b_count to prevent truncate from stumbling over
+                    the shadowed buffer!  @@@ This can go if we ever get
+                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+               atomic_inc(&jh2bh(jh)->b_count);
+
+               /* Make a temporary IO buffer with which to write it out
+                    (this will requeue both the metadata buffer and the
+                    temporary IO buffer). new_bh goes on BJ_IO*/
+
+               set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+               /*
+                * akpm: journal_write_metadata_buffer() sets
+                * new_bh->b_transaction to commit_transaction.
+                * We need to clean this up before we release new_bh
+                * (which is of type BJ_IO)
+                */
+               JBUFFER_TRACE(jh, "ph3: write metadata");
+               flags = journal_write_metadata_buffer(commit_transaction,
+                                                     jh, &new_jh, blocknr);
+               set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+               wbuf[bufs++] = jh2bh(new_jh);
+
+               /* Record the new block's tag in the current descriptor
+                    buffer */
+
+               tag_flag = 0;
+               if (flags & 1)
+                       tag_flag |= JFS_FLAG_ESCAPE;
+               if (!first_tag)
+                       tag_flag |= JFS_FLAG_SAME_UUID;
+
+               tag = (journal_block_tag_t *) tagp;
+               tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr);
+               tag->t_flags = htonl(tag_flag);
+               tagp += sizeof(journal_block_tag_t);
+               space_left -= sizeof(journal_block_tag_t);
+
+               if (first_tag) {
+                       memcpy (tagp, journal->j_uuid, 16);
+                       tagp += 16;
+                       space_left -= 16;
+                       first_tag = 0;
+               }
+
+               /* If there's no more to do, or if the descriptor is full,
+                  let the IO rip! */
+
+               if (bufs == ARRAY_SIZE(wbuf) ||
+                   commit_transaction->t_buffers == NULL ||
+                   space_left < sizeof(journal_block_tag_t) + 16) {
+
+                       jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+                       /* Write an end-of-descriptor marker before
+                            submitting the IOs.  "tag" still points to
+                            the last tag we set up. */
+
+                       tag->t_flags |= htonl(JFS_FLAG_LAST_TAG);
+
+ start_journal_io:
+                       unlock_journal(journal);
+                       for (i=0; i<bufs; i++) {
+                               struct buffer_head *bh = wbuf[i];
+                               set_bit(BH_Lock, &bh->b_state);
+                               clear_bit(BH_Dirty, &bh->b_state);
+                               bh->b_end_io = journal_end_buffer_io_sync;
+                               submit_bh(WRITE, bh);
+                       }
+                       if (current->need_resched)
+                               schedule();
+                       lock_journal(journal);
+
+                       /* Force a new descriptor to be generated next
+                            time round the loop. */
+                       descriptor = NULL;
+                       bufs = 0;
+               }
+       }
+
+       /* Lo and behold: we have just managed to send a transaction to
+            the log.  Before we can commit it, wait for the IO so far to
+            complete.  Control buffers being written are on the
+            transaction's t_log_list queue, and metadata buffers are on
+            the t_iobuf_list queue.
+
+          Wait for the transactions in reverse order.  That way we are
+          less likely to be woken up until all IOs have completed, and
+          so we incur less scheduling load.
+       */
+
+       jbd_debug(3, "JBD: commit phase 4\n");
+
+       /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */
+  wait_for_iobuf:
+       while (commit_transaction->t_iobuf_list != NULL) {
+               struct buffer_head *bh;
+               jh = commit_transaction->t_iobuf_list->b_tprev;
+               bh = jh2bh(jh);
+               if (buffer_locked(bh)) {
+                       unlock_journal(journal);
+                       wait_on_buffer(bh);
+                       lock_journal(journal);
+                       goto wait_for_iobuf;
+               }
+
+               clear_bit(BH_JWrite, &jh2bh(jh)->b_state);
+
+               JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+               journal_unfile_buffer(jh);
+
+               /*
+                * akpm: don't put back a buffer_head with stale pointers
+                * dangling around.
+                */
+               J_ASSERT_JH(jh, jh->b_transaction != NULL);
+               jh->b_transaction = NULL;
+
+               /*
+                * ->t_iobuf_list should contain only dummy buffer_heads
+                * which were created by journal_write_metadata_buffer().
+                */
+               bh = jh2bh(jh);
+               BUFFER_TRACE(bh, "dumping temporary bh");
+               journal_unlock_journal_head(jh);
+               __brelse(bh);
+               J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+               put_unused_buffer_head(bh);
+
+               /* We also have to unlock and free the corresponding
+                    shadowed buffer */
+               jh = commit_transaction->t_shadow_list->b_tprev;
+               bh = jh2bh(jh);
+               clear_bit(BH_JWrite, &bh->b_state);
+               J_ASSERT_BH(bh, buffer_jdirty(bh));
+
+               /* The metadata is now released for reuse, but we need
+                    to remember it against this transaction so that when
+                    we finally commit, we can do any checkpointing
+                    required. */
+               JBUFFER_TRACE(jh, "file as BJ_Forget");
+               journal_file_buffer(jh, commit_transaction, BJ_Forget);
+               /* Wake up any transactions which were waiting for this
+                  IO to complete */
+               wake_up(&bh->b_wait);
+               JBUFFER_TRACE(jh, "brelse shadowed buffer");
+               __brelse(bh);
+       }
+
+       J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+       jbd_debug(3, "JBD: commit phase 5\n");
+
+       /* Here we wait for the revoke record and descriptor record buffers */
+  wait_for_ctlbuf:
+       while (commit_transaction->t_log_list != NULL) {
+               struct buffer_head *bh;
+
+               jh = commit_transaction->t_log_list->b_tprev;
+               bh = jh2bh(jh);
+               if (buffer_locked(bh)) {
+                       unlock_journal(journal);
+                       wait_on_buffer(bh);
+                       lock_journal(journal);
+                       goto wait_for_ctlbuf;
+               }
+
+               BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+               clear_bit(BH_JWrite, &bh->b_state);
+               journal_unfile_buffer(jh);
+               jh->b_transaction = NULL;
+               journal_unlock_journal_head(jh);
+               __brelse(bh);           /* One for getblk */
+               /* AKPM: bforget here */
+       }
+
+       jbd_debug(3, "JBD: commit phase 6\n");
+
+       /* Done it all: now write the commit record.  We should have
+        * cleaned up our previous buffers by now, so if we are in abort
+        * mode we can now just skip the rest of the journal write
+        * entirely. */
+
+       if (is_journal_aborted(journal))
+               goto skip_commit;
+
+       descriptor = journal_get_descriptor_buffer(journal);
+
+       /* AKPM: buglet - add `i' to tmp! */
+       for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
+               journal_header_t *tmp =
+                       (journal_header_t*)jh2bh(descriptor)->b_data;
+               tmp->h_magic = htonl(JFS_MAGIC_NUMBER);
+               tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK);
+               tmp->h_sequence = htonl(commit_transaction->t_tid);
+       }
+
+       unlock_journal(journal);
+       JBUFFER_TRACE(descriptor, "write commit block");
+       {
+               struct buffer_head *bh = jh2bh(descriptor);
+               ll_rw_block(WRITE, 1, &bh);
+               wait_on_buffer(bh);
+               __brelse(bh);           /* One for getblk() */
+               journal_unlock_journal_head(descriptor);
+       }
+       lock_journal(journal);
+
+       /* End of a transaction!  Finally, we can do checkpoint
+            processing: any buffers committed as a result of this
+            transaction can be removed from any checkpoint list it was on
+            before. */
+
+ skip_commit:
+
+       jbd_debug(3, "JBD: commit phase 7\n");
+
+       J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+       J_ASSERT(commit_transaction->t_async_datalist == NULL);
+       J_ASSERT(commit_transaction->t_buffers == NULL);
+       J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+       J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+       J_ASSERT(commit_transaction->t_shadow_list == NULL);
+       J_ASSERT(commit_transaction->t_log_list == NULL);
+
+       while (commit_transaction->t_forget) {
+               transaction_t *cp_transaction;
+               struct buffer_head *bh;
+
+               jh = commit_transaction->t_forget;
+               J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
+                       jh->b_transaction == journal->j_running_transaction);
+
+               /*
+                * If there is undo-protected committed data against
+                * this buffer, then we can remove it now.  If it is a
+                * buffer needing such protection, the old frozen_data
+                * field now points to a committed version of the
+                * buffer, so rotate that field to the new committed
+                * data.
+                *
+                * Otherwise, we can just throw away the frozen data now.
+                */
+               if (jh->b_committed_data) {
+                       kfree(jh->b_committed_data);
+                       jh->b_committed_data = NULL;
+                       if (jh->b_frozen_data) {
+                               jh->b_committed_data = jh->b_frozen_data;
+                               jh->b_frozen_data = NULL;
+                       }
+               } else if (jh->b_frozen_data) {
+                       kfree(jh->b_frozen_data);
+                       jh->b_frozen_data = NULL;
+               }
+
+               spin_lock(&journal_datalist_lock);
+               cp_transaction = jh->b_cp_transaction;
+               if (cp_transaction) {
+                       JBUFFER_TRACE(jh, "remove from old cp transaction");
+                       J_ASSERT_JH(jh, commit_transaction != cp_transaction);
+                       __journal_remove_checkpoint(jh);
+               }
+
+               /* Only re-checkpoint the buffer_head if it is marked
+                * dirty.  If the buffer was added to the BJ_Forget list
+                * by journal_forget, it may no longer be dirty and
+                * there's no point in keeping a checkpoint record for
+                * it. */
+               bh = jh2bh(jh);
+               if (buffer_jdirty(bh)) {
+                       JBUFFER_TRACE(jh, "add to new checkpointing trans");
+                       __journal_insert_checkpoint(jh, commit_transaction);
+                       JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+                       __journal_refile_buffer(jh);
+               } else {
+                       J_ASSERT_BH(bh, !buffer_dirty(bh));
+                       J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+                       __journal_unfile_buffer(jh);
+                       jh->b_transaction = 0;
+                       __journal_remove_journal_head(bh);
+                       __brelse(bh);
+               }
+               spin_unlock(&journal_datalist_lock);
+       }
+
+       /* Done with this transaction! */
+
+       jbd_debug(3, "JBD: commit phase 8\n");
+
+       J_ASSERT (commit_transaction->t_state == T_COMMIT);
+       commit_transaction->t_state = T_FINISHED;
+
+       J_ASSERT (commit_transaction == journal->j_committing_transaction);
+       journal->j_commit_sequence = commit_transaction->t_tid;
+       journal->j_committing_transaction = NULL;
+
+       spin_lock(&journal_datalist_lock);
+       if (commit_transaction->t_checkpoint_list == NULL) {
+               __journal_drop_transaction(journal, commit_transaction);
+       } else {
+               if (journal->j_checkpoint_transactions == NULL) {
+                       journal->j_checkpoint_transactions = commit_transaction;
+                       commit_transaction->t_cpnext = commit_transaction;
+                       commit_transaction->t_cpprev = commit_transaction;
+               } else {
+                       commit_transaction->t_cpnext =
+                               journal->j_checkpoint_transactions;
+                       commit_transaction->t_cpprev =
+                               commit_transaction->t_cpnext->t_cpprev;
+                       commit_transaction->t_cpnext->t_cpprev =
+                               commit_transaction;
+                       commit_transaction->t_cpprev->t_cpnext =
+                               commit_transaction;
+               }
+       }
+       spin_unlock(&journal_datalist_lock);
+
+       jbd_debug(1, "JBD: commit %d complete, head %d\n",
+                 journal->j_commit_sequence, journal->j_tail_sequence);
+
+       unlock_journal(journal);
+       wake_up(&journal->j_wait_done_commit);
+ }
diff -rc2P linux/fs/jbd/journal.c linux-2.4.13/fs/jbd/journal.c
*** linux/fs/jbd/journal.c      Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/journal.c       Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,1716 ----
+ /*
+  * linux/fs/journal.c
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 1998
+  *
+  * Copyright 1998 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Generic filesystem journal-writing code; part of the ext2fs
+  * journaling system.
+  *
+  * This file manages journals: areas of disk reserved for logging
+  * transactional updates.  This includes the kernel journaling thread
+  * which is responsible for scheduling updates to the log.
+  *
+  * We do not actually manage the physical storage of the journal in this
+  * file: that is left to a per-journal policy function, which allows us
+  * to store the journal within a filesystem-specified area for ext2
+  * journaling (ext2 can use a reserved inode for storing the log).
+  */
+
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+ #include <linux/mm.h>
+ #include <linux/slab.h>
+
+ EXPORT_SYMBOL(journal_start);
+ EXPORT_SYMBOL(journal_try_start);
+ EXPORT_SYMBOL(journal_restart);
+ EXPORT_SYMBOL(journal_extend);
+ EXPORT_SYMBOL(journal_stop);
+ EXPORT_SYMBOL(journal_lock_updates);
+ EXPORT_SYMBOL(journal_unlock_updates);
+ EXPORT_SYMBOL(journal_get_write_access);
+ EXPORT_SYMBOL(journal_get_create_access);
+ EXPORT_SYMBOL(journal_get_undo_access);
+ EXPORT_SYMBOL(journal_dirty_data);
+ EXPORT_SYMBOL(journal_dirty_metadata);
+ #if 0
+ EXPORT_SYMBOL(journal_release_buffer);
+ #endif
+ EXPORT_SYMBOL(journal_forget);
+ #if 0
+ EXPORT_SYMBOL(journal_sync_buffer);
+ #endif
+ EXPORT_SYMBOL(journal_flush);
+ EXPORT_SYMBOL(journal_revoke);
+
+ EXPORT_SYMBOL(journal_init_dev);
+ EXPORT_SYMBOL(journal_init_inode);
+ EXPORT_SYMBOL(journal_update_format);
+ EXPORT_SYMBOL(journal_check_used_features);
+ EXPORT_SYMBOL(journal_check_available_features);
+ EXPORT_SYMBOL(journal_set_features);
+ EXPORT_SYMBOL(journal_create);
+ EXPORT_SYMBOL(journal_load);
+ EXPORT_SYMBOL(journal_destroy);
+ EXPORT_SYMBOL(journal_recover);
+ EXPORT_SYMBOL(journal_update_superblock);
+ EXPORT_SYMBOL(__journal_abort);
+ EXPORT_SYMBOL(journal_abort);
+ EXPORT_SYMBOL(journal_errno);
+ EXPORT_SYMBOL(journal_ack_err);
+ EXPORT_SYMBOL(journal_clear_err);
+ EXPORT_SYMBOL(log_wait_commit);
+ EXPORT_SYMBOL(log_start_commit);
+ EXPORT_SYMBOL(journal_wipe);
+ EXPORT_SYMBOL(journal_blocks_per_page);
+ EXPORT_SYMBOL(journal_flushpage);
+ EXPORT_SYMBOL(journal_try_to_free_buffers);
+ EXPORT_SYMBOL(journal_bmap);
+ EXPORT_SYMBOL(journal_force_commit);
+
+ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+
+ /*
+  * journal_datalist_lock is used to protect data buffers:
+  *
+  *    bh->b_transaction
+  *    bh->b_tprev
+  *    bh->b_tnext
+  *
+  * journal_free_buffer() is called from journal_try_to_free_buffer(), and is
+  * async wrt everything else.
+  *
+  * It is also used for checkpoint data, also to protect against
+  * journal_try_to_free_buffer():
+  *
+  *    bh->b_cp_transaction
+  *    bh->b_cpnext
+  *    bh->b_cpprev
+  *    transaction->t_checkpoint_list
+  *    transaction->t_cpnext
+  *    transaction->t_cpprev
+  *    journal->j_checkpoint_transactions
+  *
+  * It is global at this time rather than per-journal because it's
+  * impossible for __journal_free_buffer to go from a buffer_head
+  * back to a journal_t unracily (well, not true.  Fix later)
+  *
+  *
+  * The `datalist' and `checkpoint list' functions are quite
+  * separate and we could use two spinlocks here.
+  *
+  * lru_list_lock nests inside journal_datalist_lock.
+  */
+ spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED;
+
+ /*
+  * List of all journals in the system.  Protected by the BKL.
+  */
+ static LIST_HEAD(all_journals);
+
+ /*
+  * Helper function used to manage commit timeouts
+  */
+
+ static void commit_timeout(unsigned long __data)
+ {
+       struct task_struct * p = (struct task_struct *) __data;
+
+       wake_up_process(p);
+ }
+
+ /* Static check for data structure consistency.  There's no code
+  * invoked --- we'll just get a linker failure if things aren't right.
+  */
+ void __journal_internal_check(void)
+ {
+       extern void journal_bad_superblock_size(void);
+       if (sizeof(struct journal_superblock_s) != 1024)
+               journal_bad_superblock_size();
+ }
+
+ /*
+  * kjournald: The main thread function used to manage a logging device
+  * journal.
+  *
+  * This kernel thread is responsible for two things:
+  *
+  * 1) COMMIT:  Every so often we need to commit the current state of the
+  *    filesystem to disk.  The journal thread is responsible for writing
+  *    all of the metadata buffers to disk.
+  *
+  * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+  *    of the data in that part of the log has been rewritten elsewhere on
+  *    the disk.  Flushing these old buffers to reclaim space in the log is
+  *    known as checkpointing, and this thread is responsible for that job.
+  */
+
+ journal_t *current_journal;           // AKPM: debug
+
+ int kjournald(void *arg)
+ {
+       journal_t *journal = (journal_t *) arg;
+       transaction_t *transaction;
+       struct timer_list timer;
+
+       current_journal = journal;
+
+       lock_kernel();
+       daemonize();
+       spin_lock_irq(&current->sigmask_lock);
+       sigfillset(&current->blocked);
+       recalc_sigpending(current);
+       spin_unlock_irq(&current->sigmask_lock);
+
+       sprintf(current->comm, "kjournald");
+
+       /* Set up an interval timer which can be used to trigger a
+            commit wakeup after the commit interval expires */
+       init_timer(&timer);
+       timer.data = (unsigned long) current;
+       timer.function = commit_timeout;
+       journal->j_commit_timer = &timer;
+
+       /* Record that the journal thread is running */
+       journal->j_task = current;
+       wake_up(&journal->j_wait_done_commit);
+
+       printk(KERN_INFO "kjournald starting.  Commit interval %ld seconds\n",
+                       journal->j_commit_interval / HZ);
+       list_add(&journal->j_all_journals, &all_journals);
+
+       /* And now, wait forever for commit wakeup events. */
+       while (1) {
+               if (journal->j_flags & JFS_UNMOUNT)
+                       break;
+
+               jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+                       journal->j_commit_sequence, journal->j_commit_request);
+
+               if (journal->j_commit_sequence != journal->j_commit_request) {
+                       jbd_debug(1, "OK, requests differ\n");
+                       if (journal->j_commit_timer_active) {
+                               journal->j_commit_timer_active = 0;
+                               del_timer(journal->j_commit_timer);
+                       }
+
+                       journal_commit_transaction(journal);
+                       continue;
+               }
+
+               wake_up(&journal->j_wait_done_commit);
+               interruptible_sleep_on(&journal->j_wait_commit);
+
+               jbd_debug(1, "kjournald wakes\n");
+
+               /* Were we woken up by a commit wakeup event? */
+               if ((transaction = journal->j_running_transaction) != NULL &&
+                   time_after_eq(jiffies, transaction->t_expires)) {
+                       journal->j_commit_request = transaction->t_tid;
+                       jbd_debug(1, "woke because of timeout\n");
+               }
+       }
+
+       if (journal->j_commit_timer_active) {
+               journal->j_commit_timer_active = 0;
+               del_timer_sync(journal->j_commit_timer);
+       }
+
+       list_del(&journal->j_all_journals);
+
+       journal->j_task = NULL;
+       wake_up(&journal->j_wait_done_commit);
+       jbd_debug(1, "Journal thread exiting.\n");
+       return 0;
+ }
+
+ static void journal_start_thread(journal_t *journal)
+ {
+       kernel_thread(kjournald, (void *) journal,
+                     CLONE_VM | CLONE_FS | CLONE_FILES);
+       while (!journal->j_task)
+               sleep_on(&journal->j_wait_done_commit);
+ }
+
+ static void journal_kill_thread(journal_t *journal)
+ {
+       journal->j_flags |= JFS_UNMOUNT;
+
+       while (journal->j_task) {
+               wake_up(&journal->j_wait_commit);
+               sleep_on(&journal->j_wait_done_commit);
+       }
+ }
+
+ #if 0
+
+ This is no longer needed - we do it in commit quite efficiently.
+ Note that if this function is resurrected, the loop needs to
+ be reorganised into the next_jh/last_jh algorithm.
+
+ /*
+  * journal_clean_data_list: cleanup after data IO.
+  *
+  * Once the IO system has finished writing the buffers on the transaction's
+  * data list, we can remove those buffers from the list.  This function
+  * scans the list for such buffers and removes them cleanly.
+  *
+  * We assume that the journal is already locked.
+  * We are called with journal_datalist_lock held.
+  *
+  * AKPM: This function looks inefficient.  Approximately O(n^2)
+  * for potentially thousands of buffers.  It no longer shows on profiles
+  * because these buffers are mainly dropped in journal_commit_transaction().
+  */
+
+ void __journal_clean_data_list(transaction_t *transaction)
+ {
+       struct journal_head *jh, *next;
+
+       assert_spin_locked(&journal_datalist_lock);
+
+ restart:
+       jh = transaction->t_sync_datalist;
+       if (!jh)
+               goto out;
+       do {
+               next = jh->b_tnext;
+               if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) {
+                       struct buffer_head *bh = jh2bh(jh);
+                       BUFFER_TRACE(bh, "data writeout complete: unfile");
+                       __journal_unfile_buffer(jh);
+                       jh->b_transaction = NULL;
+                       __journal_remove_journal_head(bh);
+                       refile_buffer(bh);
+                       __brelse(bh);
+                       goto restart;
+               }
+               jh = next;
+       } while (transaction->t_sync_datalist &&
+                       jh != transaction->t_sync_datalist);
+ out:
+       return;
+ }
+ #endif
+
+ /*
+  * journal_write_metadata_buffer: write a metadata buffer to the journal.
+  *
+  * Writes a metadata buffer to a given disk block.  The actual IO is not
+  * performed but a new buffer_head is constructed which labels the data
+  * to be written with the correct destination disk block.
+  *
+  * Any magic-number escaping which needs to be done will cause a
+  * copy-out here.  If the buffer happens to start with the
+  * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
+  * magic number is only written to the log for descripter blocks.  In
+  * this case, we copy the data and replace the first word with 0, and we
+  * return a result code which indicates that this buffer needs to be
+  * marked as an escaped buffer in the corresponding log descriptor
+  * block.  The missing word can then be restored when the block is read
+  * during recovery.
+  *
+  * If the source buffer has already been modified by a new transaction
+  * since we took the last commit snapshot, we use the frozen copy of
+  * that data for IO.  If we end up using the existing buffer_head's data
+  * for the write, then we *have* to lock the buffer to prevent anyone
+  * else from using and possibly modifying it while the IO is in
+  * progress.
+  *
+  * The function returns a pointer to the buffer_heads to be used for IO.
+  *
+  * We assume that the journal has already been locked in this function.
+  *
+  * Return value:
+  *  <0: Error
+  * >=0: Finished OK
+  *
+  * On success:
+  * Bit 0 set == escape performed on the data
+  * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+  */
+
+ static inline unsigned long virt_to_offset(void *p)
+ {return ((unsigned long) p) & ~PAGE_MASK;}
+
+ int journal_write_metadata_buffer(transaction_t *transaction,
+                                 struct journal_head  *jh_in,
+                                 struct journal_head **jh_out,
+                                 int blocknr)
+ {
+       int need_copy_out = 0;
+       int done_copy_out = 0;
+       int do_escape = 0;
+       char *mapped_data;
+       struct buffer_head *new_bh;
+       struct journal_head * new_jh;
+       struct page *new_page;
+       unsigned int new_offset;
+
+       /*
+        * The buffer really shouldn't be locked: only the current committing
+        * transaction is allowed to write it, so nobody else is allowed
+        * to do any IO.
+        *
+        * akpm: except if we're journalling data, and write() output is
+        * also part of a shared mapping, and another thread has
+        * decided to launch a writepage() against this buffer.
+        */
+       J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in)));
+
+       /*
+        * If a new transaction has already done a buffer copy-out, then
+        * we use that version of the data for the commit.
+        */
+
+       if (jh_in->b_frozen_data) {
+               done_copy_out = 1;
+               new_page = virt_to_page(jh_in->b_frozen_data);
+               new_offset = virt_to_offset(jh_in->b_frozen_data);
+       } else {
+               new_page = jh2bh(jh_in)->b_page;
+               new_offset = virt_to_offset(jh2bh(jh_in)->b_data);
+       }
+
+       mapped_data = ((char *) kmap(new_page)) + new_offset;
+
+       /*
+        * Check for escaping
+        */
+       if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) {
+               need_copy_out = 1;
+               do_escape = 1;
+       }
+
+       /*
+        * Do we need to do a data copy?
+        */
+
+       if (need_copy_out && !done_copy_out) {
+               char *tmp;
+               tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS);
+
+               jh_in->b_frozen_data = tmp;
+               memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size);
+
+               /* If we get to this path, we'll always need the new
+                  address kmapped so that we can clear the escaped
+                  magic number below. */
+               kunmap(new_page);
+               new_page = virt_to_page(tmp);
+               new_offset = virt_to_offset(tmp);
+               mapped_data = ((char *) kmap(new_page)) + new_offset;
+
+               done_copy_out = 1;
+       }
+
+       /*
+        * Right, time to make up the new buffer_head.
+        */
+       do {
+               new_bh = get_unused_buffer_head(0);
+               if (!new_bh) {
+                       printk (KERN_NOTICE __FUNCTION__
+                               ": ENOMEM at get_unused_buffer_head, "
+                               "trying again.\n");
+                       current->policy |= SCHED_YIELD;
+                       schedule();
+               }
+       } while (!new_bh);
+       /* keep subsequent assertions sane */
+       new_bh->b_prev_free = 0;
+       new_bh->b_next_free = 0;
+       new_bh->b_state = 0;
+       init_buffer(new_bh, NULL, NULL);
+       atomic_set(&new_bh->b_count, 1);
+       new_jh = journal_add_journal_head(new_bh);
+
+       set_bh_page(new_bh, new_page, new_offset);
+
+       new_jh->b_transaction = NULL;
+       new_bh->b_size = jh2bh(jh_in)->b_size;
+       new_bh->b_dev = transaction->t_journal->j_dev;
+       new_bh->b_blocknr = blocknr;
+       new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty);
+
+       *jh_out = new_jh;
+
+       /*
+        * Did we need to do an escaping?  Now we've done all the
+        * copying, we can finally do so.
+        */
+
+       if (do_escape)
+               * ((unsigned int *) mapped_data) = 0;
+       kunmap(new_page);
+
+       /*
+        * The to-be-written buffer needs to get moved to the io queue,
+        * and the original buffer whose contents we are shadowing or
+        * copying is moved to the transaction's shadow queue.
+        */
+       JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+       journal_file_buffer(jh_in, transaction, BJ_Shadow);
+       JBUFFER_TRACE(new_jh, "file as BJ_IO");
+       journal_file_buffer(new_jh, transaction, BJ_IO);
+
+       return do_escape | (done_copy_out << 1);
+ }
+
+ /*
+  * Allocation code for the journal file.  Manage the space left in the
+  * journal, so that we can begin checkpointing when appropriate.
+  */
+
+ /*
+  * log_space_left: Return the number of free blocks left in the journal.
+  *
+  * Called with the journal already locked.
+  */
+
+ int log_space_left (journal_t *journal)
+ {
+       int left = journal->j_free;
+
+       /* Be pessimistic here about the number of those free blocks
+        * which might be required for log descriptor control blocks. */
+
+ #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+       left -= MIN_LOG_RESERVED_BLOCKS;
+
+       if (left <= 0)
+               return 0;
+       left -= (left >> 3);
+       return left;
+ }
+
+ /*
+  * This function must be non-allocating for PF_MEMALLOC tasks
+  */
+ tid_t log_start_commit (journal_t *journal, transaction_t *transaction)
+ {
+       tid_t target = journal->j_commit_request;
+
+       lock_kernel(); /* Protect journal->j_running_transaction */
+
+       /*
+        * A NULL transaction asks us to commit the currently running
+        * transaction, if there is one.
+        */
+       if (transaction)
+               target = transaction->t_tid;
+       else {
+               transaction = journal->j_running_transaction;
+               if (!transaction)
+                       goto out;
+               target = transaction->t_tid;
+       }
+
+       /*
+        * Are we already doing a recent enough commit?
+        */
+       if (tid_geq(journal->j_commit_request, target))
+               goto out;
+
+       /*
+        * We want a new commit: OK, mark the request and wakup the
+        * commit thread.  We do _not_ do the commit ourselves.
+        */
+
+       journal->j_commit_request = target;
+       jbd_debug(1, "JBD: requesting commit %d/%d\n",
+                 journal->j_commit_request,
+                 journal->j_commit_sequence);
+       wake_up(&journal->j_wait_commit);
+
+ out:
+       unlock_kernel();
+       return target;
+ }
+
+ /*
+  * Wait for a specified commit to complete.
+  * The caller may not hold the journal lock.
+  */
+ void log_wait_commit (journal_t *journal, tid_t tid)
+ {
+       lock_kernel();
+ #ifdef CONFIG_JBD_DEBUG
+       lock_journal(journal);
+       if (!tid_geq(journal->j_commit_request, tid)) {
+               printk(KERN_EMERG __FUNCTION__
+                       ": error: j_commit_request=%d, tid=%d\n",
+                       journal->j_commit_request, tid);
+       }
+       unlock_journal(journal);
+ #endif
+       while (tid_gt(tid, journal->j_commit_sequence)) {
+               jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+                                 tid, journal->j_commit_sequence);
+               wake_up(&journal->j_wait_commit);
+               sleep_on(&journal->j_wait_done_commit);
+       }
+       unlock_kernel();
+ }
+
+ /*
+  * Log buffer allocation routines:
+  */
+
+ unsigned long journal_next_log_block(journal_t *journal)
+ {
+       unsigned long blocknr;
+
+       J_ASSERT(journal->j_free > 1);
+
+       blocknr = journal->j_head;
+       journal->j_head++;
+       journal->j_free--;
+       if (journal->j_head == journal->j_last)
+               journal->j_head = journal->j_first;
+       return journal_bmap(journal, blocknr);
+ }
+
+ /*
+  * Conversion of logical to physical block numbers for the journal
+  *
+  * On external journals the journal blocks are identity-mapped, so
+  * this is a no-op.  If needed, we can use j_blk_offset - everything is
+  * ready.
+  */
+ unsigned long journal_bmap(journal_t *journal, unsigned long blocknr)
+ {
+       unsigned long ret;
+
+       if (journal->j_inode) {
+               ret = bmap(journal->j_inode, blocknr);
+               J_ASSERT(ret != 0);
+       } else {
+               ret = blocknr;   /* +journal->j_blk_offset */
+       }
+       return ret;
+ }
+
+ /*
+  * We play buffer_head aliasing tricks to write data/metadata blocks to
+  * the journal without copying their contents, but for journal
+  * descriptor blocks we do need to generate bona fide buffers.
+  */
+
+ struct journal_head * journal_get_descriptor_buffer(journal_t *journal)
+ {
+       struct buffer_head *bh;
+       unsigned long blocknr = journal_next_log_block(journal);
+
+       bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+       bh->b_state |= (1 << BH_Dirty);
+       BUFFER_TRACE(bh, "return this buffer");
+       return journal_add_journal_head(bh);
+ }
+
+ /*
+  * Management for journal control blocks: functions to create and
+  * destroy journal_t structures, and to initialise and read existing
+  * journal blocks from disk.  */
+
+ /* First: create and setup a journal_t object in memory.  We initialise
+  * very few fields yet: that has to wait until we have created the
+  * journal structures from from scratch, or loaded them from disk. */
+
+ static journal_t * journal_init_common (void)
+ {
+       journal_t *journal;
+       int err;
+
+       MOD_INC_USE_COUNT;
+
+       journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
+       if (!journal)
+               goto fail;
+       memset(journal, 0, sizeof(*journal));
+
+       init_waitqueue_head(&journal->j_wait_transaction_locked);
+       init_waitqueue_head(&journal->j_wait_logspace);
+       init_waitqueue_head(&journal->j_wait_done_commit);
+       init_waitqueue_head(&journal->j_wait_checkpoint);
+       init_waitqueue_head(&journal->j_wait_commit);
+       init_waitqueue_head(&journal->j_wait_updates);
+       init_MUTEX(&journal->j_barrier);
+       init_MUTEX(&journal->j_checkpoint_sem);
+       init_MUTEX(&journal->j_sem);
+
+       journal->j_commit_interval = (HZ * 5);
+
+       /* The journal is marked for error until we succeed with recovery! */
+       journal->j_flags = JFS_ABORT;
+
+       /* Set up a default-sized revoke table for the new mount. */
+       err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+       if (err) {
+               kfree(journal);
+               goto fail;
+       }
+       return journal;
+ fail:
+       MOD_DEC_USE_COUNT;
+       return NULL;
+ }
+
+ /* journal_init_dev and journal_init_inode:
+  *
+  * Create a journal structure assigned some fixed set of disk blocks to
+  * the journal.  We don't actually touch those disk blocks yet, but we
+  * need to set up all of the mapping information to tell the journaling
+  * system where the journal blocks are.
+  *
+  * journal_init_dev creates a journal which maps a fixed contiguous
+  * range of blocks on an arbitrary block device.
+  *
+  * journal_init_inode creates a journal which maps an on-disk inode as
+  * the journal.  The inode must exist already, must support bmap() and
+  * must have all data blocks preallocated.
+  */
+
+ journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
+                       int start, int len, int blocksize)
+ {
+       journal_t *journal = journal_init_common();
+       struct buffer_head *bh;
+
+       if (!journal)
+               return NULL;
+
+       journal->j_dev = dev;
+       journal->j_fs_dev = fs_dev;
+       journal->j_blk_offset = start;
+       journal->j_maxlen = len;
+       journal->j_blocksize = blocksize;
+
+       bh = getblk(journal->j_dev, start, journal->j_blocksize);
+       J_ASSERT(bh != NULL);
+       journal->j_sb_buffer = bh;
+       journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+       return journal;
+ }
+
+ journal_t * journal_init_inode (struct inode *inode)
+ {
+       struct buffer_head *bh;
+       journal_t *journal = journal_init_common();
+       int blocknr;
+
+       if (!journal)
+               return NULL;
+
+       journal->j_dev = inode->i_dev;
+       journal->j_fs_dev = inode->i_dev;
+       journal->j_inode = inode;
+       jbd_debug(1,
+                 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+                 journal, bdevname(inode->i_dev), inode->i_ino, inode->i_size,
+                 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+       journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+       journal->j_blocksize = inode->i_sb->s_blocksize;
+
+       blocknr = journal_bmap(journal, 0);
+       bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+       J_ASSERT(bh != NULL);
+       journal->j_sb_buffer = bh;
+       journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+       return journal;
+ }
+
+ /*
+  * Given a journal_t structure, initialise the various fields for
+  * startup of a new journaling session.  We use this both when creating
+  * a journal, and after recovering an old journal to reset it for
+  * subsequent use.
+  */
+
+ static int journal_reset (journal_t *journal)
+ {
+       journal_superblock_t *sb = journal->j_superblock;
+       unsigned int first, last;
+
+       first = ntohl(sb->s_first);
+       last = ntohl(sb->s_maxlen);
+
+       journal->j_first = first;
+       journal->j_last = last;
+
+       journal->j_head = first;
+       journal->j_tail = first;
+       journal->j_free = last - first;
+
+       journal->j_tail_sequence = journal->j_transaction_sequence;
+       journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+       journal->j_commit_request = journal->j_commit_sequence;
+
+       journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+       /* Add the dynamic fields and write it to disk. */
+       journal_update_superblock(journal, 1);
+
+       lock_journal(journal);
+       journal_start_thread(journal);
+       unlock_journal(journal);
+
+       return 0;
+ }
+
+ /*
+  * Given a journal_t structure which tells us which disk blocks we can
+  * use, create a new journal superblock and initialise all of the
+  * journal fields from scratch.  */
+
+ int journal_create (journal_t *journal)
+ {
+       int blocknr;
+       struct buffer_head *bh;
+       journal_superblock_t *sb;
+       int i;
+
+       if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
+               printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+                       journal->j_maxlen);
+               return -EINVAL;
+       }
+
+       if (journal->j_inode == NULL) {
+               /*
+                * We don't know what block to start at!
+                */
+               printk(KERN_EMERG __FUNCTION__
+                       ": creation of journal on external device!\n");
+               BUG();
+       }
+
+       /* Zero out the entire journal on disk.  We cannot afford to
+          have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
+       jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+       for (i = 0; i < journal->j_maxlen; i++) {
+               blocknr = journal_bmap(journal, i);
+               bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+               wait_on_buffer(bh);
+               memset (bh->b_data, 0, journal->j_blocksize);
+               BUFFER_TRACE(bh, "marking dirty");
+               mark_buffer_dirty(bh);
+               BUFFER_TRACE(bh, "marking uptodate");
+               mark_buffer_uptodate(bh, 1);
+               __brelse(bh);
+       }
+       sync_dev(journal->j_dev);
+       jbd_debug(1, "JBD: journal cleared.\n");
+
+       /* OK, fill in the initial static fields in the new superblock */
+       sb = journal->j_superblock;
+
+       sb->s_header.h_magic     = htonl(JFS_MAGIC_NUMBER);
+       sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2);
+
+       sb->s_blocksize = htonl(journal->j_blocksize);
+       sb->s_maxlen    = htonl(journal->j_maxlen);
+       sb->s_first     = htonl(1);
+
+       journal->j_transaction_sequence = 1;
+
+       journal->j_flags &= ~JFS_ABORT;
+       journal->j_format_version = 2;
+
+       return journal_reset(journal);
+ }
+
+ /*
+  * Update a journal's dynamic superblock fields and write it to disk,
+  * optionally waiting for the IO to complete.
+ */
+
+ void journal_update_superblock(journal_t *journal, int wait)
+ {
+       journal_superblock_t *sb = journal->j_superblock;
+       struct buffer_head *bh = journal->j_sb_buffer;
+
+       jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+                 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+
+       sb->s_sequence = htonl(journal->j_tail_sequence);
+       sb->s_start    = htonl(journal->j_tail);
+       sb->s_errno    = htonl(journal->j_errno);
+
+       BUFFER_TRACE(bh, "marking dirty");
+       mark_buffer_dirty(bh);
+       ll_rw_block(WRITE, 1, &bh);
+       if (wait)
+               wait_on_buffer(bh);
+
+       /* If we have just flushed the log (by marking s_start==0), then
+        * any future commit will have to be careful to update the
+        * superblock again to re-record the true start of the log. */
+
+       if (sb->s_start)
+               journal->j_flags &= ~JFS_FLUSHED;
+       else
+               journal->j_flags |= JFS_FLUSHED;
+ }
+
+
+ /*
+  * Read the superblock for a given journal, performing initial
+  * validation of the format.
+  */
+
+ static int journal_get_superblock(journal_t *journal)
+ {
+       struct buffer_head *bh;
+       journal_superblock_t *sb;
+
+       bh = journal->j_sb_buffer;
+
+       J_ASSERT(bh != NULL);
+       if (!buffer_uptodate(bh)) {
+               ll_rw_block(READ, 1, &bh);
+               wait_on_buffer(bh);
+               if (!buffer_uptodate(bh)) {
+                       printk (KERN_ERR
+                               "JBD: IO error reading journal superblock\n");
+                       return -EIO;
+               }
+       }
+
+       sb = journal->j_superblock;
+
+       if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) ||
+           sb->s_blocksize != htonl(journal->j_blocksize)) {
+               printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+               return -EINVAL;
+       }
+
+       switch(ntohl(sb->s_header.h_blocktype)) {
+       case JFS_SUPERBLOCK_V1:
+               journal->j_format_version = 1;
+               break;
+       case JFS_SUPERBLOCK_V2:
+               journal->j_format_version = 2;
+               break;
+       default:
+               printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+               return -EINVAL;
+       }
+
+       if (ntohl(sb->s_maxlen) < journal->j_maxlen)
+               journal->j_maxlen = ntohl(sb->s_maxlen);
+       else if (ntohl(sb->s_maxlen) > journal->j_maxlen) {
+               printk (KERN_WARNING "JBD: journal file too short\n");
+               return -EINVAL;
+       }
+
+       return 0;
+ }
+
+ /*
+  * Load the on-disk journal superblock and read the key fields into the
+  * journal_t.
+  */
+
+ static int load_superblock(journal_t *journal)
+ {
+       int err;
+       journal_superblock_t *sb;
+
+       err = journal_get_superblock(journal);
+       if (err)
+               return err;
+
+       sb = journal->j_superblock;
+
+       journal->j_tail_sequence = ntohl(sb->s_sequence);
+       journal->j_tail = ntohl(sb->s_start);
+       journal->j_first = ntohl(sb->s_first);
+       journal->j_last = ntohl(sb->s_maxlen);
+       journal->j_errno = ntohl(sb->s_errno);
+
+       return 0;
+ }
+
+
+ /*
+  * Given a journal_t structure which tells us which disk blocks contain
+  * a journal, read the journal from disk to initialise the in-memory
+  * structures.
+  */
+
+ int journal_load(journal_t *journal)
+ {
+       int err;
+
+       err = load_superblock(journal);
+       if (err)
+               return err;
+
+       /* If this is a V2 superblock, then we have to check the
+        * features flags on it. */
+
+       if (journal->j_format_version >= 2) {
+               journal_superblock_t *sb = journal->j_superblock;
+
+               if ((sb->s_feature_ro_compat &
+                    ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
+                   (sb->s_feature_incompat &
+                    ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
+                       printk (KERN_WARNING
+                               "JBD: Unrecognised features on journal\n");
+                       return -EINVAL;
+               }
+       }
+
+       /* Let the recovery code check whether it needs to recover any
+        * data from the journal. */
+       if (journal_recover(journal))
+               goto recovery_error;
+
+       /* OK, we've finished with the dynamic journal bits:
+        * reinitialise the dynamic contents of the superblock in memory
+        * and reset them on disk. */
+       if (journal_reset(journal))
+               goto recovery_error;
+
+       journal->j_flags &= ~JFS_ABORT;
+       journal->j_flags |= JFS_LOADED;
+       return 0;
+
+ recovery_error:
+       printk (KERN_WARNING "JBD: recovery failed\n");
+       return -EIO;
+ }
+
+ /*
+  * Release a journal_t structure once it is no longer in use by the
+  * journaled object.
+  */
+
+ void journal_destroy (journal_t *journal)
+ {
+       /* Wait for the commit thread to wake up and die. */
+       journal_kill_thread(journal);
+
+       /* Force a final log commit */
+       if (journal->j_running_transaction)
+               journal_commit_transaction(journal);
+
+       /* Force any old transactions to disk */
+       lock_journal(journal);
+       while (journal->j_checkpoint_transactions != NULL)
+               log_do_checkpoint(journal, 1);
+
+       J_ASSERT(journal->j_running_transaction == NULL);
+       J_ASSERT(journal->j_committing_transaction == NULL);
+       J_ASSERT(journal->j_checkpoint_transactions == NULL);
+
+       /* We can now mark the journal as empty. */
+       journal->j_tail = 0;
+       journal->j_tail_sequence = ++journal->j_transaction_sequence;
+       journal_update_superblock(journal, 1);
+
+       if (journal->j_inode)
+               iput(journal->j_inode);
+       if (journal->j_revoke)
+               journal_destroy_revoke(journal);
+
+       unlock_journal(journal);
+       brelse(journal->j_sb_buffer);
+       kfree(journal);
+       MOD_DEC_USE_COUNT;
+ }
+
+
+ /* Published API: Check whether the journal uses all of a given set of
+  * features.  Return true (non-zero) if it does. */
+
+ int journal_check_used_features (journal_t *journal, unsigned long compat,
+                                unsigned long ro, unsigned long incompat)
+ {
+       journal_superblock_t *sb;
+
+       if (!compat && !ro && !incompat)
+               return 1;
+       if (journal->j_format_version == 1)
+               return 0;
+
+       sb = journal->j_superblock;
+
+       if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+           ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+           ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+               return 1;
+
+       return 0;
+ }
+
+ /* Published API: Check whether the journaling code supports the use of
+  * all of a given set of features on this journal.  Return true
+  * (non-zero) if it can. */
+
+ int journal_check_available_features (journal_t *journal, unsigned long compat,
+                                     unsigned long ro, unsigned long incompat)
+ {
+       journal_superblock_t *sb;
+
+       if (!compat && !ro && !incompat)
+               return 1;
+
+       sb = journal->j_superblock;
+
+       /* We can support any known requested features iff the
+        * superblock is in version 2.  Otherwise we fail to support any
+        * extended sb features. */
+
+       if (journal->j_format_version != 2)
+               return 0;
+
+       if ((compat   & JFS_KNOWN_COMPAT_FEATURES) == compat &&
+           (ro       & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
+           (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
+               return 1;
+
+       return 0;
+ }
+
+ /* Published API: Mark a given journal feature as present on the
+  * superblock.  Returns true if the requested features could be set. */
+
+ int journal_set_features (journal_t *journal, unsigned long compat,
+                         unsigned long ro, unsigned long incompat)
+ {
+       journal_superblock_t *sb;
+
+       if (journal_check_used_features(journal, compat, ro, incompat))
+               return 1;
+
+       if (!journal_check_available_features(journal, compat, ro, incompat))
+               return 0;
+
+       jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+                 compat, ro, incompat);
+
+       sb = journal->j_superblock;
+
+       sb->s_feature_compat    |= cpu_to_be32(compat);
+       sb->s_feature_ro_compat |= cpu_to_be32(ro);
+       sb->s_feature_incompat  |= cpu_to_be32(incompat);
+
+       return 1;
+ }
+
+
+ /*
+  * Published API:
+  * Given an initialised but unloaded journal struct, poke about in the
+  * on-disk structure to update it to the most recent supported version.
+  */
+
+ int journal_update_format (journal_t *journal)
+ {
+       journal_superblock_t *sb;
+       int err;
+
+       err = journal_get_superblock(journal);
+       if (err)
+               return err;
+
+       sb = journal->j_superblock;
+
+       switch (ntohl(sb->s_header.h_blocktype)) {
+       case JFS_SUPERBLOCK_V2:
+               return 0;
+       case JFS_SUPERBLOCK_V1:
+               return journal_convert_superblock_v1(journal, sb);
+       default:
+               break;
+       }
+       return -EINVAL;
+ }
+
+ static int journal_convert_superblock_v1(journal_t *journal,
+                                        journal_superblock_t *sb)
+ {
+       int offset, blocksize;
+       struct buffer_head *bh;
+
+       printk(KERN_WARNING
+               "JBD: Converting superblock from version 1 to 2.\n");
+
+       /* Pre-initialise new fields to zero */
+       offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+       blocksize = ntohl(sb->s_blocksize);
+       memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+       sb->s_nr_users = cpu_to_be32(1);
+       sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+       journal->j_format_version = 2;
+
+       bh = journal->j_sb_buffer;
+       BUFFER_TRACE(bh, "marking dirty");
+       mark_buffer_dirty(bh);
+       ll_rw_block(WRITE, 1, &bh);
+       wait_on_buffer(bh);
+       return 0;
+ }
+
+
+ /*
+  * Flush all data for a given journal to disk and empty the journal.
+  * Filesystems can use this when remounting readonly to ensure that
+  * recovery does not need to happen on remount.
+  */
+
+ int journal_flush (journal_t *journal)
+ {
+       int err = 0;
+       transaction_t *transaction = NULL;
+       unsigned long old_tail;
+
+       lock_kernel();
+
+       /* Force everything buffered to the log... */
+       if (journal->j_running_transaction) {
+               transaction = journal->j_running_transaction;
+               log_start_commit(journal, transaction);
+       } else if (journal->j_committing_transaction)
+               transaction = journal->j_committing_transaction;
+
+       /* Wait for the log commit to complete... */
+       if (transaction)
+               log_wait_commit(journal, transaction->t_tid);
+
+       /* ...and flush everything in the log out to disk. */
+       lock_journal(journal);
+       while (!err && journal->j_checkpoint_transactions != NULL)
+               err = log_do_checkpoint(journal, journal->j_maxlen);
+       cleanup_journal_tail(journal);
+
+       /* Finally, mark the journal as really needing no recovery.
+        * This sets s_start==0 in the underlying superblock, which is
+        * the magic code for a fully-recovered superblock.  Any future
+        * commits of data to the journal will restore the current
+        * s_start value. */
+       old_tail = journal->j_tail;
+       journal->j_tail = 0;
+       journal_update_superblock(journal, 1);
+       journal->j_tail = old_tail;
+
+       unlock_journal(journal);
+
+       J_ASSERT(!journal->j_running_transaction);
+       J_ASSERT(!journal->j_committing_transaction);
+       J_ASSERT(!journal->j_checkpoint_transactions);
+       J_ASSERT(journal->j_head == journal->j_tail);
+       J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+
+       unlock_kernel();
+
+       return err;
+ }
+
+ /*
+  * Wipe out all of the contents of a journal, safely.  This will produce
+  * a warning if the journal contains any valid recovery information.
+  * Must be called between journal_init_*() and journal_load().
+  *
+  * If (write) is non-zero, then we wipe out the journal on disk; otherwise
+  * we merely suppress recovery.
+  */
+
+ int journal_wipe (journal_t *journal, int write)
+ {
+       journal_superblock_t *sb;
+       int err = 0;
+
+       J_ASSERT (!(journal->j_flags & JFS_LOADED));
+
+       err = load_superblock(journal);
+       if (err)
+               return err;
+
+       sb = journal->j_superblock;
+
+       if (!journal->j_tail)
+               goto no_recovery;
+
+       printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+               write ? "Clearing" : "Ignoring");
+
+       err = journal_skip_recovery(journal);
+       if (write)
+               journal_update_superblock(journal, 1);
+
+  no_recovery:
+       return err;
+ }
+
+ /*
+  * journal_dev_name: format a character string to describe on what
+  * device this journal is present.
+  */
+
+ const char * journal_dev_name(journal_t *journal)
+ {
+       kdev_t dev;
+
+       if (journal->j_inode)
+               dev = journal->j_inode->i_dev;
+       else
+               dev = journal->j_dev;
+
+       return bdevname(dev);
+ }
+
+ /*
+  * journal_abort: perform a complete, immediate shutdown of the ENTIRE
+  * journal (not of a single transaction).  This operation cannot be
+  * undone without closing and reopening the journal.
+  *
+  * The journal_abort function is intended to support higher level error
+  * recovery mechanisms such as the ext2/ext3 remount-readonly error
+  * mode.
+  *
+  * Journal abort has very specific semantics.  Any existing dirty,
+  * unjournaled buffers in the main filesystem will still be written to
+  * disk by bdflush, but the journaling mechanism will be suspended
+  * immediately and no further transaction commits will be honoured.
+  *
+  * Any dirty, journaled buffers will be written back to disk without
+  * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+  * filesystem, but we _do_ attempt to leave as much data as possible
+  * behind for fsck to use for cleanup.
+  *
+  * Any attempt to get a new transaction handle on a journal which is in
+  * ABORT state will just result in an -EROFS error return.  A
+  * journal_stop on an existing handle will return -EIO if we have
+  * entered abort state during the update.
+  *
+  * Recursive transactions are not disturbed by journal abort until the
+  * final journal_stop, which will receive the -EIO error.
+  *
+  * Finally, the journal_abort call allows the caller to supply an errno
+  * which will be recored (if possible) in the journal superblock.  This
+  * allows a client to record failure conditions in the middle of a
+  * transaction without having to complete the transaction to record the
+  * failure to disk.  ext3_error, for example, now uses this
+  * functionality.
+  *
+  * Errors which originate from within the journaling layer will NOT
+  * supply an errno; a null errno implies that absolutely no further
+  * writes are done to the journal (unless there are any already in
+  * progress).
+  */
+
+ /* Quick version for internal journal use (doesn't lock the journal) */
+ void __journal_abort (journal_t *journal)
+ {
+       transaction_t *transaction;
+
+       printk (KERN_ERR "Aborting journal on device %s.\n",
+               journal_dev_name(journal));
+
+       journal->j_flags |= JFS_ABORT;
+       transaction = journal->j_running_transaction;
+       if (transaction)
+               log_start_commit(journal, transaction);
+ }
+
+ /* Full version for external use */
+ void journal_abort (journal_t *journal, int errno)
+ {
+       lock_journal(journal);
+
+       if (journal->j_flags & JFS_ABORT)
+               goto out;
+
+       if (!journal->j_errno)
+               journal->j_errno = errno;
+
+       __journal_abort(journal);
+
+       if (errno)
+               journal_update_superblock(journal, 1);
+
+  out:
+       unlock_journal(journal);
+ }
+
+ int journal_errno (journal_t *journal)
+ {
+       int err;
+
+       lock_journal(journal);
+       if (journal->j_flags & JFS_ABORT)
+               err = -EROFS;
+       else
+               err = journal->j_errno;
+       unlock_journal(journal);
+       return err;
+ }
+
+ int journal_clear_err (journal_t *journal)
+ {
+       int err = 0;
+
+       lock_journal(journal);
+       if (journal->j_flags & JFS_ABORT)
+               err = -EROFS;
+       else
+               journal->j_errno = 0;
+       unlock_journal(journal);
+       return err;
+ }
+
+ void journal_ack_err (journal_t *journal)
+ {
+       lock_journal(journal);
+       if (journal->j_errno)
+               journal->j_flags |= JFS_ACK_ERR;
+       unlock_journal(journal);
+ }
+
+ int journal_blocks_per_page(struct inode *inode)
+ {
+       return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ }
+
+ /*
+  * shrink_journal_memory().
+  * Called when we're under memory pressure.  Free up all the written-back
+  * checkpointed metadata buffers.
+  */
+ void shrink_journal_memory(void)
+ {
+       struct list_head *list;
+
+       lock_kernel();
+       list_for_each(list, &all_journals) {
+               journal_t *journal =
+                       list_entry(list, journal_t, j_all_journals);
+               spin_lock(&journal_datalist_lock);
+               __journal_clean_checkpoint_list(journal);
+               spin_unlock(&journal_datalist_lock);
+       }
+       unlock_kernel();
+ }
+
+ /*
+  * Simple support for retying memory allocations.  Introduced to help to
+  * debug different VM deadlock avoidance strategies.
+  */
+ /*
+  * Simple support for retying memory allocations.  Introduced to help to
+  * debug different VM deadlock avoidance strategies.
+  */
+ void * __jbd_kmalloc (char *where, size_t size, int flags, int retry)
+ {
+       void *p;
+       static unsigned long last_warning;
+
+       while (1) {
+               p = kmalloc(size, flags);
+               if (p)
+                       return p;
+               if (!retry)
+                       return NULL;
+               /* Log every retry for debugging.  Also log them to the
+                * syslog, but do rate-limiting on the non-debugging
+                * messages. */
+               jbd_debug(1, "ENOMEM in %s, retrying.\n", where);
+
+               if (time_after(jiffies, last_warning + 5*HZ)) {
+                       printk(KERN_NOTICE
+                              "ENOMEM in %s, retrying.\n", where);
+                       last_warning = jiffies;
+               }
+
+               current->policy |= SCHED_YIELD;
+               schedule();
+       }
+ }
+
+ /*
+  * Journal_head storage management
+  */
+ static kmem_cache_t *journal_head_cache;
+ #ifdef CONFIG_JBD_DEBUG
+ static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+ #endif
+
+ static int journal_init_journal_head_cache(void)
+ {
+       int retval;
+
+       J_ASSERT(journal_head_cache == 0);
+       journal_head_cache = kmem_cache_create("journal_head",
+                               sizeof(struct journal_head),
+                               0,              /* offset */
+                               0,              /* flags */
+                               NULL,           /* ctor */
+                               NULL);          /* dtor */
+       retval = 0;
+       if (journal_head_cache == 0) {
+               retval = -ENOMEM;
+               printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+       }
+       return retval;
+ }
+
+ static void journal_destroy_journal_head_cache(void)
+ {
+       J_ASSERT(journal_head_cache != NULL);
+       kmem_cache_destroy(journal_head_cache);
+       journal_head_cache = 0;
+ }
+
+ /*
+  * journal_head splicing and dicing
+  */
+ static struct journal_head *journal_alloc_journal_head(void)
+ {
+       struct journal_head *ret;
+       static unsigned long last_warning;
+
+ #ifdef CONFIG_JBD_DEBUG
+       atomic_inc(&nr_journal_heads);
+ #endif
+       ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+       if (ret == 0) {
+               jbd_debug(1, "out of memory for journal_head\n");
+               if (time_after(jiffies, last_warning + 5*HZ)) {
+                       printk(KERN_NOTICE "ENOMEM in " __FUNCTION__
+                              ", retrying.\n");
+                       last_warning = jiffies;
+               }
+               while (ret == 0) {
+                       current->policy |= SCHED_YIELD;
+                       schedule();
+                       ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+               }
+       }
+       return ret;
+ }
+
+ static void journal_free_journal_head(struct journal_head *jh)
+ {
+ #ifdef CONFIG_JBD_DEBUG
+       atomic_dec(&nr_journal_heads);
+       memset(jh, 0x5b, sizeof(*jh));
+ #endif
+       kmem_cache_free(journal_head_cache, jh);
+ }
+
+ /*
+  * A journal_head is attached to a buffer_head whenever JBD has an
+  * interest in the buffer.
+  *
+  * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+  * is set.  This bit is tested in core kernel code where we need to take
+  * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+  * there.
+  *
+  * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+  *
+  * When a buffer has its BH_JBD bit set it is immune from being released by
+  * core kernel code, mainly via ->b_count.
+  *
+  * A journal_head may be detached from its buffer_head when the journal_head's
+  * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+  * Various places in JBD call journal_remove_journal_head() to indicate that the
+  * journal_head can be dropped if needed.
+  *
+  * Various places in the kernel want to attach a journal_head to a buffer_head
+  * _before_ attaching the journal_head to a transaction.  To protect the
+  * journal_head in this situation, journal_add_journal_head elevates the
+  * journal_head's b_jcount refcount by one.  The caller must call
+  * journal_unlock_journal_head() to undo this.
+  *
+  * So the typical usage would be:
+  *
+  *    (Attach a journal_head if needed.  Increments b_jcount)
+  *    struct journal_head *jh = journal_add_journal_head(bh);
+  *    ...
+  *    jh->b_transaction = xxx;
+  *    journal_unlock_journal_head(jh);
+  *
+  * Now, the journal_head's b_jcount is zero, but it is safe from being released
+  * because it has a non-zero b_transaction.
+  */
+
+ /*
+  * Give a buffer_head a journal_head.
+  *
+  * Doesn't need the journal lock.
+  * May sleep.
+  * Cannot be called with journal_datalist_lock held.
+  */
+ struct journal_head *journal_add_journal_head(struct buffer_head *bh)
+ {
+       struct journal_head *jh;
+
+       spin_lock(&journal_datalist_lock);
+       if (buffer_jbd(bh)) {
+               jh = bh2jh(bh);
+       } else {
+               J_ASSERT_BH(bh,
+                       (atomic_read(&bh->b_count) > 0) ||
+                       (bh->b_page && bh->b_page->mapping));
+               spin_unlock(&journal_datalist_lock);
+               jh = journal_alloc_journal_head();
+               memset(jh, 0, sizeof(*jh));
+               spin_lock(&journal_datalist_lock);
+
+               if (buffer_jbd(bh)) {
+                       /* Someone did it for us! */
+                       J_ASSERT_BH(bh, bh->b_private != NULL);
+                       journal_free_journal_head(jh);
+                       jh = bh->b_private;
+               } else {
+                       /*
+                        * We actually don't need jh_splice_lock when
+                        * adding a journal_head - only on removal.
+                        */
+                       spin_lock(&jh_splice_lock);
+                       set_bit(BH_JBD, &bh->b_state);
+                       bh->b_private = jh;
+                       jh->b_bh = bh;
+                       atomic_inc(&bh->b_count);
+                       spin_unlock(&jh_splice_lock);
+                       BUFFER_TRACE(bh, "added journal_head");
+               }
+       }
+       jh->b_jcount++;
+       spin_unlock(&journal_datalist_lock);
+       return bh->b_private;
+ }
+
+ /*
+  * journal_remove_journal_head(): if the buffer isn't attached to a transaction
+  * and has a zero b_jcount then remove and release its journal_head.   If we did
+  * see that the buffer is not used by any transaction we also "logically"
+  * decrement ->b_count.
+  *
+  * We in fact take an additional increment on ->b_count as a convenience,
+  * because the caller usually wants to do additional things with the bh
+  * after calling here.
+  * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
+  * time.  Once the caller has run __brelse(), the buffer is eligible for
+  * reaping by try_to_free_buffers().
+  *
+  * Requires journal_datalist_lock.
+  */
+ void __journal_remove_journal_head(struct buffer_head *bh)
+ {
+       struct journal_head *jh = bh2jh(bh);
+
+       assert_spin_locked(&journal_datalist_lock);
+       J_ASSERT_JH(jh, jh->b_jcount >= 0);
+       atomic_inc(&bh->b_count);
+       if (jh->b_jcount == 0) {
+               if (jh->b_transaction == NULL &&
+                               jh->b_next_transaction == NULL &&
+                               jh->b_cp_transaction == NULL) {
+                       J_ASSERT_BH(bh, buffer_jbd(bh));
+                       J_ASSERT_BH(bh, jh2bh(jh) == bh);
+                       BUFFER_TRACE(bh, "remove journal_head");
+                       spin_lock(&jh_splice_lock);
+                       bh->b_private = NULL;
+                       jh->b_bh = NULL;        /* debug, really */
+                       clear_bit(BH_JBD, &bh->b_state);
+                       __brelse(bh);
+                       spin_unlock(&jh_splice_lock);
+                       journal_free_journal_head(jh);
+               } else {
+                       BUFFER_TRACE(bh, "journal_head was locked");
+               }
+       }
+ }
+
+ void journal_unlock_journal_head(struct journal_head *jh)
+ {
+       spin_lock(&journal_datalist_lock);
+       J_ASSERT_JH(jh, jh->b_jcount > 0);
+       --jh->b_jcount;
+       if (!jh->b_jcount && !jh->b_transaction) {
+               struct buffer_head *bh;
+               bh = jh2bh(jh);
+               __journal_remove_journal_head(bh);
+               __brelse(bh);
+       }
+
+       spin_unlock(&journal_datalist_lock);
+ }
+
+ void journal_remove_journal_head(struct buffer_head *bh)
+ {
+       spin_lock(&journal_datalist_lock);
+       __journal_remove_journal_head(bh);
+       spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+  * Module startup and shutdown
+  */
+
+ static int __init journal_init_caches(void)
+ {
+       int ret;
+
+       ret = journal_init_revoke_caches();
+       if (ret == 0)
+               ret = journal_init_journal_head_cache();
+       return ret;
+ }
+
+ static void journal_destroy_caches(void)
+ {
+       journal_destroy_revoke_caches();
+       journal_destroy_journal_head_cache();
+ }
+
+ static int __init journal_init(void)
+ {
+       int ret;
+
+       printk(KERN_INFO "Journalled Block Device driver loaded\n");
+       ret = journal_init_caches();
+       if (ret != 0)
+               journal_destroy_caches();
+       return ret;
+ }
+
+ static void __exit journal_exit(void)
+ {
+ #ifdef CONFIG_JBD_DEBUG
+       int n = atomic_read(&nr_journal_heads);
+       if (n)
+               printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+ #endif
+       journal_destroy_caches();
+ }
+
+ MODULE_LICENSE("GPL");
+ module_init(journal_init);
+ module_exit(journal_exit);
+
diff -rc2P linux/fs/jbd/recovery.c linux-2.4.13/fs/jbd/recovery.c
*** linux/fs/jbd/recovery.c     Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/recovery.c      Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,586 ----
+ /*
+  * linux/fs/recovery.c
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 1999
+  *
+  * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Journal recovery routines for the generic filesystem journaling code;
+  * part of the ext2fs journaling system.
+  */
+
+ #ifndef __KERNEL__
+ #include "jfs_user.h"
+ #else
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #endif
+
+ /*
+  * Maintain information about the progress of the recovery job, so that
+  * the different passes can carry information between them.
+  */
+ struct recovery_info
+ {
+       tid_t           start_transaction;
+       tid_t           end_transaction;
+
+       int             nr_replays;
+       int             nr_revokes;
+       int             nr_revoke_hits;
+ };
+
+ enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+ static int do_one_pass(journal_t *journal,
+                               struct recovery_info *info, enum passtype pass);
+ static int scan_revoke_records(journal_t *, struct buffer_head *,
+                               tid_t, struct recovery_info *);
+
+ #ifdef __KERNEL__
+
+ /* Release readahead buffers after use */
+ void journal_brelse_array(struct buffer_head *b[], int n)
+ {
+       while (--n >= 0)
+               brelse (b[n]);
+ }
+
+
+ /*
+  * When reading from the journal, we are going through the block device
+  * layer directly and so there is no readahead being done for us.  We
+  * need to implement any readahead ourselves if we want it to happen at
+  * all.  Recovery is basically one long sequential read, so make sure we
+  * do the IO in reasonably large chunks.
+  *
+  * This is not so critical that we need to be enormously clever about
+  * the readahead size, though.  128K is a purely arbitrary, good-enough
+  * fixed value.
+  */
+
+ #define MAXBUF 8
+ static int do_readahead(journal_t *journal, unsigned int start)
+ {
+       int err;
+       unsigned int max, nbufs, next, blocknr;
+       struct buffer_head *bh;
+
+       struct buffer_head * bufs[MAXBUF];
+
+       /* Do up to 128K of readahead */
+       max = start + (128 * 1024 / journal->j_blocksize);
+       if (max > journal->j_maxlen)
+               max = journal->j_maxlen;
+
+       /* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+        * a time to the block device IO layer. */
+
+       nbufs = 0;
+
+       for (next = start; next < max; next++) {
+               blocknr = journal_bmap(journal, next);
+
+               if (!blocknr) {
+                       printk (KERN_ERR "JBD: bad block at offset %u\n",
+                               next);
+                       err = -EIO;
+                       goto failed;
+               }
+
+               bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+               if (!bh) {
+                       err = -ENOMEM;
+                       goto failed;
+               }
+
+               if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+                       bufs[nbufs++] = bh;
+                       if (nbufs == MAXBUF) {
+                               ll_rw_block(READ, nbufs, bufs);
+                               journal_brelse_array(bufs, nbufs);
+                               nbufs = 0;
+                       }
+               } else
+                       brelse(bh);
+       }
+
+       if (nbufs)
+               ll_rw_block(READ, nbufs, bufs);
+       err = 0;
+
+ failed:
+       if (nbufs)
+               journal_brelse_array(bufs, nbufs);
+       return err;
+ }
+
+ #endif /* __KERNEL__ */
+
+
+ /*
+  * Read a block from the journal
+  */
+
+ static int jread(struct buffer_head **bhp, journal_t *journal,
+                unsigned int offset)
+ {
+       unsigned int blocknr;
+       struct buffer_head *bh;
+
+       *bhp = NULL;
+
+       J_ASSERT (offset < journal->j_maxlen);
+
+       blocknr = journal_bmap(journal, offset);
+
+       if (!blocknr) {
+               printk (KERN_ERR "JBD: bad block at offset %u\n",
+                       offset);
+               return -EIO;
+       }
+
+       bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+       if (!bh)
+               return -ENOMEM;
+
+       if (!buffer_uptodate(bh)) {
+               /* If this is a brand new buffer, start readahead.
+                    Otherwise, we assume we are already reading it.  */
+               if (!buffer_req(bh))
+                       do_readahead(journal, offset);
+               wait_on_buffer(bh);
+       }
+
+       if (!buffer_uptodate(bh)) {
+               printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+                       offset);
+               brelse(bh);
+               return -EIO;
+       }
+
+       *bhp = bh;
+       return 0;
+ }
+
+
+ /*
+  * Count the number of in-use tags in a journal descriptor block.
+  */
+
+ static int count_tags(struct buffer_head *bh, int size)
+ {
+       char *                  tagp;
+       journal_block_tag_t *   tag;
+       int                     nr = 0;
+
+       tagp = &bh->b_data[sizeof(journal_header_t)];
+
+       while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
+               tag = (journal_block_tag_t *) tagp;
+
+               nr++;
+               tagp += sizeof(journal_block_tag_t);
+               if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID)))
+                       tagp += 16;
+
+               if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG))
+                       break;
+       }
+
+       return nr;
+ }
+
+
+ /* Make sure we wrap around the log correctly! */
+ #define wrap(journal, var)                                            \
+ do {                                                                  \
+       if (var >= (journal)->j_last)                                   \
+               var -= ((journal)->j_last - (journal)->j_first);        \
+ } while (0)
+
+ /*
+  * journal_recover
+  *
+  * The primary function for recovering the log contents when mounting a
+  * journaled device.
+  *
+  * Recovery is done in three passes.  In the first pass, we look for the
+  * end of the log.  In the second, we assemble the list of revoke
+  * blocks.  In the third and final pass, we replay any un-revoked blocks
+  * in the log.
+  */
+
+ int journal_recover(journal_t *journal)
+ {
+       int                     err;
+       journal_superblock_t *  sb;
+
+       struct recovery_info    info;
+
+       memset(&info, 0, sizeof(info));
+       sb = journal->j_superblock;
+
+       /*
+        * The journal superblock's s_start field (the current log head)
+        * is always zero if, and only if, the journal was cleanly
+        * unmounted.
+        */
+
+       if (!sb->s_start) {
+               jbd_debug(1, "No recovery required, last transaction %d\n",
+                         ntohl(sb->s_sequence));
+               journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1;
+               return 0;
+       }
+
+
+       err = do_one_pass(journal, &info, PASS_SCAN);
+       if (!err)
+               err = do_one_pass(journal, &info, PASS_REVOKE);
+       if (!err)
+               err = do_one_pass(journal, &info, PASS_REPLAY);
+
+       jbd_debug(0, "JBD: recovery, exit status %d, "
+                 "recovered transactions %u to %u\n",
+                 err, info.start_transaction, info.end_transaction);
+       jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
+                 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+       /* Restart the log at the next transaction ID, thus invalidating
+        * any existing commit records in the log. */
+       journal->j_transaction_sequence = ++info.end_transaction;
+
+       journal_clear_revoke(journal);
+       fsync_no_super(journal->j_fs_dev);
+       return err;
+ }
+
+ /*
+  * journal_skip_recovery
+  *
+  * Locate any valid recovery information from the journal and set up the
+  * journal structures in memory to ignore it (presumably because the
+  * caller has evidence that it is out of date).
+  *
+  * We perform one pass over the journal to allow us to tell the user how
+  * much recovery information is being erased, and to let us initialise
+  * the journal transaction sequence numbers to the next unused ID.
+  */
+
+ int journal_skip_recovery(journal_t *journal)
+ {
+       int                     err;
+       journal_superblock_t *  sb;
+
+       struct recovery_info    info;
+
+       memset (&info, 0, sizeof(info));
+       sb = journal->j_superblock;
+
+       err = do_one_pass(journal, &info, PASS_SCAN);
+
+       if (err) {
+               printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+               ++journal->j_transaction_sequence;
+       } else {
+ #ifdef CONFIG_JBD_DEBUG
+               int dropped = info.end_transaction - ntohl(sb->s_sequence);
+ #endif
+
+               jbd_debug(0,
+                         "JBD: ignoring %d transaction%s from the journal.\n",
+                         dropped, (dropped == 1) ? "" : "s");
+               journal->j_transaction_sequence = ++info.end_transaction;
+       }
+
+       journal->j_tail = 0;
+
+       return err;
+ }
+
+ static int do_one_pass(journal_t *journal,
+                       struct recovery_info *info, enum passtype pass)
+ {
+
+       unsigned int            first_commit_ID, next_commit_ID;
+       unsigned long           next_log_block;
+       int                     err, success = 0;
+       journal_superblock_t *  sb;
+       journal_header_t *      tmp;
+       struct buffer_head *    bh;
+       unsigned int            sequence;
+       int                     blocktype;
+
+       /* Precompute the maximum metadata descriptors in a descriptor block */
+       int                     MAX_BLOCKS_PER_DESC;
+       MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+                              / sizeof(journal_block_tag_t));
+
+       /*
+        * First thing is to establish what we expect to find in the log
+        * (in terms of transaction IDs), and where (in terms of log
+        * block offsets): query the superblock.
+        */
+
+       sb = journal->j_superblock;
+       next_commit_ID = ntohl(sb->s_sequence);
+       next_log_block = ntohl(sb->s_start);
+
+       first_commit_ID = next_commit_ID;
+       if (pass == PASS_SCAN)
+               info->start_transaction = first_commit_ID;
+
+       jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+       /*
+        * Now we walk through the log, transaction by transaction,
+        * making sure that each transaction has a commit block in the
+        * expected place.  Each complete transaction gets replayed back
+        * into the main filesystem.
+        */
+
+       while (1) {
+               int                     flags;
+               char *                  tagp;
+               journal_block_tag_t *   tag;
+               struct buffer_head *    obh;
+               struct buffer_head *    nbh;
+
+               /* If we already know where to stop the log traversal,
+                * check right now that we haven't gone past the end of
+                * the log. */
+
+               if (pass != PASS_SCAN)
+                       if (tid_geq(next_commit_ID, info->end_transaction))
+                               break;
+
+               jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+                         next_commit_ID, next_log_block, journal->j_last);
+
+               /* Skip over each chunk of the transaction looking
+                * either the next descriptor block or the final commit
+                * record. */
+
+               jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+               err = jread(&bh, journal, next_log_block);
+               if (err)
+                       goto failed;
+
+               next_log_block++;
+               wrap(journal, next_log_block);
+
+               /* What kind of buffer is it?
+                *
+                * If it is a descriptor block, check that it has the
+                * expected sequence number.  Otherwise, we're all done
+                * here. */
+
+               tmp = (journal_header_t *)bh->b_data;
+
+               if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) {
+                       brelse(bh);
+                       break;
+               }
+
+               blocktype = ntohl(tmp->h_blocktype);
+               sequence = ntohl(tmp->h_sequence);
+               jbd_debug(3, "Found magic %d, sequence %d\n",
+                         blocktype, sequence);
+
+               if (sequence != next_commit_ID) {
+                       brelse(bh);
+                       break;
+               }
+
+               /* OK, we have a valid descriptor block which matches
+                * all of the sequence number checks.  What are we going
+                * to do with it?  That depends on the pass... */
+
+               switch(blocktype) {
+               case JFS_DESCRIPTOR_BLOCK:
+                       /* If it is a valid descriptor block, replay it
+                        * in pass REPLAY; otherwise, just skip over the
+                        * blocks it describes. */
+                       if (pass != PASS_REPLAY) {
+                               next_log_block +=
+                                       count_tags(bh, journal->j_blocksize);
+                               wrap(journal, next_log_block);
+                               brelse(bh);
+                               continue;
+                       }
+
+                       /* A descriptor block: we can now write all of
+                        * the data blocks.  Yay, useful work is finally
+                        * getting done here! */
+
+                       tagp = &bh->b_data[sizeof(journal_header_t)];
+                       while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+                              <= journal->j_blocksize) {
+                               unsigned long io_block;
+
+                               tag = (journal_block_tag_t *) tagp;
+                               flags = ntohl(tag->t_flags);
+
+                               io_block = next_log_block++;
+                               wrap(journal, next_log_block);
+                               err = jread(&obh, journal, io_block);
+                               if (err) {
+                                       /* Recover what we can, but
+                                        * report failure at the end. */
+                                       success = err;
+                                       printk (KERN_ERR
+                                               "JBD: IO error %d recovering "
+                                               "block %ld in log\n",
+                                               err, io_block);
+                               } else {
+                                       unsigned long blocknr;
+
+                                       J_ASSERT(obh != NULL);
+                                       blocknr = ntohl(tag->t_blocknr);
+
+                                       /* If the block has been
+                                        * revoked, then we're all done
+                                        * here. */
+                                       if (journal_test_revoke
+                                           (journal, blocknr,
+                                            next_commit_ID)) {
+                                               brelse(obh);
+                                               ++info->nr_revoke_hits;
+                                               goto skip_write;
+                                       }
+
+                                       /* Find a buffer for the new
+                                        * data being restored */
+                                       nbh = getblk(journal->j_fs_dev, blocknr,
+                                                    journal->j_blocksize);
+                                       if (nbh == NULL) {
+                                               printk(KERN_ERR
+                                                      "JBD: Out of memory "
+                                                      "during recovery.\n");
+                                               err = -ENOMEM;
+                                               brelse(bh);
+                                               brelse(obh);
+                                               goto failed;
+                                       }
+
+                                       memcpy(nbh->b_data, obh->b_data,
+                                                       journal->j_blocksize);
+                                       if (flags & JFS_FLAG_ESCAPE) {
+                                               *((unsigned int *)bh->b_data) =
+                                                       htonl(JFS_MAGIC_NUMBER);
+                                       }
+
+                                       BUFFER_TRACE(nbh, "marking dirty");
+                                       mark_buffer_dirty(nbh);
+                                       BUFFER_TRACE(nbh, "marking uptodate");
+                                       mark_buffer_uptodate(nbh, 1);
+                                       ++info->nr_replays;
+                                       /* ll_rw_block(WRITE, 1, &nbh); */
+                                       brelse(obh);
+                                       brelse(nbh);
+                               }
+
+                       skip_write:
+                               tagp += sizeof(journal_block_tag_t);
+                               if (!(flags & JFS_FLAG_SAME_UUID))
+                                       tagp += 16;
+
+                               if (flags & JFS_FLAG_LAST_TAG)
+                                       break;
+                       }
+
+                       brelse(bh);
+                       continue;
+
+               case JFS_COMMIT_BLOCK:
+                       /* Found an expected commit block: not much to
+                        * do other than move on to the next sequence
+                        * number. */
+                       brelse(bh);
+                       next_commit_ID++;
+                       continue;
+
+               case JFS_REVOKE_BLOCK:
+                       /* If we aren't in the REVOKE pass, then we can
+                        * just skip over this block. */
+                       if (pass != PASS_REVOKE) {
+                               brelse(bh);
+                               continue;
+                       }
+
+                       err = scan_revoke_records(journal, bh,
+                                                 next_commit_ID, info);
+                       brelse(bh);
+                       if (err)
+                               goto failed;
+                       continue;
+
+               default:
+                       jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+                                 blocktype);
+                       goto done;
+               }
+       }
+
+  done:
+       /*
+        * We broke out of the log scan loop: either we came to the
+        * known end of the log or we found an unexpected block in the
+        * log.  If the latter happened, then we know that the "current"
+        * transaction marks the end of the valid log.
+        */
+
+       if (pass == PASS_SCAN)
+               info->end_transaction = next_commit_ID;
+       else {
+               /* It's really bad news if different passes end up at
+                * different places (but possible due to IO errors). */
+               if (info->end_transaction != next_commit_ID) {
+                       printk (KERN_ERR "JBD: recovery pass %d ended at "
+                               "transaction %u, expected %u\n",
+                               pass, next_commit_ID, info->end_transaction);
+                       if (!success)
+                               success = -EIO;
+               }
+       }
+
+       return success;
+
+  failed:
+       return err;
+ }
+
+
+ /* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+                              tid_t sequence, struct recovery_info *info)
+ {
+       journal_revoke_header_t *header;
+       int offset, max;
+
+       header = (journal_revoke_header_t *) bh->b_data;
+       offset = sizeof(journal_revoke_header_t);
+       max = ntohl(header->r_count);
+
+       while (offset < max) {
+               unsigned long blocknr;
+               int err;
+
+               blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset)));
+               offset += 4;
+               err = journal_set_revoke(journal, blocknr, sequence);
+               if (err)
+                       return err;
+               ++info->nr_revokes;
+       }
+       return 0;
+ }
diff -rc2P linux/fs/jbd/revoke.c linux-2.4.13/fs/jbd/revoke.c
*** linux/fs/jbd/revoke.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/revoke.c        Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,631 ----
+ /*
+  * linux/fs/revoke.c
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 2000
+  *
+  * Copyright 2000 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Journal revoke routines for the generic filesystem journaling code;
+  * part of the ext2fs journaling system.
+  *
+  * Revoke is the mechanism used to prevent old log records for deleted
+  * metadata from being replayed on top of newer data using the same
+  * blocks.  The revoke mechanism is used in two separate places:
+  *
+  * + Commit: during commit we write the entire list of the current
+  *   transaction's revoked blocks to the journal
+  *
+  * + Recovery: during recovery we record the transaction ID of all
+  *   revoked blocks.  If there are multiple revoke records in the log
+  *   for a single block, only the last one counts, and if there is a log
+  *   entry for a block beyond the last revoke, then that log entry still
+  *   gets replayed.
+  *
+  * We can get interactions between revokes and new log data within a
+  * single transaction:
+  *
+  * Block is revoked and then journaled:
+  *   The desired end result is the journaling of the new block, so we
+  *   cancel the revoke before the transaction commits.
+  *
+  * Block is journaled and then revoked:
+  *   The revoke must take precedence over the write of the block, so we
+  *   need either to cancel the journal entry or to write the revoke
+  *   later in the log than the log block.  In this case, we choose the
+  *   latter: journaling a block cancels any revoke record for that block
+  *   in the current transaction, so any revoke for that block in the
+  *   transaction must have happened after the block was journaled and so
+  *   the revoke must take precedence.
+  *
+  * Block is revoked and then written as data:
+  *   The data write is allowed to succeed, but the revoke is _not_
+  *   cancelled.  We still need to prevent old log records from
+  *   overwriting the new data.  We don't even need to clear the revoke
+  *   bit here.
+  *
+  * Revoke information on buffers is a tri-state value:
+  *
+  * RevokeValid clear: no cached revoke status, need to look it up
+  * RevokeValid set, Revoked clear:
+  *                    buffer has not been revoked, and cancel_revoke
+  *                    need do nothing.
+  * RevokeValid set, Revoked set:
+  *                    buffer has been revoked.
+  */
+
+ #ifndef __KERNEL__
+ #include "jfs_user.h"
+ #else
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/list.h>
+ #include <linux/smp_lock.h>
+ #include <linux/init.h>
+ #endif
+
+ static kmem_cache_t *revoke_record_cache;
+ static kmem_cache_t *revoke_table_cache;
+
+ /* Each revoke record represents one single revoked block.  During
+    journal replay, this involves recording the transaction ID of the
+    last transaction to revoke this block. */
+
+ struct jbd_revoke_record_s
+ {
+       struct list_head  hash;
+       tid_t             sequence;     /* Used for recovery only */
+       unsigned long     blocknr;
+ };
+
+
+ /* The revoke table is just a simple hash table of revoke records. */
+ struct jbd_revoke_table_s
+ {
+       /* It is conceivable that we might want a larger hash table
+        * for recovery.  Must be a power of two. */
+       int               hash_size;
+       int               hash_shift;
+       struct list_head *hash_table;
+ };
+
+
+ #ifdef __KERNEL__
+ static void write_one_revoke_record(journal_t *, transaction_t *,
+                                   struct journal_head **, int *,
+                                   struct jbd_revoke_record_s *);
+ static void flush_descriptor(journal_t *, struct journal_head *, int);
+ #endif
+
+ /* Utility functions to maintain the revoke table */
+
+ /* Borrowed from buffer.c: this is a tried and tested block hash function */
+ static inline int hash(journal_t *journal, unsigned long block)
+ {
+       struct jbd_revoke_table_s *table = journal->j_revoke;
+       int hash_shift = table->hash_shift;
+
+       return ((block << (hash_shift - 6)) ^
+               (block >> 13) ^
+               (block << (hash_shift - 12))) & (table->hash_size - 1);
+ }
+
+ int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
+ {
+       struct list_head *hash_list;
+       struct jbd_revoke_record_s *record;
+
+ repeat:
+       record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
+       if (!record)
+               goto oom;
+
+       record->sequence = seq;
+       record->blocknr = blocknr;
+       hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+       list_add(&record->hash, hash_list);
+       return 0;
+
+ oom:
+       if (!journal_oom_retry)
+               return -ENOMEM;
+       jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
+       current->policy |= SCHED_YIELD;
+       schedule();
+       goto repeat;
+ }
+
+ /* Find a revoke record in the journal's hash table. */
+
+ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
+                                                     unsigned long blocknr)
+ {
+       struct list_head *hash_list;
+       struct jbd_revoke_record_s *record;
+
+       hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+       record = (struct jbd_revoke_record_s *) hash_list->next;
+       while (&(record->hash) != hash_list) {
+               if (record->blocknr == blocknr)
+                       return record;
+               record = (struct jbd_revoke_record_s *) record->hash.next;
+       }
+       return NULL;
+ }
+
+ int __init journal_init_revoke_caches(void)
+ {
+       revoke_record_cache = kmem_cache_create("revoke_record",
+                                          sizeof(struct jbd_revoke_record_s),
+                                          0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+       if (revoke_record_cache == 0)
+               return -ENOMEM;
+
+       revoke_table_cache = kmem_cache_create("revoke_table",
+                                          sizeof(struct jbd_revoke_table_s),
+                                          0, 0, NULL, NULL);
+       if (revoke_table_cache == 0) {
+               kmem_cache_destroy(revoke_record_cache);
+               revoke_record_cache = NULL;
+               return -ENOMEM;
+       }
+       return 0;
+ }
+
+ void journal_destroy_revoke_caches(void)
+ {
+       kmem_cache_destroy(revoke_record_cache);
+       revoke_record_cache = 0;
+       kmem_cache_destroy(revoke_table_cache);
+       revoke_table_cache = 0;
+ }
+
+ /* Initialise the revoke table for a given journal to a given size. */
+
+ int journal_init_revoke(journal_t *journal, int hash_size)
+ {
+       int shift, tmp;
+
+       J_ASSERT (journal->j_revoke == NULL);
+
+       journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+       if (!journal->j_revoke)
+               return -ENOMEM;
+
+       /* Check that the hash_size is a power of two */
+       J_ASSERT ((hash_size & (hash_size-1)) == 0);
+
+       journal->j_revoke->hash_size = hash_size;
+
+       shift = 0;
+       tmp = hash_size;
+       while((tmp >>= 1UL) != 0UL)
+               shift++;
+       journal->j_revoke->hash_shift = shift;
+
+       journal->j_revoke->hash_table =
+               kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+       if (!journal->j_revoke->hash_table) {
+               kmem_cache_free(revoke_table_cache, journal->j_revoke);
+               journal->j_revoke = NULL;
+               return -ENOMEM;
+       }
+
+       for (tmp = 0; tmp < hash_size; tmp++)
+               INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+
+       return 0;
+ }
+
+ /* Destoy a journal's revoke table.  The table must already be empty! */
+
+ void journal_destroy_revoke(journal_t *journal)
+ {
+       struct jbd_revoke_table_s *table;
+       struct list_head *hash_list;
+       int i;
+
+       table = journal->j_revoke;
+       if (!table)
+               return;
+
+       for (i=0; i<table->hash_size; i++) {
+               hash_list = &table->hash_table[i];
+               J_ASSERT (list_empty(hash_list));
+       }
+
+       kfree(table->hash_table);
+       kmem_cache_free(revoke_table_cache, table);
+       journal->j_revoke = NULL;
+ }
+
+
+ #ifdef __KERNEL__
+
+ /*
+  * journal_revoke: revoke a given buffer_head from the journal.  This
+  * prevents the block from being replayed during recovery if we take a
+  * crash after this current transaction commits.  Any subsequent
+  * metadata writes of the buffer in this transaction cancel the
+  * revoke.
+  *
+  * Note that this call may block --- it is up to the caller to make
+  * sure that there are no further calls to journal_write_metadata
+  * before the revoke is complete.  In ext3, this implies calling the
+  * revoke before clearing the block bitmap when we are deleting
+  * metadata.
+  *
+  * Revoke performs a journal_forget on any buffer_head passed in as a
+  * parameter, but does _not_ forget the buffer_head if the bh was only
+  * found implicitly.
+  *
+  * bh_in may not be a journalled buffer - it may have come off
+  * the hash tables without an attached journal_head.
+  *
+  * If bh_in is non-zero, journal_revoke() will decrement its b_count
+  * by one.
+  */
+
+ int journal_revoke(handle_t *handle, unsigned long blocknr,
+                  struct buffer_head *bh_in)
+ {
+       struct buffer_head *bh = NULL;
+       journal_t *journal;
+       kdev_t dev;
+       int err;
+
+       if (bh_in)
+               BUFFER_TRACE(bh_in, "enter");
+
+       journal = handle->h_transaction->t_journal;
+       if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
+               J_ASSERT (!"Cannot set revoke feature!");
+               return -EINVAL;
+       }
+
+       dev = journal->j_fs_dev;
+       bh = bh_in;
+
+       if (!bh) {
+               bh = get_hash_table(dev, blocknr, journal->j_blocksize);
+               if (bh)
+                       BUFFER_TRACE(bh, "found on hash");
+       }
+ #ifdef JBD_EXPENSIVE_CHECKING
+       else {
+               struct buffer_head *bh2;
+
+               /* If there is a different buffer_head lying around in
+                * memory anywhere... */
+               bh2 = get_hash_table(dev, blocknr, journal->j_blocksize);
+               if (bh2) {
+                       /* ... and it has RevokeValid status... */
+                       if ((bh2 != bh) &&
+                           test_bit(BH_RevokeValid, &bh2->b_state))
+                               /* ...then it better be revoked too,
+                                * since it's illegal to create a revoke
+                                * record against a buffer_head which is
+                                * not marked revoked --- that would
+                                * risk missing a subsequent revoke
+                                * cancel. */
+                               J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
+                                                         bh2->b_state));
+                       __brelse(bh2);
+               }
+       }
+ #endif
+
+       /* We really ought not ever to revoke twice in a row without
+            first having the revoke cancelled: it's illegal to free a
+            block twice without allocating it in between! */
+       if (bh) {
+               J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
+               set_bit(BH_Revoked, &bh->b_state);
+               set_bit(BH_RevokeValid, &bh->b_state);
+               if (bh_in) {
+                       BUFFER_TRACE(bh_in, "call journal_forget");
+                       journal_forget(handle, bh_in);
+               } else {
+                       BUFFER_TRACE(bh, "call brelse");
+                       __brelse(bh);
+               }
+       }
+
+       lock_journal(journal);
+       jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+       err = insert_revoke_hash(journal, blocknr,
+                               handle->h_transaction->t_tid);
+       unlock_journal(journal);
+       BUFFER_TRACE(bh_in, "exit");
+       return err;
+ }
+
+ /*
+  * Cancel an outstanding revoke.  For use only internally by the
+  * journaling code (called from journal_get_write_access).
+  *
+  * We trust the BH_Revoked bit on the buffer if the buffer is already
+  * being journaled: if there is no revoke pending on the buffer, then we
+  * don't do anything here.
+  *
+  * This would break if it were possible for a buffer to be revoked and
+  * discarded, and then reallocated within the same transaction.  In such
+  * a case we would have lost the revoked bit, but when we arrived here
+  * the second time we would still have a pending revoke to cancel.  So,
+  * do not trust the Revoked bit on buffers unless RevokeValid is also
+  * set.
+  *
+  * The caller must have the journal locked.
+  */
+ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+ {
+       struct jbd_revoke_record_s *record;
+       journal_t *journal = handle->h_transaction->t_journal;
+       int need_cancel;
+       int did_revoke = 0;     /* akpm: debug */
+       struct buffer_head *bh = jh2bh(jh);
+
+       jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+       /* Is the existing Revoke bit valid?  If so, we trust it, and
+        * only perform the full cancel if the revoke bit is set.  If
+        * not, we can't trust the revoke bit, and we need to do the
+        * full search for a revoke record. */
+       if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
+               need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
+       else {
+               need_cancel = 1;
+               clear_bit(BH_Revoked, &bh->b_state);
+       }
+
+       if (need_cancel) {
+               record = find_revoke_record(journal, bh->b_blocknr);
+               if (record) {
+                       jbd_debug(4, "cancelled existing revoke on "
+                                 "blocknr %lu\n", bh->b_blocknr);
+                       list_del(&record->hash);
+                       kmem_cache_free(revoke_record_cache, record);
+                       did_revoke = 1;
+               }
+       }
+
+ #ifdef JBD_EXPENSIVE_CHECKING
+       /* There better not be one left behind by now! */
+       record = find_revoke_record(journal, bh->b_blocknr);
+       J_ASSERT_JH(jh, record == NULL);
+ #endif
+
+       /* Finally, have we just cleared revoke on an unhashed
+        * buffer_head?  If so, we'd better make sure we clear the
+        * revoked status on any hashed alias too, otherwise the revoke
+        * state machine will get very upset later on. */
+       if (need_cancel && !bh->b_pprev) {
+               struct buffer_head *bh2;
+               bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+               if (bh2) {
+                       clear_bit(BH_Revoked, &bh2->b_state);
+                       __brelse(bh2);
+               }
+       }
+
+       return did_revoke;
+ }
+
+
+ /*
+  * Write revoke records to the journal for all entries in the current
+  * revoke hash, deleting the entries as we go.
+  *
+  * Called with the journal lock held.
+  */
+
+ void journal_write_revoke_records(journal_t *journal,
+                                 transaction_t *transaction)
+ {
+       struct journal_head *descriptor;
+       struct jbd_revoke_record_s *record;
+       struct jbd_revoke_table_s *revoke;
+       struct list_head *hash_list;
+       int i, offset, count;
+
+       descriptor = NULL;
+       offset = 0;
+       count = 0;
+       revoke = journal->j_revoke;
+
+       for (i = 0; i < revoke->hash_size; i++) {
+               hash_list = &revoke->hash_table[i];
+
+               while (!list_empty(hash_list)) {
+                       record = (struct jbd_revoke_record_s *)
+                               hash_list->next;
+                       write_one_revoke_record(journal, transaction,
+                                               &descriptor, &offset,
+                                               record);
+                       count++;
+                       list_del(&record->hash);
+                       kmem_cache_free(revoke_record_cache, record);
+               }
+       }
+       if (descriptor)
+               flush_descriptor(journal, descriptor, offset);
+       jbd_debug(1, "Wrote %d revoke records\n", count);
+ }
+
+ /*
+  * Write out one revoke record.  We need to create a new descriptor
+  * block if the old one is full or if we have not already created one.
+  */
+
+ static void write_one_revoke_record(journal_t *journal,
+                                   transaction_t *transaction,
+                                   struct journal_head **descriptorp,
+                                   int *offsetp,
+                                   struct jbd_revoke_record_s *record)
+ {
+       struct journal_head *descriptor;
+       int offset;
+       journal_header_t *header;
+
+       /* If we are already aborting, this all becomes a noop.  We
+            still need to go round the loop in
+            journal_write_revoke_records in order to free all of the
+            revoke records: only the IO to the journal is omitted. */
+       if (is_journal_aborted(journal))
+               return;
+
+       descriptor = *descriptorp;
+       offset = *offsetp;
+
+       /* Make sure we have a descriptor with space left for the record */
+       if (descriptor) {
+               if (offset == journal->j_blocksize) {
+                       flush_descriptor(journal, descriptor, offset);
+                       descriptor = NULL;
+               }
+       }
+
+       if (!descriptor) {
+               descriptor = journal_get_descriptor_buffer(journal);
+               header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+               header->h_magic     = htonl(JFS_MAGIC_NUMBER);
+               header->h_blocktype = htonl(JFS_REVOKE_BLOCK);
+               header->h_sequence  = htonl(transaction->t_tid);
+
+               /* Record it so that we can wait for IO completion later */
+               JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+               journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+               offset = sizeof(journal_revoke_header_t);
+               *descriptorp = descriptor;
+       }
+
+       * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) =
+               htonl(record->blocknr);
+       offset += 4;
+       *offsetp = offset;
+ }
+
+ /*
+  * Flush a revoke descriptor out to the journal.  If we are aborting,
+  * this is a noop; otherwise we are generating a buffer which needs to
+  * be waited for during commit, so it has to go onto the appropriate
+  * journal buffer list.
+  */
+
+ static void flush_descriptor(journal_t *journal,
+                            struct journal_head *descriptor,
+                            int offset)
+ {
+       journal_revoke_header_t *header;
+
+       if (is_journal_aborted(journal)) {
+               JBUFFER_TRACE(descriptor, "brelse");
+               __brelse(jh2bh(descriptor));
+               return;
+       }
+
+       header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+       header->r_count = htonl(offset);
+       set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
+       {
+               struct buffer_head *bh = jh2bh(descriptor);
+               BUFFER_TRACE(bh, "write");
+               ll_rw_block (WRITE, 1, &bh);
+       }
+ }
+
+ #endif
+
+ /*
+  * Revoke support for recovery.
+  *
+  * Recovery needs to be able to:
+  *
+  *  record all revoke records, including the tid of the latest instance
+  *  of each revoke in the journal
+  *
+  *  check whether a given block in a given transaction should be replayed
+  *  (ie. has not been revoked by a revoke record in that or a subsequent
+  *  transaction)
+  *
+  *  empty the revoke table after recovery.
+  */
+
+ /*
+  * First, setting revoke records.  We create a new revoke record for
+  * every block ever revoked in the log as we scan it for recovery, and
+  * we update the existing records if we find multiple revokes for a
+  * single block.
+  */
+
+ int journal_set_revoke(journal_t *journal,
+                      unsigned long blocknr,
+                      tid_t sequence)
+ {
+       struct jbd_revoke_record_s *record;
+
+       record = find_revoke_record(journal, blocknr);
+       if (record) {
+               /* If we have multiple occurences, only record the
+                * latest sequence number in the hashed record */
+               if (tid_gt(sequence, record->sequence))
+                       record->sequence = sequence;
+               return 0;
+       }
+       return insert_revoke_hash(journal, blocknr, sequence);
+ }
+
+ /*
+  * Test revoke records.  For a given block referenced in the log, has
+  * that block been revoked?  A revoke record with a given transaction
+  * sequence number revokes all blocks in that transaction and earlier
+  * ones, but later transactions still need replayed.
+  */
+
+ int journal_test_revoke(journal_t *journal,
+                       unsigned long blocknr,
+                       tid_t sequence)
+ {
+       struct jbd_revoke_record_s *record;
+
+       record = find_revoke_record(journal, blocknr);
+       if (!record)
+               return 0;
+       if (tid_gt(sequence, record->sequence))
+               return 0;
+       return 1;
+ }
+
+ /*
+  * Finally, once recovery is over, we need to clear the revoke table so
+  * that it can be reused by the running filesystem.
+  */
+
+ void journal_clear_revoke(journal_t *journal)
+ {
+       int i;
+       struct list_head *hash_list;
+       struct jbd_revoke_record_s *record;
+       struct jbd_revoke_table_s *revoke;
+
+       revoke = journal->j_revoke;
+
+       for (i = 0; i < revoke->hash_size; i++) {
+               hash_list = &revoke->hash_table[i];
+               while (!list_empty(hash_list)) {
+                       record = (struct jbd_revoke_record_s*) hash_list->next;
+                       list_del(&record->hash);
+                       kmem_cache_free(revoke_record_cache, record);
+               }
+       }
+ }
+
diff -rc2P linux/fs/jbd/transaction.c linux-2.4.13/fs/jbd/transaction.c
*** linux/fs/jbd/transaction.c  Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/transaction.c   Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,2078 ----
+ /*
+  * linux/fs/transaction.c
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 1998
+  *
+  * Copyright 1998 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Generic filesystem transaction handling code; part of the ext2fs
+  * journaling system.
+  *
+  * This file manages transactions (compound commits managed by the
+  * journaling code) and handles (individual atomic operations by the
+  * filesystem).
+  */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/timer.h>
+ #include <linux/smp_lock.h>
+ #include <linux/mm.h>
+ #include <linux/swap.h>       /* Uggh... needed for buffermem_pages */
+
+
+ extern spinlock_t journal_datalist_lock;
+
+ /*
+  * get_transaction: obtain a new transaction_t object.
+  *
+  * Simply allocate and initialise a new transaction.  Create it in
+  * RUNNING state and add it to the current journal (which should not
+  * have an existing running transaction: we only make a new transaction
+  * once we have started to commit the old one).
+  *
+  * Preconditions:
+  *    The journal MUST be locked.  We don't perform atomic mallocs on the
+  *    new transaction and we can't block without protecting against other
+  *    processes trying to touch the journal while it is in transition.
+  */
+
+ static transaction_t * get_transaction (journal_t * journal, int is_try)
+ {
+       transaction_t * transaction;
+
+       transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS);
+       if (!transaction)
+               return NULL;
+
+       memset (transaction, 0, sizeof (transaction_t));
+
+       transaction->t_journal = journal;
+       transaction->t_state = T_RUNNING;
+       transaction->t_tid = journal->j_transaction_sequence++;
+       transaction->t_expires = jiffies + journal->j_commit_interval;
+
+       /* Set up the commit timer for the new transaction. */
+       J_ASSERT (!journal->j_commit_timer_active);
+       journal->j_commit_timer_active = 1;
+       journal->j_commit_timer->expires = transaction->t_expires;
+       add_timer(journal->j_commit_timer);
+
+       J_ASSERT (journal->j_running_transaction == NULL);
+       journal->j_running_transaction = transaction;
+
+       return transaction;
+ }
+
+ /*
+  * Handle management.
+  *
+  * A handle_t is an object which represents a single atomic update to a
+  * filesystem, and which tracks all of the modifications which form part
+  * of that one update.
+  */
+
+ /*
+  * start_this_handle: Given a handle, deal with any locking or stalling
+  * needed to make sure that there is enough journal space for the handle
+  * to begin.  Attach the handle to a transaction and set up the
+  * transaction's buffer credits.
+  */
+
+ static int start_this_handle(journal_t *journal, handle_t *handle)
+ {
+       transaction_t *transaction;
+       int needed;
+       int nblocks = handle->h_buffer_credits;
+
+       jbd_debug(3, "New handle %p going live.\n", handle);
+
+ repeat:
+
+       lock_journal(journal);
+
+       if (is_journal_aborted(journal) ||
+           (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+               unlock_journal(journal);
+               return -EROFS;
+       }
+
+       /* Wait on the journal's transaction barrier if necessary */
+       if (journal->j_barrier_count) {
+               unlock_journal(journal);
+               sleep_on(&journal->j_wait_transaction_locked);
+               goto repeat;
+       }
+
+ repeat_locked:
+       if (!journal->j_running_transaction)
+               get_transaction(journal, 0);
+       /* @@@ Error? */
+       J_ASSERT(journal->j_running_transaction);
+
+       transaction = journal->j_running_transaction;
+
+       /* If the current transaction is locked down for commit, wait
+        * for the lock to be released. */
+
+       if (transaction->t_state == T_LOCKED) {
+               unlock_journal(journal);
+               jbd_debug(3, "Handle %p stalling...\n", handle);
+               sleep_on(&journal->j_wait_transaction_locked);
+               goto repeat;
+       }
+
+       /* If there is not enough space left in the log to write all
+        * potential buffers requested by this operation, we need to
+        * stall pending a log checkpoint to free some more log
+        * space. */
+
+       needed = transaction->t_outstanding_credits + nblocks;
+
+       if (needed > journal->j_max_transaction_buffers) {
+               /* If the current transaction is already too large, then
+                * start to commit it: we can then go back and attach
+                * this handle to a new transaction. */
+
+               jbd_debug(2, "Handle %p starting new commit...\n", handle);
+               log_start_commit(journal, transaction);
+               unlock_journal(journal);
+               sleep_on(&journal->j_wait_transaction_locked);
+               lock_journal(journal);
+               goto repeat_locked;
+       }
+
+       /*
+        * The commit code assumes that it can get enough log space
+        * without forcing a checkpoint.  This is *critical* for
+        * correctness: a checkpoint of a buffer which is also
+        * associated with a committing transaction creates a deadlock,
+        * so commit simply cannot force through checkpoints.
+        *
+        * We must therefore ensure the necessary space in the journal
+        * *before* starting to dirty potentially checkpointed buffers
+        * in the new transaction.
+        *
+        * The worst part is, any transaction currently committing can
+        * reduce the free space arbitrarily.  Be careful to account for
+        * those buffers when checkpointing.
+        */
+
+       /*
+        * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+        * a _lot_ of headroom: 1/4 of the journal plus the size of
+        * the committing transaction.  Really, we only need to give it
+        * committing_transaction->t_outstanding_credits plus "enough" for
+        * the log control blocks.
+        * Also, this test is inconsitent with the matching one in
+        * journal_extend().
+        */
+       needed = journal->j_max_transaction_buffers;
+       if (journal->j_committing_transaction)
+               needed += journal->j_committing_transaction->
+                                       t_outstanding_credits;
+
+       if (log_space_left(journal) < needed) {
+               jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+               log_wait_for_space(journal, needed);
+               goto repeat_locked;
+       }
+
+       /* OK, account for the buffers that this operation expects to
+        * use and add the handle to the running transaction. */
+
+       handle->h_transaction = transaction;
+       transaction->t_outstanding_credits += nblocks;
+       transaction->t_updates++;
+       transaction->t_handle_count++;
+       jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+                 handle, nblocks, transaction->t_outstanding_credits,
+                 log_space_left(journal));
+
+       unlock_journal(journal);
+
+       return 0;
+ }
+
+ /*
+  * Obtain a new handle.
+  *
+  * We make sure that the transaction can guarantee at least nblocks of
+  * modified buffers in the log.  We block until the log can guarantee
+  * that much space.
+  *
+  * This function is visible to journal users (like ext2fs), so is not
+  * called with the journal already locked.
+  *
+  * Return a pointer to a newly allocated handle, or NULL on failure
+  */
+
+ handle_t *journal_start(journal_t *journal, int nblocks)
+ {
+       handle_t *handle = journal_current_handle();
+       int err;
+
+       if (!journal)
+               return ERR_PTR(-EROFS);
+
+       if (handle) {
+               J_ASSERT(handle->h_transaction->t_journal == journal);
+               handle->h_ref++;
+               return handle;
+       }
+
+       handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+       if (!handle)
+               return ERR_PTR(-ENOMEM);
+       memset (handle, 0, sizeof (handle_t));
+
+       handle->h_buffer_credits = nblocks;
+       handle->h_ref = 1;
+       current->journal_info = handle;
+
+       err = start_this_handle(journal, handle);
+       if (err < 0) {
+               kfree(handle);
+               current->journal_info = NULL;
+               return ERR_PTR(err);
+       }
+
+       return handle;
+ }
+
+ /*
+  * Return zero on success
+  */
+ static int try_start_this_handle(journal_t *journal, handle_t *handle)
+ {
+       transaction_t *transaction;
+       int needed;
+       int nblocks = handle->h_buffer_credits;
+       int ret = 0;
+
+       jbd_debug(3, "New handle %p maybe going live.\n", handle);
+
+       lock_journal(journal);
+
+       if (is_journal_aborted(journal) ||
+           (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+               ret = -EROFS;
+               goto fail_unlock;
+       }
+
+       if (journal->j_barrier_count)
+               goto fail_unlock;
+
+       if (!journal->j_running_transaction && get_transaction(journal, 1) == 0)
+               goto fail_unlock;
+
+       transaction = journal->j_running_transaction;
+       if (transaction->t_state == T_LOCKED)
+               goto fail_unlock;
+
+       needed = transaction->t_outstanding_credits + nblocks;
+       /* We could run log_start_commit here */
+       if (needed > journal->j_max_transaction_buffers)
+               goto fail_unlock;
+
+       needed = journal->j_max_transaction_buffers;
+       if (journal->j_committing_transaction)
+               needed += journal->j_committing_transaction->
+                                               t_outstanding_credits;
+
+       if (log_space_left(journal) < needed)
+               goto fail_unlock;
+
+       handle->h_transaction = transaction;
+       transaction->t_outstanding_credits += nblocks;
+       transaction->t_updates++;
+       jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+                 handle, nblocks, transaction->t_outstanding_credits,
+                 log_space_left(journal));
+       unlock_journal(journal);
+       return 0;
+
+ fail_unlock:
+       unlock_journal(journal);
+       if (ret >= 0)
+               ret = -1;
+       return ret;
+ }
+
+ /*
+  * Try to start a handle, but non-blockingly.  If we weren't able
+  * to, return an ERR_PTR value.
+  */
+ handle_t *journal_try_start(journal_t *journal, int nblocks)
+ {
+       handle_t *handle = journal_current_handle();
+       int err;
+
+       if (!journal)
+               return ERR_PTR(-EROFS);
+
+       if (handle) {
+               jbd_debug(4, "h_ref %d -> %d\n",
+                               handle->h_ref,
+                               handle->h_ref + 1);
+               J_ASSERT(handle->h_transaction->t_journal == journal);
+               if (is_handle_aborted(handle))
+                       return ERR_PTR(-EIO);
+               handle->h_ref++;
+               return handle;
+       } else {
+               jbd_debug(4, "no current transaction\n");
+       }
+
+       if (is_journal_aborted(journal))
+               return ERR_PTR(-EIO);
+
+       handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+       if (!handle)
+               return ERR_PTR(-ENOMEM);
+       memset (handle, 0, sizeof (handle_t));
+
+       handle->h_buffer_credits = nblocks;
+       handle->h_ref = 1;
+       current->journal_info = handle;
+
+       err = try_start_this_handle(journal, handle);
+       if (err < 0) {
+               kfree(handle);
+               current->journal_info = NULL;
+               return ERR_PTR(err);
+       }
+
+       return handle;
+ }
+
+ /*
+  * journal_extend: extend buffer credits.
+  *
+  * Some transactions, such as large extends and truncates, can be done
+  * atomically all at once or in several stages.  The operation requests
+  * a credit for a number of buffer modications in advance, but can
+  * extend its credit if it needs more.
+  *
+  * journal_extend tries to give the running handle more buffer credits.
+  * It does not guarantee that allocation: this is a best-effort only.
+  * The calling process MUST be able to deal cleanly with a failure to
+  * extend here.
+  *
+  * Return 0 on success, non-zero on failure.
+  *
+  * return code < 0 implies an error
+  * return code > 0 implies normal transaction-full status.
+  */
+
+ int journal_extend (handle_t *handle, int nblocks)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int result;
+       int wanted;
+
+       lock_journal (journal);
+
+       result = -EIO;
+       if (is_handle_aborted(handle))
+               goto error_out;
+
+       result = 1;
+
+       /* Don't extend a locked-down transaction! */
+       if (handle->h_transaction->t_state != T_RUNNING) {
+               jbd_debug(3, "denied handle %p %d blocks: "
+                         "transaction not running\n", handle, nblocks);
+               goto error_out;
+       }
+
+       wanted = transaction->t_outstanding_credits + nblocks;
+
+       if (wanted > journal->j_max_transaction_buffers) {
+               jbd_debug(3, "denied handle %p %d blocks: "
+                         "transaction too large\n", handle, nblocks);
+               goto error_out;
+       }
+
+       if (wanted > log_space_left(journal)) {
+               jbd_debug(3, "denied handle %p %d blocks: "
+                         "insufficient log space\n", handle, nblocks);
+               goto error_out;
+       }
+
+       handle->h_buffer_credits += nblocks;
+       transaction->t_outstanding_credits += nblocks;
+       result = 0;
+
+       jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+
+ error_out:
+       unlock_journal (journal);
+       return result;
+ }
+
+
+ /*
+  * journal_restart: restart a handle for a multi-transaction filesystem
+  * operation.
+  *
+  * If the journal_extend() call above fails to grant new buffer credits
+  * to a running handle, a call to journal_restart will commit the
+  * handle's transaction so far and reattach the handle to a new
+  * transaction capabable of guaranteeing the requested number of
+  * credits.
+  */
+
+ int journal_restart(handle_t *handle, int nblocks)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int ret;
+
+       /* If we've had an abort of any type, don't even think about
+        * actually doing the restart! */
+       if (is_handle_aborted(handle))
+               return 0;
+
+       /* First unlink the handle from its current transaction, and
+        * start the commit on that. */
+
+       J_ASSERT (transaction->t_updates > 0);
+       J_ASSERT (journal_current_handle() == handle);
+
+       transaction->t_outstanding_credits -= handle->h_buffer_credits;
+       transaction->t_updates--;
+
+       if (!transaction->t_updates)
+               wake_up(&journal->j_wait_updates);
+
+       jbd_debug(2, "restarting handle %p\n", handle);
+       log_start_commit(journal, transaction);
+
+       handle->h_buffer_credits = nblocks;
+       ret = start_this_handle(journal, handle);
+       return ret;
+ }
+
+
+ /*
+  * Barrier operation: establish a transaction barrier.
+  *
+  * This locks out any further updates from being started, and blocks
+  * until all existing updates have completed, returning only once the
+  * journal is in a quiescent state with no updates running.
+  *
+  * The journal lock should not be held on entry.
+  */
+
+ void journal_lock_updates (journal_t *journal)
+ {
+       lock_journal(journal);
+       ++journal->j_barrier_count;
+
+       /* Wait until there are no running updates */
+       while (1) {
+               transaction_t *transaction = journal->j_running_transaction;
+               if (!transaction)
+                       break;
+               if (!transaction->t_updates)
+                       break;
+
+               unlock_journal(journal);
+               sleep_on(&journal->j_wait_updates);
+               lock_journal(journal);
+       }
+
+       unlock_journal(journal);
+
+       /* We have now established a barrier against other normal
+        * updates, but we also need to barrier against other
+        * journal_lock_updates() calls to make sure that we serialise
+        * special journal-locked operations too. */
+       down(&journal->j_barrier);
+ }
+
+ /*
+  * Release a transaction barrier obtained with journal_lock_updates().
+  *
+  * Should be called without the journal lock held.
+  */
+
+ void journal_unlock_updates (journal_t *journal)
+ {
+       lock_journal(journal);
+
+       J_ASSERT (journal->j_barrier_count != 0);
+
+       up(&journal->j_barrier);
+       --journal->j_barrier_count;
+       wake_up(&journal->j_wait_transaction_locked);
+       unlock_journal(journal);
+ }
+
+ /*
+  * journal_get_write_access: notify intent to modify a buffer for metadata
+  * (not data) update.
+  *
+  * If the buffer is already part of the current transaction, then there
+  * is nothing we need to do.  If it is already part of a prior
+  * transaction which we are still committing to disk, then we need to
+  * make sure that we do not overwrite the old copy: we do copy-out to
+  * preserve the copy going to disk.  We also account the buffer against
+  * the handle's metadata buffer credits (unless the buffer is already
+  * part of the transaction, that is).
+  *
+  * Returns an error code or 0 on success.
+  *
+  * In full data journalling mode the buffer may be of type BJ_AsyncData,
+  * because we're write()ing a buffer which is also part of a shared mapping.
+  */
+
+ static int
+ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int error;
+       char *frozen_buffer = NULL;
+       int need_copy = 0;
+
+       jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+
+       JBUFFER_TRACE(jh, "entry");
+ repeat:
+       /* @@@ Need to check for errors here at some point. */
+
+       /*
+        * AKPM: neither bdflush nor kupdate run with the BKL.   There's
+        * nothing we can do to prevent them from starting writeout of a
+        * BUF_DIRTY buffer at any time.  And checkpointing buffers are on
+        * BUF_DIRTY.  So.  We no longer assert that the buffer is unlocked.
+        *
+        * However.  It is very wrong for us to allow ext3 to start directly
+        * altering the ->b_data of buffers which may at that very time be
+        * undergoing writeout to the client filesystem.  This can leave
+        * the filesystem in an inconsistent, transient state if we crash.
+        * So what we do is to steal the buffer if it is in checkpoint
+        * mode and dirty.  The journal lock will keep out checkpoint-mode
+        * state transitions within journal_remove_checkpoint() and the buffer
+        * is locked to keep bdflush/kupdate/whoever away from it as well.
+        *
+        * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
+        * simple lock_journal().  This code here will care for locked buffers.
+        */
+       /*
+        * The buffer_locked() || buffer_dirty() tests here are simply an
+        * optimisation tweak.  If anyone else in the system decides to
+        * lock this buffer later on, we'll blow up.  There doesn't seem
+        * to be a good reason why they should do this.
+        */
+       if (jh->b_cp_transaction &&
+           (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
+               unlock_journal(journal);
+               lock_buffer(jh2bh(jh));
+               spin_lock(&journal_datalist_lock);
+               if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
+                       /* OK, we need to steal it */
+                       JBUFFER_TRACE(jh, "stealing from checkpoint mode");
+                       J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+                       J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+
+                       J_ASSERT(handle->h_buffer_credits > 0);
+                       handle->h_buffer_credits--;
+
+                       /* This will clear BH_Dirty and set BH_JBDDirty. */
+                       JBUFFER_TRACE(jh, "file as BJ_Reserved");
+                       __journal_file_buffer(jh, transaction, BJ_Reserved);
+
+                       /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
+                       refile_buffer(jh2bh(jh));
+
+                       /*
+                        * The buffer is now hidden from bdflush.   It is
+                        * metadata against the current transaction.
+                        */
+                       JBUFFER_TRACE(jh, "steal from cp mode is complete");
+               }
+               spin_unlock(&journal_datalist_lock);
+               unlock_buffer(jh2bh(jh));
+               lock_journal(journal);
+       }
+
+       J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
+
+       error = -EROFS;
+       if (is_handle_aborted(handle))
+               goto out_unlocked;
+       error = 0;
+
+       spin_lock(&journal_datalist_lock);
+
+       /* The buffer is already part of this transaction if
+        * b_transaction or b_next_transaction points to it. */
+
+       if (jh->b_transaction == transaction ||
+           jh->b_next_transaction == transaction)
+               goto done_locked;
+
+       /* If there is already a copy-out version of this buffer, then
+        * we don't need to make another one. */
+
+       if (jh->b_frozen_data) {
+               JBUFFER_TRACE(jh, "has frozen data");
+               J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+               jh->b_next_transaction = transaction;
+
+               J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+               handle->h_buffer_credits--;
+               goto done_locked;
+       }
+
+       /* Is there data here we need to preserve? */
+
+       if (jh->b_transaction && jh->b_transaction != transaction) {
+               JBUFFER_TRACE(jh, "owned by older transaction");
+               J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+               J_ASSERT_JH(jh, jh->b_transaction ==
+                                       journal->j_committing_transaction);
+
+               /* There is one case we have to be very careful about.
+                * If the committing transaction is currently writing
+                * this buffer out to disk and has NOT made a copy-out,
+                * then we cannot modify the buffer contents at all
+                * right now.  The essence of copy-out is that it is the
+                * extra copy, not the primary copy, which gets
+                * journaled.  If the primary copy is already going to
+                * disk then we cannot do copy-out here. */
+
+               if (jh->b_jlist == BJ_Shadow) {
+                       JBUFFER_TRACE(jh, "on shadow: sleep");
+                       spin_unlock(&journal_datalist_lock);
+                       unlock_journal(journal);
+                       /* commit wakes up all shadow buffers after IO */
+                       sleep_on(&jh2bh(jh)->b_wait);
+                       lock_journal(journal);
+                       goto repeat;
+               }
+
+               /* Only do the copy if the currently-owning transaction
+                * still needs it.  If it is on the Forget list, the
+                * committing transaction is past that stage.  The
+                * buffer had better remain locked during the kmalloc,
+                * but that should be true --- we hold the journal lock
+                * still and the buffer is already on the BUF_JOURNAL
+                * list so won't be flushed.
+                *
+                * Subtle point, though: if this is a get_undo_access,
+                * then we will be relying on the frozen_data to contain
+                * the new value of the committed_data record after the
+                * transaction, so we HAVE to force the frozen_data copy
+                * in that case. */
+
+               if (jh->b_jlist != BJ_Forget || force_copy) {
+                       JBUFFER_TRACE(jh, "generate frozen data");
+                       if (!frozen_buffer) {
+                               JBUFFER_TRACE(jh, "allocate memory for buffer");
+                               spin_unlock(&journal_datalist_lock);
+                               unlock_journal(journal);
+                               frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
+                                                           GFP_NOFS);
+                               lock_journal(journal);
+                               if (!frozen_buffer) {
+                                       printk(KERN_EMERG __FUNCTION__
+                                               "OOM for frozen_buffer\n");
+                                       JBUFFER_TRACE(jh, "oom!");
+                                       error = -ENOMEM;
+                                       spin_lock(&journal_datalist_lock);
+                                       goto done_locked;
+                               }
+                               goto repeat;
+                       }
+
+                       jh->b_frozen_data = frozen_buffer;
+                       frozen_buffer = NULL;
+                       need_copy = 1;
+               }
+               jh->b_next_transaction = transaction;
+       }
+
+       J_ASSERT(handle->h_buffer_credits > 0);
+       handle->h_buffer_credits--;
+
+       /* Finally, if the buffer is not journaled right now, we need to
+        * make sure it doesn't get written to disk before the caller
+        * actually commits the new data. */
+
+       if (!jh->b_transaction) {
+               JBUFFER_TRACE(jh, "no transaction");
+               J_ASSERT_JH(jh, !jh->b_next_transaction);
+               jh->b_transaction = transaction;
+               JBUFFER_TRACE(jh, "file as BJ_Reserved");
+               __journal_file_buffer(jh, transaction, BJ_Reserved);
+       }
+
+ done_locked:
+       spin_unlock(&journal_datalist_lock);
+       if (need_copy) {
+               struct page *page;
+               int offset;
+               char *source;
+
+               J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh)));
+               page = jh2bh(jh)->b_page;
+               offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+               source = kmap(page);
+               memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+               kunmap(page);
+       }
+
+
+       /* If we are about to journal a buffer, then any revoke pending
+            on it is no longer valid. */
+       journal_cancel_revoke(handle, jh);
+
+ out_unlocked:
+       if (frozen_buffer)
+               kfree(frozen_buffer);
+
+       JBUFFER_TRACE(jh, "exit");
+       return error;
+ }
+
+ int journal_get_write_access (handle_t *handle, struct buffer_head *bh)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       struct journal_head *jh = journal_add_journal_head(bh);
+       int rc;
+
+       /* We do not want to get caught playing with fields which the
+        * log thread also manipulates.  Make sure that the buffer
+        * completes any outstanding IO before proceeding. */
+       lock_journal(journal);
+       rc = do_get_write_access(handle, jh, 0);
+       journal_unlock_journal_head(jh);
+       unlock_journal(journal);
+       return rc;
+ }
+
+
+ /*
+  * When the user wants to journal a newly created buffer_head
+  * (ie. getblk() returned a new buffer and we are going to populate it
+  * manually rather than reading off disk), then we need to keep the
+  * buffer_head locked until it has been completely filled with new
+  * data.  In this case, we should be able to make the assertion that
+  * the bh is not already part of an existing transaction.
+  *
+  * The buffer should already be locked by the caller by this point.
+  * There is no lock ranking violation: it was a newly created,
+  * unlocked buffer beforehand. */
+
+ int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       struct journal_head *jh = journal_add_journal_head(bh);
+       int err;
+
+       jbd_debug(5, "journal_head %p\n", jh);
+       lock_journal(journal);
+       err = -EROFS;
+       if (is_handle_aborted(handle))
+               goto out;
+       err = 0;
+
+       JBUFFER_TRACE(jh, "entry");
+       /* The buffer may already belong to this transaction due to
+        * pre-zeroing in the filesystem's new_block code.  It may also
+        * be on the previous, committing transaction's lists, but it
+        * HAS to be in Forget state in that case: the transaction must
+        * have deleted the buffer for it to be reused here. */
+       J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+                        jh->b_transaction == NULL ||
+                        (jh->b_transaction == journal->j_committing_transaction &&
+                         jh->b_jlist == BJ_Forget)));
+
+       J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+       J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+       J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+       handle->h_buffer_credits--;
+
+       spin_lock(&journal_datalist_lock);
+       if (jh->b_transaction == NULL) {
+               jh->b_transaction = transaction;
+               JBUFFER_TRACE(jh, "file as BJ_Reserved");
+               __journal_file_buffer(jh, transaction, BJ_Reserved);
+               JBUFFER_TRACE(jh, "refile");
+               refile_buffer(jh2bh(jh));
+       } else if (jh->b_transaction == journal->j_committing_transaction) {
+               JBUFFER_TRACE(jh, "set next transaction");
+               jh->b_next_transaction = transaction;
+       }
+       spin_unlock(&journal_datalist_lock);
+
+       /*
+        * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+        * blocks which contain freed but then revoked metadata.  We need
+        * to cancel the revoke in case we end up freeing it yet again
+        * and the reallocating as data - this would cause a second revoke,
+        * which hits an assertion error.
+        */
+       JBUFFER_TRACE(jh, "cancelling revoke");
+       journal_cancel_revoke(handle, jh);
+       journal_unlock_journal_head(jh);
+ out:
+       unlock_journal(journal);
+       return err;
+ }
+
+
+
+ /*
+  * journal_get_undo_access: Notify intent to modify metadata with non-
+  * rewindable consequences
+  *
+  * Sometimes there is a need to distinguish between metadata which has
+  * been committed to disk and that which has not.  The ext3fs code uses
+  * this for freeing and allocating space: we have to make sure that we
+  * do not reuse freed space until the deallocation has been committed,
+  * since if we overwrote that space we would make the delete
+  * un-rewindable in case of a crash.
+  *
+  * To deal with that, journal_get_undo_access requests write access to a
+  * buffer for parts of non-rewindable operations such as delete
+  * operations on the bitmaps.  The journaling code must keep a copy of
+  * the buffer's contents prior to the undo_access call until such time
+  * as we know that the buffer has definitely been committed to disk.
+  *
+  * We never need to know which transaction the committed data is part
+  * of: buffers touched here are guaranteed to be dirtied later and so
+  * will be committed to a new transaction in due course, at which point
+  * we can discard the old committed data pointer.
+  *
+  * Returns error number or 0 on success.
+  */
+
+ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
+ {
+       journal_t *journal = handle->h_transaction->t_journal;
+       int err;
+       struct journal_head *jh = journal_add_journal_head(bh);
+
+       JBUFFER_TRACE(jh, "entry");
+       lock_journal(journal);
+
+       /* Do this first --- it can drop the journal lock, so we want to
+        * make sure that obtaining the committed_data is done
+        * atomically wrt. completion of any outstanding commits. */
+       err = do_get_write_access (handle, jh, 1);
+       if (err)
+               goto out;
+
+       if (!jh->b_committed_data) {
+               /* Copy out the current buffer contents into the
+                * preserved, committed copy. */
+               JBUFFER_TRACE(jh, "generate b_committed data");
+               jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size,
+                                                  GFP_NOFS);
+               if (!jh->b_committed_data) {
+                       printk(KERN_EMERG __FUNCTION__
+                               ": No memory for committed data!\n");
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               memcpy (jh->b_committed_data, jh2bh(jh)->b_data,
+                               jh2bh(jh)->b_size);
+       }
+
+ out:
+       if (!err)
+               J_ASSERT_JH(jh, jh->b_committed_data);
+       journal_unlock_journal_head(jh);
+       unlock_journal(journal);
+       return err;
+ }
+
+ /*
+  * journal_dirty_data: mark a buffer as containing dirty data which
+  * needs to be flushed before we can commit the current transaction.
+  *
+  * The buffer is placed on the transaction's data list and is marked as
+  * belonging to the transaction.
+  *
+  * If `async' is set then the writebask will be initiated by the caller
+  * using submit_bh -> end_buffer_io_async.  We put the buffer onto
+  * t_async_datalist.
+  *
+  * Returns error number or 0 on success.
+  *
+  * journal_dirty_data() can be called via page_launder->ext3_writepage
+  * by kswapd.  So it cannot block.  Happily, there's nothing here
+  * which needs lock_journal if `async' is set.
+  *
+  * When the buffer is on the current transaction we freely move it
+  * between BJ_AsyncData and BJ_SyncData according to who tried to
+  * change its state last.
+  */
+
+ int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
+ {
+       journal_t *journal = handle->h_transaction->t_journal;
+       int need_brelse = 0;
+       int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
+       struct journal_head *jh;
+
+       if (is_handle_aborted(handle))
+               return 0;
+
+       jh = journal_add_journal_head(bh);
+       JBUFFER_TRACE(jh, "entry");
+
+       /*
+        * The buffer could *already* be dirty.  Writeout can start
+        * at any time.
+        */
+       jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+
+       /*
+        * What if the buffer is already part of a running transaction?
+        *
+        * There are two cases:
+        * 1) It is part of the current running transaction.  Refile it,
+        *    just in case we have allocated it as metadata, deallocated
+        *    it, then reallocated it as data.
+        * 2) It is part of the previous, still-committing transaction.
+        *    If all we want to do is to guarantee that the buffer will be
+        *    written to disk before this new transaction commits, then
+        *    being sure that the *previous* transaction has this same
+        *    property is sufficient for us!  Just leave it on its old
+        *    transaction.
+        *
+        * In case (2), the buffer must not already exist as metadata
+        * --- that would violate write ordering (a transaction is free
+        * to write its data at any point, even before the previous
+        * committing transaction has committed).  The caller must
+        * never, ever allow this to happen: there's nothing we can do
+        * about it in this layer.
+        */
+       spin_lock(&journal_datalist_lock);
+       if (jh->b_transaction) {
+               JBUFFER_TRACE(jh, "has transaction");
+               if (jh->b_transaction != handle->h_transaction) {
+                       JBUFFER_TRACE(jh, "belongs to older transaction");
+                       J_ASSERT_JH(jh, jh->b_transaction ==
+                                       journal->j_committing_transaction);
+
+                       /* @@@ IS THIS TRUE  ? */
+                       /*
+                        * Not any more.  Scenario: someone does a write()
+                        * in data=journal mode.  The buffer's transaction has
+                        * moved into commit.  Then someone does another
+                        * write() to the file.  We do the frozen data copyout
+                        * and set b_next_transaction to point to j_running_t.
+                        * And while we're in that state, someone does a
+                        * writepage() in an attempt to pageout the same area
+                        * of the file via a shared mapping.  At present that
+                        * calls journal_dirty_data(), and we get right here.
+                        * It may be too late to journal the data.  Simply
+                        * falling through to the next test will suffice: the
+                        * data will be dirty and wil be checkpointed.  The
+                        * ordering comments in the next comment block still
+                        * apply.
+                        */
+                       //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
+                       /*
+                        * If we're journalling data, and this buffer was
+                        * subject to a write(), it could be metadata, forget
+                        * or shadow against the committing transaction.  Now,
+                        * someone has dirtied the same darn page via a mapping
+                        * and it is being writepage()'d.
+                        * We *could* just steal the page from commit, with some
+                        * fancy locking there.  Instead, we just skip it -
+                        * don't tie the page's buffers to the new transaction
+                        * at all.
+                        * Implication: if we crash before the writepage() data
+                        * is written into the filesystem, recovery will replay
+                        * the write() data.
+                        */
+                       if (jh->b_jlist != BJ_None &&
+                                       jh->b_jlist != BJ_SyncData &&
+                                       jh->b_jlist != BJ_AsyncData) {
+                               JBUFFER_TRACE(jh, "Not stealing");
+                               goto no_journal;
+                       }
+
+                       /*
+                        * This buffer may be undergoing writeout in commit.  We
+                        * can't return from here and let the caller dirty it
+                        * again because that can cause the write-out loop in
+                        * commit to never terminate.
+                        */
+                       if (!async && buffer_dirty(bh)) {
+                               atomic_inc(&bh->b_count);
+                               spin_unlock(&journal_datalist_lock);
+                               need_brelse = 1;
+                               ll_rw_block(WRITE, 1, &bh);
+                               wait_on_buffer(bh);
+                               spin_lock(&journal_datalist_lock);
+                               /* The buffer may become locked again at any
+                                  time if it is redirtied */
+                       }
+
+                       /* journal_clean_data_list() may have got there first */
+                       if (jh->b_transaction != NULL) {
+                               JBUFFER_TRACE(jh, "unfile from commit");
+                               __journal_unfile_buffer(jh);
+                               jh->b_transaction = NULL;
+                       }
+                       /* The buffer will be refiled below */
+
+               }
+               /*
+                * Special case --- the buffer might actually have been
+                * allocated and then immediately deallocated in the previous,
+                * committing transaction, so might still be left on that
+                * transaction's metadata lists.
+                */
+               if (jh->b_jlist != wanted_jlist) {
+                       JBUFFER_TRACE(jh, "not on correct data list: unfile");
+                       J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+                       __journal_unfile_buffer(jh);
+                       jh->b_transaction = NULL;
+                       JBUFFER_TRACE(jh, "file as data");
+                       __journal_file_buffer(jh, handle->h_transaction,
+                                               wanted_jlist);
+               }
+       } else {
+               JBUFFER_TRACE(jh, "not on a transaction");
+               __journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
+       }
+       /*
+        * We need to mark the buffer dirty and refile it inside the lock to
+        * protect it from release by journal_try_to_free_buffer()
+        *
+        * We set ->b_flushtime to something small enough to typically keep
+        * kupdate away from the buffer.
+        *
+        * We don't need to do a balance_dirty() - __block_commit_write()
+        * does that.
+        */
+       if (!async && !atomic_set_buffer_dirty(jh2bh(jh))) {
+               jh2bh(jh)->b_flushtime =
+                       jiffies + journal->j_commit_interval + 1 * HZ;
+               refile_buffer(jh2bh(jh));
+       }
+ no_journal:
+       spin_unlock(&journal_datalist_lock);
+       if (need_brelse) {
+               BUFFER_TRACE(bh, "brelse");
+               __brelse(bh);
+       }
+       JBUFFER_TRACE(jh, "exit");
+       journal_unlock_journal_head(jh);
+       return 0;
+ }
+
+ /*
+  * journal_dirty_metadata: mark a buffer as containing dirty metadata
+  * which needs to be journaled as part of the current transaction.
+  *
+  * The buffer is placed on the transaction's metadata list and is marked
+  * as belonging to the transaction.
+  *
+  * Special care needs to be taken if the buffer already belongs to the
+  * current committing transaction (in which case we should have frozen
+  * data present for that commit).  In that case, we don't relink the
+  * buffer: that only gets done when the old transaction finally
+  * completes its commit.
+  *
+  * Returns error number or 0 on success.
+  */
+
+ int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       struct journal_head *jh = bh2jh(bh);
+
+       jbd_debug(5, "journal_head %p\n", jh);
+       JBUFFER_TRACE(jh, "entry");
+       lock_journal(journal);
+       if (is_handle_aborted(handle))
+               goto out_unlock;
+
+       spin_lock(&journal_datalist_lock);
+       set_bit(BH_JBDDirty, &bh->b_state);
+       set_buffer_flushtime(bh);
+
+       J_ASSERT_JH(jh, jh->b_transaction != NULL);
+
+       /*
+        * Metadata already on the current transaction list doesn't
+        * need to be filed.  Metadata on another transaction's list must
+        * be committing, and will be refiled once the commit completes:
+        * leave it alone for now.
+        */
+
+       if (jh->b_transaction != transaction) {
+               JBUFFER_TRACE(jh, "already on other transaction");
+               J_ASSERT_JH(jh, jh->b_transaction ==
+                                       journal->j_committing_transaction);
+               J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+               /* And this case is illegal: we can't reuse another
+                * transaction's data buffer, ever. */
+               /* FIXME: writepage() should be journalled */
+               J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData);
+               goto done_locked;
+       }
+
+       /* That test should have eliminated the following case: */
+       J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+
+       JBUFFER_TRACE(jh, "file as BJ_Metadata");
+       __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+
+ done_locked:
+       spin_unlock(&journal_datalist_lock);
+       JBUFFER_TRACE(jh, "exit");
+ out_unlock:
+       unlock_journal(journal);
+       return 0;
+ }
+
+ #if 0
+ /*
+  * journal_release_buffer: undo a get_write_access without any buffer
+  * updates, if the update decided in the end that it didn't need access.
+  *
+  * journal_get_write_access() can block, so it is quite possible for a
+  * journaling component to decide after the write access is returned
+  * that global state has changed and the update is no longer required.  */
+
+ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       struct journal_head *jh = bh2jh(bh);
+
+       lock_journal(journal);
+       JBUFFER_TRACE(jh, "entry");
+
+       /* If the buffer is reserved but not modified by this
+        * transaction, then it is safe to release it.  In all other
+        * cases, just leave the buffer as it is. */
+
+       spin_lock(&journal_datalist_lock);
+       if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction &&
+           !buffer_jdirty(jh2bh(jh))) {
+               JBUFFER_TRACE(jh, "unused: refiling it");
+               handle->h_buffer_credits++;
+               __journal_refile_buffer(jh);
+       }
+       spin_unlock(&journal_datalist_lock);
+
+       JBUFFER_TRACE(jh, "exit");
+       unlock_journal(journal);
+ }
+ #endif
+
+ /*
+  * journal_forget: bforget() for potentially-journaled buffers.  We can
+  * only do the bforget if there are no commits pending against the
+  * buffer.  If the buffer is dirty in the current running transaction we
+  * can safely unlink it.
+  *
+  * bh may not be a journalled buffer at all - it may be a non-JBD
+  * buffer which came off the hashtable.  Check for this.
+  *
+  * Decrements bh->b_count by one.
+  *
+  * Allow this call even if the handle has aborted --- it may be part of
+  * the caller's cleanup after an abort.
+  */
+
+ void journal_forget (handle_t *handle, struct buffer_head *bh)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       struct journal_head *jh;
+
+       BUFFER_TRACE(bh, "entry");
+
+       lock_journal(journal);
+       spin_lock(&journal_datalist_lock);
+
+       if (!buffer_jbd(bh))
+               goto not_jbd;
+       jh = bh2jh(bh);
+
+       if (jh->b_transaction == handle->h_transaction) {
+               J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+               /* If we are forgetting a buffer which is already part
+                * of this transaction, then we can just drop it from
+                * the transaction immediately. */
+               clear_bit(BH_Dirty, &bh->b_state);
+               clear_bit(BH_JBDDirty, &bh->b_state);
+
+               JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+               J_ASSERT_JH(jh, !jh->b_committed_data);
+
+               __journal_unfile_buffer(jh);
+               jh->b_transaction = 0;
+
+               /*
+                * We are no longer going to journal this buffer.
+                * However, the commit of this transaction is still
+                * important to the buffer: the delete that we are now
+                * processing might obsolete an old log entry, so by
+                * committing, we can satisfy the buffer's checkpoint.
+                *
+                * So, if we have a checkpoint on the buffer, we should
+                * now refile the buffer on our BJ_Forget list so that
+                * we know to remove the checkpoint after we commit.
+                */
+
+               if (jh->b_cp_transaction) {
+                       __journal_file_buffer(jh, transaction, BJ_Forget);
+               } else {
+                       __journal_remove_journal_head(bh);
+                       __brelse(bh);
+                       if (!buffer_jbd(bh)) {
+                               spin_unlock(&journal_datalist_lock);
+                               unlock_journal(journal);
+                               __bforget(bh);
+                               return;
+                       }
+               }
+
+       } else if (jh->b_transaction) {
+               J_ASSERT_JH(jh, (jh->b_transaction ==
+                                journal->j_committing_transaction));
+               /* However, if the buffer is still owned by a prior
+                * (committing) transaction, we can't drop it yet... */
+               JBUFFER_TRACE(jh, "belongs to older transaction");
+               /* ... but we CAN drop it from the new transaction if we
+                * have also modified it since the original commit. */
+
+               if (jh->b_next_transaction) {
+                       J_ASSERT(jh->b_next_transaction == transaction);
+                       jh->b_next_transaction = NULL;
+               }
+       }
+
+ not_jbd:
+       spin_unlock(&journal_datalist_lock);
+       unlock_journal(journal);
+       __brelse(bh);
+       return;
+ }
+
+ #if 0 /* Unused */
+ /*
+  * journal_sync_buffer: flush a potentially-journaled buffer to disk.
+  *
+  * Used for O_SYNC filesystem operations.  If the buffer is journaled,
+  * we need to complete the O_SYNC by waiting for the transaction to
+  * complete.  It is an error to call journal_sync_buffer before
+  * journal_stop!
+  */
+
+ void journal_sync_buffer(struct buffer_head *bh)
+ {
+       transaction_t *transaction;
+       journal_t *journal;
+       long sequence;
+       struct journal_head *jh;
+
+       /* If the buffer isn't journaled, this is easy: just sync it to
+        * disk.  */
+       BUFFER_TRACE(bh, "entry");
+
+       spin_lock(&journal_datalist_lock);
+       if (!buffer_jbd(bh)) {
+               spin_unlock(&journal_datalist_lock);
+               return;
+       }
+       jh = bh2jh(bh);
+       if (jh->b_transaction == NULL) {
+               /* If the buffer has already been journaled, then this
+                * is a noop. */
+               if (jh->b_cp_transaction == NULL) {
+                       spin_unlock(&journal_datalist_lock);
+                       return;
+               }
+               atomic_inc(&bh->b_count);
+               spin_unlock(&journal_datalist_lock);
+               ll_rw_block (WRITE, 1, &bh);
+               wait_on_buffer(bh);
+               __brelse(bh);
+               goto out;
+       }
+
+       /* Otherwise, just wait until the transaction is synced to disk. */
+       transaction = jh->b_transaction;
+       journal = transaction->t_journal;
+       sequence = transaction->t_tid;
+       spin_unlock(&journal_datalist_lock);
+
+       jbd_debug(2, "requesting commit for jh %p\n", jh);
+       log_start_commit (journal, transaction);
+
+       while (tid_gt(sequence, journal->j_commit_sequence)) {
+               wake_up(&journal->j_wait_done_commit);
+               sleep_on(&journal->j_wait_done_commit);
+       }
+       JBUFFER_TRACE(jh, "exit");
+ out:
+       return;
+ }
+ #endif
+
+ /*
+  * All done for a particular handle.
+  *
+  * There is not much action needed here.  We just return any remaining
+  * buffer credits to the transaction and remove the handle.  The only
+  * complication is that we need to start a commit operation if the
+  * filesystem is marked for synchronous update.
+  *
+  * journal_stop itself will not usually return an error, but it may
+  * do so in unusual circumstances.  In particular, expect it to
+  * return -EIO if a journal_abort has been executed since the
+  * transaction began.
+  */
+
+ int journal_stop(handle_t *handle)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int old_handle_count, err;
+
+       if (!handle)
+               return 0;
+
+       J_ASSERT (transaction->t_updates > 0);
+       J_ASSERT (journal_current_handle() == handle);
+
+       if (is_handle_aborted(handle))
+               err = -EIO;
+       else
+               err = 0;
+
+       if (--handle->h_ref > 0) {
+               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+                         handle->h_ref);
+               return err;
+       }
+
+       jbd_debug(4, "Handle %p going down\n", handle);
+
+       /*
+        * Implement synchronous transaction batching.  If the handle
+        * was synchronous, don't force a commit immediately.  Let's
+        * yield and let another thread piggyback onto this transaction.
+        * Keep doing that while new threads continue to arrive.
+        * It doesn't cost much - we're about to run a commit and sleep
+        * on IO anyway.  Speeds up many-threaded, many-dir operations
+        * by 30x or more...
+        */
+       if (handle->h_sync) {
+               do {
+                       old_handle_count = transaction->t_handle_count;
+                       set_current_state(TASK_RUNNING);
+                       current->policy |= SCHED_YIELD;
+                       schedule();
+               } while (old_handle_count != transaction->t_handle_count);
+       }
+
+       current->journal_info = NULL;
+       transaction->t_outstanding_credits -= handle->h_buffer_credits;
+       transaction->t_updates--;
+       if (!transaction->t_updates) {
+               wake_up(&journal->j_wait_updates);
+               if (journal->j_barrier_count)
+                       wake_up(&journal->j_wait_transaction_locked);
+       }
+
+       /*
+        * If the handle is marked SYNC, we need to set another commit
+        * going!  We also want to force a commit if the current
+        * transaction is occupying too much of the log, or if the
+        * transaction is too old now.
+        */
+       if (handle->h_sync ||
+                       transaction->t_outstanding_credits >
+                               journal->j_max_transaction_buffers ||
+                       time_after_eq(jiffies, transaction->t_expires)) {
+               /* Do this even for aborted journals: an abort still
+                * completes the commit thread, it just doesn't write
+                * anything to disk. */
+               tid_t tid = transaction->t_tid;
+
+               jbd_debug(2, "transaction too old, requesting commit for "
+                                       "handle %p\n", handle);
+               /* This is non-blocking */
+               log_start_commit(journal, transaction);
+
+               /*
+                * Special case: JFS_SYNC synchronous updates require us
+                * to wait for the commit to complete.
+                */
+               if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+                       log_wait_commit(journal, tid);
+       }
+       kfree(handle);
+       return err;
+ }
+
+ /*
+  * For synchronous operations: force any uncommitted trasnactions
+  * to disk.  May seem kludgy, but it reuses all the handle batching
+  * code in a very simple manner.
+  */
+ int journal_force_commit(journal_t *journal)
+ {
+       handle_t *handle;
+       int ret = 0;
+
+       lock_kernel();
+       handle = journal_start(journal, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out;
+       }
+       handle->h_sync = 1;
+       journal_stop(handle);
+ out:
+       unlock_kernel();
+       return ret;
+ }
+
+ /*
+  *
+  * List management code snippets: various functions for manipulating the
+  * transaction buffer lists.
+  *
+  */
+
+ /*
+  * Append a buffer to a transaction list, given the transaction's list head
+  * pointer.
+  * journal_datalist_lock is held.
+  */
+
+ static inline void
+ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+ {
+       if (!*list) {
+               jh->b_tnext = jh->b_tprev = jh;
+               *list = jh;
+       } else {
+               /* Insert at the tail of the list to preserve order */
+               struct journal_head *first = *list, *last = first->b_tprev;
+               jh->b_tprev = last;
+               jh->b_tnext = first;
+               last->b_tnext = first->b_tprev = jh;
+       }
+ }
+
+ /*
+  * Remove a buffer from a transaction list, given the transaction's list
+  * head pointer.
+  *
+  * Called with journal_datalist_lock held, and the journal may not
+  * be locked.
+  */
+
+ static inline void
+ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+ {
+       if (*list == jh) {
+               *list = jh->b_tnext;
+               if (*list == jh)
+                       *list = 0;
+       }
+       jh->b_tprev->b_tnext = jh->b_tnext;
+       jh->b_tnext->b_tprev = jh->b_tprev;
+ }
+
+ /*
+  * Remove a buffer from the appropriate transaction list.
+  *
+  * Note that this function can *change* the value of
+  * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
+  * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+  * is holding onto a copy of one of thee pointers, it could go bad.
+  * Generally the caller needs to re-read the pointer from the transaction_t.
+  *
+  * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
+  * via journal_try_to_free_buffer() or journal_clean_data_list().  In that
+  * case, journal_datalist_lock will be held, and the journal may not be locked.
+  */
+ void __journal_unfile_buffer(struct journal_head *jh)
+ {
+       struct journal_head **list = 0;
+       transaction_t * transaction;
+
+       assert_spin_locked(&journal_datalist_lock);
+       transaction = jh->b_transaction;
+
+ #ifdef __SMP__
+       J_ASSERT (current->lock_depth >= 0);
+ #endif
+       J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+
+       if (jh->b_jlist != BJ_None)
+               J_ASSERT_JH(jh, transaction != 0);
+
+       switch (jh->b_jlist) {
+       case BJ_None:
+               return;
+       case BJ_SyncData:
+               list = &transaction->t_sync_datalist;
+               break;
+       case BJ_AsyncData:
+               list = &transaction->t_async_datalist;
+               break;
+       case BJ_Metadata:
+               transaction->t_nr_buffers--;
+               J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+               list = &transaction->t_buffers;
+               break;
+       case BJ_Forget:
+               list = &transaction->t_forget;
+               break;
+       case BJ_IO:
+               list = &transaction->t_iobuf_list;
+               break;
+       case BJ_Shadow:
+               list = &transaction->t_shadow_list;
+               break;
+       case BJ_LogCtl:
+               list = &transaction->t_log_list;
+               break;
+       case BJ_Reserved:
+               list = &transaction->t_reserved_list;
+               break;
+       }
+
+       __blist_del_buffer(list, jh);
+       jh->b_jlist = BJ_None;
+       if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) {
+               set_bit(BH_Dirty, &jh2bh(jh)->b_state);
+       }
+ }
+
+ void journal_unfile_buffer(struct journal_head *jh)
+ {
+       spin_lock(&journal_datalist_lock);
+       __journal_unfile_buffer(jh);
+       spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+  * Called from journal_try_to_free_buffers().  The journal is not
+  * locked. lru_list_lock is not held.
+  *
+  * Here we see why journal_datalist_lock is global and not per-journal.
+  * We cannot get back to this buffer's journal pointer without locking
+  * out journal_clean_data_list() in some manner.
+  *
+  * One could use journal_datalist_lock to get unracy access to a
+  * per-journal lock.
+  *
+  * Called with journal_datalist_lock held.
+  *
+  * Returns non-zero iff we were able to free the journal_head.
+  */
+ static int __journal_try_to_free_buffer(struct buffer_head *bh,
+                                       int *locked_or_dirty)
+ {
+       struct journal_head *jh;
+
+       assert_spin_locked(&journal_datalist_lock);
+
+       if (!buffer_jbd(bh))
+               return 1;
+       jh = bh2jh(bh);
+
+       if (buffer_locked(bh) || buffer_dirty(bh)) {
+               *locked_or_dirty = 1;
+               goto out;
+       }
+
+       if (!buffer_uptodate(bh))
+               goto out;
+
+       if (jh->b_next_transaction != 0)
+               goto out;
+
+       if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+               if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
+                       /* A written-back ordered data buffer */
+                       JBUFFER_TRACE(jh, "release data");
+                       __journal_unfile_buffer(jh);
+                       jh->b_transaction = 0;
+                       __journal_remove_journal_head(bh);
+                       __brelse(bh);
+               }
+       }
+       else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+               /* written-back checkpointed metadata buffer */
+               if (jh->b_jlist == BJ_None) {
+                       JBUFFER_TRACE(jh, "remove from checkpoint list");
+                       __journal_remove_checkpoint(jh);
+                       __journal_remove_journal_head(bh);
+                       __brelse(bh);
+               }
+       }
+       return !buffer_jbd(bh);
+
+ out:
+       return 0;
+ }
+
+ /*
+  * journal_try_to_free_buffers().  For all the buffers on this page,
+  * if they are fully written out ordered data, move them onto BUF_CLEAN
+  * so try_to_free_buffers() can reap them.  Called with lru_list_lock
+  * not held.  Does its own locking.
+  *
+  * This complicates JBD locking somewhat.  We aren't protected by the
+  * BKL here.  We wish to remove the buffer from its committing or
+  * running transaction's ->t_datalist via __journal_unfile_buffer.
+  *
+  * This may *change* the value of transaction_t->t_datalist, so anyone
+  * who looks at t_datalist needs to lock against this function.
+  *
+  * Even worse, someone may be doing a journal_dirty_data on this
+  * buffer.  So we need to lock against that.  journal_dirty_data()
+  * will come out of the lock with the buffer dirty, which makes it
+  * ineligible for release here.
+  *
+  * Who else is affected by this?  hmm...  Really the only contender
+  * is do_get_write_access() - it could be looking at the buffer while
+  * journal_try_to_free_buffer() is changing its state.  But that
+  * cannot happen because we never reallocate freed data as metadata
+  * while the data is part of a transaction.  Yes?
+  *
+  * This function returns non-zero if we wish try_to_free_buffers()
+  * to be called. We do this is the page is releasable by try_to_free_buffers().
+  * We also do it if the page has locked or dirty buffers and the caller wants
+  * us to perform sync or async writeout.
+  */
+ int journal_try_to_free_buffers(journal_t *journal,
+                               struct page *page, int gfp_mask)
+ {
+       struct buffer_head *bh;
+       struct buffer_head *tmp;
+       int locked_or_dirty = 0;
+       int call_ttfb = 1;
+
+       J_ASSERT(PageLocked(page));
+
+       bh = page->buffers;
+       tmp = bh;
+       spin_lock(&journal_datalist_lock);
+       do {
+               struct buffer_head *p = tmp;
+
+               tmp = tmp->b_this_page;
+               if (buffer_jbd(p))
+                       if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
+                               call_ttfb = 0;
+       } while (tmp != bh);
+       spin_unlock(&journal_datalist_lock);
+
+       if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
+               goto out;
+       if (!locked_or_dirty)
+               goto out;
+       /*
+        * The VM wants us to do writeout, or to block on IO, or both.
+        * So we allow try_to_free_buffers to be called even if the page
+        * still has journalled buffers.
+        */
+       call_ttfb = 1;
+ out:
+       return call_ttfb;
+ }
+
+ /*
+  * This buffer is no longer needed.  If it is on an older transaction's
+  * checkpoint list we need to record it on this transaction's forget list
+  * to pin this buffer (and hence its checkpointing transaction) down until
+  * this transaction commits.  If the buffer isn't on a checkpoint list, we
+  * release it.
+  * Returns non-zero if JBD no longer has an interest in the buffer.
+  */
+ static int dispose_buffer(struct journal_head *jh,
+               transaction_t *transaction)
+ {
+       int may_free = 1;
+       struct buffer_head *bh = jh2bh(jh);
+
+       spin_lock(&journal_datalist_lock);
+       __journal_unfile_buffer(jh);
+       jh->b_transaction = 0;
+
+       if (jh->b_cp_transaction) {
+               JBUFFER_TRACE(jh, "on running+cp transaction");
+               __journal_file_buffer(jh, transaction, BJ_Forget);
+               clear_bit(BH_JBDDirty, &bh->b_state);
+               may_free = 0;
+       } else {
+               JBUFFER_TRACE(jh, "on running transaction");
+               __journal_remove_journal_head(bh);
+               __brelse(bh);
+       }
+       spin_unlock(&journal_datalist_lock);
+       return may_free;
+ }
+
+ /*
+  * journal_flushpage
+  *
+  * This code is tricky.  It has a number of cases to deal with.
+  *
+  * There are two invariants which this code relies on:
+  *
+  * i_size must be updated on disk before we start calling flushpage on the
+  * data.
+  *
+  *  This is done in ext3 by defining an ext3_setattr method which
+  *  updates i_size before truncate gets going.  By maintaining this
+  *  invariant, we can be sure that it is safe to throw away any buffers
+  *  attached to the current transaction: once the transaction commits,
+  *  we know that the data will not be needed.
+  *
+  *  Note however that we can *not* throw away data belonging to the
+  *  previous, committing transaction!
+  *
+  * Any disk blocks which *are* part of the previous, committing
+  * transaction (and which therefore cannot be discarded immediately) are
+  * not going to be reused in the new running transaction
+  *
+  *  The bitmap committed_data images guarantee this: any block which is
+  *  allocated in one transaction and removed in the next will be marked
+  *  as in-use in the committed_data bitmap, so cannot be reused until
+  *  the next transaction to delete the block commits.  This means that
+  *  leaving committing buffers dirty is quite safe: the disk blocks
+  *  cannot be reallocated to a different file and so buffer aliasing is
+  *  not possible.
+  *
+  *
+  * The above applies mainly to ordered data mode.  In writeback mode we
+  * don't make guarantees about the order in which data hits disk --- in
+  * particular we don't guarantee that new dirty data is flushed before
+  * transaction commit --- so it is always safe just to discard data
+  * immediately in that mode.  --sct
+  */
+
+ /*
+  * The journal_unmap_buffer helper function returns zero if the buffer
+  * concerned remains pinned as an anonymous buffer belonging to an older
+  * transaction.
+  *
+  * We're outside-transaction here.  Either or both of j_running_transaction
+  * and j_committing_transaction may be NULL.
+  */
+ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+ {
+       transaction_t *transaction;
+       struct journal_head *jh;
+       int may_free = 1;
+
+       BUFFER_TRACE(bh, "entry");
+
+       if (!buffer_mapped(bh))
+               return 1;
+
+       /* It is safe to proceed here without the
+        * journal_datalist_spinlock because the buffers cannot be
+        * stolen by try_to_free_buffers as long as we are holding the
+        * page lock. --sct */
+
+       if (!buffer_jbd(bh))
+               goto zap_buffer;
+
+       jh = bh2jh(bh);
+       transaction = jh->b_transaction;
+       if (transaction == NULL) {
+               /* First case: not on any transaction.  If it
+                * has no checkpoint link, then we can zap it:
+                * it's a writeback-mode buffer so we don't care
+                * if it hits disk safely. */
+               if (!jh->b_cp_transaction) {
+                       JBUFFER_TRACE(jh, "not on any transaction: zap");
+                       goto zap_buffer;
+               }
+
+               if (!buffer_dirty(bh)) {
+                       /* bdflush has written it.  We can drop it now */
+                       goto zap_buffer;
+               }
+
+               /* OK, it must be in the journal but still not
+                * written fully to disk: it's metadata or
+                * journaled data... */
+
+               if (journal->j_running_transaction) {
+                       /* ... and once the current transaction has
+                        * committed, the buffer won't be needed any
+                        * longer. */
+                       JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+                       return dispose_buffer(jh,
+                                       journal->j_running_transaction);
+               } else {
+                       /* There is no currently-running transaction. So the
+                        * orphan record which we wrote for this file must have
+                        * passed into commit.  We must attach this buffer to
+                        * the committing transaction, if it exists. */
+                       if (journal->j_committing_transaction) {
+                               JBUFFER_TRACE(jh, "give to committing trans");
+                               return dispose_buffer(jh,
+                                       journal->j_committing_transaction);
+                       } else {
+                               /* The orphan record's transaction has
+                                * committed.  We can cleanse this buffer */
+                               clear_bit(BH_JBDDirty, &bh->b_state);
+                               goto zap_buffer;
+                       }
+               }
+       } else if (transaction == journal->j_committing_transaction) {
+               /* If it is committing, we simply cannot touch it.  We
+                * can remove it's next_transaction pointer from the
+                * running transaction if that is set, but nothing
+                * else. */
+               JBUFFER_TRACE(jh, "on committing transaction");
+               if (jh->b_next_transaction) {
+                       J_ASSERT(jh->b_next_transaction ==
+                                       journal->j_running_transaction);
+                       jh->b_next_transaction = NULL;
+               }
+               return 0;
+       } else {
+               /* Good, the buffer belongs to the running transaction.
+                * We are writing our own transaction's data, not any
+                * previous one's, so it is safe to throw it away
+                * (remember that we expect the filesystem to have set
+                * i_size already for this truncate so recovery will not
+                * expose the disk blocks we are discarding here.) */
+               J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+               may_free = dispose_buffer(jh, transaction);
+       }
+
+ zap_buffer:
+       if (buffer_dirty(bh))
+               mark_buffer_clean(bh);
+       J_ASSERT_BH(bh, !buffer_jdirty(bh));
+       clear_bit(BH_Uptodate, &bh->b_state);
+       clear_bit(BH_Mapped, &bh->b_state);
+       clear_bit(BH_Req, &bh->b_state);
+       clear_bit(BH_New, &bh->b_state);
+       return may_free;
+ }
+
+ /*
+  * Return non-zero if the page's buffers were successfully reaped
+  */
+ int journal_flushpage(journal_t *journal,
+                     struct page *page,
+                     unsigned long offset)
+ {
+       struct buffer_head *head, *bh, *next;
+       unsigned int curr_off = 0;
+       int may_free = 1;
+
+       if (!PageLocked(page))
+               BUG();
+       if (!page->buffers)
+               return 1;
+
+       /* We will potentially be playing with lists other than just the
+        * data lists (especially for journaled data mode), so be
+        * cautious in our locking. */
+       lock_journal(journal);
+
+       head = bh = page->buffers;
+       do {
+               unsigned int next_off = curr_off + bh->b_size;
+               next = bh->b_this_page;
+
+               /* AKPM: doing lock_buffer here may be overly paranoid */
+               if (offset <= curr_off) {
+                       /* This block is wholly outside the truncation point */
+                       lock_buffer(bh);
+                       may_free &= journal_unmap_buffer(journal, bh);
+                       unlock_buffer(bh);
+               }
+               curr_off = next_off;
+               bh = next;
+
+       } while (bh != head);
+
+       unlock_journal(journal);
+
+       if (!offset) {
+               if (!may_free || !try_to_free_buffers(page, 0)) {
+                       atomic_inc(&buffermem_pages);
+                       return 0;
+               }
+               J_ASSERT(page->buffers == NULL);
+       }
+
+       return 1;
+ }
+
+
+
+ /*
+  * File a buffer on the given transaction list.
+  */
+
+ void __journal_file_buffer(struct journal_head *jh,
+                       transaction_t *transaction, int jlist)
+ {
+       struct journal_head **list = 0;
+
+       assert_spin_locked(&journal_datalist_lock);
+
+ #ifdef __SMP__
+       J_ASSERT (current->lock_depth >= 0);
+ #endif
+       J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+       J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+                               jh->b_transaction == 0);
+
+       if (jh->b_transaction) {
+               if (jh->b_jlist == jlist)
+                       return;
+               __journal_unfile_buffer(jh);
+       } else {
+               jh->b_transaction = transaction;
+       }
+
+       switch (jlist) {
+       case BJ_None:
+               J_ASSERT_JH(jh, !jh->b_committed_data);
+               J_ASSERT_JH(jh, !jh->b_frozen_data);
+               return;
+       case BJ_SyncData:
+               list = &transaction->t_sync_datalist;
+               break;
+       case BJ_AsyncData:
+               list = &transaction->t_async_datalist;
+               break;
+       case BJ_Metadata:
+               transaction->t_nr_buffers++;
+               list = &transaction->t_buffers;
+               break;
+       case BJ_Forget:
+               list = &transaction->t_forget;
+               break;
+       case BJ_IO:
+               list = &transaction->t_iobuf_list;
+               break;
+       case BJ_Shadow:
+               list = &transaction->t_shadow_list;
+               break;
+       case BJ_LogCtl:
+               list = &transaction->t_log_list;
+               break;
+       case BJ_Reserved:
+               list = &transaction->t_reserved_list;
+               break;
+       }
+
+       __blist_add_buffer(list, jh);
+       jh->b_jlist = jlist;
+
+       if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+           jlist == BJ_Shadow || jlist == BJ_Forget) {
+               if (atomic_set_buffer_clean(jh2bh(jh))) {
+                       set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
+               }
+       }
+ }
+
+ void journal_file_buffer(struct journal_head *jh,
+                               transaction_t *transaction, int jlist)
+ {
+       spin_lock(&journal_datalist_lock);
+       __journal_file_buffer(jh, transaction, jlist);
+       spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+  * Remove a buffer from its current buffer list in preparation for
+  * dropping it from its current transaction entirely.  If the buffer has
+  * already started to be used by a subsequent transaction, refile the
+  * buffer on that transaction's metadata list.
+  */
+
+ void __journal_refile_buffer(struct journal_head *jh)
+ {
+       assert_spin_locked(&journal_datalist_lock);
+ #ifdef __SMP__
+       J_ASSERT_JH(jh, current->lock_depth >= 0);
+ #endif
+       __journal_unfile_buffer(jh);
+
+       /* If the buffer is now unused, just drop it.  If it has been
+          modified by a later transaction, add it to the new
+          transaction's metadata list. */
+
+       jh->b_transaction = jh->b_next_transaction;
+       jh->b_next_transaction = NULL;
+
+       if (jh->b_transaction != NULL) {
+               __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
+               J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+       } else {
+               /* Onto BUF_DIRTY for writeback */
+               refile_buffer(jh2bh(jh));
+       }
+ }
+
+ /*
+  * For the unlocked version of this call, also make sure that any
+  * hanging journal_head is cleaned up if necessary.
+  *
+  * __journal_refile_buffer is usually called as part of a single locked
+  * operation on a buffer_head, in which the caller is probably going to
+  * be hooking the journal_head onto other lists.  In that case it is up
+  * to the caller to remove the journal_head if necessary.  For the
+  * unlocked journal_refile_buffer call, the caller isn't going to be
+  * doing anything else to the buffer so we need to do the cleanup
+  * ourselves to avoid a jh leak.
+  *
+  * *** The journal_head may be freed by this call! ***
+  */
+ void journal_refile_buffer(struct journal_head *jh)
+ {
+       struct buffer_head *bh;
+
+       spin_lock(&journal_datalist_lock);
+       bh = jh2bh(jh);
+
+       __journal_refile_buffer(jh);
+       __journal_remove_journal_head(bh);
+
+       spin_unlock(&journal_datalist_lock);
+       __brelse(bh);
+ }
diff -rc2P linux/fs/jbd-kernel.c linux-2.4.13/fs/jbd-kernel.c
*** linux/fs/jbd-kernel.c       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd-kernel.c        Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,336 ----
+ /*
+  * fs/jbd-kernel.c
+  *
+  * Support code for the Journalling Block Device layer.
+  * This file contains things which have to be in-kernel when
+  * JBD is a module.
+  *
+  * 15 May 2001        Andrew Morton <[email protected]>
+  *    Created
+  */
+
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
+
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+
+ /*
+  * jh_splice_lock needs explantion.
+  *
+  * In a number of places we want to do things like:
+  *
+  *    if (buffer_jbd(bh) && bh2jh(bh)->foo)
+  *
+  * This is racy on SMP, because another CPU could remove the journal_head
+  * in the middle of this expression.  We need locking.
+  *
+  * But we can greatly optimise the locking cost by testing BH_JBD
+  * outside the lock.  So, effectively:
+  *
+  *    ret = 0;
+  *    if (buffer_jbd(bh)) {
+  *            spin_lock(&jh_splice_lock);
+  *            if (buffer_jbd(bh)) {    (* Still there? *)
+  *                    ret = bh2jh(bh)->foo;
+  *            }
+  *            spin_unlock(&jh_splice_lock);
+  *    }
+  *    return ret;
+  *
+  * Now, that protects us from races where another CPU can remove the
+  * journal_head.  But it doesn't defend us from the situation where another
+  * CPU can *add* a journal_head.  This is a correctness issue.  But it's not
+  * a problem because a) the calling code was *already* racy and b) it often
+  * can't happen at the call site and c) the places where we add journal_heads
+  * tend to be under external locking.
+  */
+ spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED;
+ EXPORT_SYMBOL(jh_splice_lock);
+
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+  * Some sanity testing which is called from mark_buffer_clean(),
+  * and must be present in the main kernel.
+  */
+
+ void jbd_preclean_buffer_check(struct buffer_head *bh)
+ {
+       if (buffer_jbd(bh)) {
+               struct journal_head *jh = bh2jh(bh);
+
+               transaction_t *transaction = jh->b_transaction;
+               journal_t *journal;
+
+               if (jh->b_jlist == 0 && transaction == NULL)
+                       return;
+
+               J_ASSERT_JH(jh, (jh->b_jlist == 0 ||
+                                jh->b_jlist == BJ_LogCtl ||
+                                jh->b_jlist == BJ_IO ||
+                                jh->b_jlist == BJ_Forget ||
+                                buffer_jbd_data(bh)));
+               J_ASSERT_JH(jh, transaction != NULL);
+               /* The kernel may be unmapping old data.  We expect it
+                * to be dirty in that case, unless the buffer has
+                * already been forgotten by a transaction. */
+               if (jh->b_jlist != BJ_Forget) {
+ #if 1
+                       if (!buffer_dirty(bh)) {
+                               printk(__FUNCTION__": clean of clean buffer\n");
+                               print_buffer_trace(bh);
+                               return;
+                       }
+ #endif
+                       J_ASSERT_BH(bh, buffer_dirty(bh));
+                       if (!buffer_jbd_data(bh)) {
+                               J_ASSERT_JH(jh,
+                                           test_bit(BH_JWrite,
+                                                    &jh2bh(jh)->b_state));
+                       }
+               }
+
+               journal = transaction->t_journal;
+               J_ASSERT_JH(jh,
+                           transaction == journal->j_running_transaction ||
+                           transaction == journal->j_committing_transaction);
+       }
+ }
+ EXPORT_SYMBOL(jbd_preclean_buffer_check);
+ #endif                /* CONFIG_JBD_DEBUG */
+
+ /*
+  * Entries in /proc/sys/fs
+  */
+
+ int journal_oom_retry = 1;
+ EXPORT_SYMBOL(journal_oom_retry);
+ #if defined(CONFIG_JBD_DEBUG)
+ int journal_enable_debug;
+ int journal_no_write[2];
+ EXPORT_SYMBOL(journal_enable_debug);
+ EXPORT_SYMBOL(journal_no_write);
+ #endif
+
+ #endif        /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */
+
+ /*
+  * Support functions for BUFFER_TRACE()
+  */
+ #ifdef CONFIG_BUFFER_DEBUG
+
+ static spinlock_t trace_lock = SPIN_LOCK_UNLOCKED;
+
+ void buffer_trace(struct buffer_head *dest,
+               struct buffer_head *src, char *info)
+ {
+       struct buffer_history_item *bhist_i;
+       unsigned long flags;
+
+       if (dest == 0 || src == 0)
+               return;
+
+       spin_lock_irqsave(&trace_lock, flags);
+
+       /*
+        * Sometimes we don't initialise the ring pointers. (locally declared
+        * temp buffer_heads). Feebly attempt to detect and correct that here.
+        */
+       if ((dest->b_history.b_history_head - dest->b_history.b_history_tail >
+                               BUFFER_HISTORY_SIZE)) {
+               dest->b_history.b_history_head = 0;
+               dest->b_history.b_history_tail = 0;
+       }
+       bhist_i = dest->b_history.b +
+               (dest->b_history.b_history_head & (BUFFER_HISTORY_SIZE - 1));
+       bhist_i->info = info;
+       bhist_i->b_state = src->b_state;
+       bhist_i->b_list = src->b_list;
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+       bhist_i->b_trans_is_running = 0;
+       bhist_i->b_trans_is_committing = 0;
+       bhist_i->b_blocknr = src->b_blocknr;
+       if (buffer_jbd(src)) {
+               struct journal_head *jh;
+               journal_t *journal;
+               transaction_t *transaction;
+
+               /* Footwork to avoid racing with journal_remove_journal_head */
+               jh = src->b_private;
+               if (jh == 0)
+                       goto raced;
+               transaction = jh->b_transaction;
+               if (src->b_private == 0)
+                       goto raced;
+               bhist_i->b_jcount = jh->b_jcount;
+               bhist_i->b_jbd = 1;
+               bhist_i->b_jlist = jh->b_jlist;
+               bhist_i->b_frozen_data = jh->b_frozen_data;
+               bhist_i->b_committed_data = jh->b_committed_data;
+               bhist_i->b_transaction = !!jh->b_transaction;
+               bhist_i->b_next_transaction = !!jh->b_next_transaction;
+               bhist_i->b_cp_transaction = !!jh->b_cp_transaction;
+
+               if (transaction) {
+                       journal = transaction->t_journal;
+                       bhist_i->b_trans_is_running = transaction ==
+                                       journal->j_running_transaction;
+                       bhist_i->b_trans_is_committing = transaction ==
+                                       journal->j_committing_transaction;
+               }
+       } else {
+ raced:
+               bhist_i->b_jcount = 0;
+               bhist_i->b_jbd = 0;
+               bhist_i->b_jlist = 0;
+               bhist_i->b_frozen_data = 0;
+               bhist_i->b_committed_data = 0;
+               bhist_i->b_transaction = 0;
+               bhist_i->b_next_transaction = 0;
+               bhist_i->b_cp_transaction = 0;
+       }
+ #endif        /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */
+
+       bhist_i->on_lru = (src->b_prev_free != 0 && src->b_next_free != 0);
+       bhist_i->on_hash = (src->b_pprev != 0);
+       bhist_i->cpu = smp_processor_id();
+       bhist_i->b_count = atomic_read(&src->b_count);
+
+       dest->b_history.b_history_head++;
+       if (dest->b_history.b_history_head - dest->b_history.b_history_tail >
+                               BUFFER_HISTORY_SIZE)
+               dest->b_history.b_history_tail =
+                       dest->b_history.b_history_head - BUFFER_HISTORY_SIZE;
+
+       spin_unlock_irqrestore(&trace_lock, flags);
+ }
+
+ static const char *b_list_to_string(unsigned int b_list)
+ {
+       switch (b_list) {
+       case BUF_CLEAN:         return "BUF_CLEAN";
+       case BUF_LOCKED:        return "BUF_LOCKED";
+       case BUF_DIRTY:         return "BUF_DIRTY";
+       default:                return "Bad b_list";
+       }
+ }
+
+ static const char *b_jlist_to_string(unsigned int b_list)
+ {
+       switch (b_list) {
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+       case BJ_None:           return "BJ_None";
+       case BJ_SyncData:       return "BJ_SyncData";
+       case BJ_AsyncData:      return "BJ_AsyncData";
+       case BJ_Metadata:       return "BJ_Metadata";
+       case BJ_Forget:         return "BJ_Forget";
+       case BJ_IO:             return "BJ_IO";
+       case BJ_Shadow:         return "BJ_Shadow";
+       case BJ_LogCtl:         return "BJ_LogCtl";
+       case BJ_Reserved:       return "BJ_Reserved";
+ #endif
+       default:                return "Bad b_jlist";
+       }
+ }
+
+ static void print_one_hist(struct buffer_history_item *bhist_i)
+ {
+       printk(" %s\n", bhist_i->info);
+       printk("     b_state:0x%lx b_list:%s b_jlist:%s on_lru:%d\n",
+                       bhist_i->b_state,
+                       b_list_to_string(bhist_i->b_list),
+                       b_jlist_to_string(bhist_i->b_jlist),
+                       bhist_i->on_lru);
+       printk("     cpu:%d on_hash:%d b_count:%d b_blocknr:%lu\n",
+                       bhist_i->cpu,
+                       bhist_i->on_hash,
+                       bhist_i->b_count,
+                       bhist_i->b_blocknr);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+       printk("     b_jbd:%u b_frozen_data:%p b_committed_data:%p\n",
+                       bhist_i->b_jbd,
+                       bhist_i->b_frozen_data,
+                       bhist_i->b_committed_data);
+       printk("     b_transaction:%u b_next_transaction:%u "
+                       "b_cp_transaction:%u b_trans_is_running:%u\n",
+                       bhist_i->b_transaction,
+                       bhist_i->b_next_transaction,
+                       bhist_i->b_cp_transaction,
+                       bhist_i->b_trans_is_running);
+       printk("     b_trans_is_comitting:%u b_jcount:%u ",
+                       bhist_i->b_trans_is_committing,
+                       bhist_i->b_jcount);
+ #endif
+       printk("\n");
+ }
+
+ void print_buffer_fields(struct buffer_head *bh)
+ {
+       printk("b_next:%p, b_blocknr:%lu b_count:%d b_flushtime:%lu\n",
+               bh->b_next, bh->b_blocknr, atomic_read(&bh->b_count),
+                       bh->b_flushtime);
+       printk("b_next_free:%p b_prev_free:%p b_this_page:%p b_reqnext:%p\n",
+               bh->b_next_free, bh->b_prev_free, bh->b_this_page,
+                       bh->b_reqnext);
+       printk("b_pprev:%p b_data:%p b_page:%p b_inode:%p b_list:%d\n",
+               bh->b_pprev, bh->b_data, bh->b_page, bh->b_inode, bh->b_list);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+       if (buffer_jbd(bh)) {
+               struct journal_head *jh = bh2jh(bh);
+
+               printk("b_jlist:%u b_frozen_data:%p b_committed_data:%p\n",
+                       jh->b_jlist, jh->b_frozen_data, jh->b_committed_data);
+               printk(" b_transaction:%p b_next_transaction:%p "
+                               "b_cp_transaction:%p\n",
+                       jh->b_transaction, jh->b_next_transaction,
+                       jh->b_cp_transaction);
+               printk("b_cpnext:%p b_cpprev:%p\n",
+                       jh->b_cpnext, jh->b_cpprev);
+       }
+ #endif
+ }
+
+ void print_buffer_trace(struct buffer_head *bh)
+ {
+ #ifdef CONFIG_X86
+       extern void show_stack(unsigned long * esp);
+ #endif
+
+       unsigned long idx, count;
+       unsigned long flags;
+
+       printk("buffer trace for buffer at 0x%p (I am CPU %d)\n",
+                       bh, smp_processor_id());
+       BUFFER_TRACE(bh, "");           /* Record state now */
+
+       spin_lock_irqsave(&trace_lock, flags);
+       for (   idx = bh->b_history.b_history_tail, count = 0;
+               idx < bh->b_history.b_history_head &&
+                       count < BUFFER_HISTORY_SIZE;
+               idx++, count++)
+               print_one_hist(bh->b_history.b +
+                       (idx & (BUFFER_HISTORY_SIZE - 1)));
+
+       print_buffer_fields(bh);
+       spin_unlock_irqrestore(&trace_lock, flags);
+ #ifdef CONFIG_X86
+       show_stack(NULL);
+ #endif
+       printk("\n");
+ }
+
+ static struct buffer_head *failed_buffer_head;        /* For access with debuggers */
+
+ void buffer_assertion_failure(struct buffer_head *bh)
+ {
+       failed_buffer_head = bh;
+       print_buffer_trace(bh);
+ }
+ EXPORT_SYMBOL(buffer_trace);
+ EXPORT_SYMBOL(print_buffer_trace);
+ EXPORT_SYMBOL(buffer_assertion_failure);
+ EXPORT_SYMBOL(print_buffer_fields);
+ #endif        /* CONFIG_BUFFER_DEBUG */
+
diff -rc2P linux/fs/open.c linux-2.4.13/fs/open.c
*** linux/fs/open.c     Fri Nov  9 16:15:08 2001
--- linux-2.4.13/fs/open.c      Fri Nov  9 16:57:59 2001
***************
*** 72,75 ****
--- 72,81 ----
 }

+ /*
+  * i_sem is taken outside i_truncate_sem because that is the
+  * order in which these locks are taken on the path
+  * generic_file_write->copy_from_user->handle_mm_fault->do_no_page
+  */
+
 int do_truncate(struct dentry *dentry, loff_t length)
 {
***************
*** 83,89 ****
--- 89,97 ----

       down(&inode->i_sem);
+       down_write(&inode->i_truncate_sem);
       newattrs.ia_size = length;
       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
       error = notify_change(dentry, &newattrs);
+       up_write(&inode->i_truncate_sem);
       up(&inode->i_sem);
       return error;
diff -rc2P linux/include/linux/buffer-trace.h linux-2.4.13/include/linux/buffer-trace.h
*** linux/include/linux/buffer-trace.h  Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/buffer-trace.h   Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,84 ----
+ /*
+  * include/linux/buffer-trace.h
+  *
+  * Debugging support for recording buffer_head state transitions
+  *
+  * May 2001, akpm
+  *    Created
+  */
+
+ #ifndef BUFFER_TRACE_H_INCLUDED
+ #define BUFFER_TRACE_H_INCLUDED
+
+ #include <linux/config.h>
+
+ #ifdef CONFIG_BUFFER_DEBUG
+
+ /* The number of records per buffer_head.  Must be a power of two */
+ #define BUFFER_HISTORY_SIZE   32
+
+ struct buffer_head;
+
+ /* This gets embedded in struct buffer_head */
+ struct buffer_history {
+       struct buffer_history_item {
+               char *info;
+               unsigned long b_state;
+               unsigned b_list:3;
+               unsigned b_jlist:4;
+               unsigned on_lru:1;
+               unsigned on_hash:1;
+               unsigned cpu:3;
+               unsigned b_count:8;
+               unsigned long b_blocknr;        /* For src != dest */
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+               unsigned b_jcount:4;
+               unsigned b_jbd:1;
+               unsigned b_transaction:1;
+               unsigned b_next_transaction:1;
+               unsigned b_cp_transaction:1;
+               unsigned b_trans_is_running:1;
+               unsigned b_trans_is_committing:1;
+               void *b_frozen_data;
+               void *b_committed_data;
+ #endif
+       } b[BUFFER_HISTORY_SIZE];
+       unsigned long b_history_head;   /* Next place to write */
+       unsigned long b_history_tail;   /* Oldest valid entry */
+ };
+
+ static inline void buffer_trace_init(struct buffer_history *bhist)
+ {
+       bhist->b_history_head = 0;
+       bhist->b_history_tail = 0;
+ }
+ extern void buffer_trace(struct buffer_head *dest,
+                       struct buffer_head *src, char *info);
+ extern void print_buffer_fields(struct buffer_head *bh);
+ extern void print_buffer_trace(struct buffer_head *bh);
+
+ #define BUFFER_STRINGIFY2(X)          #X
+ #define BUFFER_STRINGIFY(X)           BUFFER_STRINGIFY2(X)
+
+ #define BUFFER_TRACE2(dest, src, info)                                \
+       do {                                                    \
+               buffer_trace((dest), (src),                     \
+                       __FUNCTION__"() ["__FILE__":"           \
+                       BUFFER_STRINGIFY(__LINE__)"] " info);   \
+       } while (0)
+
+ #define BUFFER_TRACE(bh, info) BUFFER_TRACE2(bh, bh, info)
+ #define JBUFFER_TRACE(jh, info)       BUFFER_TRACE(jh2bh(jh), info)
+
+ #else         /* CONFIG_BUFFER_DEBUG */
+
+ #define buffer_trace_init(bh) do {} while (0)
+ #define print_buffer_fields(bh)       do {} while (0)
+ #define print_buffer_trace(bh)        do {} while (0)
+ #define BUFFER_TRACE(bh, info)        do {} while (0)
+ #define BUFFER_TRACE2(bh, bh2, info)  do {} while (0)
+ #define JBUFFER_TRACE(jh, info)       do {} while (0)
+
+ #endif                /* CONFIG_BUFFER_DEBUG */
+
+ #endif                /* BUFFER_TRACE_H_INCLUDED */
diff -rc2P linux/include/linux/capability.h linux-2.4.13/include/linux/capability.h
*** linux/include/linux/capability.h    Fri Nov  9 16:15:08 2001
--- linux-2.4.13/include/linux/capability.h     Fri Nov  9 16:58:00 2001
***************
*** 251,254 ****
--- 251,256 ----
 /* Override quota limits. */
 /* Override reserved space on ext2 filesystem */
+ /* Modify data journaling mode on ext3 filesystem (uses journaling
+    resources) */
 /* NOTE: ext2 honors fsuid when checking for resource overrides, so
    you can override using fsuid too */
diff -rc2P linux/include/linux/capability.h.orig linux-2.4.13/include/linux/capability.h.orig
*** linux/include/linux/capability.h.orig       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/capability.h.orig        Fri Nov  9 16:15:08 2001
***************
*** 0 ****
--- 1,356 ----
+ /*
+  * This is <linux/capability.h>
+  *
+  * Andrew G. Morgan <[email protected]>
+  * Alexander Kjeldaas <[email protected]>
+  * with help from Aleph1, Roland Buresund and Andrew Main.
+  *
+  * See here for the libcap library ("POSIX draft" compliance):
+  *
+  * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.2/
+  */
+
+ #ifndef _LINUX_CAPABILITY_H
+ #define _LINUX_CAPABILITY_H
+
+ #include <linux/types.h>
+ #include <linux/fs.h>
+
+ /* User-level do most of the mapping between kernel and user
+    capabilities based on the version tag given by the kernel. The
+    kernel might be somewhat backwards compatible, but don't bet on
+    it. */
+
+ /* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to
+    a set of three capability sets.  The transposition of 3*the
+    following structure to such a composite is better handled in a user
+    library since the draft standard requires the use of malloc/free
+    etc.. */
+
+ #define _LINUX_CAPABILITY_VERSION  0x19980330
+
+ typedef struct __user_cap_header_struct {
+       __u32 version;
+       int pid;
+ } *cap_user_header_t;
+
+ typedef struct __user_cap_data_struct {
+         __u32 effective;
+         __u32 permitted;
+         __u32 inheritable;
+ } *cap_user_data_t;
+
+ #ifdef __KERNEL__
+
+ /* #define STRICT_CAP_T_TYPECHECKS */
+
+ #ifdef STRICT_CAP_T_TYPECHECKS
+
+ typedef struct kernel_cap_struct {
+       __u32 cap;
+ } kernel_cap_t;
+
+ #else
+
+ typedef __u32 kernel_cap_t;
+
+ #endif
+
+ #define _USER_CAP_HEADER_SIZE  (2*sizeof(__u32))
+ #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
+
+ #endif
+
+
+ /**
+  ** POSIX-draft defined capabilities.
+  **/
+
+ /* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this
+    overrides the restriction of changing file ownership and group
+    ownership. */
+
+ #define CAP_CHOWN            0
+
+ /* Override all DAC access, including ACL execute access if
+    [_POSIX_ACL] is defined. Excluding DAC access covered by
+    CAP_LINUX_IMMUTABLE. */
+
+ #define CAP_DAC_OVERRIDE     1
+
+ /* Overrides all DAC restrictions regarding read and search on files
+    and directories, including ACL restrictions if [_POSIX_ACL] is
+    defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. */
+
+ #define CAP_DAC_READ_SEARCH  2
+
+ /* Overrides all restrictions about allowed operations on files, where
+    file owner ID must be equal to the user ID, except where CAP_FSETID
+    is applicable. It doesn't override MAC and DAC restrictions. */
+
+ #define CAP_FOWNER           3
+
+ /* Overrides the following restrictions that the effective user ID
+    shall match the file owner ID when setting the S_ISUID and S_ISGID
+    bits on that file; that the effective group ID (or one of the
+    supplementary group IDs) shall match the file owner ID when setting
+    the S_ISGID bit on that file; that the S_ISUID and S_ISGID bits are
+    cleared on successful return from chown(2) (not implemented). */
+
+ #define CAP_FSETID           4
+
+ /* Used to decide between falling back on the old suser() or fsuser(). */
+
+ #define CAP_FS_MASK          0x1f
+
+ /* Overrides the restriction that the real or effective user ID of a
+    process sending a signal must match the real or effective user ID
+    of the process receiving the signal. */
+
+ #define CAP_KILL             5
+
+ /* Allows setgid(2) manipulation */
+ /* Allows setgroups(2) */
+ /* Allows forged gids on socket credentials passing. */
+
+ #define CAP_SETGID           6
+
+ /* Allows set*uid(2) manipulation (including fsuid). */
+ /* Allows forged pids on socket credentials passing. */
+
+ #define CAP_SETUID           7
+
+
+ /**
+  ** Linux-specific capabilities
+  **/
+
+ /* Transfer any capability in your permitted set to any pid,
+    remove any capability in your permitted set from any pid */
+
+ #define CAP_SETPCAP          8
+
+ /* Allow modification of S_IMMUTABLE and S_APPEND file attributes */
+
+ #define CAP_LINUX_IMMUTABLE  9
+
+ /* Allows binding to TCP/UDP sockets below 1024 */
+ /* Allows binding to ATM VCIs below 32 */
+
+ #define CAP_NET_BIND_SERVICE 10
+
+ /* Allow broadcasting, listen to multicast */
+
+ #define CAP_NET_BROADCAST    11
+
+ /* Allow interface configuration */
+ /* Allow administration of IP firewall, masquerading and accounting */
+ /* Allow setting debug option on sockets */
+ /* Allow modification of routing tables */
+ /* Allow setting arbitrary process / process group ownership on
+    sockets */
+ /* Allow binding to any address for transparent proxying */
+ /* Allow setting TOS (type of service) */
+ /* Allow setting promiscuous mode */
+ /* Allow clearing driver statistics */
+ /* Allow multicasting */
+ /* Allow read/write of device-specific registers */
+ /* Allow activation of ATM control sockets */
+
+ #define CAP_NET_ADMIN        12
+
+ /* Allow use of RAW sockets */
+ /* Allow use of PACKET sockets */
+
+ #define CAP_NET_RAW          13
+
+ /* Allow locking of shared memory segments */
+ /* Allow mlock and mlockall (which doesn't really have anything to do
+    with IPC) */
+
+ #define CAP_IPC_LOCK         14
+
+ /* Override IPC ownership checks */
+
+ #define CAP_IPC_OWNER        15
+
+ /* Insert and remove kernel modules - modify kernel without limit */
+ /* Modify cap_bset */
+ #define CAP_SYS_MODULE       16
+
+ /* Allow ioperm/iopl access */
+ /* Allow sending USB messages to any device via /proc/bus/usb */
+
+ #define CAP_SYS_RAWIO        17
+
+ /* Allow use of chroot() */
+
+ #define CAP_SYS_CHROOT       18
+
+ /* Allow ptrace() of any process */
+
+ #define CAP_SYS_PTRACE       19
+
+ /* Allow configuration of process accounting */
+
+ #define CAP_SYS_PACCT        20
+
+ /* Allow configuration of the secure attention key */
+ /* Allow administration of the random device */
+ /* Allow examination and configuration of disk quotas */
+ /* Allow configuring the kernel's syslog (printk behaviour) */
+ /* Allow setting the domainname */
+ /* Allow setting the hostname */
+ /* Allow calling bdflush() */
+ /* Allow mount() and umount(), setting up new smb connection */
+ /* Allow some autofs root ioctls */
+ /* Allow nfsservctl */
+ /* Allow VM86_REQUEST_IRQ */
+ /* Allow to read/write pci config on alpha */
+ /* Allow irix_prctl on mips (setstacksize) */
+ /* Allow flushing all cache on m68k (sys_cacheflush) */
+ /* Allow removing semaphores */
+ /* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+    and shared memory */
+ /* Allow locking/unlocking of shared memory segment */
+ /* Allow turning swap on/off */
+ /* Allow forged pids on socket credentials passing */
+ /* Allow setting readahead and flushing buffers on block devices */
+ /* Allow setting geometry in floppy driver */
+ /* Allow turning DMA on/off in xd driver */
+ /* Allow administration of md devices (mostly the above, but some
+    extra ioctls) */
+ /* Allow tuning the ide driver */
+ /* Allow access to the nvram device */
+ /* Allow administration of apm_bios, serial and bttv (TV) device */
+ /* Allow manufacturer commands in isdn CAPI support driver */
+ /* Allow reading non-standardized portions of pci configuration space */
+ /* Allow DDI debug ioctl on sbpcd driver */
+ /* Allow setting up serial ports */
+ /* Allow sending raw qic-117 commands */
+ /* Allow enabling/disabling tagged queuing on SCSI controllers and sending
+    arbitrary SCSI commands */
+ /* Allow setting encryption key on loopback filesystem */
+ /* Allow the selection of a security context */
+
+ #define CAP_SYS_ADMIN        21
+
+ /* Allow use of reboot() */
+
+ #define CAP_SYS_BOOT         22
+
+ /* Allow raising priority and setting priority on other (different
+    UID) processes */
+ /* Allow use of FIFO and round-robin (realtime) scheduling on own
+    processes and setting the scheduling algorithm used by another
+    process. */
+
+ #define CAP_SYS_NICE         23
+
+ /* Override resource limits. Set resource limits. */
+ /* Override quota limits. */
+ /* Override reserved space on ext2 filesystem */
+ /* NOTE: ext2 honors fsuid when checking for resource overrides, so
+    you can override using fsuid too */
+ /* Override size restrictions on IPC message queues */
+ /* Allow more than 64hz interrupts from the real-time clock */
+ /* Override max number of consoles on console allocation */
+ /* Override max number of keymaps */
+
+ #define CAP_SYS_RESOURCE     24
+
+ /* Allow manipulation of system clock */
+ /* Allow irix_stime on mips */
+ /* Allow setting the real-time clock */
+
+ #define CAP_SYS_TIME         25
+
+ /* Allow configuration of tty devices */
+ /* Allow vhangup() of tty */
+
+ #define CAP_SYS_TTY_CONFIG   26
+
+ /* Allow the privileged aspects of mknod() */
+
+ #define CAP_MKNOD            27
+
+ /* Allow taking of leases on files */
+
+ #define CAP_LEASE            28
+
+ /* Allow opening special device file */
+
+ #define CAP_OPENDEV          29
+
+ #ifdef __KERNEL__
+ /*
+  * Bounding set
+  */
+ extern kernel_cap_t cap_bset;
+
+ /*
+  * Internal kernel functions only
+  */
+
+ #ifdef STRICT_CAP_T_TYPECHECKS
+
+ #define to_cap_t(x) { x }
+ #define cap_t(x) (x).cap
+
+ #else
+
+ #define to_cap_t(x) (x)
+ #define cap_t(x) (x)
+
+ #endif
+
+ #define CAP_EMPTY_SET       to_cap_t(0)
+ #define CAP_FULL_SET        to_cap_t(~0)
+ #define CAP_INIT_EFF_SET    to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP))
+ #define CAP_INIT_INH_SET    to_cap_t(0)
+
+ #define CAP_TO_MASK(x) (1 << (x))
+ #define cap_raise(c, flag)   (cap_t(c) |=  CAP_TO_MASK(flag))
+ #define cap_lower(c, flag)   (cap_t(c) &= ~CAP_TO_MASK(flag))
+ #define cap_raised(c, flag)  (cap_t(c) & CAP_TO_MASK(flag))
+
+ static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = cap_t(a) | cap_t(b);
+      return dest;
+ }
+
+ static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = cap_t(a) & cap_t(b);
+      return dest;
+ }
+
+ static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = cap_t(a) & ~cap_t(drop);
+      return dest;
+ }
+
+ static inline kernel_cap_t cap_invert(kernel_cap_t c)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = ~cap_t(c);
+      return dest;
+ }
+
+ #define cap_isclear(c)       (!cap_t(c))
+ #define cap_issubset(a,set)  (!(cap_t(a) & ~cap_t(set)))
+
+ #define cap_clear(c)         do { cap_t(c) =  0; } while(0)
+ #define cap_set_full(c)      do { cap_t(c) = ~0; } while(0)
+ #define cap_mask(c,mask)     do { cap_t(c) &= cap_t(mask); } while(0)
+
+ #define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
+
+ #endif /* __KERNEL__ */
+
+ #endif /* !_LINUX_CAPABILITY_H */
diff -rc2P linux/include/linux/ext3_fs.h linux-2.4.13/include/linux/ext3_fs.h
*** linux/include/linux/ext3_fs.h       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs.h        Fri Nov  9 17:05:34 2001
***************
*** 0 ****
--- 1,716 ----
+ /*
+  *  linux/include/linux/ext3_fs.h
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/include/linux/minix_fs.h
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+
+ #ifndef _LINUX_EXT3_FS_H
+ #define _LINUX_EXT3_FS_H
+
+ #include <linux/types.h>
+
+ /*
+  * The second extended filesystem constants/structures
+  */
+
+ /*
+  * Define EXT3FS_DEBUG to produce debug messages
+  */
+ #undef EXT3FS_DEBUG
+
+ /*
+  * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files
+  */
+ #undef  EXT3_PREALLOCATE /* @@@ Fix this! */
+ #define EXT3_DEFAULT_PREALLOC_BLOCKS  8
+
+ /*
+  * The second extended file system version
+  */
+ #define EXT3FS_DATE           "21 Oct 2001"
+ #define EXT3FS_VERSION                "2.4-0.9.13"
+
+ /*
+  * Debug code
+  */
+ #ifdef EXT3FS_DEBUG
+ #define ext3_debug(f, a...)                                           \
+       do {                                                            \
+               printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:",       \
+                       __FILE__, __LINE__, __FUNCTION__);              \
+               printk (KERN_DEBUG f, ## a);                            \
+       } while (0)
+ #else
+ #define ext3_debug(f, a...)   do {} while (0)
+ #endif
+
+ /*
+  * Special inodes numbers
+  */
+ #define       EXT3_BAD_INO             1      /* Bad blocks inode */
+ #define EXT3_ROOT_INO          2      /* Root inode */
+ #define EXT3_ACL_IDX_INO       3      /* ACL inode */
+ #define EXT3_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO     6      /* Undelete directory inode */
+ #define EXT3_RESIZE_INO                7      /* Reserved group descriptors inode */
+ #define EXT3_JOURNAL_INO       8      /* Journal inode */
+
+ /* First non-reserved inode for old ext3 filesystems */
+ #define EXT3_GOOD_OLD_FIRST_INO       11
+
+ /*
+  * The second extended file system magic number
+  */
+ #define EXT3_SUPER_MAGIC      0xEF53
+
+ /*
+  * Maximal count of links to a file
+  */
+ #define EXT3_LINK_MAX         32000
+
+ /*
+  * Macro-instructions used to manage several block sizes
+  */
+ #define EXT3_MIN_BLOCK_SIZE           1024
+ #define       EXT3_MAX_BLOCK_SIZE             4096
+ #define EXT3_MIN_BLOCK_LOG_SIZE                 10
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE(s)           ((s)->s_blocksize)
+ #else
+ # define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+ #define EXT3_ACLE_PER_BLOCK(s)                (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+ #else
+ # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_log_block_size + 10)
+ #endif
+ #ifdef __KERNEL__
+ #define       EXT3_ADDR_PER_BLOCK_BITS(s)     ((s)->u.ext3_sb.s_addr_per_block_bits)
+ #define EXT3_INODE_SIZE(s)            ((s)->u.ext3_sb.s_inode_size)
+ #define EXT3_FIRST_INO(s)             ((s)->u.ext3_sb.s_first_ino)
+ #else
+ #define EXT3_INODE_SIZE(s)    (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+                                EXT3_GOOD_OLD_INODE_SIZE : \
+                                (s)->s_inode_size)
+ #define EXT3_FIRST_INO(s)     (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+                                EXT3_GOOD_OLD_FIRST_INO : \
+                                (s)->s_first_ino)
+ #endif
+
+ /*
+  * Macro-instructions used to manage fragments
+  */
+ #define EXT3_MIN_FRAG_SIZE            1024
+ #define       EXT3_MAX_FRAG_SIZE              4096
+ #define EXT3_MIN_FRAG_LOG_SIZE                  10
+ #ifdef __KERNEL__
+ # define EXT3_FRAG_SIZE(s)            ((s)->u.ext3_sb.s_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s)      ((s)->u.ext3_sb.s_frags_per_block)
+ #else
+ # define EXT3_FRAG_SIZE(s)            (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s)      (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s))
+ #endif
+
+ /*
+  * ACL structures
+  */
+ struct ext3_acl_header        /* Header of Access Control Lists */
+ {
+       __u32   aclh_size;
+       __u32   aclh_file_count;
+       __u32   aclh_acle_count;
+       __u32   aclh_first_acle;
+ };
+
+ struct ext3_acl_entry /* Access Control List Entry */
+ {
+       __u32   acle_size;
+       __u16   acle_perms;     /* Access permissions */
+       __u16   acle_type;      /* Type of entry */
+       __u16   acle_tag;       /* User or group identity */
+       __u16   acle_pad1;
+       __u32   acle_next;      /* Pointer on next entry for the */
+                                       /* same inode or on next free entry */
+ };
+
+ /*
+  * Structure of a blocks group descriptor
+  */
+ struct ext3_group_desc
+ {
+       __u32   bg_block_bitmap;                /* Blocks bitmap block */
+       __u32   bg_inode_bitmap;                /* Inodes bitmap block */
+       __u32   bg_inode_table;         /* Inodes table block */
+       __u16   bg_free_blocks_count;   /* Free blocks count */
+       __u16   bg_free_inodes_count;   /* Free inodes count */
+       __u16   bg_used_dirs_count;     /* Directories count */
+       __u16   bg_pad;
+       __u32   bg_reserved[3];
+ };
+
+ /*
+  * Macro-instructions used to manage group descriptors
+  */
+ #ifdef __KERNEL__
+ # define EXT3_BLOCKS_PER_GROUP(s)     ((s)->u.ext3_sb.s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s)               ((s)->u.ext3_sb.s_desc_per_block)
+ # define EXT3_INODES_PER_GROUP(s)     ((s)->u.ext3_sb.s_inodes_per_group)
+ # define EXT3_DESC_PER_BLOCK_BITS(s)  ((s)->u.ext3_sb.s_desc_per_block_bits)
+ #else
+ # define EXT3_BLOCKS_PER_GROUP(s)     ((s)->s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s)               (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc))
+ # define EXT3_INODES_PER_GROUP(s)     ((s)->s_inodes_per_group)
+ #endif
+
+ /*
+  * Constants relative to the data blocks
+  */
+ #define       EXT3_NDIR_BLOCKS                12
+ #define       EXT3_IND_BLOCK                  EXT3_NDIR_BLOCKS
+ #define       EXT3_DIND_BLOCK                 (EXT3_IND_BLOCK + 1)
+ #define       EXT3_TIND_BLOCK                 (EXT3_DIND_BLOCK + 1)
+ #define       EXT3_N_BLOCKS                   (EXT3_TIND_BLOCK + 1)
+
+ /*
+  * Inode flags
+  */
+ #define       EXT3_SECRM_FL                   0x00000001 /* Secure deletion */
+ #define       EXT3_UNRM_FL                    0x00000002 /* Undelete */
+ #define       EXT3_COMPR_FL                   0x00000004 /* Compress file */
+ #define EXT3_SYNC_FL                  0x00000008 /* Synchronous updates */
+ #define EXT3_IMMUTABLE_FILE_FL                0x00000010 /* Immutable file */
+ #define EXT3_APPEND_FL                        0x00000020 /* writes to file may only append */
+ #define EXT3_NODUMP_FL                        0x00000040 /* do not dump file */
+ #define EXT3_NOATIME_FL                       0x00000080 /* do not update atime */
+ /* Reserved for compression usage... */
+ #define EXT3_DIRTY_FL                 0x00000100
+ #define EXT3_COMPRBLK_FL              0x00000200 /* One or more compressed clusters */
+ #define EXT3_NOCOMPR_FL                       0x00000400 /* Don't compress */
+ #define EXT3_ECOMPR_FL                        0x00000800 /* Compression error */
+ /* End compression flags --- maybe not all used */
+ #define EXT3_INDEX_FL                 0x00001000 /* hash-indexed directory */
+ #define EXT3_IMAGIC_FL                        0x00002000 /* AFS directory */
+ #define EXT3_JOURNAL_DATA_FL          0x00004000 /* file data should be journaled */
+ #define EXT3_IMMUTABLE_LINK_FL          0x00008000 /* Immutable link */
+ #define EXT3_RESERVED_FL              0x80000000 /* reserved for ext3 lib */
+
+ #define EXT3_FL_USER_VISIBLE          0x00009FFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE               0x000080FF /* User modifiable flags */
+
+ /*
+  * Inode dynamic state flags
+  */
+ #define EXT3_STATE_JDATA              0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW                        0x00000002 /* inode is newly created */
+
+ /*
+  * ioctl commands
+  */
+ #define       EXT3_IOC_GETFLAGS               _IOR('f', 1, long)
+ #define       EXT3_IOC_SETFLAGS               _IOW('f', 2, long)
+ #define       EXT3_IOC_GETVERSION             _IOR('f', 3, long)
+ #define       EXT3_IOC_SETVERSION             _IOW('f', 4, long)
+ #define       EXT3_IOC_GETVERSION_OLD         _IOR('v', 1, long)
+ #define       EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+ #define EXT3_IOC_WAIT_FOR_READONLY    _IOR('f', 99, long)
+ #endif
+
+ /*
+  * Structure of an inode on the disk
+  */
+ struct ext3_inode {
+       __u16   i_mode;         /* File mode */
+       __u16   i_uid;          /* Low 16 bits of Owner Uid */
+       __u32   i_size;         /* Size in bytes */
+       __u32   i_atime;        /* Access time */
+       __u32   i_ctime;        /* Creation time */
+       __u32   i_mtime;        /* Modification time */
+       __u32   i_dtime;        /* Deletion Time */
+       __u16   i_gid;          /* Low 16 bits of Group Id */
+       __u16   i_links_count;  /* Links count */
+       __u32   i_blocks;       /* Blocks count */
+       __u32   i_flags;        /* File flags */
+       union {
+               struct {
+                       __u32  l_i_reserved1;
+               } linux1;
+               struct {
+                       __u32  h_i_translator;
+               } hurd1;
+               struct {
+                       __u32  m_i_reserved1;
+               } masix1;
+       } osd1;                         /* OS dependent 1 */
+       __u32   i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+       __u32   i_generation;   /* File version (for NFS) */
+       __u32   i_file_acl;     /* File ACL */
+       __u32   i_dir_acl;      /* Directory ACL */
+       __u32   i_faddr;        /* Fragment address */
+       union {
+               struct {
+                       __u8    l_i_frag;       /* Fragment number */
+                       __u8    l_i_fsize;      /* Fragment size */
+                       __u16   i_pad1;
+                       __u16   l_i_uid_high;   /* these 2 fields    */
+                       __u16   l_i_gid_high;   /* were reserved2[0] */
+                       __u32   l_i_reserved2;
+               } linux2;
+               struct {
+                       __u8    h_i_frag;       /* Fragment number */
+                       __u8    h_i_fsize;      /* Fragment size */
+                       __u16   h_i_mode_high;
+                       __u16   h_i_uid_high;
+                       __u16   h_i_gid_high;
+                       __u32   h_i_author;
+               } hurd2;
+               struct {
+                       __u8    m_i_frag;       /* Fragment number */
+                       __u8    m_i_fsize;      /* Fragment size */
+                       __u16   m_pad1;
+                       __u32   m_i_reserved2[2];
+               } masix2;
+       } osd2;                         /* OS dependent 2 */
+ };
+
+ #define i_size_high   i_dir_acl
+
+ #if defined(__KERNEL__) || defined(__linux__)
+ #define i_reserved1   osd1.linux1.l_i_reserved1
+ #define i_frag                osd2.linux2.l_i_frag
+ #define i_fsize               osd2.linux2.l_i_fsize
+ #define i_uid_low     i_uid
+ #define i_gid_low     i_gid
+ #define i_uid_high    osd2.linux2.l_i_uid_high
+ #define i_gid_high    osd2.linux2.l_i_gid_high
+ #define i_reserved2   osd2.linux2.l_i_reserved2
+
+ #elif defined(__GNU__)
+
+ #define i_translator  osd1.hurd1.h_i_translator
+ #define i_frag                osd2.hurd2.h_i_frag;
+ #define i_fsize               osd2.hurd2.h_i_fsize;
+ #define i_uid_high    osd2.hurd2.h_i_uid_high
+ #define i_gid_high    osd2.hurd2.h_i_gid_high
+ #define i_author      osd2.hurd2.h_i_author
+
+ #elif defined(__masix__)
+
+ #define i_reserved1   osd1.masix1.m_i_reserved1
+ #define i_frag                osd2.masix2.m_i_frag
+ #define i_fsize               osd2.masix2.m_i_fsize
+ #define i_reserved2   osd2.masix2.m_i_reserved2
+
+ #endif /* defined(__KERNEL__) || defined(__linux__) */
+
+ /*
+  * File system states
+  */
+ #define       EXT3_VALID_FS                   0x0001  /* Unmounted cleanly */
+ #define       EXT3_ERROR_FS                   0x0002  /* Errors detected */
+ #define       EXT3_ORPHAN_FS                  0x0004  /* Orphans being recovered */
+
+ /*
+  * Mount flags
+  */
+ #define EXT3_MOUNT_CHECK              0x0001  /* Do mount-time checks */
+ #define EXT3_MOUNT_GRPID              0x0004  /* Create files with directory's group */
+ #define EXT3_MOUNT_DEBUG              0x0008  /* Some debugging messages */
+ #define EXT3_MOUNT_ERRORS_CONT                0x0010  /* Continue on errors */
+ #define EXT3_MOUNT_ERRORS_RO          0x0020  /* Remount fs ro on errors */
+ #define EXT3_MOUNT_ERRORS_PANIC               0x0040  /* Panic on errors */
+ #define EXT3_MOUNT_MINIX_DF           0x0080  /* Mimics the Minix statfs */
+ #define EXT3_MOUNT_NOLOAD             0x0100  /* Don't use existing journal*/
+ #define EXT3_MOUNT_ABORT              0x0200  /* Fatal error detected */
+ #define EXT3_MOUNT_DATA_FLAGS         0x0C00  /* Mode for data writes: */
+   #define EXT3_MOUNT_JOURNAL_DATA     0x0400  /* Write data to journal */
+   #define EXT3_MOUNT_ORDERED_DATA     0x0800  /* Flush data before commit */
+   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+ #define clear_opt(o, opt)             o &= ~EXT3_MOUNT_##opt
+ #define set_opt(o, opt)                       o |= EXT3_MOUNT_##opt
+ #define test_opt(sb, opt)             ((sb)->u.ext3_sb.s_mount_opt & \
+                                        EXT3_MOUNT_##opt)
+ #else
+ #define EXT2_MOUNT_NOLOAD             EXT3_MOUNT_NOLOAD
+ #define EXT2_MOUNT_ABORT              EXT3_MOUNT_ABORT
+ #endif
+
+ #define ext3_set_bit                  ext2_set_bit
+ #define ext3_clear_bit                        ext2_clear_bit
+ #define ext3_test_bit                 ext2_test_bit
+ #define ext3_find_first_zero_bit      ext2_find_first_zero_bit
+ #define ext3_find_next_zero_bit               ext2_find_next_zero_bit
+
+ /*
+  * Maximal mount counts between two filesystem checks
+  */
+ #define EXT3_DFL_MAX_MNT_COUNT                20      /* Allow 20 mounts */
+ #define EXT3_DFL_CHECKINTERVAL                0       /* Don't use interval check */
+
+ /*
+  * Behaviour when detecting errors
+  */
+ #define EXT3_ERRORS_CONTINUE          1       /* Continue execution */
+ #define EXT3_ERRORS_RO                        2       /* Remount fs read-only */
+ #define EXT3_ERRORS_PANIC             3       /* Panic */
+ #define EXT3_ERRORS_DEFAULT           EXT3_ERRORS_CONTINUE
+
+ /*
+  * Structure of the super block
+  */
+ struct ext3_super_block {
+ /*00*/        __u32   s_inodes_count;         /* Inodes count */
+       __u32   s_blocks_count;         /* Blocks count */
+       __u32   s_r_blocks_count;       /* Reserved blocks count */
+       __u32   s_free_blocks_count;    /* Free blocks count */
+ /*10*/        __u32   s_free_inodes_count;    /* Free inodes count */
+       __u32   s_first_data_block;     /* First Data Block */
+       __u32   s_log_block_size;       /* Block size */
+       __s32   s_log_frag_size;        /* Fragment size */
+ /*20*/        __u32   s_blocks_per_group;     /* # Blocks per group */
+       __u32   s_frags_per_group;      /* # Fragments per group */
+       __u32   s_inodes_per_group;     /* # Inodes per group */
+       __u32   s_mtime;                /* Mount time */
+ /*30*/        __u32   s_wtime;                /* Write time */
+       __u16   s_mnt_count;            /* Mount count */
+       __s16   s_max_mnt_count;        /* Maximal mount count */
+       __u16   s_magic;                /* Magic signature */
+       __u16   s_state;                /* File system state */
+       __u16   s_errors;               /* Behaviour when detecting errors */
+       __u16   s_minor_rev_level;      /* minor revision level */
+ /*40*/        __u32   s_lastcheck;            /* time of last check */
+       __u32   s_checkinterval;        /* max. time between checks */
+       __u32   s_creator_os;           /* OS */
+       __u32   s_rev_level;            /* Revision level */
+ /*50*/        __u16   s_def_resuid;           /* Default uid for reserved blocks */
+       __u16   s_def_resgid;           /* Default gid for reserved blocks */
+       /*
+        * These fields are for EXT3_DYNAMIC_REV superblocks only.
+        *
+        * Note: the difference between the compatible feature set and
+        * the incompatible feature set is that if there is a bit set
+        * in the incompatible feature set that the kernel doesn't
+        * know about, it should refuse to mount the filesystem.
+        *
+        * e2fsck's requirements are more strict; if it doesn't know
+        * about a feature in either the compatible or incompatible
+        * feature set, it must abort and not try to meddle with
+        * things it doesn't understand...
+        */
+       __u32   s_first_ino;            /* First non-reserved inode */
+       __u16   s_inode_size;           /* size of inode structure */
+       __u16   s_block_group_nr;       /* block group # of this superblock */
+       __u32   s_feature_compat;       /* compatible feature set */
+ /*60*/        __u32   s_feature_incompat;     /* incompatible feature set */
+       __u32   s_feature_ro_compat;    /* readonly-compatible feature set */
+ /*68*/        __u8    s_uuid[16];             /* 128-bit uuid for volume */
+ /*78*/        char    s_volume_name[16];      /* volume name */
+ /*88*/        char    s_last_mounted[64];     /* directory where last mounted */
+ /*C8*/        __u32   s_algorithm_usage_bitmap; /* For compression */
+       /*
+        * Performance hints.  Directory preallocation should only
+        * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+        */
+       __u8    s_prealloc_blocks;      /* Nr of blocks to try to preallocate*/
+       __u8    s_prealloc_dir_blocks;  /* Nr to preallocate for dirs */
+       __u16   s_padding1;
+       /*
+        * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+        */
+ /*D0*/        __u8    s_journal_uuid[16];     /* uuid of journal superblock */
+ /*E0*/        __u32   s_journal_inum;         /* inode number of journal file */
+       __u32   s_journal_dev;          /* device number of journal file */
+       __u32   s_last_orphan;          /* start of list of inodes to delete */
+
+ /*EC*/        __u32   s_reserved[197];        /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+ #define EXT3_SB(sb)   (&((sb)->u.ext3_sb))
+ #define EXT3_I(inode) (&((inode)->u.ext3_i))
+ #else
+ /* Assume that user mode programs are passing in an ext3fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+  * macros from user land. */
+ #define EXT3_SB(sb)   (sb)
+ #endif
+
+ #define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime
+
+ /*
+  * Codes for operating systems
+  */
+ #define EXT3_OS_LINUX         0
+ #define EXT3_OS_HURD          1
+ #define EXT3_OS_MASIX         2
+ #define EXT3_OS_FREEBSD               3
+ #define EXT3_OS_LITES         4
+
+ /*
+  * Revision levels
+  */
+ #define EXT3_GOOD_OLD_REV     0       /* The good old (original) format */
+ #define EXT3_DYNAMIC_REV      1       /* V2 format w/ dynamic inode sizes */
+
+ #define EXT3_CURRENT_REV      EXT3_GOOD_OLD_REV
+ #define EXT3_MAX_SUPP_REV     EXT3_DYNAMIC_REV
+
+ #define EXT3_GOOD_OLD_INODE_SIZE 128
+
+ /*
+  * Feature set definitions
+  */
+
+ #define EXT3_HAS_COMPAT_FEATURE(sb,mask)                      \
+       ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+ #define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask)                   \
+       ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+ #define EXT3_HAS_INCOMPAT_FEATURE(sb,mask)                    \
+       ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+ #define EXT3_SET_COMPAT_FEATURE(sb,mask)                      \
+       EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+ #define EXT3_SET_RO_COMPAT_FEATURE(sb,mask)                   \
+       EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+ #define EXT3_SET_INCOMPAT_FEATURE(sb,mask)                    \
+       EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+ #define EXT3_CLEAR_COMPAT_FEATURE(sb,mask)                    \
+       EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+ #define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask)                 \
+       EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+ #define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask)                  \
+       EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+ #define EXT3_FEATURE_COMPAT_DIR_PREALLOC      0x0001
+ #define EXT3_FEATURE_COMPAT_IMAGIC_INODES     0x0002
+ #define EXT3_FEATURE_COMPAT_HAS_JOURNAL               0x0004
+ #define EXT3_FEATURE_COMPAT_EXT_ATTR          0x0008
+ #define EXT3_FEATURE_COMPAT_RESIZE_INODE      0x0010
+ #define EXT3_FEATURE_COMPAT_DIR_INDEX         0x0020
+
+ #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER   0x0001
+ #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE     0x0002
+ #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR      0x0004
+
+ #define EXT3_FEATURE_INCOMPAT_COMPRESSION     0x0001
+ #define EXT3_FEATURE_INCOMPAT_FILETYPE                0x0002
+ #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
+
+ #define EXT3_FEATURE_COMPAT_SUPP      0
+ #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                        EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+
+ /*
+  * Default values for user and/or group using reserved blocks
+  */
+ #define       EXT3_DEF_RESUID         0
+ #define       EXT3_DEF_RESGID         0
+
+ /*
+  * Structure of a directory entry
+  */
+ #define EXT3_NAME_LEN 255
+
+ struct ext3_dir_entry {
+       __u32   inode;                  /* Inode number */
+       __u16   rec_len;                /* Directory entry length */
+       __u16   name_len;               /* Name length */
+       char    name[EXT3_NAME_LEN];    /* File name */
+ };
+
+ /*
+  * The new version of the directory entry.  Since EXT3 structures are
+  * stored in intel byte order, and the name_len field could never be
+  * bigger than 255 chars, it's safe to reclaim the extra byte for the
+  * file_type field.
+  */
+ struct ext3_dir_entry_2 {
+       __u32   inode;                  /* Inode number */
+       __u16   rec_len;                /* Directory entry length */
+       __u8    name_len;               /* Name length */
+       __u8    file_type;
+       char    name[EXT3_NAME_LEN];    /* File name */
+ };
+
+ /*
+  * Ext3 directory file types.  Only the low 3 bits are used.  The
+  * other bits are reserved for now.
+  */
+ #define EXT3_FT_UNKNOWN               0
+ #define EXT3_FT_REG_FILE      1
+ #define EXT3_FT_DIR           2
+ #define EXT3_FT_CHRDEV                3
+ #define EXT3_FT_BLKDEV                4
+ #define EXT3_FT_FIFO          5
+ #define EXT3_FT_SOCK          6
+ #define EXT3_FT_SYMLINK               7
+
+ #define EXT3_FT_MAX           8
+
+ /*
+  * EXT3_DIR_PAD defines the directory entries boundaries
+  *
+  * NOTE: It must be a multiple of 4
+  */
+ #define EXT3_DIR_PAD                  4
+ #define EXT3_DIR_ROUND                        (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len)    (((name_len) + 8 + EXT3_DIR_ROUND) & \
+                                        ~EXT3_DIR_ROUND)
+
+ #ifdef __KERNEL__
+
+ /* Filesize hard limits for 64-bit file offsets */
+ extern long long ext3_max_sizes[];
+
+ /*
+  * Describe an inode's exact location on disk and in memory
+  */
+ struct ext3_iloc
+ {
+       struct buffer_head *bh;
+       struct ext3_inode *raw_inode;
+       unsigned long block_group;
+ };
+
+ /*
+  * Function prototypes
+  */
+
+ /*
+  * Ok, these declarations are also in <linux/kernel.h> but none of the
+  * ext3 source programs needs to include it so they are duplicated here.
+  */
+ # define NORET_TYPE    /**/
+ # define ATTRIB_NORET  __attribute__((noreturn))
+ # define NORET_AND     noreturn,
+
+ /* acl.c */
+ extern int ext3_permission (struct inode *, int);
+
+ /* balloc.c */
+ extern int ext3_bg_has_super(struct super_block *sb, int group);
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long,
+                                           __u32 *, __u32 *, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+                             unsigned long);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+ extern void ext3_check_blocks_bitmap (struct super_block *);
+ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+                                                   unsigned int block_group,
+                                                   struct buffer_head ** bh);
+
+ /* bitmap.c */
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+                               struct ext3_dir_entry_2 *, struct buffer_head *,
+                               unsigned long);
+
+ /* file.c */
+
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, ino_t);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+ extern void ext3_check_inodes_bitmap (struct super_block *);
+
+ /* inode.c */
+
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+
+ extern int  ext3_get_inode_loc (struct inode *, struct ext3_iloc *);
+ extern void ext3_read_inode (struct inode *);
+ extern void ext3_write_inode (struct inode *, int);
+ extern int  ext3_setattr (struct dentry *, struct iattr *);
+ extern void ext3_put_inode (struct inode *);
+ extern void ext3_delete_inode (struct inode *);
+ extern int  ext3_sync_inode (handle_t *, struct inode *);
+ extern void ext3_discard_prealloc (struct inode *);
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+                      unsigned long);
+
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+ extern void __ext3_std_error (struct super_block *, const char *, int);
+ extern void ext3_abort (struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+ extern NORET_TYPE void ext3_panic (struct super_block *, const char *,
+                                  const char *, ...)
+       __attribute__ ((NORET_AND format (printf, 3, 4)));
+ extern void ext3_warning (struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+ extern void ext3_update_dynamic_rev (struct super_block *sb);
+ extern void ext3_put_super (struct super_block *);
+ extern void ext3_write_super (struct super_block *);
+ extern void ext3_write_super_lockfs (struct super_block *);
+ extern void ext3_unlockfs (struct super_block *);
+ extern int ext3_remount (struct super_block *, int *, char *);
+ extern struct super_block * ext3_read_super (struct super_block *,void *,int);
+ extern int ext3_statfs (struct super_block *, struct statfs *);
+
+ /* truncate.c */
+ extern void ext3_truncate (struct inode *);
+
+ #define ext3_std_error(sb, errno)                             \
+ do {                                                          \
+       if ((errno))                                            \
+               __ext3_std_error((sb), __FUNCTION__, (errno));  \
+ } while (0)
+ extern const char *ext3_decode_error(struct super_block *sb, int errno, char nbuf[16]);
+
+ /*
+  * Inodes and files operations
+  */
+
+ /* dir.c */
+ extern struct file_operations ext3_dir_operations;
+
+ /* file.c */
+ extern struct inode_operations ext3_file_inode_operations;
+ extern struct file_operations ext3_file_operations;
+
+ /* symlink.c */
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
+ extern struct address_space_operations ext3_aops;
+
+ #endif        /* __KERNEL__ */
+
+ #endif        /* _LINUX_EXT3_FS_H */
diff -rc2P linux/include/linux/ext3_fs_i.h linux-2.4.13/include/linux/ext3_fs_i.h
*** linux/include/linux/ext3_fs_i.h     Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs_i.h      Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,78 ----
+ /*
+  *  linux/include/linux/ext3_fs_i.h
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/include/linux/minix_fs_i.h
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+
+ #ifndef _LINUX_EXT3_FS_I
+ #define _LINUX_EXT3_FS_I
+
+ #include <linux/rwsem.h>
+
+ /*
+  * second extended file system inode data in memory
+  */
+ struct ext3_inode_info {
+       __u32   i_data[15];
+       __u32   i_flags;
+ #ifdef EXT3_FRAGMENTS
+       __u32   i_faddr;
+       __u8    i_frag_no;
+       __u8    i_frag_size;
+       __u16   unused;                 /* formerly i_osync */
+ #endif
+       __u32   i_file_acl;
+       __u32   i_dir_acl;
+       __u32   i_dtime;
+       __u32   i_block_group;
+       __u32   i_state;                /* Dynamic state flags for ext3 */
+       __u32   i_next_alloc_block;
+       __u32   i_next_alloc_goal;
+ #ifdef EXT3_PREALLOCATE
+       __u32   i_prealloc_block;
+       __u32   i_prealloc_count;
+ #endif
+       __u32   i_dir_start_lookup;
+
+       struct list_head i_orphan;      /* unlinked but open inodes */
+
+       /*
+        * i_disksize keeps track of what the inode size is ON DISK, not
+        * in memory.  During truncate, i_size is set to the new size by
+        * the VFS prior to calling ext3_truncate(), but the filesystem won't
+        * set i_disksize to 0 until the truncate is actually under way.
+        *
+        * The intent is that i_disksize always represents the blocks which
+        * are used by this file.  This allows recovery to restart truncate
+        * on orphans if we crash during truncate.  We actually write i_disksize
+        * into the on-disk inode when writing inodes out, instead of i_size.
+        *
+        * The only time when i_disksize and i_size may be different is when
+        * a truncate is in progress.  The only things which change i_disksize
+        * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+        */
+       loff_t  i_disksize;
+
+       /*
+        * truncate_sem is for serialising ext3_truncate() against
+        * ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
+        * data tree are chopped off during truncate. We can't do that in
+        * ext3 because whenever we perform intermediate commits during
+        * truncate, the inode and all the metadata blocks *must* be in a
+        * consistent state which allows truncation of the orphans to restart
+        * during recovery.  Hence we must fix the get_block-vs-truncate race
+        * by other means, so we have truncate_sem.
+        */
+       struct rw_semaphore truncate_sem;
+ };
+
+ #endif        /* _LINUX_EXT3_FS_I */
diff -rc2P linux/include/linux/ext3_fs_sb.h linux-2.4.13/include/linux/ext3_fs_sb.h
*** linux/include/linux/ext3_fs_sb.h    Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs_sb.h     Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,77 ----
+ /*
+  *  linux/include/linux/ext3_fs_sb.h
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card ([email protected])
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/include/linux/minix_fs_sb.h
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+
+ #ifndef _LINUX_EXT3_FS_SB
+ #define _LINUX_EXT3_FS_SB
+
+ #ifdef __KERNEL__
+ #include <linux/timer.h>
+ #include <linux/wait.h>
+ #endif
+
+ /*
+  * The following is not needed anymore since the descriptors buffer
+  * heads are now dynamically allocated
+  */
+ /* #define EXT3_MAX_GROUP_DESC        8 */
+
+ #define EXT3_MAX_GROUP_LOADED 8
+
+ /*
+  * third extended-fs super-block data in memory
+  */
+ struct ext3_sb_info {
+       unsigned long s_frag_size;      /* Size of a fragment in bytes */
+       unsigned long s_frags_per_block;/* Number of fragments per block */
+       unsigned long s_inodes_per_block;/* Number of inodes per block */
+       unsigned long s_frags_per_group;/* Number of fragments in a group */
+       unsigned long s_blocks_per_group;/* Number of blocks in a group */
+       unsigned long s_inodes_per_group;/* Number of inodes in a group */
+       unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
+       unsigned long s_gdb_count;      /* Number of group descriptor blocks */
+       unsigned long s_desc_per_block; /* Number of group descriptors per block */
+       unsigned long s_groups_count;   /* Number of groups in the fs */
+       struct buffer_head * s_sbh;     /* Buffer containing the super block */
+       struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
+       struct buffer_head ** s_group_desc;
+       unsigned short s_loaded_inode_bitmaps;
+       unsigned short s_loaded_block_bitmaps;
+       unsigned long s_inode_bitmap_number[EXT3_MAX_GROUP_LOADED];
+       struct buffer_head * s_inode_bitmap[EXT3_MAX_GROUP_LOADED];
+       unsigned long s_block_bitmap_number[EXT3_MAX_GROUP_LOADED];
+       struct buffer_head * s_block_bitmap[EXT3_MAX_GROUP_LOADED];
+       unsigned long  s_mount_opt;
+       uid_t s_resuid;
+       gid_t s_resgid;
+       unsigned short s_mount_state;
+       unsigned short s_pad;
+       int s_addr_per_block_bits;
+       int s_desc_per_block_bits;
+       int s_inode_size;
+       int s_first_ino;
+
+       /* Journaling */
+       struct inode * s_journal_inode;
+       struct journal_s * s_journal;
+       struct list_head s_orphan;
+       unsigned long s_commit_interval;
+       struct block_device *journal_bdev;
+ #ifdef CONFIG_JBD_DEBUG
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+ #endif
+ };
+
+ #endif        /* _LINUX_EXT3_FS_SB */
diff -rc2P linux/include/linux/ext3_jbd.h linux-2.4.13/include/linux/ext3_jbd.h
*** linux/include/linux/ext3_jbd.h      Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_jbd.h       Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,290 ----
+ /*
+  * linux/include/linux/ext3_jbd.h
+  *
+  * Written by Stephen C. Tweedie <[email protected]>, 1999
+  *
+  * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Ext3-specific journaling extensions.
+  */
+
+ #ifndef _LINUX_EXT3_JBD_H
+ #define _LINUX_EXT3_JBD_H
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+
+ #define EXT3_JOURNAL(inode)   (EXT3_SB((inode)->i_sb)->s_journal)
+
+ /* Define the number of blocks we need to account to a transaction to
+  * modify one block of data.
+  *
+  * We may have to touch one inode, one bitmap buffer, up to three
+  * indirection blocks, the group and superblock summaries, and the data
+  * block to complete the transaction.  */
+
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS  8
+
+ /* Define the minimum size for a transaction which modifies data.  This
+  * needs to take into account the fact that we may end up modifying two
+  * quota files too (one for the group, one for the user quota).  The
+  * superblock only gets updated once, of course, so don't bother
+  * counting that again for the quota updates. */
+
+ #define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
+
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+
+ /* Delete operations potentially hit one directory's namespace plus an
+  * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
+  * generous.  We can grow the delete transaction later if necessary. */
+
+ #define EXT3_DELETE_TRANS_BLOCKS      (2 * EXT3_DATA_TRANS_BLOCKS + 64)
+
+ /* Define an arbitrary limit for the amount of data we will anticipate
+  * writing to any given transaction.  For unbounded transactions such as
+  * write(2) and truncate(2) we can write more than this, but we always
+  * start off at the maximum transaction size and grow the transaction
+  * optimistically as we go. */
+
+ #define EXT3_MAX_TRANS_DATA           64
+
+ /* We break up a large truncate or write transaction once the handle's
+  * buffer credits gets this low, we need either to extend the
+  * transaction or to start a new one.  Reserve enough space here for
+  * inode, bitmap, superblock, group and indirection updates for at least
+  * one block, plus two quota updates.  Quota allocations are not
+  * needed. */
+
+ #define EXT3_RESERVE_TRANS_BLOCKS     12
+
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+                    struct inode *inode,
+                    struct ext3_iloc *iloc);
+
+ /*
+  * On success, We end up with an outstanding reference count against
+  * iloc->bh.  This _must_ be cleaned up later.
+  */
+
+ int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+                       struct ext3_iloc *iloc);
+
+ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+ /*
+  * Wrapper functions with which ext3 calls into JBD.  The intent here is
+  * to allow these to be turned into appropriate stubs so ext3 can control
+  * ext2 filesystems, so ext2+ext3 systems only nee one fs.  This work hasn't
+  * been done yet.
+  */
+
+ static inline void ext3_journal_abort_handle(const char *caller,
+                                            const char *err_fn,
+                                            struct buffer_head *bh,
+                                            handle_t *handle,
+                                            int err)
+ {
+       char nbuf[16];
+       const char *errstr = ext3_decode_error(NULL, err, nbuf);
+
+       printk(KERN_ERR "%s: aborting transaction: %s in %s",
+              caller, errstr, err_fn);
+
+       if (bh)
+               BUFFER_TRACE(bh, "abort");
+       journal_abort_handle(handle);
+       if (!handle->h_err)
+               handle->h_err = err;
+ }
+
+ static inline int
+ __ext3_journal_get_undo_access(const char *where,
+                              handle_t *handle, struct buffer_head *bh)
+ {
+       int err = journal_get_undo_access(handle, bh);
+       if (err)
+               ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+       return err;
+ }
+
+ static inline int
+ __ext3_journal_get_write_access(const char *where,
+                               handle_t *handle, struct buffer_head *bh)
+ {
+       int err = journal_get_write_access(handle, bh);
+       if (err)
+               ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+       return err;
+ }
+
+ static inline int
+ __ext3_journal_dirty_data(const char *where,
+                         handle_t *handle, struct buffer_head *bh, int async)
+ {
+       int err = journal_dirty_data(handle, bh, async);
+       if (err)
+               ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+       return err;
+ }
+
+ static inline void
+ ext3_journal_forget(handle_t *handle, struct buffer_head *bh)
+ {
+       journal_forget(handle, bh);
+ }
+
+ static inline int
+ __ext3_journal_revoke(const char *where, handle_t *handle,
+                     unsigned long blocknr, struct buffer_head *bh)
+ {
+       int err = journal_revoke(handle, blocknr, bh);
+       if (err)
+               ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+       return err;
+ }
+
+ static inline int
+ __ext3_journal_get_create_access(const char *where,
+                                handle_t *handle, struct buffer_head *bh)
+ {
+       int err = journal_get_create_access(handle, bh);
+       if (err)
+               ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+       return err;
+ }
+
+ static inline int
+ __ext3_journal_dirty_metadata(const char *where,
+                             handle_t *handle, struct buffer_head *bh)
+ {
+       int err = journal_dirty_metadata(handle, bh);
+       if (err)
+               ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+       return err;
+ }
+
+
+ #define ext3_journal_get_undo_access(handle, bh) \
+       __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_get_write_access(handle, bh) \
+       __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_dirty_data(handle, bh, async) \
+       __ext3_journal_dirty_data(__FUNCTION__, (handle), (bh), (async))
+ #define ext3_journal_revoke(handle, blocknr, bh) \
+       __ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ #define ext3_journal_get_create_access(handle, bh) \
+       __ext3_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_dirty_metadata(handle, bh) \
+       __ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+
+
+
+ /*
+  * Wrappers for journal_start/end.
+  *
+  * The only special thing we need to do here is to make sure that all
+  * journal_end calls result in the superblock being marked dirty, so
+  * that sync() will call the filesystem's write_super callback if
+  * appropriate.
+  */
+ static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+ {
+       if (inode->i_sb->s_flags & MS_RDONLY)
+               return ERR_PTR(-EROFS);
+       return journal_start(EXT3_JOURNAL(inode), nblocks);
+ }
+
+ static inline handle_t *
+ ext3_journal_try_start(struct inode *inode, int nblocks)
+ {
+       if (inode->i_sb->s_flags & MS_RDONLY)
+               return ERR_PTR(-EROFS);
+       return journal_try_start(EXT3_JOURNAL(inode), nblocks);
+ }
+
+ /*
+  * The only special thing we need to do here is to make sure that all
+  * journal_stop calls result in the superblock being marked dirty, so
+  * that sync() will call the filesystem's write_super callback if
+  * appropriate.
+  */
+ static inline int __ext3_journal_stop(const char *where,
+                                     handle_t *handle, struct inode *inode)
+ {
+       int err = handle->h_err;
+       int rc = journal_stop(handle);
+
+       inode->i_sb->s_dirt = 1;
+       if (!err)
+               err = rc;
+       if (err)
+               __ext3_std_error(inode->i_sb, where, err);
+       return err;
+ }
+ #define ext3_journal_stop(handle, inode) \
+       __ext3_journal_stop(__FUNCTION__, (handle), (inode))
+
+ static inline handle_t *ext3_journal_current_handle(void)
+ {
+       return journal_current_handle();
+ }
+
+ static inline void
+ ext3_log_start_commit(journal_t *journal, transaction_t *transaction)
+ {
+       log_start_commit(journal, transaction);
+ }
+
+ static inline void ext3_log_wait_commit(journal_t *journal, tid_t tid)
+ {
+       log_wait_commit(journal, tid);
+ }
+
+ static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+ {
+       return journal_extend(handle, nblocks);
+ }
+
+ static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+ {
+       return journal_restart(handle, nblocks);
+ }
+
+ static inline int ext3_journal_blocks_per_page(struct inode *inode)
+ {
+       return journal_blocks_per_page(inode);
+ }
+
+ static inline int ext3_journal_force_commit(journal_t *journal)
+ {
+       return journal_force_commit(journal);
+ }
+
+ /* super.c */
+ int ext3_force_commit(struct super_block *sb);
+
+ static inline int ext3_should_journal_data(struct inode *inode)
+ {
+       if (!S_ISREG(inode->i_mode))
+               return 1;
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+               return 1;
+       if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL)
+               return 1;
+       return 0;
+ }
+
+ static inline int ext3_should_order_data(struct inode *inode)
+ {
+       return (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA);
+ }
+
+
+ #endif        /* _LINUX_EXT3_JBD_H */
diff -rc2P linux/include/linux/fs.h linux-2.4.13/include/linux/fs.h
*** linux/include/linux/fs.h    Fri Nov  9 16:15:08 2001
--- linux-2.4.13/include/linux/fs.h     Fri Nov  9 16:58:00 2001
***************
*** 22,25 ****
--- 22,26 ----
 #include <linux/stddef.h>
 #include <linux/string.h>
+ #include <linux/buffer-trace.h>

 #include <asm/atomic.h>
***************
*** 219,222 ****
--- 220,224 ----
       BH_Wait_IO,     /* 1 if we should write out this buffer */
       BH_launder,     /* 1 if we should throttle on this buffer */
+       BH_JBD,         /* 1 if it has an attached journal_head */

       BH_PrivateStart,/* not a state bit, but the first bit available
***************
*** 265,268 ****
--- 267,274 ----
       struct inode *       b_inode;
       struct list_head     b_inode_buffers;   /* doubly linked list of inode dirty buffers */
+
+ #ifdef CONFIG_BUFFER_DEBUG
+       struct buffer_history b_history;
+ #endif
 };

***************
*** 290,293 ****
--- 296,300 ----
 #include <linux/minix_fs_i.h>
 #include <linux/ext2_fs_i.h>
+ #include <linux/ext3_fs_i.h>
 #include <linux/hpfs_fs_i.h>
 #include <linux/ntfs_fs_i.h>
***************
*** 380,387 ****
--- 387,400 ----
       int (*readpage)(struct file *, struct page *);
       int (*sync_page)(struct page *);
+       /*
+        * ext3 requires that a successful prepare_write() call be followed
+        * by a commit_write() call - they must be balanced
+        */
       int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
       int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
       /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
       int (*bmap)(struct address_space *, long);
+       int (*flushpage) (struct page *, unsigned long);
+       int (*releasepage) (struct page *, int);
 #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
       int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
***************
*** 445,448 ****
--- 458,462 ----
       unsigned long           i_version;
       struct semaphore        i_sem;
+       struct rw_semaphore     i_truncate_sem; /* Nests inside i_sem */
       struct semaphore        i_zombie;
       struct inode_operations *i_op;
***************
*** 474,477 ****
--- 488,492 ----
               struct minix_inode_info         minix_i;
               struct ext2_inode_info          ext2_i;
+               struct ext3_inode_info          ext3_i;
               struct hpfs_inode_info          hpfs_i;
               struct ntfs_inode_info          ntfs_i;
***************
*** 662,665 ****
--- 677,681 ----
 #include <linux/minix_fs_sb.h>
 #include <linux/ext2_fs_sb.h>
+ #include <linux/ext3_fs_sb.h>
 #include <linux/hpfs_fs_sb.h>
 #include <linux/ntfs_fs_sb.h>
***************
*** 718,721 ****
--- 734,738 ----
               struct minix_sb_info    minix_sb;
               struct ext2_sb_info     ext2_sb;
+               struct ext3_sb_info     ext3_sb;
               struct hpfs_sb_info     hpfs_sb;
               struct ntfs_sb_info     ntfs_sb;
***************
*** 1091,1094 ****
--- 1108,1112 ----
 extern int try_to_free_buffers(struct page *, unsigned int);
 extern void refile_buffer(struct buffer_head * buf);
+ extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
 extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);

***************
*** 1132,1135 ****
--- 1150,1157 ----
 static inline void mark_buffer_clean(struct buffer_head * bh)
 {
+ #if defined(CONFIG_JBD_DEBUG)
+       extern void jbd_preclean_buffer_check(struct buffer_head *);
+       jbd_preclean_buffer_check(bh); /* @@@ Expensive debugging */
+ #endif
       if (atomic_set_buffer_clean(bh))
               __mark_buffer_clean(bh);
***************
*** 1173,1176 ****
--- 1195,1199 ----
 }

+ extern void set_buffer_flushtime(struct buffer_head *);
 extern void balance_dirty(void);
 extern int check_disk_change(kdev_t);
***************
*** 1352,1355 ****
--- 1375,1380 ----
 extern struct buffer_head * bread(kdev_t, int, int);
 extern void wakeup_bdflush(void);
+ extern void put_unused_buffer_head(struct buffer_head * bh);
+ extern struct buffer_head * get_unused_buffer_head(int async);

 extern int brw_page(int, struct page *, kdev_t, int [], int);
***************
*** 1358,1361 ****
--- 1383,1387 ----

 /* Generic buffer handling for block filesystems.. */
+ extern int try_to_release_page(struct page * page, int gfp_mask);
 extern int discard_bh_page(struct page *, unsigned long, int);
 #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
diff -rc2P linux/include/linux/fs.h.orig linux-2.4.13/include/linux/fs.h.orig
*** linux/include/linux/fs.h.orig       Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/fs.h.orig        Fri Nov  9 16:15:08 2001
***************
*** 0 ****
--- 1,1569 ----
+ #ifndef _LINUX_FS_H
+ #define _LINUX_FS_H
+
+ /*
+  * This file has definitions for some important file table
+  * structures etc.
+  */
+
+ #include <linux/config.h>
+ #include <linux/linkage.h>
+ #include <linux/limits.h>
+ #include <linux/wait.h>
+ #include <linux/types.h>
+ #include <linux/vfs.h>
+ #include <linux/net.h>
+ #include <linux/kdev_t.h>
+ #include <linux/ioctl.h>
+ #include <linux/list.h>
+ #include <linux/dcache.h>
+ #include <linux/stat.h>
+ #include <linux/cache.h>
+ #include <linux/stddef.h>
+ #include <linux/string.h>
+
+ #include <asm/atomic.h>
+ #include <asm/bitops.h>
+
+ struct poll_table_struct;
+
+
+ /*
+  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
+  * the file limit at runtime and only root can increase the per-process
+  * nr_file rlimit, so it's safe to set up a ridiculously high absolute
+  * upper limit on files-per-process.
+  *
+  * Some programs (notably those using select()) may have to be
+  * recompiled to take full advantage of the new limits..
+  */
+
+ /* Fixed constants first: */
+ #undef NR_OPEN
+ #define NR_OPEN (1024*1024)   /* Absolute upper limit on fd num */
+ #define INR_OPEN 1024         /* Initial setting for nfile rlimits */
+
+ #define BLOCK_SIZE_BITS 10
+ #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+
+ /* And dynamically-tunable limits and defaults: */
+ struct files_stat_struct {
+       int nr_files;           /* read only */
+       int nr_free_files;      /* read only */
+       int max_files;          /* tunable */
+ };
+ extern struct files_stat_struct files_stat;
+
+ struct inodes_stat_t {
+       int nr_inodes;
+       int nr_unused;
+       int dummy[5];
+ };
+ extern struct inodes_stat_t inodes_stat;
+
+ extern int leases_enable, dir_notify_enable, lease_break_time;
+
+ #define NR_FILE  8192 /* this can well be larger on a larger system */
+ #define NR_RESERVED_FILES 10 /* reserved for root */
+ #define NR_SUPER 256
+
+ #define MAY_EXEC 1
+ #define MAY_WRITE 2
+ #define MAY_READ 4
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
+
+ #define READ 0
+ #define WRITE 1
+ #define READA 2               /* read-ahead  - don't block if no resources */
+ #define SPECIAL 4     /* For non-blockdevice requests in request queue */
+
+ #define SEL_IN                1
+ #define SEL_OUT               2
+ #define SEL_EX                4
+
+ /* public flags for file_system_type */
+ #define FS_REQUIRES_DEV 1
+ #define FS_NO_DCACHE  2 /* Only dcache the necessary things. */
+ #define FS_NO_PRELIM  4 /* prevent preloading of dentries, even if
+                          * FS_NO_DCACHE is not set.
+                          */
+ #define FS_SINGLE     8 /* Filesystem that can have only one superblock */
+ #define FS_NOMOUNT    16 /* Never mount from userland */
+ #define FS_LITTER     32 /* Keeps the tree in dcache */
+ #define FS_ODD_RENAME 32768   /* Temporary stuff; will go away as soon
+                                 * as nfs_rename() will be cleaned up
+                                 */
+ /*
+  * These are the fs-independent mount-flags: up to 32 flags are supported
+  */
+ #define MS_RDONLY      1      /* Mount read-only */
+ #define MS_NOSUID      2      /* Ignore suid and sgid bits */
+ #define MS_NODEV       4      /* Disallow access to device special files */
+ #define MS_NOEXEC      8      /* Disallow program execution */
+ #define MS_SYNCHRONOUS        16      /* Writes are synced at once */
+ #define MS_REMOUNT    32      /* Alter flags of a mounted FS */
+ #define MS_MANDLOCK   64      /* Allow mandatory locks on an FS */
+ #define MS_NOATIME    1024    /* Do not update access times. */
+ #define MS_NODIRATIME 2048    /* Do not update directory access times */
+ #define MS_BIND               4096
+ #define MS_REC                16384
+ #define MS_VERBOSE    32768
+ #define MS_NOUSER     (1<<31)
+
+ /*
+  * Superblock flags that can be altered by MS_REMOUNT
+  */
+ #define MS_RMT_MASK   (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
+                        MS_NODIRATIME)
+
+ /*
+  * Old magic mount flag and mask
+  */
+ #define MS_MGC_VAL 0xC0ED0000
+ #define MS_MGC_MSK 0xffff0000
+
+ /* Inode flags - they have nothing to superblock flags now */
+
+ #define S_SYNC                        1       /* Writes are synced at once */
+ #define S_NOATIME             2       /* Do not update access times */
+ #define S_QUOTA                       4       /* Quota initialized for file */
+ #define S_APPEND              8       /* Append-only file */
+ #define S_IMMUTABLE_FILE      16      /* Immutable file */
+ #define S_DEAD                        32      /* removed, but still open directory */
+ #define S_NOQUOTA             64      /* Inode is not counted to quota */
+ #define S_IMMUTABLE_LINK      128     /* Immutable links */
+
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+  * flags just means all the inodes inherit those flags by default. It might be
+  * possible to override it selectively if you really wanted to with some
+  * ioctl() that is not currently implemented.
+  *
+  * Exception: MS_RDONLY is always applied to the entire file system.
+  *
+  * Unfortunately, it is possible to change a filesystems flags with it mounted
+  * with files in use.  This means that all of the inodes will not have their
+  * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
+  * flags, so these have to be checked separately. -- [email protected]
+  */
+ #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
+
+ #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
+ #define IS_SYNC(inode)                (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
+ #define IS_MANDLOCK(inode)    __IS_FLG(inode, MS_MANDLOCK)
+
+ #define IS_QUOTAINIT(inode)   ((inode)->i_flags & S_QUOTA)
+ #define IS_NOQUOTA(inode)     ((inode)->i_flags & S_NOQUOTA)
+ #define IS_APPEND(inode)      ((inode)->i_flags & S_APPEND)
+ #define IS_IMMUTABLE_FILE(inode)      ((inode)->i_flags & S_IMMUTABLE_FILE)
+ #define IS_IMMUTABLE_LINK(inode) ((((inode)->i_flags & S_IMMUTABLE_FILE) << 3) ^ ((inode)->i_flags & S_IMMUTABLE_LINK) )
+ #define IS_NOATIME(inode)     (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
+ #define IS_NODIRATIME(inode)  __IS_FLG(inode, MS_NODIRATIME)
+
+ #define IS_DEADDIR(inode)     ((inode)->i_flags & S_DEAD)
+
+ /* the read-only stuff doesn't really belong here, but any other place is
+    probably as bad and I don't want to create yet another include file. */
+
+ #define BLKROSET   _IO(0x12,93)       /* set device read-only (0 = read-write) */
+ #define BLKROGET   _IO(0x12,94)       /* get read-only status (0 = read_write) */
+ #define BLKRRPART  _IO(0x12,95)       /* re-read partition table */
+ #define BLKGETSIZE _IO(0x12,96)       /* return device size /512 (long *arg) */
+ #define BLKFLSBUF  _IO(0x12,97)       /* flush buffer cache */
+ #define BLKRASET   _IO(0x12,98)       /* Set read ahead for block device */
+ #define BLKRAGET   _IO(0x12,99)       /* get current read ahead setting */
+ #define BLKFRASET  _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
+ #define BLKFRAGET  _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
+ #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
+ #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
+ #define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
+ #if 0
+ #define BLKPG      _IO(0x12,105)/* See blkpg.h */
+ #define BLKELVGET  _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))/* elevator get */
+ #define BLKELVSET  _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))/* elevator set */
+ /* This was here just to show that the number is taken -
+    probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
+ #endif
+ /* A jump here: 108-111 have been used for various private purposes. */
+ #define BLKBSZGET  _IOR(0x12,112,sizeof(int))
+ #define BLKBSZSET  _IOW(0x12,113,sizeof(int))
+ #define BLKGETSIZE64 _IOR(0x12,114,sizeof(u64))       /* return device size in bytes (u64 *arg) */
+
+ #define BMAP_IOCTL 1          /* obsolete - kept for compatibility */
+ #define FIBMAP           _IO(0x00,1)  /* bmap access */
+ #define FIGETBSZ   _IO(0x00,2)        /* get the block size used for bmap */
+
+ #ifdef __KERNEL__
+
+ #include <asm/semaphore.h>
+ #include <asm/byteorder.h>
+
+ extern void update_atime (struct inode *);
+ #define UPDATE_ATIME(inode) update_atime (inode)
+
+ extern void buffer_init(unsigned long);
+ extern void inode_init(unsigned long);
+ extern void mnt_init(unsigned long);
+
+ /* bh state bits */
+ enum bh_state_bits {
+       BH_Uptodate,    /* 1 if the buffer contains valid data */
+       BH_Dirty,       /* 1 if the buffer is dirty */
+       BH_Lock,        /* 1 if the buffer is locked */
+       BH_Req,         /* 0 if the buffer has been invalidated */
+       BH_Mapped,      /* 1 if the buffer has a disk mapping */
+       BH_New,         /* 1 if the buffer is new and not yet written out */
+       BH_Async,       /* 1 if the buffer is under end_buffer_io_async I/O */
+       BH_Wait_IO,     /* 1 if we should write out this buffer */
+       BH_launder,     /* 1 if we should throttle on this buffer */
+
+       BH_PrivateStart,/* not a state bit, but the first bit available
+                        * for private allocation by other entities
+                        */
+ };
+
+ /*
+  * Try to keep the most commonly used fields in single cache lines (16
+  * bytes) to improve performance.  This ordering should be
+  * particularly beneficial on 32-bit processors.
+  *
+  * We use the first 16 bytes for the data which is used in searches
+  * over the block hash lists (ie. getblk() and friends).
+  *
+  * The second 16 bytes we use for lru buffer scans, as used by
+  * sync_buffers() and refill_freelist().  -- sct
+  */
+ struct buffer_head {
+       /* First cache line: */
+       struct buffer_head *b_next;     /* Hash queue list */
+       unsigned long b_blocknr;        /* block number */
+       unsigned short b_size;          /* block size */
+       unsigned short b_list;          /* List that this buffer appears */
+       kdev_t b_dev;                   /* device (B_FREE = free) */
+
+       atomic_t b_count;               /* users using this block */
+       kdev_t b_rdev;                  /* Real device */
+       unsigned long b_state;          /* buffer state bitmap (see above) */
+       unsigned long b_flushtime;      /* Time when (dirty) buffer should be written */
+
+       struct buffer_head *b_next_free;/* lru/free list linkage */
+       struct buffer_head *b_prev_free;/* doubly linked list of buffers */
+       struct buffer_head *b_this_page;/* circular list of buffers in one page */
+       struct buffer_head *b_reqnext;  /* request queue */
+
+       struct buffer_head **b_pprev;   /* doubly linked list of hash-queue */
+       char * b_data;                  /* pointer to data block */
+       struct page *b_page;            /* the page this bh is mapped to */
+       void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
+       void *b_private;                /* reserved for b_end_io */
+
+       unsigned long b_rsector;        /* Real buffer location on disk */
+       wait_queue_head_t b_wait;
+
+       struct inode *       b_inode;
+       struct list_head     b_inode_buffers;   /* doubly linked list of inode dirty buffers */
+ };
+
+ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
+ void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
+
+ #define __buffer_state(bh, state)     (((bh)->b_state & (1UL << BH_##state)) != 0)
+
+ #define buffer_uptodate(bh)   __buffer_state(bh,Uptodate)
+ #define buffer_dirty(bh)      __buffer_state(bh,Dirty)
+ #define buffer_locked(bh)     __buffer_state(bh,Lock)
+ #define buffer_req(bh)                __buffer_state(bh,Req)
+ #define buffer_mapped(bh)     __buffer_state(bh,Mapped)
+ #define buffer_new(bh)                __buffer_state(bh,New)
+ #define buffer_async(bh)      __buffer_state(bh,Async)
+
+ #define bh_offset(bh)         ((unsigned long)(bh)->b_data & ~PAGE_MASK)
+
+ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset);
+
+ #define touch_buffer(bh)      mark_page_accessed(bh->b_page)
+
+
+ #include <linux/pipe_fs_i.h>
+ #include <linux/minix_fs_i.h>
+ #include <linux/ext2_fs_i.h>
+ #include <linux/hpfs_fs_i.h>
+ #include <linux/ntfs_fs_i.h>
+ #include <linux/msdos_fs_i.h>
+ #include <linux/umsdos_fs_i.h>
+ #include <linux/iso_fs_i.h>
+ #include <linux/nfs_fs_i.h>
+ #include <linux/sysv_fs_i.h>
+ #include <linux/affs_fs_i.h>
+ #include <linux/ufs_fs_i.h>
+ #include <linux/efs_fs_i.h>
+ #include <linux/coda_fs_i.h>
+ #include <linux/romfs_fs_i.h>
+ #include <linux/shmem_fs.h>
+ #include <linux/smb_fs_i.h>
+ #include <linux/hfs_fs_i.h>
+ #include <linux/adfs_fs_i.h>
+ #include <linux/qnx4_fs_i.h>
+ #include <linux/reiserfs_fs_i.h>
+ #include <linux/bfs_fs_i.h>
+ #include <linux/udf_fs_i.h>
+ #include <linux/ncp_fs_i.h>
+ #include <linux/proc_fs_i.h>
+ #include <linux/usbdev_fs_i.h>
+ #include <linux/jffs2_fs_i.h>
+ #include <linux/cramfs_fs_sb.h>
+
+ /*
+  * Attribute flags.  These should be or-ed together to figure out what
+  * has been changed!
+  */
+ #define ATTR_MODE     1
+ #define ATTR_UID      2
+ #define ATTR_GID      4
+ #define ATTR_SIZE     8
+ #define ATTR_ATIME    16
+ #define ATTR_MTIME    32
+ #define ATTR_CTIME    64
+ #define ATTR_ATIME_SET        128
+ #define ATTR_MTIME_SET        256
+ #define ATTR_FORCE    512     /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG        1024
+
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+  * uses the above definitions as flags, to know which values have changed.
+  * Also, in this manner, a Filesystem can look at only the values it cares
+  * about.  Basically, these are the attributes that the VFS layer can
+  * request to change from the FS layer.
+  *
+  * Derek Atkins <[email protected]> 94-10-20
+  */
+ struct iattr {
+       unsigned int    ia_valid;
+       umode_t         ia_mode;
+       uid_t           ia_uid;
+       gid_t           ia_gid;
+       loff_t          ia_size;
+       time_t          ia_atime;
+       time_t          ia_mtime;
+       time_t          ia_ctime;
+       unsigned int    ia_attr_flags;
+ };
+
+ /*
+  * This is the inode attributes flag definitions
+  */
+ #define ATTR_FLAG_SYNCRONOUS          1       /* Syncronous write */
+ #define ATTR_FLAG_NOATIME             2       /* Don't update atime */
+ #define ATTR_FLAG_APPEND              4       /* Append-only file */
+ #define ATTR_FLAG_IMMUTABLE_FILE      8       /* Immutable file */
+ #define ATTR_FLAG_NODIRATIME          16      /* Don't update atime for directory */
+ #define ATTR_FLAG_IMMUTABLE_LINK      32      /* Immutable file */
+
+ /*
+  * Includes for diskquotas and mount structures.
+  */
+ #include <linux/quota.h>
+ #include <linux/mount.h>
+
+ /*
+  * oh the beauties of C type declarations.
+  */
+ struct page;
+ struct address_space;
+ struct kiobuf;
+
+ struct address_space_operations {
+       int (*writepage)(struct page *);
+       int (*readpage)(struct file *, struct page *);
+       int (*sync_page)(struct page *);
+       int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
+       int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+       /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+       int (*bmap)(struct address_space *, long);
+ #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
+       int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+ };
+
+ struct address_space {
+       struct list_head        clean_pages;    /* list of clean pages */
+       struct list_head        dirty_pages;    /* list of dirty pages */
+       struct list_head        locked_pages;   /* list of locked pages */
+       unsigned long           nrpages;        /* number of total pages */
+       struct address_space_operations *a_ops; /* methods */
+       struct inode            *host;          /* owner: inode, block_device */
+       struct vm_area_struct   *i_mmap;        /* list of private mappings */
+       struct vm_area_struct   *i_mmap_shared; /* list of shared mappings */
+       spinlock_t              i_shared_lock;  /* and spinlock protecting it */
+       int                     gfp_mask;       /* how to allocate the pages */
+ };
+
+ struct char_device {
+       struct list_head        hash;
+       atomic_t                count;
+       dev_t                   dev;
+       atomic_t                openers;
+       struct semaphore        sem;
+ };
+
+ struct block_device {
+       struct list_head        bd_hash;
+       atomic_t                bd_count;
+       struct inode *          bd_inode;
+       dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
+       int                     bd_openers;
+       const struct block_device_operations *bd_op;
+       struct semaphore        bd_sem; /* open/close mutex */
+       struct list_head        bd_inodes;
+ };
+
+ struct inode {
+       struct list_head        i_hash;
+       struct list_head        i_list;
+       struct list_head        i_dentry;
+
+       struct list_head        i_dirty_buffers;
+       struct list_head        i_dirty_data_buffers;
+
+       unsigned long           i_ino;
+       atomic_t                i_count;
+       kdev_t                  i_dev;
+       umode_t                 i_mode;
+       nlink_t                 i_nlink;
+       uid_t                   i_uid;
+       gid_t                   i_gid;
+       kdev_t                  i_rdev;
+       loff_t                  i_size;
+       time_t                  i_atime;
+       time_t                  i_mtime;
+       time_t                  i_ctime;
+       unsigned int            i_blkbits;
+       unsigned long           i_blksize;
+       unsigned long           i_blocks;
+       unsigned long           i_version;
+       struct semaphore        i_sem;
+       struct semaphore        i_zombie;
+       struct inode_operations *i_op;
+       struct file_operations  *i_fop; /* former ->i_op->default_file_ops */
+       struct super_block      *i_sb;
+       wait_queue_head_t       i_wait;
+       struct file_lock        *i_flock;
+       struct address_space    *i_mapping;
+       struct address_space    i_data;
+       struct dquot            *i_dquot[MAXQUOTAS];
+       /* These three should probably be a union */
+       struct list_head        i_devices;
+       struct pipe_inode_info  *i_pipe;
+       struct block_device     *i_bdev;
+       struct char_device      *i_cdev;
+
+       unsigned long           i_dnotify_mask; /* Directory notify events */
+       struct dnotify_struct   *i_dnotify; /* for directory notifications */
+
+       unsigned long           i_state;
+
+       unsigned int            i_flags;
+       unsigned char           i_sock;
+
+       atomic_t                i_writecount;
+       unsigned int            i_attr_flags;
+       __u32                   i_generation;
+       union {
+               struct minix_inode_info         minix_i;
+               struct ext2_inode_info          ext2_i;
+               struct hpfs_inode_info          hpfs_i;
+               struct ntfs_inode_info          ntfs_i;
+               struct msdos_inode_info         msdos_i;
+               struct umsdos_inode_info        umsdos_i;
+               struct iso_inode_info           isofs_i;
+               struct nfs_inode_info           nfs_i;
+               struct sysv_inode_info          sysv_i;
+               struct affs_inode_info          affs_i;
+               struct ufs_inode_info           ufs_i;
+               struct efs_inode_info           efs_i;
+               struct romfs_inode_info         romfs_i;
+               struct shmem_inode_info         shmem_i;
+               struct coda_inode_info          coda_i;
+               struct smb_inode_info           smbfs_i;
+               struct hfs_inode_info           hfs_i;
+               struct adfs_inode_info          adfs_i;
+               struct qnx4_inode_info          qnx4_i;
+               struct reiserfs_inode_info      reiserfs_i;
+               struct bfs_inode_info           bfs_i;
+               struct udf_inode_info           udf_i;
+               struct ncp_inode_info           ncpfs_i;
+               struct proc_inode_info          proc_i;
+               struct socket                   socket_i;
+               struct usbdev_inode_info        usbdev_i;
+               struct jffs2_inode_info         jffs2_i;
+               void                            *generic_ip;
+       } u;
+ };
+
+ struct fown_struct {
+       int pid;                /* pid or -pgrp where SIGIO should be sent */
+       uid_t uid, euid;        /* uid/euid of process setting the owner */
+       int signum;             /* posix.1b rt signal to be delivered on IO */
+ };
+
+ struct file {
+       struct list_head        f_list;
+       struct dentry           *f_dentry;
+       struct vfsmount         *f_vfsmnt;
+       struct file_operations  *f_op;
+       atomic_t                f_count;
+       unsigned int            f_flags;
+       mode_t                  f_mode;
+       loff_t                  f_pos;
+       unsigned long           f_reada, f_ramax, f_raend, f_ralen, f_rawin;
+       struct fown_struct      f_owner;
+       unsigned int            f_uid, f_gid;
+       int                     f_error;
+
+       unsigned long           f_version;
+
+       /* needed for tty driver, and maybe others */
+       void                    *private_data;
+
+       /* preallocated helper kiobuf to speedup O_DIRECT */
+       struct kiobuf           *f_iobuf;
+       long                    f_iobuf_lock;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+ #define file_list_unlock() spin_unlock(&files_lock);
+
+ #define get_file(x)   atomic_inc(&(x)->f_count)
+ #define file_count(x) atomic_read(&(x)->f_count)
+
+ extern int init_private_file(struct file *, struct dentry *, int);
+
+ #define       MAX_NON_LFS     ((1UL<<31) - 1)
+
+ #define FL_POSIX      1
+ #define FL_FLOCK      2
+ #define FL_BROKEN     4       /* broken flock() emulation */
+ #define FL_ACCESS     8       /* for processes suspended by mandatory locking */
+ #define FL_LOCKD      16      /* lock held by rpc.lockd */
+ #define FL_LEASE      32      /* lease held on this file */
+
+ /*
+  * The POSIX file lock owner is determined by
+  * the "struct files_struct" in the thread group
+  * (or NULL for no owner - BSD locks).
+  *
+  * Lockd stuffs a "host" pointer into this.
+  */
+ typedef struct files_struct *fl_owner_t;
+
+ struct file_lock {
+       struct file_lock *fl_next;      /* singly linked list for this inode  */
+       struct list_head fl_link;       /* doubly linked list of all locks */
+       struct list_head fl_block;      /* circular list of blocked processes */
+       fl_owner_t fl_owner;
+       unsigned int fl_pid;
+       wait_queue_head_t fl_wait;
+       struct file *fl_file;
+       unsigned char fl_flags;
+       unsigned char fl_type;
+       loff_t fl_start;
+       loff_t fl_end;
+
+       void (*fl_notify)(struct file_lock *);  /* unblock callback */
+       void (*fl_insert)(struct file_lock *);  /* lock insertion callback */
+       void (*fl_remove)(struct file_lock *);  /* lock removal callback */
+
+       struct fasync_struct *  fl_fasync; /* for lease break notifications */
+
+       union {
+               struct nfs_lock_info    nfs_fl;
+       } fl_u;
+ };
+
+ /* The following constant reflects the upper bound of the file/locking space */
+ #ifndef OFFSET_MAX
+ #define INT_LIMIT(x)  (~((x)1 << (sizeof(x)*8 - 1)))
+ #define OFFSET_MAX    INT_LIMIT(loff_t)
+ #define OFFT_OFFSET_MAX       INT_LIMIT(off_t)
+ #endif
+
+ extern struct list_head file_lock_list;
+
+ #include <linux/fcntl.h>
+
+ extern int fcntl_getlk(unsigned int, struct flock *);
+ extern int fcntl_setlk(unsigned int, unsigned int, struct flock *);
+
+ extern int fcntl_getlk64(unsigned int, struct flock64 *);
+ extern int fcntl_setlk64(unsigned int, unsigned int, struct flock64 *);
+
+ /* fs/locks.c */
+ extern void locks_init_lock(struct file_lock *);
+ extern void locks_copy_lock(struct file_lock *, struct file_lock *);
+ extern void locks_remove_posix(struct file *, fl_owner_t);
+ extern void locks_remove_flock(struct file *);
+ extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
+ extern int posix_lock_file(struct file *, struct file_lock *, unsigned int);
+ extern void posix_block_lock(struct file_lock *, struct file_lock *);
+ extern void posix_unblock_lock(struct file_lock *);
+ extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
+ extern int __get_lease(struct inode *inode, unsigned int flags);
+ extern time_t lease_get_mtime(struct inode *);
+ extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
+ extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+
+ struct fasync_struct {
+       int     magic;
+       int     fa_fd;
+       struct  fasync_struct   *fa_next; /* singly linked list */
+       struct  file            *fa_file;
+ };
+
+ #define FASYNC_MAGIC 0x4601
+
+ /* SMP safe fasync helpers: */
+ extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
+ /* can be called from interrupts */
+ extern void kill_fasync(struct fasync_struct **, int, int);
+ /* only for net: no internal synchronization */
+ extern void __kill_fasync(struct fasync_struct *, int, int);
+
+ struct nameidata {
+       struct dentry *dentry;
+       struct vfsmount *mnt;
+       struct qstr last;
+       unsigned int flags;
+       int last_type;
+ };
+
+ #define DQUOT_USR_ENABLED     0x01            /* User diskquotas enabled */
+ #define DQUOT_GRP_ENABLED     0x02            /* Group diskquotas enabled */
+
+ struct quota_mount_options
+ {
+       unsigned int flags;                     /* Flags for diskquotas on this device */
+       struct semaphore dqio_sem;              /* lock device while I/O in progress */
+       struct semaphore dqoff_sem;             /* serialize quota_off() and quota_on() on device */
+       struct file *files[MAXQUOTAS];          /* fp's to quotafiles */
+       time_t inode_expire[MAXQUOTAS];         /* expiretime for inode-quota */
+       time_t block_expire[MAXQUOTAS];         /* expiretime for block-quota */
+       char rsquash[MAXQUOTAS];                /* for quotas threat root as any other user */
+ };
+
+ /*
+  *    Umount options
+  */
+
+ #define MNT_FORCE     0x00000001      /* Attempt to forcibily umount */
+ #define MNT_DETACH    0x00000002      /* Just detach from the tree */
+
+ #include <linux/minix_fs_sb.h>
+ #include <linux/ext2_fs_sb.h>
+ #include <linux/hpfs_fs_sb.h>
+ #include <linux/ntfs_fs_sb.h>
+ #include <linux/msdos_fs_sb.h>
+ #include <linux/iso_fs_sb.h>
+ #include <linux/nfs_fs_sb.h>
+ #include <linux/sysv_fs_sb.h>
+ #include <linux/affs_fs_sb.h>
+ #include <linux/ufs_fs_sb.h>
+ #include <linux/efs_fs_sb.h>
+ #include <linux/romfs_fs_sb.h>
+ #include <linux/smb_fs_sb.h>
+ #include <linux/hfs_fs_sb.h>
+ #include <linux/adfs_fs_sb.h>
+ #include <linux/qnx4_fs_sb.h>
+ #include <linux/reiserfs_fs_sb.h>
+ #include <linux/bfs_fs_sb.h>
+ #include <linux/udf_fs_sb.h>
+ #include <linux/ncp_fs_sb.h>
+ #include <linux/usbdev_fs_sb.h>
+ #include <linux/cramfs_fs_sb.h>
+ #include <linux/jffs2_fs_sb.h>
+
+ extern struct list_head super_blocks;
+ extern spinlock_t sb_lock;
+
+ #define sb_entry(list)        list_entry((list), struct super_block, s_list)
+ #define S_BIAS (1<<30)
+ struct super_block {
+       struct list_head        s_list;         /* Keep this first */
+       kdev_t                  s_dev;
+       unsigned long           s_blocksize;
+       unsigned char           s_blocksize_bits;
+       unsigned char           s_dirt;
+       unsigned long long      s_maxbytes;     /* Max file size */
+       struct file_system_type *s_type;
+       struct super_operations *s_op;
+       struct dquot_operations *dq_op;
+       unsigned long           s_flags;
+       unsigned long           s_magic;
+       struct dentry           *s_root;
+       struct rw_semaphore     s_umount;
+       struct semaphore        s_lock;
+       int                     s_count;
+       atomic_t                s_active;
+
+       struct list_head        s_dirty;        /* dirty inodes */
+       struct list_head        s_locked_inodes;/* inodes being synced */
+       struct list_head        s_files;
+
+       struct block_device     *s_bdev;
+       struct list_head        s_instances;
+       struct quota_mount_options s_dquot;     /* Diskquota specific options */
+
+       union {
+               struct minix_sb_info    minix_sb;
+               struct ext2_sb_info     ext2_sb;
+               struct hpfs_sb_info     hpfs_sb;
+               struct ntfs_sb_info     ntfs_sb;
+               struct msdos_sb_info    msdos_sb;
+               struct isofs_sb_info    isofs_sb;
+               struct nfs_sb_info      nfs_sb;
+               struct sysv_sb_info     sysv_sb;
+               struct affs_sb_info     affs_sb;
+               struct ufs_sb_info      ufs_sb;
+               struct efs_sb_info      efs_sb;
+               struct shmem_sb_info    shmem_sb;
+               struct romfs_sb_info    romfs_sb;
+               struct smb_sb_info      smbfs_sb;
+               struct hfs_sb_info      hfs_sb;
+               struct adfs_sb_info     adfs_sb;
+               struct qnx4_sb_info     qnx4_sb;
+               struct reiserfs_sb_info reiserfs_sb;
+               struct bfs_sb_info      bfs_sb;
+               struct udf_sb_info      udf_sb;
+               struct ncp_sb_info      ncpfs_sb;
+               struct usbdev_sb_info   usbdevfs_sb;
+               struct jffs2_sb_info    jffs2_sb;
+               struct cramfs_sb_info   cramfs_sb;
+               void                    *generic_sbp;
+       } u;
+       /*
+        * The next field is for VFS *only*. No filesystems have any business
+        * even looking at it. You had been warned.
+        */
+       struct semaphore s_vfs_rename_sem;      /* Kludge */
+
+       /* The next field is used by knfsd when converting a (inode number based)
+        * file handle into a dentry. As it builds a path in the dcache tree from
+        * the bottom up, there may for a time be a subpath of dentrys which is not
+        * connected to the main tree.  This semaphore ensure that there is only ever
+        * one such free path per filesystem.  Note that unconnected files (or other
+        * non-directories) are allowed, but not unconnected diretories.
+        */
+       struct semaphore s_nfsd_free_path_sem;
+ };
+
+ /*
+  * VFS helper functions..
+  */
+ extern int vfs_create(struct inode *, struct dentry *, int);
+ extern int vfs_mkdir(struct inode *, struct dentry *, int);
+ extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
+ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *);
+ extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+
+ /*
+  * File types
+  */
+ #define DT_UNKNOWN    0
+ #define DT_FIFO               1
+ #define DT_CHR                2
+ #define DT_DIR                4
+ #define DT_BLK                6
+ #define DT_REG                8
+ #define DT_LNK                10
+ #define DT_SOCK               12
+ #define DT_WHT                14
+
+ /*
+  * This is the "filldir" function type, used by readdir() to let
+  * the kernel specify what kind of dirent layout it wants to have.
+  * This allows the kernel to read directories into kernel space or
+  * to have different dirent layouts depending on the binary type.
+  */
+ typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned);
+
+ struct block_device_operations {
+       int (*open) (struct inode *, struct file *);
+       int (*release) (struct inode *, struct file *);
+       int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+       int (*check_media_change) (kdev_t);
+       int (*revalidate) (kdev_t);
+ };
+
+ /*
+  * NOTE:
+  * read, write, poll, fsync, readv, writev can be called
+  *   without the big kernel lock held in all filesystems.
+  */
+ struct file_operations {
+       struct module *owner;
+       loff_t (*llseek) (struct file *, loff_t, int);
+       ssize_t (*read) (struct file *, char *, size_t, loff_t *);
+       ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+       int (*readdir) (struct file *, void *, filldir_t);
+       unsigned int (*poll) (struct file *, struct poll_table_struct *);
+       int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+       int (*mmap) (struct file *, struct vm_area_struct *);
+       int (*open) (struct inode *, struct file *);
+       int (*flush) (struct file *);
+       int (*release) (struct inode *, struct file *);
+       int (*fsync) (struct file *, struct dentry *, int datasync);
+       int (*fasync) (int, struct file *, int);
+       int (*lock) (struct file *, int, struct file_lock *);
+       ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
+       ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+       ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
+       unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ };
+
+ struct inode_operations {
+       int (*create) (struct inode *,struct dentry *,int);
+       struct dentry * (*lookup) (struct inode *,struct dentry *);
+       int (*link) (struct dentry *,struct inode *,struct dentry *);
+       int (*unlink) (struct inode *,struct dentry *);
+       int (*symlink) (struct inode *,struct dentry *,const char *);
+       int (*mkdir) (struct inode *,struct dentry *,int);
+       int (*rmdir) (struct inode *,struct dentry *);
+       int (*mknod) (struct inode *,struct dentry *,int,int);
+       int (*rename) (struct inode *, struct dentry *,
+                       struct inode *, struct dentry *);
+       int (*readlink) (struct dentry *, char *,int);
+       int (*follow_link) (struct dentry *, struct nameidata *);
+       void (*truncate) (struct inode *);
+       int (*permission) (struct inode *, int);
+       int (*revalidate) (struct dentry *);
+       int (*setattr) (struct dentry *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+ };
+
+ /*
+  * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
+  * without the big kernel lock held in all filesystems.
+  */
+ struct super_operations {
+       void (*read_inode) (struct inode *);
+
+       /* reiserfs kludge.  reiserfs needs 64 bits of information to
+       ** find an inode.  We are using the read_inode2 call to get
+       ** that information.  We don't like this, and are waiting on some
+       ** VFS changes for the real solution.
+       ** iget4 calls read_inode2, iff it is defined
+       */
+       void (*read_inode2) (struct inode *, void *) ;
+       void (*dirty_inode) (struct inode *);
+       void (*write_inode) (struct inode *, int);
+       void (*put_inode) (struct inode *);
+       void (*delete_inode) (struct inode *);
+       void (*put_super) (struct super_block *);
+       void (*write_super) (struct super_block *);
+       void (*write_super_lockfs) (struct super_block *);
+       void (*unlockfs) (struct super_block *);
+       int (*statfs) (struct super_block *, struct statfs *);
+       int (*remount_fs) (struct super_block *, int *, char *);
+       void (*clear_inode) (struct inode *);
+       void (*umount_begin) (struct super_block *);
+
+       /* Following are for knfsd to interact with "interesting" filesystems
+        * Currently just reiserfs, but possibly FAT and others later
+        *
+        * fh_to_dentry is given a filehandle fragement with length, and a type flag
+        *   and must return a dentry for the referenced object or, if "parent" is
+        *   set, a dentry for the parent of the object.
+        *   If a dentry cannot be found, a "root" dentry should be created and
+        *   flaged as DCACHE_NFSD_DISCONNECTED. nfsd_iget is an example implementation.
+        *
+        * dentry_to_fh is given a dentry and must generate the filesys specific
+        *   part of the file handle.  Available length is passed in *lenp and used
+        *   length should be returned therein.
+        *   If need_parent is set, then dentry_to_fh should encode sufficient information
+        *   to find the (current) parent.
+        *   dentry_to_fh should return a 1byte "type" which will be passed back in
+        *   the fhtype arguement to fh_to_dentry.  Type of 0 is reserved.
+        *   If filesystem was exportable before the introduction of fh_to_dentry,
+        *   types 1 and 2 should be used is that same way as the generic code.
+        *   Type 255 means error.
+        *
+        * Lengths are in units of 4bytes, not bytes.
+        */
+       struct dentry * (*fh_to_dentry)(struct super_block *sb, __u32 *fh, int len, int fhtype, int parent);
+       int (*dentry_to_fh)(struct dentry *, __u32 *fh, int *lenp, int need_parent);
+ };
+
+ /* Inode state bits.. */
+ #define I_DIRTY_SYNC          1 /* Not dirty enough for O_DATASYNC */
+ #define I_DIRTY_DATASYNC      2 /* Data-related inode changes pending */
+ #define I_DIRTY_PAGES         4 /* Data-related inode changes pending */
+ #define I_LOCK                        8
+ #define I_FREEING             16
+ #define I_CLEAR                       32
+
+ #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+
+ extern void __mark_inode_dirty(struct inode *, int);
+ static inline void mark_inode_dirty(struct inode *inode)
+ {
+       __mark_inode_dirty(inode, I_DIRTY);
+ }
+
+ static inline void mark_inode_dirty_sync(struct inode *inode)
+ {
+       __mark_inode_dirty(inode, I_DIRTY_SYNC);
+ }
+
+ static inline void mark_inode_dirty_pages(struct inode *inode)
+ {
+       __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ }
+
+ struct dquot_operations {
+       void (*initialize) (struct inode *, short);
+       void (*drop) (struct inode *);
+       int (*alloc_block) (struct inode *, unsigned long, char);
+       int (*alloc_inode) (const struct inode *, unsigned long);
+       void (*free_block) (struct inode *, unsigned long);
+       void (*free_inode) (const struct inode *, unsigned long);
+       int (*transfer) (struct inode *, struct iattr *);
+ };
+
+ struct file_system_type {
+       const char *name;
+       int fs_flags;
+       struct super_block *(*read_super) (struct super_block *, void *, int);
+       struct module *owner;
+       struct file_system_type * next;
+       struct list_head fs_supers;
+ };
+
+ #define DECLARE_FSTYPE(var,type,read,flags) \
+ struct file_system_type var = { \
+       name:           type, \
+       read_super:     read, \
+       fs_flags:       flags, \
+       owner:          THIS_MODULE, \
+ }
+
+ #define DECLARE_FSTYPE_DEV(var,type,read) \
+       DECLARE_FSTYPE(var,type,read,FS_REQUIRES_DEV)
+
+ /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
+ #define fops_get(fops) \
+       (((fops) && (fops)->owner)      \
+               ? ( try_inc_mod_count((fops)->owner) ? (fops) : NULL ) \
+               : (fops))
+
+ #define fops_put(fops) \
+ do {  \
+       if ((fops) && (fops)->owner) \
+               __MOD_DEC_USE_COUNT((fops)->owner);     \
+ } while(0)
+
+ extern int register_filesystem(struct file_system_type *);
+ extern int unregister_filesystem(struct file_system_type *);
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+
+ #define kern_umount mntput
+
+ extern int vfs_statfs(struct super_block *, struct statfs *);
+
+ /* Return value for VFS lock functions - tells locks.c to lock conventionally
+  * REALLY kosha for root NFS and nfs_lock
+  */
+ #define LOCK_USE_CLNT 1
+
+ #define FLOCK_VERIFY_READ  1
+ #define FLOCK_VERIFY_WRITE 2
+
+ extern int locks_mandatory_locked(struct inode *);
+ extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
+
+ /*
+  * Candidates for mandatory locking have the setgid bit set
+  * but no group execute bit -  an otherwise meaningless combination.
+  */
+ #define MANDATORY_LOCK(inode) \
+       (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+
+ static inline int locks_verify_locked(struct inode *inode)
+ {
+       if (MANDATORY_LOCK(inode))
+               return locks_mandatory_locked(inode);
+       return 0;
+ }
+
+ static inline int locks_verify_area(int read_write, struct inode *inode,
+                                   struct file *filp, loff_t offset,
+                                   size_t count)
+ {
+       if (inode->i_flock && MANDATORY_LOCK(inode))
+               return locks_mandatory_area(read_write, inode, filp, offset, count);
+       return 0;
+ }
+
+ static inline int locks_verify_truncate(struct inode *inode,
+                                   struct file *filp,
+                                   loff_t size)
+ {
+       if (inode->i_flock && MANDATORY_LOCK(inode))
+               return locks_mandatory_area(
+                       FLOCK_VERIFY_WRITE, inode, filp,
+                       size < inode->i_size ? size : inode->i_size,
+                       (size < inode->i_size ? inode->i_size - size
+                        : size - inode->i_size)
+               );
+       return 0;
+ }
+
+ static inline int get_lease(struct inode *inode, unsigned int mode)
+ {
+       if (inode->i_flock && (inode->i_flock->fl_flags & FL_LEASE))
+               return __get_lease(inode, mode);
+       return 0;
+ }
+
+ /* fs/open.c */
+
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int);      /* yes, it's really unsigned */
+ extern int do_truncate(struct dentry *, loff_t start);
+
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+
+ /* fs/dcache.c */
+ extern void vfs_caches_init(unsigned long);
+
+ #define __getname()   kmem_cache_alloc(names_cachep, SLAB_KERNEL)
+ #define putname(name) kmem_cache_free(names_cachep, (void *)(name))
+
+ enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW};
+ extern int register_blkdev(unsigned int, const char *, struct block_device_operations *);
+ extern int unregister_blkdev(unsigned int, const char *);
+ extern struct block_device *bdget(dev_t);
+ extern int bd_acquire(struct inode *inode);
+ extern void bd_forget(struct inode *inode);
+ extern void bdput(struct block_device *);
+ extern struct char_device *cdget(dev_t);
+ extern void cdput(struct char_device *);
+ extern int blkdev_open(struct inode *, struct file *);
+ extern int blkdev_close(struct inode *, struct file *);
+ extern struct file_operations def_blk_fops;
+ extern struct address_space_operations def_blk_aops;
+ extern struct file_operations def_fifo_fops;
+ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
+ extern int blkdev_get(struct block_device *, mode_t, unsigned, int);
+ extern int blkdev_put(struct block_device *, int);
+
+ /* fs/devices.c */
+ extern const struct block_device_operations *get_blkfops(unsigned int);
+ extern int register_chrdev(unsigned int, const char *, struct file_operations *);
+ extern int unregister_chrdev(unsigned int, const char *);
+ extern int chrdev_open(struct inode *, struct file *);
+ extern const char * bdevname(kdev_t);
+ extern const char * cdevname(kdev_t);
+ extern const char * kdevname(kdev_t);
+ extern void init_special_inode(struct inode *, umode_t, int);
+
+ /* Invalid inode operations -- fs/bad_inode.c */
+ extern void make_bad_inode(struct inode *);
+ extern int is_bad_inode(struct inode *);
+
+ extern struct file_operations read_fifo_fops;
+ extern struct file_operations write_fifo_fops;
+ extern struct file_operations rdwr_fifo_fops;
+ extern struct file_operations read_pipe_fops;
+ extern struct file_operations write_pipe_fops;
+ extern struct file_operations rdwr_pipe_fops;
+
+ extern int fs_may_remount_ro(struct super_block *);
+
+ extern int try_to_free_buffers(struct page *, unsigned int);
+ extern void refile_buffer(struct buffer_head * buf);
+ extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+
+ /* reiserfs_writepage needs this */
+ extern void set_buffer_async_io(struct buffer_head *bh) ;
+
+ #define BUF_CLEAN     0
+ #define BUF_LOCKED    1       /* Buffers scheduled for write */
+ #define BUF_DIRTY     2       /* Dirty buffers, not yet scheduled for write */
+ #define NR_LIST               3
+
+ static inline void get_bh(struct buffer_head * bh)
+ {
+         atomic_inc(&(bh)->b_count);
+ }
+
+ static inline void put_bh(struct buffer_head *bh)
+ {
+         smp_mb__before_atomic_dec();
+         atomic_dec(&bh->b_count);
+ }
+
+ /*
+  * This is called by bh->b_end_io() handlers when I/O has completed.
+  */
+ static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+ {
+       if (on)
+               set_bit(BH_Uptodate, &bh->b_state);
+       else
+               clear_bit(BH_Uptodate, &bh->b_state);
+ }
+
+ #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
+
+ static inline void __mark_buffer_clean(struct buffer_head *bh)
+ {
+       refile_buffer(bh);
+ }
+
+ static inline void mark_buffer_clean(struct buffer_head * bh)
+ {
+       if (atomic_set_buffer_clean(bh))
+               __mark_buffer_clean(bh);
+ }
+
+ extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
+ extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
+ extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+ extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *));
+
+ #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
+
+ static inline void mark_buffer_async(struct buffer_head * bh, int on)
+ {
+       if (on)
+               set_bit(BH_Async, &bh->b_state);
+       else
+               clear_bit(BH_Async, &bh->b_state);
+ }
+
+ /*
+  * If an error happens during the make_request, this function
+  * has to be recalled. It marks the buffer as clean and not
+  * uptodate, and it notifys the upper layer about the end
+  * of the I/O.
+  */
+ static inline void buffer_IO_error(struct buffer_head * bh)
+ {
+       mark_buffer_clean(bh);
+       /*
+        * b_end_io has to clear the BH_Uptodate bitflag in the error case!
+        */
+       bh->b_end_io(bh, 0);
+ }
+
+ extern void buffer_insert_inode_queue(struct buffer_head *, struct inode *);
+ static inline void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+ {
+       mark_buffer_dirty(bh);
+       buffer_insert_inode_queue(bh, inode);
+ }
+
+ extern void balance_dirty(void);
+ extern int check_disk_change(kdev_t);
+ extern int invalidate_inodes(struct super_block *);
+ extern int invalidate_device(kdev_t, int);
+ extern void invalidate_inode_pages(struct inode *);
+ extern void invalidate_inode_pages2(struct address_space *);
+ extern void invalidate_inode_buffers(struct inode *);
+ #define invalidate_buffers(dev)       __invalidate_buffers((dev), 0)
+ #define destroy_buffers(dev)  __invalidate_buffers((dev), 1)
+ extern void invalidate_bdev(struct block_device *, int);
+ extern void __invalidate_buffers(kdev_t dev, int);
+ extern void sync_inodes(kdev_t);
+ extern void sync_unlocked_inodes(void);
+ extern void write_inode_now(struct inode *, int);
+ extern int sync_buffers(kdev_t, int);
+ extern void sync_dev(kdev_t);
+ extern int fsync_dev(kdev_t);
+ extern int fsync_super(struct super_block *);
+ extern int fsync_no_super(kdev_t);
+ extern void sync_inodes_sb(struct super_block *);
+ extern int osync_inode_buffers(struct inode *);
+ extern int osync_inode_data_buffers(struct inode *);
+ extern int fsync_inode_buffers(struct inode *);
+ extern int fsync_inode_data_buffers(struct inode *);
+ extern int inode_has_buffers(struct inode *);
+ extern void filemap_fdatasync(struct address_space *);
+ extern void filemap_fdatawait(struct address_space *);
+ extern void sync_supers(kdev_t);
+ extern int bmap(struct inode *, int);
+ extern int notify_change(struct dentry *, struct iattr *);
+ extern int permission(struct inode *, int);
+ extern int vfs_permission(struct inode *, int);
+ extern int get_write_access(struct inode *);
+ extern int deny_write_access(struct file *);
+ static inline void put_write_access(struct inode * inode)
+ {
+       atomic_dec(&inode->i_writecount);
+ }
+ static inline void allow_write_access(struct file *file)
+ {
+       if (file)
+               atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ }
+ extern int do_pipe(int *);
+
+ extern int open_namei(const char *, int, int, struct nameidata *);
+
+ extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
+ extern struct file * open_exec(const char *);
+
+ /* fs/dcache.c -- generic fs support functions */
+ extern int is_subdir(struct dentry *, struct dentry *);
+ extern ino_t find_inode_number(struct dentry *, struct qstr *);
+
+ /*
+  * Kernel pointers have redundant information, so we can use a
+  * scheme where we can return either an error code or a dentry
+  * pointer with the same return value.
+  *
+  * This should be a per-architecture thing, to allow different
+  * error and pointer decisions.
+  */
+ static inline void *ERR_PTR(long error)
+ {
+       return (void *) error;
+ }
+
+ static inline long PTR_ERR(const void *ptr)
+ {
+       return (long) ptr;
+ }
+
+ static inline long IS_ERR(const void *ptr)
+ {
+       return (unsigned long)ptr > (unsigned long)-1000L;
+ }
+
+ /*
+  * The bitmask for a lookup event:
+  *  - follow links at the end
+  *  - require a directory
+  *  - ending slashes ok even for nonexistent files
+  *  - internal "there are more path compnents" flag
+  */
+ #define LOOKUP_FOLLOW         (1)
+ #define LOOKUP_DIRECTORY      (2)
+ #define LOOKUP_CONTINUE               (4)
+ #define LOOKUP_POSITIVE               (8)
+ #define LOOKUP_PARENT         (16)
+ #define LOOKUP_NOALT          (32)
+ /*
+  * Type of the last component on LOOKUP_PARENT
+  */
+ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+
+ /*
+  * "descriptor" for what we're up to with a read for sendfile().
+  * This allows us to use the same read code yet
+  * have multiple different users of the data that
+  * we read from a file.
+  *
+  * The simplest case just copies the data to user
+  * mode.
+  */
+ typedef struct {
+       size_t written;
+       size_t count;
+       char * buf;
+       int error;
+ } read_descriptor_t;
+
+ typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
+
+ /* needed for stackable file system support */
+ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
+
+ extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+ #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
+ #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
+
+ extern void iput(struct inode *);
+ extern void force_delete(struct inode *);
+ extern struct inode * igrab(struct inode *);
+ extern ino_t iunique(struct super_block *, ino_t);
+
+ typedef int (*find_inode_t)(struct inode *, unsigned long, void *);
+ extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *);
+ static inline struct inode *iget(struct super_block *sb, unsigned long ino)
+ {
+       return iget4(sb, ino, NULL, NULL);
+ }
+
+ extern void clear_inode(struct inode *);
+ extern struct inode * get_empty_inode(void);
+
+ static inline struct inode * new_inode(struct super_block *sb)
+ {
+       struct inode *inode = get_empty_inode();
+       if (inode) {
+               inode->i_sb = sb;
+               inode->i_dev = sb->s_dev;
+               inode->i_blkbits = sb->s_blocksize_bits;
+       }
+       return inode;
+ }
+ extern void remove_suid(struct inode *inode);
+
+ extern void insert_inode_hash(struct inode *);
+ extern void remove_inode_hash(struct inode *);
+ extern struct file * get_empty_filp(void);
+ extern void file_move(struct file *f, struct list_head *list);
+ extern struct buffer_head * get_hash_table(kdev_t, int, int);
+ extern struct buffer_head * getblk(kdev_t, int, int);
+ extern void ll_rw_block(int, int, struct buffer_head * bh[]);
+ extern void submit_bh(int, struct buffer_head *);
+ extern int is_read_only(kdev_t);
+ extern void __brelse(struct buffer_head *);
+ static inline void brelse(struct buffer_head *buf)
+ {
+       if (buf)
+               __brelse(buf);
+ }
+ extern void __bforget(struct buffer_head *);
+ static inline void bforget(struct buffer_head *buf)
+ {
+       if (buf)
+               __bforget(buf);
+ }
+ extern int set_blocksize(kdev_t, int);
+ extern struct buffer_head * bread(kdev_t, int, int);
+ extern void wakeup_bdflush(void);
+
+ extern int brw_page(int, struct page *, kdev_t, int [], int);
+
+ typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
+
+ /* Generic buffer handling for block filesystems.. */
+ extern int discard_bh_page(struct page *, unsigned long, int);
+ #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
+ #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
+ extern int block_symlink(struct inode *, const char *, int);
+ extern int block_write_full_page(struct page*, get_block_t*);
+ extern int block_read_full_page(struct page*, get_block_t*);
+ extern int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
+ extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
+                               unsigned long *);
+ extern int block_commit_write(struct page *page, unsigned from, unsigned to);
+ extern int block_sync_page(struct page *);
+
+ int generic_block_bmap(struct address_space *, long, get_block_t *);
+ int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
+ int block_truncate_page(struct address_space *, loff_t, get_block_t *);
+ extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
+
+ extern int waitfor_one_page(struct page*);
+ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
+ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
+ extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
+ extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
+ extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+ extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
+ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
+ extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
+ extern int generic_file_open(struct inode * inode, struct file * filp);
+
+ extern struct file_operations generic_ro_fops;
+
+ extern int vfs_readlink(struct dentry *, char *, int, const char *);
+ extern int vfs_follow_link(struct nameidata *, const char *);
+ extern int page_readlink(struct dentry *, char *, int);
+ extern int page_follow_link(struct dentry *, struct nameidata *);
+ extern struct inode_operations page_symlink_inode_operations;
+
+ extern int vfs_readdir(struct file *, filldir_t, void *);
+ extern int dcache_readdir(struct file *, void *, filldir_t);
+
+ extern struct file_system_type *get_fs_type(const char *name);
+ extern struct super_block *get_super(kdev_t);
+ extern void drop_super(struct super_block *sb);
+ static inline int is_mounted(kdev_t dev)
+ {
+       struct super_block *sb = get_super(dev);
+       if (sb) {
+               drop_super(sb);
+               return 1;
+       }
+       return 0;
+ }
+ unsigned long generate_cluster(kdev_t, int b[], int);
+ unsigned long generate_cluster_swab32(kdev_t, int b[], int);
+ extern kdev_t ROOT_DEV;
+ extern char root_device_name[];
+
+
+ extern void show_buffers(void);
+ extern void mount_root(void);
+
+ #ifdef CONFIG_BLK_DEV_INITRD
+ extern kdev_t real_root_dev;
+ extern int change_root(kdev_t, const char *);
+ #endif
+
+ extern ssize_t char_read(struct file *, char *, size_t, loff_t *);
+ extern ssize_t block_read(struct file *, char *, size_t, loff_t *);
+ extern int read_ahead[];
+
+ extern ssize_t char_write(struct file *, const char *, size_t, loff_t *);
+ extern ssize_t block_write(struct file *, const char *, size_t, loff_t *);
+
+ extern int file_fsync(struct file *, struct dentry *, int);
+ extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx);
+ extern int generic_osync_inode(struct inode *, int);
+ #define OSYNC_METADATA (1<<0)
+ #define OSYNC_DATA (1<<1)
+ #define OSYNC_INODE (1<<2)
+
+ extern int inode_change_ok(struct inode *, struct iattr *);
+ extern int inode_setattr(struct inode *, struct iattr *);
+
+ /*
+  * Common dentry functions for inclusion in the VFS
+  * or in other stackable file systems.  Some of these
+  * functions were in linux/fs/ C (VFS) files.
+  *
+  */
+
+ /*
+  * Locking the parent is needed to:
+  *  - serialize directory operations
+  *  - make sure the parent doesn't change from
+  *    under us in the middle of an operation.
+  *
+  * NOTE! Right now we'd rather use a "struct inode"
+  * for this, but as I expect things to move toward
+  * using dentries instead for most things it is
+  * probably better to start with the conceptually
+  * better interface of relying on a path of dentries.
+  */
+ static inline struct dentry *lock_parent(struct dentry *dentry)
+ {
+       struct dentry *dir = dget(dentry->d_parent);
+
+       down(&dir->d_inode->i_sem);
+       return dir;
+ }
+
+ static inline struct dentry *get_parent(struct dentry *dentry)
+ {
+       return dget(dentry->d_parent);
+ }
+
+ static inline void unlock_dir(struct dentry *dir)
+ {
+       up(&dir->d_inode->i_sem);
+       dput(dir);
+ }
+
+ /*
+  * Whee.. Deadlock country. Happily there are only two VFS
+  * operations that does this..
+  */
+ static inline void double_down(struct semaphore *s1, struct semaphore *s2)
+ {
+       if (s1 != s2) {
+               if ((unsigned long) s1 < (unsigned long) s2) {
+                       struct semaphore *tmp = s2;
+                       s2 = s1; s1 = tmp;
+               }
+               down(s1);
+       }
+       down(s2);
+ }
+
+ /*
+  * Ewwwwwwww... _triple_ lock. We are guaranteed that the 3rd argument is
+  * not equal to 1st and not equal to 2nd - the first case (target is parent of
+  * source) would be already caught, the second is plain impossible (target is
+  * its own parent and that case would be caught even earlier). Very messy.
+  * I _think_ that it works, but no warranties - please, look it through.
+  * Pox on bloody lusers who mandated overwriting rename() for directories...
+  */
+
+ static inline void triple_down(struct semaphore *s1,
+                              struct semaphore *s2,
+                              struct semaphore *s3)
+ {
+       if (s1 != s2) {
+               if ((unsigned long) s1 < (unsigned long) s2) {
+                       if ((unsigned long) s1 < (unsigned long) s3) {
+                               struct semaphore *tmp = s3;
+                               s3 = s1; s1 = tmp;
+                       }
+                       if ((unsigned long) s1 < (unsigned long) s2) {
+                               struct semaphore *tmp = s2;
+                               s2 = s1; s1 = tmp;
+                       }
+               } else {
+                       if ((unsigned long) s1 < (unsigned long) s3) {
+                               struct semaphore *tmp = s3;
+                               s3 = s1; s1 = tmp;
+                       }
+                       if ((unsigned long) s2 < (unsigned long) s3) {
+                               struct semaphore *tmp = s3;
+                               s3 = s2; s2 = tmp;
+                       }
+               }
+               down(s1);
+       } else if ((unsigned long) s2 < (unsigned long) s3) {
+               struct semaphore *tmp = s3;
+               s3 = s2; s2 = tmp;
+       }
+       down(s2);
+       down(s3);
+ }
+
+ static inline void double_up(struct semaphore *s1, struct semaphore *s2)
+ {
+       up(s1);
+       if (s1 != s2)
+               up(s2);
+ }
+
+ static inline void triple_up(struct semaphore *s1,
+                            struct semaphore *s2,
+                            struct semaphore *s3)
+ {
+       up(s1);
+       if (s1 != s2)
+               up(s2);
+       up(s3);
+ }
+
+ static inline void double_lock(struct dentry *d1, struct dentry *d2)
+ {
+       double_down(&d1->d_inode->i_sem, &d2->d_inode->i_sem);
+ }
+
+ static inline void double_unlock(struct dentry *d1, struct dentry *d2)
+ {
+       double_up(&d1->d_inode->i_sem,&d2->d_inode->i_sem);
+       dput(d1);
+       dput(d2);
+ }
+
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_FS_H */
diff -rc2P linux/include/linux/jbd.h linux-2.4.13/include/linux/jbd.h
*** linux/include/linux/jbd.h   Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/jbd.h    Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,878 ----
+ /*
+  * linux/include/linux/jbd.h
+  *
+  * Written by Stephen C. Tweedie <[email protected]>
+  *
+  * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Definitions for transaction data structures for the buffer cache
+  * filesystem journaling support.
+  */
+
+ #ifndef _LINUX_JBD_H
+ #define _LINUX_JBD_H
+
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || !defined(__KERNEL__)
+
+ /* Allow this file to be included directly into e2fsprogs */
+ #ifndef __KERNEL__
+ #include "jfs_compat.h"
+ #define JFS_DEBUG
+ #define jfs_debug jbd_debug
+ #else
+
+ #include <linux/journal-head.h>
+ #include <linux/stddef.h>
+ #include <asm/semaphore.h>
+ #endif
+
+ extern int journal_oom_retry;
+
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+  * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
+  * consistency checks.  By default we don't do this unless
+  * CONFIG_JBD_DEBUG is on.
+  */
+ #define JBD_EXPENSIVE_CHECKING
+
+ extern int journal_enable_debug;
+ extern int journal_no_write[2];
+
+ #define jbd_debug(n, f, a...)                                         \
+       do {                                                            \
+               if ((n) <= journal_enable_debug) {                      \
+                       printk (KERN_DEBUG "(%s, %d): %s: ",            \
+                               __FILE__, __LINE__, __FUNCTION__);      \
+                       printk (f, ## a);                               \
+               }                                                       \
+       } while (0)
+ #else
+ #define jbd_debug(f, a...)    /**/
+ #endif
+
+ extern void * __jbd_kmalloc (char *where, size_t size, int flags, int retry);
+ #define jbd_kmalloc(size, flags) \
+       __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
+ #define jbd_rep_kmalloc(size, flags) \
+       __jbd_kmalloc(__FUNCTION__, (size), (flags), 1)
+
+ #define JFS_MIN_JOURNAL_BLOCKS 1024
+
+ #ifdef __KERNEL__
+ typedef struct handle_s               handle_t;       /* Atomic operation type */
+ typedef struct journal_s      journal_t;      /* Journal control structure */
+ #endif
+
+ /*
+  * Internal structures used by the logging mechanism:
+  */
+
+ #define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
+
+ /*
+  * On-disk structures
+  */
+
+ /*
+  * Descriptor block types:
+  */
+
+ #define JFS_DESCRIPTOR_BLOCK  1
+ #define JFS_COMMIT_BLOCK      2
+ #define JFS_SUPERBLOCK_V1     3
+ #define JFS_SUPERBLOCK_V2     4
+ #define JFS_REVOKE_BLOCK      5
+
+ /*
+  * Standard header for all descriptor blocks:
+  */
+ typedef struct journal_header_s
+ {
+       __u32           h_magic;
+       __u32           h_blocktype;
+       __u32           h_sequence;
+ } journal_header_t;
+
+
+ /*
+  * The block tag: used to describe a single buffer in the journal
+  */
+ typedef struct journal_block_tag_s
+ {
+       __u32           t_blocknr;      /* The on-disk block number */
+       __u32           t_flags;        /* See below */
+ } journal_block_tag_t;
+
+ /*
+  * The revoke descriptor: used on disk to describe a series of blocks to
+  * be revoked from the log
+  */
+ typedef struct journal_revoke_header_s
+ {
+       journal_header_t r_header;
+       int              r_count;       /* Count of bytes used in the block */
+ } journal_revoke_header_t;
+
+
+ /* Definitions for the journal tag flags word: */
+ #define JFS_FLAG_ESCAPE               1       /* on-disk block is escaped */
+ #define JFS_FLAG_SAME_UUID    2       /* block has same uuid as previous */
+ #define JFS_FLAG_DELETED      4       /* block deleted by this transaction */
+ #define JFS_FLAG_LAST_TAG     8       /* last tag in this descriptor block */
+
+
+ /*
+  * The journal superblock.  All fields are in big-endian byte order.
+  */
+ typedef struct journal_superblock_s
+ {
+ /* 0x0000 */
+       journal_header_t s_header;
+
+ /* 0x000C */
+       /* Static information describing the journal */
+       __u32   s_blocksize;            /* journal device blocksize */
+       __u32   s_maxlen;               /* total blocks in journal file */
+       __u32   s_first;                /* first block of log information */
+
+ /* 0x0018 */
+       /* Dynamic information describing the current state of the log */
+       __u32   s_sequence;             /* first commit ID expected in log */
+       __u32   s_start;                /* blocknr of start of log */
+
+ /* 0x0020 */
+       /* Error value, as set by journal_abort(). */
+       __s32   s_errno;
+
+ /* 0x0024 */
+       /* Remaining fields are only valid in a version-2 superblock */
+       __u32   s_feature_compat;       /* compatible feature set */
+       __u32   s_feature_incompat;     /* incompatible feature set */
+       __u32   s_feature_ro_compat;    /* readonly-compatible feature set */
+ /* 0x0030 */
+       __u8    s_uuid[16];             /* 128-bit uuid for journal */
+
+ /* 0x0040 */
+       __u32   s_nr_users;             /* Nr of filesystems sharing log */
+
+       __u32   s_dynsuper;             /* Blocknr of dynamic superblock copy*/
+
+ /* 0x0048 */
+       __u32   s_max_transaction;      /* Limit of journal blocks per trans.*/
+       __u32   s_max_trans_data;       /* Limit of data blocks per trans. */
+
+ /* 0x0050 */
+       __u32   s_padding[44];
+
+ /* 0x0100 */
+       __u8    s_users[16*48];         /* ids of all fs'es sharing the log */
+ /* 0x0400 */
+ } journal_superblock_t;
+
+ #define JFS_HAS_COMPAT_FEATURE(j,mask)                                        \
+       ((j)->j_format_version >= 2 &&                                  \
+        ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
+ #define JFS_HAS_RO_COMPAT_FEATURE(j,mask)                             \
+       ((j)->j_format_version >= 2 &&                                  \
+        ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
+ #define JFS_HAS_INCOMPAT_FEATURE(j,mask)                              \
+       ((j)->j_format_version >= 2 &&                                  \
+        ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+
+ #define JFS_FEATURE_INCOMPAT_REVOKE   0x00000001
+
+ /* Features known to this kernel version: */
+ #define JFS_KNOWN_COMPAT_FEATURES     0
+ #define JFS_KNOWN_ROCOMPAT_FEATURES   0
+ #define JFS_KNOWN_INCOMPAT_FEATURES   JFS_FEATURE_INCOMPAT_REVOKE
+
+ #ifdef __KERNEL__
+
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+
+ #define JBD_ASSERTIONS
+ #ifdef JBD_ASSERTIONS
+ #define J_ASSERT(assert)                                              \
+ do {                                                                  \
+       if (!(assert)) {                                                \
+               printk (KERN_EMERG                                      \
+                       "Assertion failure in %s() at %s:%d: \"%s\"\n", \
+                       __FUNCTION__, __FILE__, __LINE__, # assert);    \
+               BUG();                                                  \
+       }                                                               \
+ } while (0)
+
+ #if defined(CONFIG_BUFFER_DEBUG)
+ void buffer_assertion_failure(struct buffer_head *bh);
+ #define J_ASSERT_BH(bh, expr)                                         \
+       do {                                                            \
+               if (!(expr))                                            \
+                       buffer_assertion_failure(bh);                   \
+               J_ASSERT(expr);                                         \
+       } while (0)
+ #define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr)
+ #else
+ #define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
+ #define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
+ #endif
+
+ #else
+ #define J_ASSERT(assert)
+ #endif                /* JBD_ASSERTIONS */
+
+ enum jbd_state_bits {
+       BH_JWrite
+         = BH_PrivateStart,    /* 1 if being written to log (@@@ DEBUGGING) */
+       BH_Freed,               /* 1 if buffer has been freed (truncated) */
+       BH_Revoked,             /* 1 if buffer has been revoked from the log */
+       BH_RevokeValid,         /* 1 if buffer revoked flag is valid */
+       BH_JBDDirty,            /* 1 if buffer is dirty but journaled */
+ };
+
+ /* Return true if the buffer is one which JBD is managing */
+ static inline int buffer_jbd(struct buffer_head *bh)
+ {
+       return __buffer_state(bh, JBD);
+ }
+
+ static inline struct buffer_head *jh2bh(struct journal_head *jh)
+ {
+       return jh->b_bh;
+ }
+
+ static inline struct journal_head *bh2jh(struct buffer_head *bh)
+ {
+       return bh->b_private;
+ }
+
+ struct jbd_revoke_table_s;
+
+ /* The handle_t type represents a single atomic update being performed
+  * by some process.  All filesystem modifications made by the process go
+  * through this handle.  Recursive operations (such as quota operations)
+  * are gathered into a single update.
+  *
+  * The buffer credits field is used to account for journaled buffers
+  * being modified by the running process.  To ensure that there is
+  * enough log space for all outstanding operations, we need to limit the
+  * number of outstanding buffers possible at any time.  When the
+  * operation completes, any buffer credits not used are credited back to
+  * the transaction, so that at all times we know how many buffers the
+  * outstanding updates on a transaction might possibly touch. */
+
+ struct handle_s
+ {
+       /* Which compound transaction is this update a part of? */
+       transaction_t         * h_transaction;
+
+       /* Number of remaining buffers we are allowed to dirty: */
+       int                     h_buffer_credits;
+
+       /* Reference count on this handle */
+       int                     h_ref;
+
+       /* Field for caller's use to track errors through large fs
+          operations */
+       int                     h_err;
+
+       /* Flags */
+       unsigned int    h_sync:         1;      /* sync-on-close */
+       unsigned int    h_jdata:        1;      /* force data journaling */
+       unsigned int    h_aborted:      1;      /* fatal error on handle */
+ };
+
+
+ /* The transaction_t type is the guts of the journaling mechanism.  It
+  * tracks a compound transaction through its various states:
+  *
+  * RUNNING:   accepting new updates
+  * LOCKED:    Updates still running but we don't accept new ones
+  * RUNDOWN:   Updates are tidying up but have finished requesting
+  *            new buffers to modify (state not used for now)
+  * FLUSH:       All updates complete, but we are still writing to disk
+  * COMMIT:      All data on disk, writing commit record
+  * FINISHED:  We still have to keep the transaction for checkpointing.
+  *
+  * The transaction keeps track of all of the buffers modified by a
+  * running transaction, and all of the buffers committed but not yet
+  * flushed to home for finished transactions.
+  */
+
+ struct transaction_s
+ {
+       /* Pointer to the journal for this transaction. */
+       journal_t *             t_journal;
+
+       /* Sequence number for this transaction */
+       tid_t                   t_tid;
+
+       /* Transaction's current state */
+       enum {
+               T_RUNNING,
+               T_LOCKED,
+               T_RUNDOWN,
+               T_FLUSH,
+               T_COMMIT,
+               T_FINISHED
+       }                       t_state;
+
+       /* Where in the log does this transaction's commit start? */
+       unsigned long           t_log_start;
+
+       /* Doubly-linked circular list of all inodes owned by this
+            transaction */     /* AKPM: unused */
+       struct inode *          t_ilist;
+
+       /* Number of buffers on the t_buffers list */
+       int                     t_nr_buffers;
+
+       /* Doubly-linked circular list of all buffers reserved but not
+            yet modified by this transaction */
+       struct journal_head *   t_reserved_list;
+
+       /* Doubly-linked circular list of all metadata buffers owned by this
+            transaction */
+       struct journal_head *   t_buffers;
+
+       /*
+        * Doubly-linked circular list of all data buffers still to be
+        * flushed before this transaction can be committed.
+        * Protected by journal_datalist_lock.
+        */
+       struct journal_head *   t_sync_datalist;
+
+       /*
+        * Doubly-linked circular list of all writepage data buffers
+        * still to be written before this transaction can be committed.
+        * Protected by journal_datalist_lock.
+        */
+       struct journal_head *   t_async_datalist;
+
+       /* Doubly-linked circular list of all forget buffers (superceded
+            buffers which we can un-checkpoint once this transaction
+            commits) */
+       struct journal_head *   t_forget;
+
+       /*
+        * Doubly-linked circular list of all buffers still to be
+        * flushed before this transaction can be checkpointed.
+        */
+       /* Protected by journal_datalist_lock */
+       struct journal_head *   t_checkpoint_list;
+
+       /* Doubly-linked circular list of temporary buffers currently
+            undergoing IO in the log */
+       struct journal_head *   t_iobuf_list;
+
+       /* Doubly-linked circular list of metadata buffers being
+            shadowed by log IO.  The IO buffers on the iobuf list and the
+            shadow buffers on this list match each other one for one at
+            all times. */
+       struct journal_head *   t_shadow_list;
+
+       /* Doubly-linked circular list of control buffers being written
+            to the log. */
+       struct journal_head *   t_log_list;
+
+       /* Number of outstanding updates running on this transaction */
+       int                     t_updates;
+
+       /* Number of buffers reserved for use by all handles in this
+        * transaction handle but not yet modified. */
+       int                     t_outstanding_credits;
+
+       /*
+        * Forward and backward links for the circular list of all
+        * transactions awaiting checkpoint.
+        */
+       /* Protected by journal_datalist_lock */
+       transaction_t           *t_cpnext, *t_cpprev;
+
+       /* When will the transaction expire (become due for commit), in
+        * jiffies ? */
+       unsigned long           t_expires;
+
+       /* How many handles used this transaction? */
+       int t_handle_count;
+ };
+
+
+ /* The journal_t maintains all of the journaling state information for a
+  * single filesystem.  It is linked to from the fs superblock structure.
+  *
+  * We use the journal_t to keep track of all outstanding transaction
+  * activity on the filesystem, and to manage the state of the log
+  * writing process. */
+
+ struct journal_s
+ {
+       /* General journaling state flags */
+       unsigned long           j_flags;
+
+       /* Is there an outstanding uncleared error on the journal (from
+        * a prior abort)? */
+       int                     j_errno;
+
+       /* The superblock buffer */
+       struct buffer_head *    j_sb_buffer;
+       journal_superblock_t *  j_superblock;
+
+       /* Version of the superblock format */
+       int                     j_format_version;
+
+       /* Number of processes waiting to create a barrier lock */
+       int                     j_barrier_count;
+
+       /* The barrier lock itself */
+       struct semaphore        j_barrier;
+
+       /* Transactions: The current running transaction... */
+       transaction_t *         j_running_transaction;
+
+       /* ... the transaction we are pushing to disk ... */
+       transaction_t *         j_committing_transaction;
+
+       /* ... and a linked circular list of all transactions waiting
+        * for checkpointing. */
+       /* Protected by journal_datalist_lock */
+       transaction_t *         j_checkpoint_transactions;
+
+       /* Wait queue for waiting for a locked transaction to start
+            committing, or for a barrier lock to be released */
+       wait_queue_head_t       j_wait_transaction_locked;
+
+       /* Wait queue for waiting for checkpointing to complete */
+       wait_queue_head_t       j_wait_logspace;
+
+       /* Wait queue for waiting for commit to complete */
+       wait_queue_head_t       j_wait_done_commit;
+
+       /* Wait queue to trigger checkpointing */
+       wait_queue_head_t       j_wait_checkpoint;
+
+       /* Wait queue to trigger commit */
+       wait_queue_head_t       j_wait_commit;
+
+       /* Wait queue to wait for updates to complete */
+       wait_queue_head_t       j_wait_updates;
+
+       /* Semaphore for locking against concurrent checkpoints */
+       struct semaphore        j_checkpoint_sem;
+
+       /* The main journal lock, used by lock_journal() */
+       struct semaphore        j_sem;
+
+       /* Journal head: identifies the first unused block in the journal. */
+       unsigned long           j_head;
+
+       /* Journal tail: identifies the oldest still-used block in the
+        * journal. */
+       unsigned long           j_tail;
+
+       /* Journal free: how many free blocks are there in the journal? */
+       unsigned long           j_free;
+
+       /* Journal start and end: the block numbers of the first usable
+        * block and one beyond the last usable block in the journal. */
+       unsigned long           j_first, j_last;
+
+       /* Device, blocksize and starting block offset for the location
+        * where we store the journal. */
+       kdev_t                  j_dev;
+       int                     j_blocksize;
+       unsigned int            j_blk_offset;
+
+       /* Device which holds the client fs.  For internal journal this
+        * will be equal to j_dev. */
+       kdev_t                  j_fs_dev;
+
+       /* Total maximum capacity of the journal region on disk. */
+       unsigned int            j_maxlen;
+
+       /* Optional inode where we store the journal.  If present, all
+        * journal block numbers are mapped into this inode via
+        * bmap(). */
+       struct inode *          j_inode;
+
+       /* Sequence number of the oldest transaction in the log */
+       tid_t                   j_tail_sequence;
+       /* Sequence number of the next transaction to grant */
+       tid_t                   j_transaction_sequence;
+       /* Sequence number of the most recently committed transaction */
+       tid_t                   j_commit_sequence;
+       /* Sequence number of the most recent transaction wanting commit */
+       tid_t                   j_commit_request;
+
+       /* Journal uuid: identifies the object (filesystem, LVM volume
+        * etc) backed by this journal.  This will eventually be
+        * replaced by an array of uuids, allowing us to index multiple
+        * devices within a single journal and to perform atomic updates
+        * across them.  */
+
+       __u8                    j_uuid[16];
+
+       /* Pointer to the current commit thread for this journal */
+       struct task_struct *    j_task;
+
+       /* Maximum number of metadata buffers to allow in a single
+        * compound commit transaction */
+       int                     j_max_transaction_buffers;
+
+       /* What is the maximum transaction lifetime before we begin a
+        * commit? */
+       unsigned long           j_commit_interval;
+
+       /* The timer used to wakeup the commit thread: */
+       struct timer_list *     j_commit_timer;
+       int                     j_commit_timer_active;
+
+       /* Link all journals together - system-wide */
+       struct list_head        j_all_journals;
+
+       /* The revoke table: maintains the list of revoked blocks in the
+            current transaction. */
+       struct jbd_revoke_table_s *j_revoke;
+ };
+
+ /*
+  * Journal flag definitions
+  */
+ #define JFS_UNMOUNT   0x001   /* Journal thread is being destroyed */
+ #define JFS_ABORT     0x002   /* Journaling has been aborted for errors. */
+ #define JFS_ACK_ERR   0x004   /* The errno in the sb has been acked */
+ #define JFS_FLUSHED   0x008   /* The journal superblock has been flushed */
+ #define JFS_LOADED    0x010   /* The journal superblock has been loaded */
+
+ /*
+  * Function declarations for the journaling transaction and buffer
+  * management
+  */
+
+ /* Filing buffers */
+ extern void __journal_unfile_buffer(struct journal_head *);
+ extern void journal_unfile_buffer(struct journal_head *);
+ extern void __journal_refile_buffer(struct journal_head *);
+ extern void journal_refile_buffer(struct journal_head *);
+ extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
+ extern void __journal_free_buffer(struct journal_head *bh);
+ extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
+ extern void __journal_clean_data_list(transaction_t *transaction);
+
+ /* Log buffer allocation */
+ extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
+ extern unsigned long journal_next_log_block(journal_t *);
+
+ /* Commit management */
+ extern void journal_commit_transaction(journal_t *);
+
+ /* Checkpoint list management */
+ int __journal_clean_checkpoint_list(journal_t *journal);
+ extern void journal_remove_checkpoint(struct journal_head *);
+ extern void __journal_remove_checkpoint(struct journal_head *);
+ extern void journal_insert_checkpoint(struct journal_head *, transaction_t *);
+ extern void __journal_insert_checkpoint(struct journal_head *,transaction_t *);
+
+ /* Buffer IO */
+ extern int
+ journal_write_metadata_buffer(transaction_t     *transaction,
+                             struct journal_head  *jh_in,
+                             struct journal_head **jh_out,
+                             int                  blocknr);
+
+ /* Transaction locking */
+ extern void           __wait_on_journal (journal_t *);
+
+ /*
+  * Journal locking.
+  *
+  * We need to lock the journal during transaction state changes so that
+  * nobody ever tries to take a handle on the running transaction while
+  * we are in the middle of moving it to the commit phase.
+  *
+  * Note that the locking is completely interrupt unsafe.  We never touch
+  * journal structures from interrupts.
+  *
+  * In 2.2, the BKL was required for lock_journal.  This is no longer
+  * the case.
+  */
+
+ static inline void lock_journal(journal_t *journal)
+ {
+       down(&journal->j_sem);
+ }
+
+ /* This returns zero if we acquired the semaphore */
+ static inline int try_lock_journal(journal_t * journal)
+ {
+       return down_trylock(&journal->j_sem);
+ }
+
+ static inline void unlock_journal(journal_t * journal)
+ {
+       up(&journal->j_sem);
+ }
+
+
+ static inline handle_t *journal_current_handle(void)
+ {
+       return current->journal_info;
+ }
+
+ /* The journaling code user interface:
+  *
+  * Create and destroy handles
+  * Register buffer modifications against the current transaction.
+  */
+
+ extern handle_t *journal_start(journal_t *, int nblocks);
+ extern handle_t *journal_try_start(journal_t *, int nblocks);
+ extern int     journal_restart (handle_t *, int nblocks);
+ extern int     journal_extend (handle_t *, int nblocks);
+ extern int     journal_get_write_access (handle_t *, struct buffer_head *);
+ extern int     journal_get_create_access (handle_t *, struct buffer_head *);
+ extern int     journal_get_undo_access (handle_t *, struct buffer_head *);
+ extern int     journal_dirty_data (handle_t *,
+                               struct buffer_head *, int async);
+ extern int     journal_dirty_metadata (handle_t *, struct buffer_head *);
+ extern void    journal_release_buffer (handle_t *, struct buffer_head *);
+ extern void    journal_forget (handle_t *, struct buffer_head *);
+ extern void    journal_sync_buffer (struct buffer_head *);
+ extern int     journal_flushpage(journal_t *, struct page *, unsigned long);
+ extern int     journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int     journal_stop(handle_t *);
+ extern int     journal_flush (journal_t *);
+
+ extern void    journal_lock_updates (journal_t *);
+ extern void    journal_unlock_updates (journal_t *);
+
+ extern journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
+                               int start, int len, int bsize);
+ extern journal_t * journal_init_inode (struct inode *);
+ extern int       journal_update_format (journal_t *);
+ extern int       journal_check_used_features
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_check_available_features
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_set_features
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_create     (journal_t *);
+ extern int       journal_load       (journal_t *journal);
+ extern void      journal_destroy    (journal_t *);
+ extern int       journal_recover    (journal_t *journal);
+ extern int       journal_wipe       (journal_t *, int);
+ extern int       journal_skip_recovery (journal_t *);
+ extern void      journal_update_superblock (journal_t *, int);
+ extern void      __journal_abort      (journal_t *);
+ extern void      journal_abort      (journal_t *, int);
+ extern int       journal_errno      (journal_t *);
+ extern void      journal_ack_err    (journal_t *);
+ extern int       journal_clear_err  (journal_t *);
+ extern unsigned long journal_bmap(journal_t *journal, unsigned long blocknr);
+ extern int        journal_force_commit(journal_t *journal);
+
+ /*
+  * journal_head management
+  */
+ extern struct journal_head
+               *journal_add_journal_head(struct buffer_head *bh);
+ extern void   journal_remove_journal_head(struct buffer_head *bh);
+ extern void   __journal_remove_journal_head(struct buffer_head *bh);
+ extern void   journal_unlock_journal_head(struct journal_head *jh);
+
+ /* Primary revoke support */
+ #define JOURNAL_REVOKE_DEFAULT_HASH 256
+ extern int       journal_init_revoke(journal_t *, int);
+ extern void      journal_destroy_revoke_caches(void);
+ extern int       journal_init_revoke_caches(void);
+
+ extern void      journal_destroy_revoke(journal_t *);
+ extern int       journal_revoke (handle_t *,
+                               unsigned long, struct buffer_head *);
+ extern int       journal_cancel_revoke(handle_t *, struct journal_head *);
+ extern void      journal_write_revoke_records(journal_t *, transaction_t *);
+
+ /* Recovery revoke support */
+ extern int       journal_set_revoke(journal_t *, unsigned long, tid_t);
+ extern int       journal_test_revoke(journal_t *, unsigned long, tid_t);
+ extern void      journal_clear_revoke(journal_t *);
+ extern void      journal_brelse_array(struct buffer_head *b[], int n);
+
+ /* The log thread user interface:
+  *
+  * Request space in the current transaction, and force transaction commit
+  * transitions on demand.
+  */
+
+ extern int    log_space_left (journal_t *); /* Called with journal locked */
+ extern tid_t  log_start_commit (journal_t *, transaction_t *);
+ extern void   log_wait_commit (journal_t *, tid_t);
+ extern int    log_do_checkpoint (journal_t *, int);
+
+ extern void   log_wait_for_space(journal_t *, int nblocks);
+ extern void   __journal_drop_transaction(journal_t *, transaction_t *);
+ extern int    cleanup_journal_tail(journal_t *);
+
+ /* Reduce journal memory usage by flushing */
+ extern void shrink_journal_memory(void);
+
+ /* Debugging code only: */
+
+ #define jbd_ENOSYS() \
+ do {                                                                \
+       printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \
+       current->state = TASK_UNINTERRUPTIBLE;                        \
+       schedule();                                                   \
+ } while (1)
+
+ /*
+  * is_journal_abort
+  *
+  * Simple test wrapper function to test the JFS_ABORT state flag.  This
+  * bit, when set, indicates that we have had a fatal error somewhere,
+  * either inside the journaling layer or indicated to us by the client
+  * (eg. ext3), and that we and should not commit any further
+  * transactions.
+  */
+
+ static inline int is_journal_aborted(journal_t *journal)
+ {
+       return journal->j_flags & JFS_ABORT;
+ }
+
+ static inline int is_handle_aborted(handle_t *handle)
+ {
+       if (handle->h_aborted)
+               return 1;
+       return is_journal_aborted(handle->h_transaction->t_journal);
+ }
+
+ static inline void journal_abort_handle(handle_t *handle)
+ {
+       handle->h_aborted = 1;
+ }
+
+ /* Not all architectures define BUG() */
+ #ifndef BUG
+  #define BUG() do { \
+         printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+       * ((char *) 0) = 0; \
+  } while (0)
+ #endif /* BUG */
+
+ #endif /* __KERNEL__   */
+
+ /* Comparison functions for transaction IDs: perform comparisons using
+  * modulo arithmetic so that they work over sequence number wraps. */
+
+ static inline int tid_gt(tid_t x, tid_t y)
+ {
+       int difference = (x - y);
+       return (difference > 0);
+ }
+
+ static inline int tid_geq(tid_t x, tid_t y)
+ {
+       int difference = (x - y);
+       return (difference >= 0);
+ }
+
+ extern int journal_blocks_per_page(struct inode *inode);
+
+ /*
+  * Definitions which augment the buffer_head layer
+  */
+
+ /* JBD additions */
+
+ /* journaling buffer types */
+ #define BJ_None               0       /* Not journaled */
+ #define BJ_SyncData   1       /* Normal data: flush before commit */
+ #define BJ_AsyncData  2       /* writepage data: wait on it before commit */
+ #define BJ_Metadata   3       /* Normal journaled metadata */
+ #define BJ_Forget     4       /* Buffer superceded by this transaction */
+ #define BJ_IO         5       /* Buffer is for temporary IO use */
+ #define BJ_Shadow     6       /* Buffer contents being shadowed to the log */
+ #define BJ_LogCtl     7       /* Buffer contains log descriptors */
+ #define BJ_Reserved   8       /* Buffer is reserved for access by journal */
+ #define BJ_Types      9
+
+ extern int jbd_blocks_per_page(struct inode *inode);
+
+ #ifdef __KERNEL__
+
+ extern spinlock_t jh_splice_lock;
+ /*
+  * Once `expr1' has been found true, take jh_splice_lock
+  * and then reevaluate everything.
+  */
+ #define SPLICE_LOCK(expr1, expr2)                             \
+       ({                                                      \
+               int ret = (expr1);                              \
+               if (ret) {                                      \
+                       spin_lock(&jh_splice_lock);             \
+                       ret = (expr1) && (expr2);               \
+                       spin_unlock(&jh_splice_lock);           \
+               }                                               \
+               ret;                                            \
+       })
+
+ /*
+  * A number of buffer state predicates.  They test for
+  * buffer_jbd() because they are used in core kernel code.
+  *
+  * These will be racy on SMP unless we're *sure* that the
+  * buffer won't be detached from the journalling system
+  * in parallel.
+  */
+
+ /* Return true if the buffer is on journal list `list' */
+ static inline int buffer_jlist_eq(struct buffer_head *bh, int list)
+ {
+       return SPLICE_LOCK(buffer_jbd(bh), bh2jh(bh)->b_jlist == list);
+ }
+
+ /* Return true if this bufer is dirty wrt the journal */
+ static inline int buffer_jdirty(struct buffer_head *bh)
+ {
+       return buffer_jbd(bh) && __buffer_state(bh, JBDDirty);
+ }
+
+ /* Return true if it's a data buffer which journalling is managing */
+ static inline int buffer_jbd_data(struct buffer_head *bh)
+ {
+       return SPLICE_LOCK(buffer_jbd(bh),
+                       bh2jh(bh)->b_jlist == BJ_SyncData ||
+                       bh2jh(bh)->b_jlist == BJ_AsyncData);
+ }
+
+ #ifdef CONFIG_SMP
+ #define assert_spin_locked(lock)      J_ASSERT(spin_is_locked(lock))
+ #else
+ #define assert_spin_locked(lock)      do {} while(0)
+ #endif
+
+ #endif        /* __KERNEL__ */
+
+ #endif        /* CONFIG_JBD || CONFIG_JBD_MODULE || !__KERNEL__ */
+
+ /*
+  * Compatibility no-ops which allow the kernel to compile without CONFIG_JBD
+  * go here.
+  */
+
+ #if defined(__KERNEL__) && !(defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE))
+
+ #define J_ASSERT(expr)                        do {} while (0)
+ #define J_ASSERT_BH(bh, expr)         do {} while (0)
+ #define buffer_jbd(bh)                        0
+ #define buffer_jlist_eq(bh, val)      0
+ #define journal_buffer_journal_lru(bh)        0
+
+ #endif        /* defined(__KERNEL__) && !defined(CONFIG_JBD) */
+ #endif        /* _LINUX_JBD_H */
diff -rc2P linux/include/linux/journal-head.h linux-2.4.13/include/linux/journal-head.h
*** linux/include/linux/journal-head.h  Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/journal-head.h   Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,70 ----
+ /*
+  * include/linux/journal-head.h
+  *
+  * buffer_head fields for JBD
+  *
+  * 27 May 2001 ANdrew Morton <[email protected]>
+  *    Created - pulled out of fs.h
+  */
+
+ #ifndef JOURNAL_HEAD_H_INCLUDED
+ #define JOURNAL_HEAD_H_INCLUDED
+
+ typedef unsigned int          tid_t;          /* Unique transaction ID */
+ typedef struct transaction_s  transaction_t;  /* Compound transaction type */
+ struct buffer_head;
+
+ struct journal_head {
+ #ifndef CONFIG_JBD_UNIFIED_BUFFERS
+       /* Points back to our buffer_head. */
+       struct buffer_head *b_bh;
+ #endif
+
+       /* Reference count - see description in journal.c */
+       int b_jcount;
+
+       /* Journaling list for this buffer */
+       unsigned b_jlist;
+
+       /* Copy of the buffer data frozen for writing to the log. */
+       char * b_frozen_data;
+
+       /* Pointer to a saved copy of the buffer containing no
+            uncommitted deallocation references, so that allocations can
+            avoid overwriting uncommitted deletes. */
+       char * b_committed_data;
+
+       /* Pointer to the compound transaction which owns this buffer's
+            metadata: either the running transaction or the committing
+            transaction (if there is one).  Only applies to buffers on a
+            transaction's data or metadata journaling list. */
+       /* Protected by journal_datalist_lock */
+       transaction_t * b_transaction;
+
+       /* Pointer to the running compound transaction which is
+            currently modifying the buffer's metadata, if there was
+            already a transaction committing it when the new transaction
+            touched it. */
+       transaction_t * b_next_transaction;
+
+       /* Doubly-linked list of buffers on a transaction's data,
+            metadata or forget queue. */
+       /* Protected by journal_datalist_lock */
+       struct journal_head *b_tnext, *b_tprev;
+
+       /*
+        * Pointer to the compound transaction against which this buffer
+        * is checkpointed.  Only dirty buffers can be checkpointed.
+        */
+       /* Protected by journal_datalist_lock */
+       transaction_t * b_cp_transaction;
+
+       /*
+        * Doubly-linked list of buffers still remaining to be flushed
+        * before an old transaction can be checkpointed.
+        */
+       /* Protected by journal_datalist_lock */
+       struct journal_head *b_cpnext, *b_cpprev;
+ };
+
+ #endif                /* JOURNAL_HEAD_H_INCLUDED */
diff -rc2P linux/include/linux/sched.h linux-2.4.13/include/linux/sched.h
*** linux/include/linux/sched.h Fri Nov  9 16:15:08 2001
--- linux-2.4.13/include/linux/sched.h  Fri Nov  9 16:58:32 2001
***************
*** 420,423 ****
--- 420,425 ----
 /* Protection of (de-)allocation: mm, files, fs, tty */
       spinlock_t alloc_lock;
+ /* journalling filesystem info */
+        void *journal_info;
 /* Field to make virtual server running in chroot more  isolated */
       int s_context;  /* Process can only deal with other processes */
***************
*** 513,516 ****
--- 515,519 ----
     blocked:          {{0}},                                          \
     alloc_lock:               SPIN_LOCK_UNLOCKED,                             \
+     journal_info:             NULL,                                           \
     cap_bset:         CAP_INIT_EFF_SET,                               \
 }
diff -rc2P linux/include/linux/sched.h.orig linux-2.4.13/include/linux/sched.h.orig
*** linux/include/linux/sched.h.orig    Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/sched.h.orig     Fri Nov  9 16:15:08 2001
***************
*** 0 ****
--- 1,936 ----
+ #ifndef _LINUX_SCHED_H
+ #define _LINUX_SCHED_H
+
+ #include <asm/param.h>        /* for HZ */
+
+ extern unsigned long event;
+
+ #include <linux/config.h>
+ #include <linux/binfmts.h>
+ #include <linux/threads.h>
+ #include <linux/kernel.h>
+ #include <linux/types.h>
+ #include <linux/times.h>
+ #include <linux/timex.h>
+ #include <linux/rbtree.h>
+
+ #include <asm/system.h>
+ #include <asm/semaphore.h>
+ #include <asm/page.h>
+ #include <asm/ptrace.h>
+ #include <asm/mmu.h>
+
+ #include <linux/smp.h>
+ #include <linux/tty.h>
+ #include <linux/sem.h>
+ #include <linux/signal.h>
+ #include <linux/securebits.h>
+ #include <linux/fs_struct.h>
+
+ struct exec_domain;
+
+ /*
+  * cloning flags:
+  */
+ #define CSIGNAL               0x000000ff      /* signal mask to be sent at exit */
+ #define CLONE_VM      0x00000100      /* set if VM shared between processes */
+ #define CLONE_FS      0x00000200      /* set if fs info shared between processes */
+ #define CLONE_FILES   0x00000400      /* set if open files shared between processes */
+ #define CLONE_SIGHAND 0x00000800      /* set if signal handlers and blocked signals shared */
+ #define CLONE_PID     0x00001000      /* set if pid shared */
+ #define CLONE_PTRACE  0x00002000      /* set if we want to let tracing continue on the child too */
+ #define CLONE_VFORK   0x00004000      /* set if the parent wants the child to wake it up on mm_release */
+ #define CLONE_PARENT  0x00008000      /* set if we want to have the same parent as the cloner */
+ #define CLONE_THREAD  0x00010000      /* Same thread group? */
+
+ #define CLONE_SIGNAL  (CLONE_SIGHAND | CLONE_THREAD)
+
+ /*
+  * These are the constant used to fake the fixed-point load-average
+  * counting. Some notes:
+  *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+  *    a load-average precision of 10 bits integer + 11 bits fractional
+  *  - if you want to count load-averages more often, you need more
+  *    precision, or rounding will get you. With 2-second counting freq,
+  *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+  *    11 bit fractions.
+  */
+ extern unsigned long avenrun[];               /* Load averages */
+
+ #define FSHIFT                11              /* nr of bits of precision */
+ #define FIXED_1               (1<<FSHIFT)     /* 1.0 as fixed-point */
+ #define LOAD_FREQ     (5*HZ)          /* 5 sec intervals */
+ #define EXP_1         1884            /* 1/exp(5sec/1min) as fixed-point */
+ #define EXP_5         2014            /* 1/exp(5sec/5min) */
+ #define EXP_15                2037            /* 1/exp(5sec/15min) */
+
+ #define CALC_LOAD(load,exp,n) \
+       load *= exp; \
+       load += n*(FIXED_1-exp); \
+       load >>= FSHIFT;
+
+ #define CT_TO_SECS(x) ((x) / HZ)
+ #define CT_TO_USECS(x)        (((x) % HZ) * 1000000/HZ)
+
+ extern int nr_running, nr_threads;
+ extern int last_pid;
+
+ #include <linux/fs.h>
+ #include <linux/time.h>
+ #include <linux/param.h>
+ #include <linux/resource.h>
+ #include <linux/timer.h>
+
+ #include <asm/processor.h>
+
+ #define TASK_RUNNING          0
+ #define TASK_INTERRUPTIBLE    1
+ #define TASK_UNINTERRUPTIBLE  2
+ #define TASK_ZOMBIE           4
+ #define TASK_STOPPED          8
+
+ #define __set_task_state(tsk, state_value)            \
+       do { (tsk)->state = (state_value); } while (0)
+ #ifdef CONFIG_SMP
+ #define set_task_state(tsk, state_value)              \
+       set_mb((tsk)->state, (state_value))
+ #else
+ #define set_task_state(tsk, state_value)              \
+       __set_task_state((tsk), (state_value))
+ #endif
+
+ #define __set_current_state(state_value)                      \
+       do { current->state = (state_value); } while (0)
+ #ifdef CONFIG_SMP
+ #define set_current_state(state_value)                \
+       set_mb(current->state, (state_value))
+ #else
+ #define set_current_state(state_value)                \
+       __set_current_state(state_value)
+ #endif
+
+ /*
+  * Scheduling policies
+  */
+ #define SCHED_OTHER           0
+ #define SCHED_FIFO            1
+ #define SCHED_RR              2
+
+ /*
+  * This is an additional bit set when we want to
+  * yield the CPU for one re-schedule..
+  */
+ #define SCHED_YIELD           0x10
+
+ struct sched_param {
+       int sched_priority;
+ };
+
+ struct completion;
+
+ #ifdef __KERNEL__
+
+ #include <linux/spinlock.h>
+
+ /*
+  * This serializes "schedule()" and also protects
+  * the run-queue from deletions/modifications (but
+  * _adding_ to the beginning of the run-queue has
+  * a separate lock).
+  */
+ extern rwlock_t tasklist_lock;
+ extern spinlock_t runqueue_lock;
+ extern spinlock_t mmlist_lock;
+
+ extern void sched_init(void);
+ extern void init_idle(void);
+ extern void show_state(void);
+ extern void cpu_init (void);
+ extern void trap_init(void);
+ extern void update_process_times(int user);
+ extern void update_one_process(struct task_struct *p, unsigned long user,
+                              unsigned long system, int cpu);
+
+ #define       MAX_SCHEDULE_TIMEOUT    LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+ asmlinkage void schedule(void);
+
+ extern int schedule_task(struct tq_struct *task);
+ extern void flush_scheduled_tasks(void);
+ extern int start_context_thread(void);
+ extern int current_is_keventd(void);
+
+ /*
+  * The default fd array needs to be at least BITS_PER_LONG,
+  * as this is the granularity returned by copy_fdset().
+  */
+ #define NR_OPEN_DEFAULT BITS_PER_LONG
+
+ /*
+  * Open file table structure
+  */
+ struct files_struct {
+       atomic_t count;
+       rwlock_t file_lock;     /* Protects all the below members.  Nests inside tsk->alloc_lock */
+       int max_fds;
+       int max_fdset;
+       int next_fd;
+       struct file ** fd;      /* current fd array */
+       fd_set *close_on_exec;
+       fd_set *open_fds;
+       fd_set close_on_exec_init;
+       fd_set open_fds_init;
+       struct file * fd_array[NR_OPEN_DEFAULT];
+ };
+
+ #define INIT_FILES \
+ {                                                     \
+       count:          ATOMIC_INIT(1),                 \
+       file_lock:      RW_LOCK_UNLOCKED,               \
+       max_fds:        NR_OPEN_DEFAULT,                \
+       max_fdset:      __FD_SETSIZE,                   \
+       next_fd:        0,                              \
+       fd:             &init_files.fd_array[0],        \
+       close_on_exec:  &init_files.close_on_exec_init, \
+       open_fds:       &init_files.open_fds_init,      \
+       close_on_exec_init: { { 0, } },                 \
+       open_fds_init:  { { 0, } },                     \
+       fd_array:       { NULL, }                       \
+ }
+
+ /* Maximum number of active map areas.. This is a random (large) number */
+ #define MAX_MAP_COUNT (65536)
+
+ struct mm_struct {
+       struct vm_area_struct * mmap;           /* list of VMAs */
+       rb_root_t mm_rb;
+       struct vm_area_struct * mmap_cache;     /* last find_vma result */
+       pgd_t * pgd;
+       atomic_t mm_users;                      /* How many users with user space? */
+       atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
+       int map_count;                          /* number of VMAs */
+       struct rw_semaphore mmap_sem;
+       spinlock_t page_table_lock;             /* Protects task page tables and mm->rss */
+
+       struct list_head mmlist;                /* List of all active mm's.  These are globally strung
+                                                * together off init_mm.mmlist, and are protected
+                                                * by mmlist_lock
+                                                */
+
+       unsigned long start_code, end_code, start_data, end_data;
+       unsigned long start_brk, brk, start_stack;
+       unsigned long arg_start, arg_end, env_start, env_end;
+       unsigned long rss, total_vm, locked_vm;
+       unsigned long def_flags;
+       unsigned long cpu_vm_mask;
+       unsigned long swap_address;
+
+       unsigned dumpable:1;
+
+       /* Architecture-specific MM context */
+       mm_context_t context;
+ };
+
+ extern int mmlist_nr;
+
+ #define INIT_MM(name) \
+ {                                                     \
+       mm_rb:          RB_ROOT,                        \
+       pgd:            swapper_pg_dir,                 \
+       mm_users:       ATOMIC_INIT(2),                 \
+       mm_count:       ATOMIC_INIT(1),                 \
+       mmap_sem:       __RWSEM_INITIALIZER(name.mmap_sem), \
+       page_table_lock: SPIN_LOCK_UNLOCKED,            \
+       mmlist:         LIST_HEAD_INIT(name.mmlist),    \
+ }
+
+ struct signal_struct {
+       atomic_t                count;
+       struct k_sigaction      action[_NSIG];
+       spinlock_t              siglock;
+ };
+
+
+ #define INIT_SIGNALS {        \
+       count:          ATOMIC_INIT(1),                 \
+       action:         { {{0,}}, },                    \
+       siglock:        SPIN_LOCK_UNLOCKED              \
+ }
+
+ /*
+  * Some day this will be a full-fledged user tracking system..
+  */
+ struct user_struct {
+       atomic_t __count;       /* reference count */
+       atomic_t processes;     /* How many processes does this user have? */
+       atomic_t files;         /* How many open files does this user have? */
+
+       /* Hash table maintenance information */
+       struct user_struct *next, **pprev;
+       uid_t uid;
+ };
+
+ #define get_current_user() ({                                 \
+       struct user_struct *__user = current->user;     \
+       atomic_inc(&__user->__count);                   \
+       __user; })
+
+
+ /*
+       We may have a different domainname and nodename for each security
+       context. By default, a security context share the same as its
+       parent, potentially the information in system_utsname
+ */
+ #define S_CTX_INFO_LOCK               1       /* Can't request a new s_context */
+ #define S_CTX_INFO_SCHED      2       /* All process in the s_context */
+                                       /* Contribute to the schedular */
+ struct context_info{
+       int refcount;
+       int s_context;
+       char nodename[65];
+       char domainname[65];
+       int flags;              /* S_CTX_INFO_xxx */
+       atomic_t ticks;         /* Number of ticks used by all process */
+                               /* in the s_context */
+ };
+
+
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+
+ struct task_struct {
+       /*
+        * offsets of these are hardcoded elsewhere - touch with care
+        */
+       volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
+       unsigned long flags;    /* per process flags, defined below */
+       int sigpending;
+       mm_segment_t addr_limit;        /* thread address space:
+                                               0-0xBFFFFFFF for user-thead
+                                               0-0xFFFFFFFF for kernel-thread
+                                        */
+       struct exec_domain *exec_domain;
+       volatile long need_resched;
+       unsigned long ptrace;
+
+       int lock_depth;         /* Lock depth */
+
+ /*
+  * offset 32 begins here on 32-bit platforms. We keep
+  * all fields in a single cacheline that are needed for
+  * the goodness() loop in schedule().
+  */
+       long counter;
+       long nice;
+       unsigned long policy;
+       struct mm_struct *mm;
+       int has_cpu, processor;
+       unsigned long cpus_allowed;
+       /*
+        * (only the 'next' pointer fits into the cacheline, but
+        * that's just fine.)
+        */
+       struct list_head run_list;
+       unsigned long sleep_time;
+
+       struct task_struct *next_task, *prev_task;
+       struct mm_struct *active_mm;
+       struct list_head local_pages;
+       unsigned int allocation_order, nr_local_pages;
+
+ /* task state */
+       struct linux_binfmt *binfmt;
+       int exit_code, exit_signal;
+       int pdeath_signal;  /*  The signal sent when the parent dies  */
+       /* ??? */
+       unsigned long personality;
+       int did_exec:1;
+       pid_t pid;
+       pid_t pgrp;
+       pid_t tty_old_pgrp;
+       pid_t session;
+       pid_t tgid;
+       /* boolean value for session group leader */
+       int leader;
+       /*
+        * pointers to (original) parent process, youngest child, younger sibling,
+        * older sibling, respectively.  (p->father can be replaced with
+        * p->p_pptr->pid)
+        */
+       struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+       struct list_head thread_group;
+
+       /* PID hash table linkage. */
+       struct task_struct *pidhash_next;
+       struct task_struct **pidhash_pprev;
+
+       wait_queue_head_t wait_chldexit;        /* for wait4() */
+       struct completion *vfork_done;          /* for vfork() */
+       unsigned long rt_priority;
+       unsigned long it_real_value, it_prof_value, it_virt_value;
+       unsigned long it_real_incr, it_prof_incr, it_virt_incr;
+       struct timer_list real_timer;
+       struct tms times;
+       unsigned long start_time;
+       long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+       unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+       int swappable:1;
+ /* process credentials */
+       uid_t uid,euid,suid,fsuid;
+       gid_t gid,egid,sgid,fsgid;
+       int ngroups;
+       gid_t   groups[NGROUPS];
+       kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
+       int keep_capabilities:1;
+       struct user_struct *user;
+ /* limits */
+       struct rlimit rlim[RLIM_NLIMITS];
+       unsigned short used_math;
+       char comm[16];
+ /* file system info */
+       int link_count, total_link_count;
+       struct tty_struct *tty; /* NULL if no tty */
+       unsigned int locks; /* How many file locks are being held */
+ /* ipc stuff */
+       struct sem_undo *semundo;
+       struct sem_queue *semsleeping;
+ /* CPU-specific state of this task */
+       struct thread_struct thread;
+ /* filesystem information */
+       struct fs_struct *fs;
+ /* open file information */
+       struct files_struct *files;
+ /* signal handlers */
+       spinlock_t sigmask_lock;        /* Protects signal and blocked */
+       struct signal_struct *sig;
+
+       sigset_t blocked;
+       struct sigpending pending;
+
+       unsigned long sas_ss_sp;
+       size_t sas_ss_size;
+       int (*notifier)(void *priv);
+       void *notifier_data;
+       sigset_t *notifier_mask;
+
+ /* Thread group tracking */
+       u32 parent_exec_id;
+       u32 self_exec_id;
+ /* Protection of (de-)allocation: mm, files, fs, tty */
+       spinlock_t alloc_lock;
+ /* Field to make virtual server running in chroot more  isolated */
+       int s_context;  /* Process can only deal with other processes */
+                       /* with the same s_context */
+       __u32 cap_bset; /* Maximum capability of this process and children */
+       unsigned long ipv4root; /* Process can only bind to this iP */
+       struct context_info *s_info;
+ };
+
+ /*
+  * Per process flags
+  */
+ #define PF_ALIGNWARN  0x00000001      /* Print alignment warning msgs */
+                                       /* Not implemented yet, only for 486*/
+ #define PF_STARTING   0x00000002      /* being created */
+ #define PF_EXITING    0x00000004      /* getting shut down */
+ #define PF_FORKNOEXEC 0x00000040      /* forked but didn't exec */
+ #define PF_SUPERPRIV  0x00000100      /* used super-user privileges */
+ #define PF_DUMPCORE   0x00000200      /* dumped core */
+ #define PF_SIGNALED   0x00000400      /* killed by a signal */
+ #define PF_MEMALLOC   0x00000800      /* Allocating memory */
+ #define PF_FREE_PAGES 0x00002000      /* per process page freeing */
+
+ #define PF_USEDFPU    0x00100000      /* task used FPU this quantum (SMP) */
+
+ /*
+  * Ptrace flags
+  */
+
+ #define PT_PTRACED    0x00000001
+ #define PT_TRACESYS   0x00000002
+ #define PT_DTRACE     0x00000004      /* delayed trace (used on m68k, i386) */
+ #define PT_TRACESYSGOOD       0x00000008
+ #define PT_PTRACE_CAP 0x00000010      /* ptracer can follow suid-exec */
+
+ /*
+  * Limit the stack by to some sane default: root can always
+  * increase this limit if needed..  8MB seems reasonable.
+  */
+ #define _STK_LIM      (8*1024*1024)
+
+ #define DEF_COUNTER   (10*HZ/100)     /* 100 ms time slice */
+ #define MAX_COUNTER   (20*HZ/100)
+ #define DEF_NICE      (0)
+
+
+ /*
+  * The default (Linux) execution domain.
+  */
+ extern struct exec_domain     default_exec_domain;
+
+ /*
+  *  INIT_TASK is used to set up the first task table, touch at
+  * your own risk!. Base=0, limit=0x1fffff (=2MB)
+  */
+ #define INIT_TASK(tsk)        \
+ {                                                                     \
+     state:            0,                                              \
+     flags:            0,                                              \
+     sigpending:               0,                                              \
+     addr_limit:               KERNEL_DS,                                      \
+     exec_domain:      &default_exec_domain,                           \
+     lock_depth:               -1,                                             \
+     counter:          DEF_COUNTER,                                    \
+     nice:             DEF_NICE,                                       \
+     policy:           SCHED_OTHER,                                    \
+     mm:                       NULL,                                           \
+     active_mm:                &init_mm,                                       \
+     cpus_allowed:     -1,                                             \
+     run_list:         LIST_HEAD_INIT(tsk.run_list),                   \
+     next_task:                &tsk,                                           \
+     prev_task:                &tsk,                                           \
+     p_opptr:          &tsk,                                           \
+     p_pptr:           &tsk,                                           \
+     thread_group:     LIST_HEAD_INIT(tsk.thread_group),               \
+     wait_chldexit:    __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
+     real_timer:               {                                               \
+       function:               it_real_fn                              \
+     },                                                                        \
+     cap_effective:    CAP_INIT_EFF_SET,                               \
+     cap_inheritable:  CAP_INIT_INH_SET,                               \
+     cap_permitted:    CAP_FULL_SET,                                   \
+     keep_capabilities:        0,                                              \
+     rlim:             INIT_RLIMITS,                                   \
+     user:             INIT_USER,                                      \
+     comm:             "swapper",                                      \
+     thread:           INIT_THREAD,                                    \
+     fs:                       &init_fs,                                       \
+     files:            &init_files,                                    \
+     sigmask_lock:     SPIN_LOCK_UNLOCKED,                             \
+     sig:              &init_signals,                                  \
+     pending:          { NULL, &tsk.pending.head, {{0}}},              \
+     blocked:          {{0}},                                          \
+     alloc_lock:               SPIN_LOCK_UNLOCKED,                             \
+     cap_bset:         CAP_INIT_EFF_SET,                               \
+ }
+
+
+ #ifndef INIT_TASK_SIZE
+ # define INIT_TASK_SIZE       2048*sizeof(long)
+ #endif
+
+ union task_union {
+       struct task_struct task;
+       unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+ };
+
+ extern union task_union init_task_union;
+
+ extern struct   mm_struct init_mm;
+ extern struct task_struct *init_tasks[NR_CPUS];
+
+ /* PID hashing. (shouldnt this be dynamic?) */
+ #define PIDHASH_SZ (4096 >> 2)
+ extern struct task_struct *pidhash[PIDHASH_SZ];
+
+ #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+
+ static inline void hash_pid(struct task_struct *p)
+ {
+       struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
+
+       if((p->pidhash_next = *htable) != NULL)
+               (*htable)->pidhash_pprev = &p->pidhash_next;
+       *htable = p;
+       p->pidhash_pprev = htable;
+ }
+
+ static inline void unhash_pid(struct task_struct *p)
+ {
+       if(p->pidhash_next)
+               p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
+       *p->pidhash_pprev = p->pidhash_next;
+ }
+
+ static inline struct task_struct *find_task_by_pid(int pid)
+ {
+       struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
+
+       for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
+               ;
+
+       return p;
+ }
+
+ /* per-UID process charging. */
+ extern struct user_struct * alloc_uid(uid_t);
+ extern void free_uid(struct user_struct *);
+
+ #include <asm/current.h>
+
+ extern unsigned long volatile jiffies;
+ extern unsigned long itimer_ticks;
+ extern unsigned long itimer_next;
+ extern struct timeval xtime;
+ extern void do_timer(struct pt_regs *);
+
+ extern unsigned int * prof_buffer;
+ extern unsigned long prof_len;
+ extern unsigned long prof_shift;
+
+ #define CURRENT_TIME (xtime.tv_sec)
+
+ extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
+ extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
+ extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+                                     signed long timeout));
+ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+                                                   signed long timeout));
+ extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+
+ #define wake_up(x)                    __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_nr(x, nr)             __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+ #define wake_up_all(x)                        __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
+ #define wake_up_sync(x)                       __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_sync_nr(x, nr)                __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+ #define wake_up_interruptible(x)      __wake_up((x),TASK_INTERRUPTIBLE, 1)
+ #define wake_up_interruptible_nr(x, nr)       __wake_up((x),TASK_INTERRUPTIBLE, nr)
+ #define wake_up_interruptible_all(x)  __wake_up((x),TASK_INTERRUPTIBLE, 0)
+ #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+ #define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE,  nr)
+ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
+
+ extern int in_group_p(gid_t);
+ extern int in_egroup_p(gid_t);
+
+ extern void proc_caches_init(void);
+ extern void flush_signals(struct task_struct *);
+ extern void flush_signal_handlers(struct task_struct *);
+ extern int dequeue_signal(sigset_t *, siginfo_t *);
+ extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+                             sigset_t *mask);
+ extern void unblock_all_signals(void);
+ extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+ extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+ extern int kill_pg_info(int, struct siginfo *, pid_t);
+ extern int kill_sl_info(int, struct siginfo *, pid_t);
+ extern int kill_proc_info(int, struct siginfo *, pid_t);
+ extern void notify_parent(struct task_struct *, int);
+ extern void do_notify_parent(struct task_struct *, int);
+ extern void force_sig(int, struct task_struct *);
+ extern int send_sig(int, struct task_struct *, int);
+ extern int kill_pg(pid_t, int, int);
+ extern int kill_sl(pid_t, int, int);
+ extern int kill_proc(pid_t, int, int);
+ extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+ extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
+
+ static inline int signal_pending(struct task_struct *p)
+ {
+       return (p->sigpending != 0);
+ }
+
+ /*
+  * Re-calculate pending state from the set of locally pending
+  * signals, globally pending signals, and blocked signals.
+  */
+ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+ {
+       unsigned long ready;
+       long i;
+
+       switch (_NSIG_WORDS) {
+       default:
+               for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+                       ready |= signal->sig[i] &~ blocked->sig[i];
+               break;
+
+       case 4: ready  = signal->sig[3] &~ blocked->sig[3];
+               ready |= signal->sig[2] &~ blocked->sig[2];
+               ready |= signal->sig[1] &~ blocked->sig[1];
+               ready |= signal->sig[0] &~ blocked->sig[0];
+               break;
+
+       case 2: ready  = signal->sig[1] &~ blocked->sig[1];
+               ready |= signal->sig[0] &~ blocked->sig[0];
+               break;
+
+       case 1: ready  = signal->sig[0] &~ blocked->sig[0];
+       }
+       return ready != 0;
+ }
+
+ /* Reevaluate whether the task has signals pending delivery.
+    This is required every time the blocked sigset_t changes.
+    All callers should have t->sigmask_lock.  */
+
+ static inline void recalc_sigpending(struct task_struct *t)
+ {
+       t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ }
+
+ /* True if we are on the alternate signal stack.  */
+
+ static inline int on_sig_stack(unsigned long sp)
+ {
+       return (sp - current->sas_ss_sp < current->sas_ss_size);
+ }
+
+ static inline int sas_ss_flags(unsigned long sp)
+ {
+       return (current->sas_ss_size == 0 ? SS_DISABLE
+               : on_sig_stack(sp) ? SS_ONSTACK : 0);
+ }
+
+ extern int request_irq(unsigned int,
+                      void (*handler)(int, void *, struct pt_regs *),
+                      unsigned long, const char *, void *);
+ extern void free_irq(unsigned int, void *);
+
+ /*
+  * This has now become a routine instead of a macro, it sets a flag if
+  * it returns true (to do BSD-style accounting where the process is flagged
+  * if it uses root privs). The implication of this is that you should do
+  * normal permissions checks first, and check suser() last.
+  *
+  * [Dec 1997 -- Chris Evans]
+  * For correctness, the above considerations need to be extended to
+  * fsuser(). This is done, along with moving fsuser() checks to be
+  * last.
+  *
+  * These will be removed, but in the mean time, when the SECURE_NOROOT
+  * flag is set, uids don't grant privilege.
+  */
+ static inline int suser(void)
+ {
+       if (!issecure(SECURE_NOROOT) && current->euid == 0) {
+               current->flags |= PF_SUPERPRIV;
+               return 1;
+       }
+       return 0;
+ }
+
+ static inline int fsuser(void)
+ {
+       if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
+               current->flags |= PF_SUPERPRIV;
+               return 1;
+       }
+       return 0;
+ }
+
+ /*
+  * capable() checks for a particular capability.
+  * New privilege checks should use this interface, rather than suser() or
+  * fsuser(). See include/linux/capability.h for defined capabilities.
+  */
+
+ static inline int capable(int cap)
+ {
+ #if 1 /* ok now */
+       if (cap_raised(current->cap_effective, cap))
+ #else
+       if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
+ #endif
+       {
+               current->flags |= PF_SUPERPRIV;
+               return 1;
+       }
+       return 0;
+ }
+
+ /*
+  * Routines for handling mm_structs
+  */
+ extern struct mm_struct * mm_alloc(void);
+
+ extern struct mm_struct * start_lazy_tlb(void);
+ extern void end_lazy_tlb(struct mm_struct *mm);
+
+ /* mmdrop drops the mm and the page tables */
+ extern inline void FASTCALL(__mmdrop(struct mm_struct *));
+ static inline void mmdrop(struct mm_struct * mm)
+ {
+       if (atomic_dec_and_test(&mm->mm_count))
+               __mmdrop(mm);
+ }
+
+ /* mmput gets rid of the mappings and all user-space */
+ extern void mmput(struct mm_struct *);
+ /* Remove the current tasks stale references to the old mm_struct */
+ extern void mm_release(void);
+
+ /*
+  * Routines for handling the fd arrays
+  */
+ extern struct file ** alloc_fd_array(int);
+ extern int expand_fd_array(struct files_struct *, int nr);
+ extern void free_fd_array(struct file **, int);
+
+ extern fd_set *alloc_fdset(int);
+ extern int expand_fdset(struct files_struct *, int nr);
+ extern void free_fdset(fd_set *, int);
+
+ extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+ extern void flush_thread(void);
+ extern void exit_thread(void);
+
+ extern void exit_mm(struct task_struct *);
+ extern void exit_files(struct task_struct *);
+ extern void exit_sighand(struct task_struct *);
+
+ extern void reparent_to_init(void);
+ extern void daemonize(void);
+
+ extern int do_execve(char *, char **, char **, struct pt_regs *);
+ extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
+
+ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+
+ #define __wait_event(wq, condition)                                   \
+ do {                                                                  \
+       wait_queue_t __wait;                                            \
+       init_waitqueue_entry(&__wait, current);                         \
+                                                                       \
+       add_wait_queue(&wq, &__wait);                                   \
+       for (;;) {                                                      \
+               set_current_state(TASK_UNINTERRUPTIBLE);                \
+               if (condition)                                          \
+                       break;                                          \
+               schedule();                                             \
+       }                                                               \
+       current->state = TASK_RUNNING;                                  \
+       remove_wait_queue(&wq, &__wait);                                \
+ } while (0)
+
+ #define wait_event(wq, condition)                                     \
+ do {                                                                  \
+       if (condition)                                                  \
+               break;                                                  \
+       __wait_event(wq, condition);                                    \
+ } while (0)
+
+ #define __wait_event_interruptible(wq, condition, ret)                        \
+ do {                                                                  \
+       wait_queue_t __wait;                                            \
+       init_waitqueue_entry(&__wait, current);                         \
+                                                                       \
+       add_wait_queue(&wq, &__wait);                                   \
+       for (;;) {                                                      \
+               set_current_state(TASK_INTERRUPTIBLE);                  \
+               if (condition)                                          \
+                       break;                                          \
+               if (!signal_pending(current)) {                         \
+                       schedule();                                     \
+                       continue;                                       \
+               }                                                       \
+               ret = -ERESTARTSYS;                                     \
+               break;                                                  \
+       }                                                               \
+       current->state = TASK_RUNNING;                                  \
+       remove_wait_queue(&wq, &__wait);                                \
+ } while (0)
+
+ #define wait_event_interruptible(wq, condition)                               \
+ ({                                                                    \
+       int __ret = 0;                                                  \
+       if (!(condition))                                               \
+               __wait_event_interruptible(wq, condition, __ret);       \
+       __ret;                                                          \
+ })
+
+ #define REMOVE_LINKS(p) do { \
+       (p)->next_task->prev_task = (p)->prev_task; \
+       (p)->prev_task->next_task = (p)->next_task; \
+       if ((p)->p_osptr) \
+               (p)->p_osptr->p_ysptr = (p)->p_ysptr; \
+       if ((p)->p_ysptr) \
+               (p)->p_ysptr->p_osptr = (p)->p_osptr; \
+       else \
+               (p)->p_pptr->p_cptr = (p)->p_osptr; \
+       } while (0)
+
+ #define SET_LINKS(p) do { \
+       (p)->next_task = &init_task; \
+       (p)->prev_task = init_task.prev_task; \
+       init_task.prev_task->next_task = (p); \
+       init_task.prev_task = (p); \
+       (p)->p_ysptr = NULL; \
+       if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
+               (p)->p_osptr->p_ysptr = p; \
+       (p)->p_pptr->p_cptr = p; \
+       } while (0)
+
+ #define for_each_task(p) \
+       for (p = &init_task ; (p = p->next_task) != &init_task ; )
+
+ #define next_thread(p) \
+       list_entry((p)->thread_group.next, struct task_struct, thread_group)
+
+ static inline void del_from_runqueue(struct task_struct * p)
+ {
+       nr_running--;
+       p->sleep_time = jiffies;
+       list_del(&p->run_list);
+       p->run_list.next = NULL;
+ }
+
+ static inline int task_on_runqueue(struct task_struct *p)
+ {
+       return (p->run_list.next != NULL);
+ }
+
+ static inline void unhash_process(struct task_struct *p)
+ {
+       if (task_on_runqueue(p)) BUG();
+       write_lock_irq(&tasklist_lock);
+       nr_threads--;
+       unhash_pid(p);
+       REMOVE_LINKS(p);
+       list_del(&p->thread_group);
+       write_unlock_irq(&tasklist_lock);
+ }
+
+ /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
+ static inline void task_lock(struct task_struct *p)
+ {
+       spin_lock(&p->alloc_lock);
+ }
+
+ static inline void task_unlock(struct task_struct *p)
+ {
+       spin_unlock(&p->alloc_lock);
+ }
+
+ /* write full pathname into buffer and return start of pathname */
+ static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
+                               char *buf, int buflen)
+ {
+       char *res;
+       struct vfsmount *rootmnt;
+       struct dentry *root;
+       read_lock(&current->fs->lock);
+       rootmnt = mntget(current->fs->rootmnt);
+       root = dget(current->fs->root);
+       read_unlock(&current->fs->lock);
+       spin_lock(&dcache_lock);
+       res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
+       spin_unlock(&dcache_lock);
+       dput(root);
+       mntput(rootmnt);
+       return res;
+ }
+
+ /* Manage the reference count of the context_info pointer */
+ void sys_release_s_info (struct task_struct *);
+ void sys_assign_s_info (struct task_struct *);
+ void sys_alloc_s_info (void);
+
+ #endif /* __KERNEL__ */
+
+ #endif
diff -rc2P linux/include/linux/sched.h.rej linux-2.4.13/include/linux/sched.h.rej
*** linux/include/linux/sched.h.rej     Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/sched.h.rej      Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,36 ----
+ ***************
+ *** 399,404 ****
+       u32 self_exec_id;
+   /* Protection of (de-)allocation: mm, files, fs, tty */
+       spinlock_t alloc_lock;
+   };
+
+   /*
+ --- 399,407 ----
+       u32 self_exec_id;
+   /* Protection of (de-)allocation: mm, files, fs, tty */
+       spinlock_t alloc_lock;
+ +
+ + /* journalling filesystem info */
+ +     void *journal_info;
+   };
+
+   /*
+ ***************
+ *** 485,491 ****
+       sig:            &init_signals,                                  \
+       pending:                { NULL, &tsk.pending.head, {{0}}},              \
+       blocked:                {{0}},                                          \
+ -     alloc_lock:             SPIN_LOCK_UNLOCKED                              \
+   }
+
+
+ --- 488,495 ----
+       sig:            &init_signals,                                  \
+       pending:                { NULL, &tsk.pending.head, {{0}}},              \
+       blocked:                {{0}},                                          \
+ +     alloc_lock:             SPIN_LOCK_UNLOCKED,                             \
+ +     journal_info:   NULL                                            \
+   }
+
+
diff -rc2P linux/kernel/sysctl.c linux-2.4.13/kernel/sysctl.c
*** linux/kernel/sysctl.c       Fri Nov  9 16:15:08 2001
--- linux-2.4.13/kernel/sysctl.c        Fri Nov  9 16:58:00 2001
***************
*** 30,33 ****
--- 30,35 ----
 #include <linux/init.h>
 #include <linux/sysrq.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
 #include <linux/highuid.h>

***************
*** 303,306 ****
--- 305,316 ----
       {FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
        0644, NULL, &proc_dointvec},
+ #ifdef CONFIG_JBD_DEBUG
+       {FS_LEASE_TIME+1, "jbd-debug", &journal_enable_debug, sizeof (int),
+        0644, NULL, &proc_dointvec},
+ #endif
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+       {FS_LEASE_TIME+2, "jbd-oom-retry", &journal_oom_retry, sizeof (int),
+        0644, NULL, &proc_dointvec},
+ #endif
       {0}
 };
diff -rc2P linux/mm/filemap.c linux-2.4.13/mm/filemap.c
*** linux/mm/filemap.c  Tue Oct 23 20:52:48 2001
--- linux-2.4.13/mm/filemap.c   Fri Nov  9 16:58:00 2001
***************
*** 201,211 ****
 }

 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
       memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-
       if (page->buffers)
!               block_flushpage(page, partial);
!
 }

--- 201,218 ----
 }

+ static int do_flushpage(struct page *page, unsigned long offset)
+ {
+       int (*flushpage) (struct page *, unsigned long);
+       flushpage = page->mapping->a_ops->flushpage;
+       if (flushpage)
+               return (*flushpage)(page, offset);
+       return block_flushpage(page, offset);
+ }
+
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
       memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
       if (page->buffers)
!               do_flushpage(page, partial);
 }

***************
*** 213,217 ****
 {
       /* Leave it on the LRU if it gets converted into anonymous buffers */
!       if (!page->buffers || block_flushpage(page, 0))
               lru_cache_del(page);

--- 220,224 ----
 {
       /* Leave it on the LRU if it gets converted into anonymous buffers */
!       if (!page->buffers || do_flushpage(page, 0))
               lru_cache_del(page);

***************
*** 1119,1122 ****
--- 1126,1130 ----
 }

+
 /*
  * Mark a page as having seen activity.
***************
*** 2817,2821 ****
       err = written ? written : status;
 out:
-
       up(&inode->i_sem);
       return err;
--- 2825,2828 ----
diff -rc2P linux/mm/memory.c linux-2.4.13/mm/memory.c
*** linux/mm/memory.c   Mon Oct 15 15:09:50 2001
--- linux-2.4.13/mm/memory.c    Fri Nov  9 16:58:00 2001
***************
*** 1243,1250 ****
       struct page * new_page;
       pte_t entry;
!
       if (!vma->vm_ops || !vma->vm_ops->nopage)
               return do_anonymous_page(mm, vma, page_table, write_access, address);
       spin_unlock(&mm->page_table_lock);

       /*
--- 1243,1256 ----
       struct page * new_page;
       pte_t entry;
!       int ret;
!       struct inode *inode = NULL;
!
       if (!vma->vm_ops || !vma->vm_ops->nopage)
               return do_anonymous_page(mm, vma, page_table, write_access, address);
       spin_unlock(&mm->page_table_lock);
+       if (vma->vm_file && vma->vm_file->f_dentry)
+               inode = vma->vm_file->f_dentry->d_inode;
+       if (inode)
+               down_read(&inode->i_truncate_sem);

       /*
***************
*** 1256,1263 ****

       spin_lock(&mm->page_table_lock);
!       if (new_page == NULL)   /* no page was available -- SIGBUS */
!               return 0;
!       if (new_page == NOPAGE_OOM)
!               return -1;
       /*
        * This silly early PAGE_DIRTY setting removes a race
--- 1262,1275 ----

       spin_lock(&mm->page_table_lock);
!       if (new_page == NULL) { /* no page was available -- SIGBUS */
!               ret = 0;
!               goto out;
!       }
!
!       if (new_page == NOPAGE_OOM) {
!               ret =  -1;
!               goto out;
!       }
!
       /*
        * This silly early PAGE_DIRTY setting removes a race
***************
*** 1285,1294 ****
               /* One of our sibling threads was faster, back out. */
               page_cache_release(new_page);
!               return 1;
       }

       /* no need to invalidate: a not-present page shouldn't be cached */
       update_mmu_cache(vma, address, entry);
!       return 2;       /* Major fault */
 }

--- 1297,1311 ----
               /* One of our sibling threads was faster, back out. */
               page_cache_release(new_page);
!               ret = 1;
!               goto out;
       }

       /* no need to invalidate: a not-present page shouldn't be cached */
       update_mmu_cache(vma, address, entry);
!       ret = 2;        /* Major fault */
! out:
!       if (inode)
!               up_read(&inode->i_truncate_sem);
!       return ret;
 }

diff -rc2P linux/mm/vmscan.c linux-2.4.13/mm/vmscan.c
*** linux/mm/vmscan.c   Wed Oct 24 00:48:55 2001
--- linux-2.4.13/mm/vmscan.c    Fri Nov  9 16:58:00 2001
***************
*** 8,12 ****
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
-  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  *  Zone aware kswapd started 02/00, Kanoj Sarcar ([email protected]).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
--- 8,11 ----
***************
*** 415,419 ****
                       page_cache_get(page);

!                       if (try_to_free_buffers(page, gfp_mask)) {
                               if (!page->mapping) {
                                       /*
--- 414,418 ----
                       page_cache_get(page);

!                       if (try_to_release_page(page, gfp_mask)) {
                               if (!page->mapping) {
                                       /*
***************
*** 436,440 ****
                                       /*
                                        * The page is still in pagecache so undo the stuff
!                                        * before the try_to_free_buffers since we've not
                                        * finished and we can now try the next step.
                                        */
--- 435,439 ----
                                       /*
                                        * The page is still in pagecache so undo the stuff
!                                        * before the try_to_release_page since we've not
                                        * finished and we can now try the next step.
                                        */