diff -rc2P linux/Documentation/Configure.help linux-2.4.13/Documentation/Configure.help

diff -rc2P linux/Documentation/Configure.help linux-2.4.13/Documentation/Configure.help
*** linux/Documentation/Configure.help Sat Oct 20 22:17:19 2001
--- linux-2.4.13/Documentation/Configure.help Fri Nov 9 16:58:00 2001
***************
*** 12059,12062 ****
--- 12059,12132 ----
wants to say Y here.

+ Ext3 journaling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+ This is the journaling version of the Second extended file system
+ (often called ext3), the de facto standard Linux file system
+ (method to organize files on a storage device) for hard disks.
+
+ The journaling code included in this driver means you do not have
+ to run e2fsck (file system checker) on your file systems after a
+ crash. The journal keeps track of any changes that were being made
+ at the time the system crashed, and can ensure that your file system
+ is consistent without the need for a lengthy check.
+
+ Other than adding the journal to the filesystem, the on-disk format of
+ ext3 is identical to ext2. It is possible to freely switch between
+ using the ext3 driver and the ext2 driver, as long as the filesystem
+ has been cleanly unmounted, or e2fsck is run on the filesystem.
+
+ To add a journal on an existing ext2 filesystem or change the behavior
+ of ext3 file systems, you can use the tune2fs utility ("man tune2fs").
+ To modify attributes of files and directories on ext3 file systems,
+ use chattr ("man chattr"). You need to be using e2fsprogs version
+ 1.20 or later in order to create ext3 journals (available at
+ <http://sourceforge.net/projects/e2fsprogs/>).
+
+ If you want to compile this file system as a module ( = code which
+ can be inserted in and removed from the running kernel whenever you
+ want), say M here and read Documentation/modules.txt. The module
+ will be called ext3.o. Be aware however that the file system of your
+ root partition (the one containing the directory /) cannot be
+ compiled as a module, and so this may be dangerous.
+
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+ This is a generic journaling layer for block devices. It is currently
+ used by the ext3 file system, but it could also be used to add journal
+ support to other file systems or block devices such as RAID or LVM.
+
+ If you are using the ext3 filesystem, you need to say Y here. If you
+ are not using ext3 then you will probably want to say N.
+
+ If you want to compile this device as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want),
+ say M here and read Documentation/modules.txt. The module will be called
+ jbd.o. If you are compiling ext3 into the kernel, you cannot compile
+ this code as a module.
+
+ JBD (ext3) debugging support
+ CONFIG_JBD_DEBUG
+ If you are using the ext3 journaled file system (or potentially any
+ other file system/device using JBD), this option allows you to enable
+ debugging output while the system is running, in order to help track
+ down any problems you are having. By default the debugging output
+ will be turned off.
+
+ If you select Y here, then you will be able to turn on debugging with
+ "echo N > /proc/sys/fs/jbd-debug", where N is a number between 1 and 5,
+ the higher the number, the more debugging output is generated. To turn
+ debugging off again, do "echo 0 > /proc/sys/fs/jbd-debug".
+
+ Buffer Head tracing (DEBUG)
+ CONFIG_BUFFER_DEBUG
+ If you are a kernel developer working with file systems or in the block
+ device layer, this buffer head tracing may help you to track down bugs
+ in your code. This enables some debugging macros (BUFFER_TRACE, etc)
+ which allow you to track the state of a buffer through various layers
+ of code. The debugging code is used primarily by ext3 and JBD code.
+
+ Because this option adds considerably to the size of each buffer, most
+ people will want to say N here.
+
BFS file system support (EXPERIMENTAL)
CONFIG_BFS_FS
diff -rc2P linux/drivers/block/ll_rw_blk.c linux-2.4.13/drivers/block/ll_rw_blk.c
*** linux/drivers/block/ll_rw_blk.c Sat Oct 13 13:30:30 2001
--- linux-2.4.13/drivers/block/ll_rw_blk.c Fri Nov 9 16:58:00 2001
***************
*** 672,677 ****
down by us so at this point flushpage will block and
won't clear the mapped bit under us. */
! if (!buffer_mapped(bh))
BUG();

/*
--- 672,679 ----
down by us so at this point flushpage will block and
won't clear the mapped bit under us. */
! if (!buffer_mapped(bh)) {
! print_buffer_trace(bh);
BUG();
+ }

/*
***************
*** 1007,1013 ****
switch(rw) {
case WRITE:
! if (!atomic_set_buffer_clean(bh))
/* Hmmph! Nothing to write */
goto end_io;
__mark_buffer_clean(bh);
break;
--- 1009,1018 ----
switch(rw) {
case WRITE:
! if (!atomic_set_buffer_clean(bh)) {
! BUFFER_TRACE(bh, "already clean");
/* Hmmph! Nothing to write */
goto end_io;
+ }
+ BUFFER_TRACE(bh, "set clean, write underway");
__mark_buffer_clean(bh);
break;
***************
*** 1032,1037 ****
sorry:
/* Make sure we don't get infinite dirty retries.. */
! for (i = 0; i < nr; i++)
mark_buffer_clean(bhs[i]);
}

--- 1037,1044 ----
sorry:
/* Make sure we don't get infinite dirty retries.. */
! for (i = 0; i < nr; i++) {
! BUFFER_TRACE(bhs[i], "sorry");
mark_buffer_clean(bhs[i]);
+ }
}

***************
*** 1133,1136 ****
--- 1140,1144 ----
queue_nr_requests = 128;

+
/*
* Batch frees according to queue length
diff -rc2P linux/drivers/block/loop.c linux-2.4.13/drivers/block/loop.c
*** linux/drivers/block/loop.c Mon Oct 15 21:53:51 2001
--- linux-2.4.13/drivers/block/loop.c Fri Nov 9 16:58:00 2001
***************
*** 187,190 ****
--- 187,192 ----
while (len > 0) {
int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize;
+ int transfer_result;
+
size = PAGE_CACHE_SIZE - offset;
if (size > len)
***************
*** 198,205 ****
kaddr = page_address(page);
flush_dcache_page(page);
! if (lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV))
! goto write_fail;
if (aops->commit_write(file, page, offset, offset+size))
goto unlock;
data += size;
len -= size;
--- 200,216 ----
kaddr = page_address(page);
flush_dcache_page(page);
! transfer_result = lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV);
! if (transfer_result) {
! /*
! * The transfer failed, but we still write the data to
! * keep prepare/commit calls balanced.
! */
! printk(KERN_ERR "loop: transfer error block %ld\n", index);
! memset(kaddr + offset, 0, size);
! }
if (aops->commit_write(file, page, offset, offset+size))
goto unlock;
+ if (transfer_result)
+ goto unlock;
data += size;
len -= size;
***************
*** 213,220 ****
return 0;

- write_fail:
- printk(KERN_ERR "loop: transfer error block %ld\n", index);
- ClearPageUptodate(page);
- kunmap(page);
unlock:
UnlockPage(page);
--- 224,227 ----
diff -rc2P linux/drivers/ide/ide-disk.c linux-2.4.13/drivers/ide/ide-disk.c
*** linux/drivers/ide/ide-disk.c Thu Oct 11 12:14:32 2001
--- linux-2.4.13/drivers/ide/ide-disk.c Fri Nov 9 16:58:00 2001
***************
*** 368,371 ****
--- 368,392 ----
static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
{
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+ * Silently stop writing to this disk to simulate a crash.
+ */
+ extern int journal_no_write[2];
+ int i;
+
+ if (rq->cmd != WRITE)
+ goto write_ok;
+
+ for (i = 0; i < 2; i++) {
+ if ((journal_no_write[i] & 0xdead0000) == 0xdead0000) {
+ if (rq->rq_dev == (journal_no_write[i] & 0xffff)) {
+ ide_end_request(1, HWGROUP(drive));
+ return ide_stopped;
+ }
+ }
+ }
+ write_ok:
+ ;
+ #endif
if (IDE_CONTROL_REG)
OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
diff -rc2P linux/fs/Config.in linux-2.4.13/fs/Config.in
*** linux/fs/Config.in Thu Oct 4 18:13:18 2001
--- linux-2.4.13/fs/Config.in Fri Nov 9 16:57:59 2001
***************
*** 21,24 ****
--- 21,32 ----
dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL

+ tristate 'Ext3 journalling file system support (EXPERIMENTAL)' CONFIG_EXT3_FS
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+ define_bool CONFIG_JBD $CONFIG_EXT3_FS
+ dep_mbool ' JBD (ext3) debugging support' CONFIG_JBD_DEBUG $CONFIG_JBD
+ bool 'Buffer Head tracing (DEBUG)' CONFIG_BUFFER_DEBUG
+
# msdos file systems
tristate 'DOS FAT fs support' CONFIG_FAT_FS
diff -rc2P linux/fs/Makefile linux-2.4.13/fs/Makefile
*** linux/fs/Makefile Thu Oct 4 18:13:18 2001
--- linux-2.4.13/fs/Makefile Fri Nov 9 16:58:00 2001
***************
*** 8,12 ****
O_TARGET := fs.o

! export-objs := filesystems.o open.o dcache.o
mod-subdirs := nls

--- 8,12 ----
O_TARGET := fs.o

! export-objs := filesystems.o open.o dcache.o buffer.o jbd-kernel.o
mod-subdirs := nls

***************
*** 15,19 ****
fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
! filesystems.o namespace.o

ifeq ($(CONFIG_QUOTA),y)
--- 15,19 ----
fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
! filesystems.o namespace.o jbd-kernel.o

ifeq ($(CONFIG_QUOTA),y)
***************
*** 27,30 ****
--- 27,32 ----

# Do not add any filesystems before this line
+ subdir-$(CONFIG_EXT3_FS) += ext3 # Before ext2 so root fs can be ext3
+ subdir-$(CONFIG_JBD) += jbd
subdir-$(CONFIG_EXT2_FS) += ext2
subdir-$(CONFIG_CRAMFS) += cramfs
diff -rc2P linux/fs/buffer.c linux-2.4.13/fs/buffer.c
*** linux/fs/buffer.c Tue Oct 23 20:54:19 2001
--- linux-2.4.13/fs/buffer.c Fri Nov 9 16:57:59 2001
***************
*** 46,49 ****
--- 46,51 ----
#include <linux/iobuf.h>
#include <linux/highmem.h>
+ #include <linux/jbd.h>
+ #include <linux/module.h>
#include <linux/completion.h>

***************
*** 614,619 ****
by the user.

! Thus invalidate_buffers in general usage is not allwowed to trash dirty
! buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.

NOTE: In the case where the user removed a removable-media-disk even if
--- 616,625 ----
by the user.

! Thus invalidate_buffers in general usage is not allwowed to trash
! dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
! be preserved. These buffers are simply skipped.
!
! We also skip buffers which are still in use. For example this can
! happen if a userspace program is reading the block device.

NOTE: In the case where the user removed a removable-media-disk even if
***************
*** 718,721 ****
--- 724,728 ----
bh->b_end_io = handler;
bh->b_private = private;
+ buffer_trace_init(&bh->b_history);
}

***************
*** 727,730 ****
--- 734,738 ----
struct page *page;

+ BUFFER_TRACE(bh, "enter");
mark_buffer_uptodate(bh, uptodate);

***************
*** 1093,1096 ****
--- 1101,1110 ----
}

+ void set_buffer_flushtime(struct buffer_head *bh)
+ {
+ bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+ }
+ EXPORT_SYMBOL(set_buffer_flushtime);
+
/*
* A buffer may need to be moved from one buffer list to another
***************
*** 1100,1103 ****
--- 1114,1120 ----
{
int dispose = BUF_CLEAN;
+
+ BUFFER_TRACE(bh, "enter");
+
if (buffer_locked(bh))
dispose = BUF_LOCKED;
***************
*** 1111,1114 ****
--- 1128,1132 ----
__insert_into_lru_list(bh, dispose);
}
+ BUFFER_TRACE(bh, "exit");
}

***************
*** 1125,1128 ****
--- 1143,1147 ----
void __brelse(struct buffer_head * buf)
{
+ BUFFER_TRACE(buf, "entry");
if (atomic_read(&buf->b_count)) {
put_bh(buf);
***************
*** 1138,1141 ****
--- 1157,1161 ----
void __bforget(struct buffer_head * buf)
{
+ BUFFER_TRACE(buf, "enter");
mark_buffer_clean(buf);
__brelse(buf);
***************
*** 1168,1175 ****
* Note: the caller should wake up the buffer_wait list if needed.
*/
! static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
{
if (bh->b_inode)
BUG();
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
kmem_cache_free(bh_cachep, bh);
--- 1188,1207 ----
* Note: the caller should wake up the buffer_wait list if needed.
*/
! static void __put_unused_buffer_head(struct buffer_head * bh)
{
if (bh->b_inode)
BUG();
+
+ J_ASSERT_BH(bh, bh->b_prev_free == 0);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ if (buffer_jbd(bh)) {
+ J_ASSERT_BH(bh, bh2jh(bh)->b_transaction == 0);
+ J_ASSERT_BH(bh, bh2jh(bh)->b_next_transaction == 0);
+ J_ASSERT_BH(bh, bh2jh(bh)->b_frozen_data == 0);
+ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data == 0);
+ }
+ #endif
+ buffer_trace_init(&bh->b_history);
+
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
kmem_cache_free(bh_cachep, bh);
***************
*** 1185,1188 ****
--- 1217,1228 ----
}

+ void put_unused_buffer_head(struct buffer_head *bh)
+ {
+ spin_lock(&unused_list_lock);
+ __put_unused_buffer_head(bh);
+ spin_unlock(&unused_list_lock);
+ }
+ EXPORT_SYMBOL(put_unused_buffer_head);
+
/*
* Reserve NR_RESERVED buffer heads for async IO requests to avoid
***************
*** 1190,1194 ****
* buffer heads is now handled in create_buffers().
*/
! static struct buffer_head * get_unused_buffer_head(int async)
{
struct buffer_head * bh;
--- 1230,1234 ----
* buffer heads is now handled in create_buffers().
*/
! struct buffer_head * get_unused_buffer_head(int async)
{
struct buffer_head * bh;
***************
*** 1211,1214 ****
--- 1251,1255 ----
bh->b_blocknr = -1;
bh->b_this_page = NULL;
+ buffer_trace_init(&bh->b_history);
return bh;
}
***************
*** 1224,1227 ****
--- 1265,1269 ----
nr_unused_buffer_heads--;
spin_unlock(&unused_list_lock);
+ buffer_trace_init(&bh->b_history);
return bh;
}
***************
*** 1231,1234 ****
--- 1273,1277 ----
return NULL;
}
+ EXPORT_SYMBOL(get_unused_buffer_head);

void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
***************
*** 1245,1248 ****
--- 1288,1292 ----
bh->b_data = page_address(page) + offset;
}
+ EXPORT_SYMBOL(set_bh_page);

/*
***************
*** 1328,1331 ****
--- 1372,1376 ----
{
if (buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "entry");
mark_buffer_clean(bh);
lock_buffer(bh);
***************
*** 1338,1341 ****
--- 1383,1411 ----
}

+ /**
+ * try_to_release_page - release old fs-specific metadata on a page
+ *
+ */
+
+ int try_to_release_page(struct page * page, int gfp_mask)
+ {
+ if (!PageLocked(page))
+ BUG();
+
+ if (!page->mapping)
+ goto try_to_free;
+ if (!page->mapping->a_ops->releasepage)
+ goto try_to_free;
+ if (page->mapping->a_ops->releasepage(page, gfp_mask))
+ goto try_to_free;
+ /*
+ * We couldn't release buffer metadata; don't even bother trying
+ * to release buffers.
+ */
+ return 0;
+ try_to_free:
+ return try_to_free_buffers(page, gfp_mask);
+ }
+
/*
* We don't have to release all buffers here, but
***************
*** 1381,1385 ****
*/
if (!offset) {
! if (!try_to_free_buffers(page, 0))
return 0;
}
--- 1451,1455 ----
*/
if (!offset) {
! if (!try_to_release_page(page, 0))
return 0;
}
***************
*** 1409,1412 ****
--- 1479,1483 ----
page_cache_get(page);
}
+ EXPORT_SYMBOL(create_empty_buffers);

/*
***************
*** 1427,1431 ****
--- 1498,1505 ----

old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+ J_ASSERT_BH(bh, old_bh != bh);
if (old_bh) {
+ BUFFER_TRACE(old_bh, "old_bh - entry");
+ J_ASSERT_BH(old_bh, !buffer_jlist_eq(old_bh, BJ_Metadata));
mark_buffer_clean(old_bh);
wait_on_buffer(old_bh);
***************
*** 1449,1454 ****

/*
! * block_write_full_page() is SMP-safe - currently it's still
! * being called with the kernel lock held, but the code is ready.
*/
static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
--- 1523,1527 ----

/*
! * block_write_full_page() is SMP threaded - the kernel lock is not held.
*/
static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
***************
*** 1484,1489 ****
if (err)
goto out;
! if (buffer_new(bh))
unmap_underlying_metadata(bh);
}
bh = bh->b_this_page;
--- 1557,1564 ----
if (err)
goto out;
! if (buffer_new(bh)) {
! BUFFER_TRACE(bh, "new: call unmap_underlying_metadata");
unmap_underlying_metadata(bh);
+ }
}
bh = bh->b_this_page;
***************
*** 1493,1496 ****
--- 1568,1572 ----
/* Stage 2: lock the buffers, mark them clean */
do {
+ BUFFER_TRACE(bh, "lock it");
lock_buffer(bh);
set_buffer_async_io(bh);
***************
*** 1549,1554 ****
--- 1625,1632 ----
goto out;
if (buffer_new(bh)) {
+ BUFFER_TRACE(bh, "new: call unmap_underlying_metadata");
unmap_underlying_metadata(bh);
if (Page_Uptodate(page)) {
+ BUFFER_TRACE(bh, "setting uptodate");
set_bit(BH_Uptodate, &bh->b_state);
continue;
***************
*** 1564,1567 ****
--- 1642,1646 ----
}
if (Page_Uptodate(page)) {
+ BUFFER_TRACE(bh, "setting uptodate");
set_bit(BH_Uptodate, &bh->b_state);
continue;
***************
*** 1569,1572 ****
--- 1648,1652 ----
if (!buffer_uptodate(bh) &&
(block_start < from || block_end > to)) {
+ BUFFER_TRACE(bh, "reading");
ll_rw_block(READ, 1, &bh);
*wait_bh++=bh;
***************
*** 1607,1610 ****
--- 1687,1691 ----
set_bit(BH_Uptodate, &bh->b_state);
if (!atomic_set_buffer_dirty(bh)) {
+ BUFFER_TRACE(bh, "mark dirty");
__mark_dirty(bh);
buffer_insert_inode_data_queue(bh, inode);
***************
*** 1890,1893 ****
--- 1971,1975 ----
kunmap(page);

+ BUFFER_TRACE(bh, "zeroed end of block");
__mark_buffer_dirty(bh);
err = 0;
***************
*** 2447,2450 ****
--- 2529,2534 ----
return 0;
}
+ EXPORT_SYMBOL(try_to_free_buffers);
+ EXPORT_SYMBOL(buffermem_pages);

/* ================== Debugging =================== */
diff -rc2P linux/fs/ext3/Makefile linux-2.4.13/fs/ext3/Makefile
*** linux/fs/ext3/Makefile Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/Makefile Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,16 ----
+ #
+ # Makefile for the linux ext2-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+ # unless it's something special (ie not a .c file).
+ #
+ # Note 2! The CFLAGS definitions are now in the main makefile...
+
+ O_TARGET := ext3.o
+
+ obj-y := acl.o balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ ioctl.o namei.o super.o symlink.o
+ obj-m := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
diff -rc2P linux/fs/ext3/acl.c linux-2.4.13/fs/ext3/acl.c
*** linux/fs/ext3/acl.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/acl.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,17 ----
+ /*
+ * linux/fs/ext3/acl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+
+
+ /*
+ * This file will contain the Access Control Lists management for the
+ * second extended file system.
+ */
diff -rc2P linux/fs/ext3/balloc.c linux-2.4.13/fs/ext3/balloc.c
*** linux/fs/ext3/balloc.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/balloc.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,995 ----
+ /*
+ * linux/fs/ext3/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * Enhanced block allocation by Stephen Tweedie ([email protected]), 1993
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ */
+
+ #include <linux/config.h>
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+ /*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+ /*
+ * The free blocks are managed by bitmaps. A file system contains several
+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block. Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block. The descriptors are loaded in memory
+ * when a file system is mounted (see ext3_read_super).
+ */
+
+
+ #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
+ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ unsigned int block_group,
+ struct buffer_head ** bh)
+ {
+ unsigned long group_desc;
+ unsigned long desc;
+ struct ext3_group_desc * gdp;
+
+ if (block_group >= sb->u.ext3_sb.s_groups_count) {
+ ext3_error (sb, "ext3_get_group_desc",
+ "block_group >= groups_count - "
+ "block_group = %d, groups_count = %lu",
+ block_group, sb->u.ext3_sb.s_groups_count);
+
+ return NULL;
+ }
+
+ group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
+ desc = block_group % EXT3_DESC_PER_BLOCK(sb);
+ if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
+ ext3_error (sb, "ext3_get_group_desc",
+ "Group descriptor not loaded - "
+ "block_group = %d, group_desc = %lu, desc = %lu",
+ block_group, group_desc, desc);
+ return NULL;
+ }
+
+ gdp = (struct ext3_group_desc *)
+ sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
+ if (bh)
+ *bh = sb->u.ext3_sb.s_group_desc[group_desc];
+ return gdp + desc;
+ }
+
+ /*
+ * Read the bitmap for a given block_group, reading into the specified
+ * slot in the superblock's bitmap cache.
+ *
+ * Return >=0 on success or a -ve error code.
+ */
+
+ static int read_block_bitmap (struct super_block * sb,
+ unsigned int block_group,
+ unsigned long bitmap_nr)
+ {
+ struct ext3_group_desc * gdp;
+ struct buffer_head * bh = NULL;
+ int retval = -EIO;
+
+ gdp = ext3_get_group_desc (sb, block_group, NULL);
+ if (!gdp)
+ goto error_out;
+ retval = 0;
+ bh = bread (sb->s_dev,
+ le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize);
+ if (!bh) {
+ ext3_error (sb, "read_block_bitmap",
+ "Cannot read block bitmap - "
+ "block_group = %d, block_bitmap = %lu",
+ block_group, (unsigned long) gdp->bg_block_bitmap);
+ retval = -EIO;
+ }
+ /*
+ * On IO error, just leave a zero in the superblock's block pointer for
+ * this group. The IO will be retried next time.
+ */
+ error_out:
+ sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group;
+ sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh;
+ return retval;
+ }
+
+ /*
+ * load_block_bitmap loads the block bitmap for a blocks group
+ *
+ * It maintains a cache for the last bitmaps loaded. This cache is managed
+ * with a LRU algorithm.
+ *
+ * Notes:
+ * 1/ There is one cache per mounted file system.
+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
+ * this function reads the bitmap without maintaining a LRU cache.
+ *
+ * Return the slot used to store the bitmap, or a -ve error code.
+ */
+ static int __load_block_bitmap (struct super_block * sb,
+ unsigned int block_group)
+ {
+ int i, j, retval = 0;
+ unsigned long block_bitmap_number;
+ struct buffer_head * block_bitmap;
+
+ if (block_group >= sb->u.ext3_sb.s_groups_count)
+ ext3_panic (sb, "load_block_bitmap",
+ "block_group >= groups_count - "
+ "block_group = %d, groups_count = %lu",
+ block_group, sb->u.ext3_sb.s_groups_count);
+
+ if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+ if (sb->u.ext3_sb.s_block_bitmap[block_group]) {
+ if (sb->u.ext3_sb.s_block_bitmap_number[block_group] ==
+ block_group)
+ return block_group;
+ ext3_error (sb, "__load_block_bitmap",
+ "block_group != block_bitmap_number");
+ }
+ retval = read_block_bitmap (sb, block_group, block_group);
+ if (retval < 0)
+ return retval;
+ return block_group;
+ }
+
+ for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+ sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++)
+ ;
+ if (i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+ sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) {
+ block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i];
+ block_bitmap = sb->u.ext3_sb.s_block_bitmap[i];
+ for (j = i; j > 0; j--) {
+ sb->u.ext3_sb.s_block_bitmap_number[j] =
+ sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+ sb->u.ext3_sb.s_block_bitmap[j] =
+ sb->u.ext3_sb.s_block_bitmap[j - 1];
+ }
+ sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number;
+ sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap;
+
+ /*
+ * There's still one special case here --- if block_bitmap == 0
+ * then our last attempt to read the bitmap failed and we have
+ * just ended up caching that failure. Try again to read it.
+ */
+ if (!block_bitmap)
+ retval = read_block_bitmap (sb, block_group, 0);
+ } else {
+ if (sb->u.ext3_sb.s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
+ sb->u.ext3_sb.s_loaded_block_bitmaps++;
+ else
+ brelse (sb->u.ext3_sb.s_block_bitmap
+ [EXT3_MAX_GROUP_LOADED - 1]);
+ for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1;
+ j > 0; j--) {
+ sb->u.ext3_sb.s_block_bitmap_number[j] =
+ sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+ sb->u.ext3_sb.s_block_bitmap[j] =
+ sb->u.ext3_sb.s_block_bitmap[j - 1];
+ }
+ retval = read_block_bitmap (sb, block_group, 0);
+ }
+ return retval;
+ }
+
+ /*
+ * Load the block bitmap for a given block group. First of all do a couple
+ * of fast lookups for common cases and then pass the request onto the guts
+ * of the bitmap loader.
+ *
+ * Return the slot number of the group in the superblock bitmap cache's on
+ * success, or a -ve error code.
+ *
+ * There is still one inconsistency here --- if the number of groups in this
+ * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of
+ * differentiating between a group for which we have never performed a bitmap
+ * IO request, and a group for which the last bitmap read request failed.
+ */
+ static inline int load_block_bitmap (struct super_block * sb,
+ unsigned int block_group)
+ {
+ int slot;
+
+ /*
+ * Do the lookup for the slot. First of all, check if we're asking
+ * for the same slot as last time, and did we succeed that last time?
+ */
+ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
+ sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
+ sb->u.ext3_sb.s_block_bitmap[0]) {
+ return 0;
+ }
+ /*
+ * Or can we do a fast lookup based on a loaded group on a filesystem
+ * small enough to be mapped directly into the superblock?
+ */
+ else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED &&
+ sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group
+ && sb->u.ext3_sb.s_block_bitmap[block_group]) {
+ slot = block_group;
+ }
+ /*
+ * If not, then do a full lookup for this block group.
+ */
+ else {
+ slot = __load_block_bitmap (sb, block_group);
+ }
+
+ /*
+ * <0 means we just got an error
+ */
+ if (slot < 0)
+ return slot;
+
+ /*
+ * If it's a valid slot, we may still have cached a previous IO error,
+ * in which case the bh in the superblock cache will be zero.
+ */
+ if (!sb->u.ext3_sb.s_block_bitmap[slot])
+ return -EIO;
+
+ /*
+ * Must have been read in OK to get this far.
+ */
+ return slot;
+ }
+
+ /* Free given blocks, update quota and i_blocks field */
+ void ext3_free_blocks (handle_t *handle, struct inode * inode,
+ unsigned long block, unsigned long count)
+ {
+ struct buffer_head *bitmap_bh;
+ struct buffer_head *gd_bh;
+ unsigned long block_group;
+ unsigned long bit;
+ unsigned long i;
+ int bitmap_nr;
+ unsigned long overflow;
+ struct super_block * sb;
+ struct ext3_group_desc * gdp;
+ struct ext3_super_block * es;
+ int err = 0, ret;
+ int dquot_freed_blocks = 0;
+
+ sb = inode->i_sb;
+ if (!sb) {
+ printk ("ext3_free_blocks: nonexistent device");
+ return;
+ }
+ lock_super (sb);
+ es = sb->u.ext3_sb.s_es;
+ if (block < le32_to_cpu(es->s_first_data_block) ||
+ (block + count) > le32_to_cpu(es->s_blocks_count)) {
+ ext3_error (sb, "ext3_free_blocks",
+ "Freeing blocks not in datazone - "
+ "block = %lu, count = %lu", block, count);
+ goto error_return;
+ }
+
+ ext3_debug ("freeing block %lu\n", block);
+
+ do_more:
+ overflow = 0;
+ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+ EXT3_BLOCKS_PER_GROUP(sb);
+ bit = (block - le32_to_cpu(es->s_first_data_block)) %
+ EXT3_BLOCKS_PER_GROUP(sb);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
+ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
+ count -= overflow;
+ }
+ bitmap_nr = load_block_bitmap (sb, block_group);
+ if (bitmap_nr < 0)
+ goto error_return;
+
+ bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
+ if (!gdp)
+ goto error_return;
+
+ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
+ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
+ in_range (block, le32_to_cpu(gdp->bg_inode_table),
+ sb->u.ext3_sb.s_itb_per_group) ||
+ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
+ sb->u.ext3_sb.s_itb_per_group))
+ ext3_error (sb, "ext3_free_blocks",
+ "Freeing blocks in system zones - "
+ "Block = %lu, count = %lu",
+ block, count);
+
+ /*
+ * We are about to start releasing blocks in the bitmap,
+ * so we need undo access.
+ */
+ /* @@@ check errors */
+ BUFFER_TRACE(bitmap_bh, "getting undo access");
+ err = ext3_journal_get_undo_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ if (err)
+ goto error_return;
+
+ for (i = 0; i < count; i++) {
+ /*
+ * An HJ special. This is expensive...
+ */
+ #ifdef CONFIG_JBD_DEBUG
+ {
+ struct buffer_head *debug_bh;
+ debug_bh = get_hash_table(sb->s_dev, block + i,
+ sb->s_blocksize);
+ if (debug_bh) {
+ BUFFER_TRACE(debug_bh, "Deleted!");
+ if (!bh2jh(bitmap_bh)->b_committed_data)
+ BUFFER_TRACE(debug_bh,
+ "No commited data in bitmap");
+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+ __brelse(debug_bh);
+ }
+ }
+ #endif
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
+ ext3_error (sb, __FUNCTION__,
+ "bit already cleared for block %lu",
+ block + i);
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ dquot_freed_blocks++;
+ gdp->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
+ es->s_free_blocks_count =
+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
+ }
+ /* @@@ This prevents newly-allocated data from being
+ * freed and then reallocated within the same
+ * transaction.
+ *
+ * Ideally we would want to allow that to happen, but to
+ * do so requires making journal_forget() capable of
+ * revoking the queued write of a data block, which
+ * implies blocking on the journal lock. *forget()
+ * cannot block due to truncate races.
+ *
+ * Eventually we can fix this by making journal_forget()
+ * return a status indicating whether or not it was able
+ * to revoke the buffer. On successful revoke, it is
+ * safe not to set the allocation bit in the committed
+ * bitmap, because we know that there is no outstanding
+ * activity on the buffer any more and so it is safe to
+ * reallocate it.
+ */
+ BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
+ J_ASSERT_BH(bitmap_bh,
+ bh2jh(bitmap_bh)->b_committed_data != NULL);
+ ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
+ }
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext3_journal_dirty_metadata(handle, gd_bh);
+ if (!err) err = ret;
+
+ /* And the superblock */
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
+ ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ if (!err) err = ret;
+
+ if (overflow && !err) {
+ block += count;
+ count = overflow;
+ goto do_more;
+ }
+ sb->s_dirt = 1;
+ error_return:
+ ext3_std_error(sb, err);
+ unlock_super(sb);
+ if (dquot_freed_blocks)
+ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+ return;
+ }
+
+ /* For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+ {
+ if (ext3_test_bit(nr, bh->b_data))
+ return 0;
+ if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
+ return 1;
+ return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
+ }
+
+ /*
+ * Find an allocatable block in a bitmap. We honour both the bitmap and
+ * its last-committed copy (if that exists), and perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap; then
+ * for any free bit in the bitmap.
+ */
+ static int find_next_usable_block(int start,
+ struct buffer_head *bh, int maxblocks)
+ {
+ int here, next;
+ char *p, *r;
+
+ if (start > 0) {
+ /*
+ * The goal was occupied; search forward for a free
+ * block within the next XX blocks.
+ *
+ * end_goal is more or less random, but it has to be
+ * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
+ * next 64-bit boundary is simple..
+ */
+ int end_goal = (start + 63) & ~63;
+ here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
+ if (here < end_goal && ext3_test_allocatable(here, bh))
+ return here;
+
+ ext3_debug ("Bit not found near goal\n");
+
+ }
+
+ here = start;
+ if (here < 0)
+ here = 0;
+
+ /*
+ * There has been no free block found in the near vicinity of
+ * the goal: do a search forward through the block groups,
+ * searching in each group first for an entire free byte in the
+ * bitmap and then for any free bit.
+ *
+ * Search first in the remainder of the current group
+ */
+ p = ((char *) bh->b_data) + (here >> 3);
+ r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+ next = (r - ((char *) bh->b_data)) << 3;
+
+ if (next < maxblocks && ext3_test_allocatable(next, bh))
+ return next;
+
+ /* The bitmap search --- search forward alternately
+ * through the actual bitmap and the last-committed copy
+ * until we find a bit free in both. */
+
+ while (here < maxblocks) {
+ next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data,
+ maxblocks, here);
+ if (next >= maxblocks)
+ return -1;
+ if (ext3_test_allocatable(next, bh))
+ return next;
+
+ J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
+ here = ext3_find_next_zero_bit
+ ((unsigned long *) bh2jh(bh)->b_committed_data,
+ maxblocks, next);
+ }
+ return -1;
+ }
+
+ /*
+ * ext3_new_block uses a goal block to assist allocation. If the goal is
+ * free, or there is a free block within 32 blocks of the goal, that block
+ * is allocated. Otherwise a forward search is made for a free block; within
+ * each block group the search first looks for an entire free byte in the block
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+ int ext3_new_block (handle_t *handle, struct inode * inode,
+ unsigned long goal, u32 * prealloc_count,
+ u32 * prealloc_block, int * errp)
+ {
+ struct buffer_head * bh, *bhtmp;
+ struct buffer_head * bh2;
+ #if 0
+ char * p, * r;
+ #endif
+ int i, j, k, tmp, alloctmp;
+ int bitmap_nr;
+ int fatal = 0, err;
+ struct super_block * sb;
+ struct ext3_group_desc * gdp;
+ struct ext3_super_block * es;
+ #ifdef EXT3FS_DEBUG
+ static int goal_hits = 0, goal_attempts = 0;
+ #endif
+ *errp = -ENOSPC;
+ sb = inode->i_sb;
+ if (!sb) {
+ printk ("ext3_new_block: nonexistent device");
+ return 0;
+ }
+
+ /*
+ * Check quota for allocation of this block.
+ */
+ if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+ *errp = -EDQUOT;
+ return 0;
+ }
+
+ lock_super (sb);
+ es = sb->u.ext3_sb.s_es;
+ if (le32_to_cpu(es->s_free_blocks_count) <=
+ le32_to_cpu(es->s_r_blocks_count) &&
+ ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
+ (sb->u.ext3_sb.s_resgid == 0 ||
+ !in_group_p (sb->u.ext3_sb.s_resgid)) &&
+ !capable(CAP_SYS_RESOURCE)))
+ goto out;
+
+ ext3_debug ("goal=%lu.\n", goal);
+
+ /*
+ * First, test whether the goal block is free.
+ */
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= le32_to_cpu(es->s_blocks_count))
+ goal = le32_to_cpu(es->s_first_data_block);
+ i = (goal - le32_to_cpu(es->s_first_data_block)) /
+ EXT3_BLOCKS_PER_GROUP(sb);
+ gdp = ext3_get_group_desc (sb, i, &bh2);
+ if (!gdp)
+ goto io_error;
+
+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+ j = ((goal - le32_to_cpu(es->s_first_data_block)) %
+ EXT3_BLOCKS_PER_GROUP(sb));
+ #ifdef EXT3FS_DEBUG
+ if (j)
+ goal_attempts++;
+ #endif
+ bitmap_nr = load_block_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ goto io_error;
+
+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+
+ ext3_debug ("goal is at %d:%d.\n", i, j);
+
+ if (ext3_test_allocatable(j, bh)) {
+ #ifdef EXT3FS_DEBUG
+ goal_hits++;
+ ext3_debug ("goal bit allocated.\n");
+ #endif
+ goto got_block;
+ }
+
+ j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb));
+ if (j >= 0)
+ goto search_back;
+ }
+
+ ext3_debug ("Bit not found in block group %d.\n", i);
+
+ /*
+ * Now search the rest of the groups. We assume that
+ * i and gdp correctly point to the last group visited.
+ */
+ for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) {
+ i++;
+ if (i >= sb->u.ext3_sb.s_groups_count)
+ i = 0;
+ gdp = ext3_get_group_desc (sb, i, &bh2);
+ if (!gdp) {
+ *errp = -EIO;
+ goto out;
+ }
+ if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+ bitmap_nr = load_block_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ goto io_error;
+
+ bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+ j = find_next_usable_block(-1, bh,
+ EXT3_BLOCKS_PER_GROUP(sb));
+ if (j >= 0)
+ goto search_back;
+ }
+ }
+
+ /* No space left on the device */
+ unlock_super (sb);
+ return 0;
+
+ search_back:
+ /*
+ * We have succeeded in finding a free byte in the block
+ * bitmap. Now search backwards up to 7 bits to find the
+ * start of this group of free blocks.
+ */
+ for ( k = 0;
+ k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh);
+ k++, j--)
+ ;
+
+ got_block:
+
+ ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count);
+
+ /* Make sure we use undo access for the bitmap, because it is
+ critical that we do the frozen_data COW on bitmap buffers in
+ all cases even if the buffer is in BJ_Forget state in the
+ committing transaction. */
+ BUFFER_TRACE(bh, "get undo access for marking new block");
+ fatal = ext3_journal_get_undo_access(handle, bh);
+ if (fatal) goto out;
+
+ BUFFER_TRACE(bh2, "get_write_access");
+ fatal = ext3_journal_get_write_access(handle, bh2);
+ if (fatal) goto out;
+
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ if (fatal) goto out;
+
+ tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb)
+ + le32_to_cpu(es->s_first_data_block);
+
+ if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
+ tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
+ sb->u.ext3_sb.s_itb_per_group))
+ ext3_error (sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", tmp);
+
+ /* The superblock lock should guard against anybody else beating
+ * us to this point! */
+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data));
+ BUFFER_TRACE(bh, "setting bitmap bit");
+ ext3_set_bit(j, bh->b_data);
+
+ #ifdef CONFIG_JBD_DEBUG
+ {
+ struct buffer_head *debug_bh;
+
+ /* Record bitmap buffer state in the newly allocated block */
+ debug_bh = get_hash_table(sb->s_dev, tmp, sb->s_blocksize);
+ if (debug_bh) {
+ BUFFER_TRACE(debug_bh, "state when allocated");
+ BUFFER_TRACE2(debug_bh, bh, "bitmap state");
+ brelse(debug_bh);
+ }
+ }
+ #endif
+ if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data)
+ J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data));
+ bhtmp = bh;
+ alloctmp = j;
+
+ ext3_debug ("found bit %d\n", j);
+
+ /*
+ * Do block preallocation now if required.
+ */
+ #ifdef EXT3_PREALLOCATE
+ /*
+ * akpm: this is not enabled for ext3. Need to use
+ * ext3_test_allocatable()
+ */
+ /* Writer: ->i_prealloc* */
+ if (prealloc_count && !*prealloc_count) {
+ int prealloc_goal;
+ unsigned long next_block = tmp + 1;
+
+ prealloc_goal = es->s_prealloc_blocks ?
+ es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS;
+
+ *prealloc_block = next_block;
+ /* Writer: end */
+ for (k = 1;
+ k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb);
+ k++, next_block++) {
+ if (DQUOT_PREALLOC_BLOCK(inode, 1))
+ break;
+ /* Writer: ->i_prealloc* */
+ if (*prealloc_block + *prealloc_count != next_block ||
+ ext3_set_bit (j + k, bh->b_data)) {
+ /* Writer: end */
+ DQUOT_FREE_BLOCK(inode, 1);
+ break;
+ }
+ (*prealloc_count)++;
+ /* Writer: end */
+ }
+ /*
+ * As soon as we go for per-group spinlocks we'll need these
+ * done inside the loop above.
+ */
+ gdp->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
+ (k - 1));
+ es->s_free_blocks_count =
+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) -
+ (k - 1));
+ ext3_debug ("Preallocated a further %lu bits.\n",
+ (k - 1));
+ }
+ #endif
+
+ j = tmp;
+
+ BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block");
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (!fatal) fatal = err;
+
+ if (j >= le32_to_cpu(es->s_blocks_count)) {
+ ext3_error (sb, "ext3_new_block",
+ "block(%d) >= blocks count(%d) - "
+ "block_group = %d, es == %p ",j,
+ le32_to_cpu(es->s_blocks_count), i, es);
+ goto out;
+ }
+
+ /*
+ * It is up to the caller to add the new buffer to a journal
+ * list of some description. We don't know in advance whether
+ * the caller wants to use it as metadata or data.
+ */
+
+ ext3_debug ("allocating block %d. "
+ "Goal hits %d of %d.\n", j, goal_hits, goal_attempts);
+
+ gdp->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
+ es->s_free_blocks_count =
+ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1);
+
+ BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor");
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (!fatal) fatal = err;
+
+ BUFFER_TRACE(bh, "journal_dirty_metadata for superblock");
+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ if (!fatal) fatal = err;
+
+ sb->s_dirt = 1;
+ if (fatal)
+ goto out;
+
+ unlock_super (sb);
+ *errp = 0;
+ return j;
+
+ io_error:
+ *errp = -EIO;
+ out:
+ if (fatal) {
+ *errp = fatal;
+ ext3_std_error(sb, fatal);
+ }
+ unlock_super (sb);
+ return 0;
+
+ }
+
+ unsigned long ext3_count_free_blocks (struct super_block * sb)
+ {
+ #ifdef EXT3FS_DEBUG
+ struct ext3_super_block * es;
+ unsigned long desc_count, bitmap_count, x;
+ int bitmap_nr;
+ struct ext3_group_desc * gdp;
+ int i;
+
+ lock_super (sb);
+ es = sb->u.ext3_sb.s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ gdp = ext3_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ bitmap_nr = load_block_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ continue;
+
+ x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr],
+ sb->s_blocksize);
+ printk ("group %d: stored = %d, counted = %lu\n",
+ i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+ bitmap_count += x;
+ }
+ printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
+ le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+ unlock_super (sb);
+ return bitmap_count;
+ #else
+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
+ #endif
+ }
+
+ static inline int block_in_use (unsigned long block,
+ struct super_block * sb,
+ unsigned char * map)
+ {
+ return ext3_test_bit ((block -
+ le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
+ EXT3_BLOCKS_PER_GROUP(sb), map);
+ }
+
+ static inline int test_root(int a, int b)
+ {
+ if (a == 0)
+ return 1;
+ while (1) {
+ if (a == 1)
+ return 1;
+ if (a % b)
+ return 0;
+ a = a / b;
+ }
+ }
+
+ int ext3_group_sparse(int group)
+ {
+ return (test_root(group, 3) || test_root(group, 5) ||
+ test_root(group, 7));
+ }
+
+ /**
+ * ext3_bg_has_super - number of blocks used by the superblock in group
+ * @sb: superblock for filesystem
+ * @group: group number to check
+ *
+ * Return the number of blocks used by the superblock (primary or backup)
+ * in this group. Currently this will be only 0 or 1.
+ */
+ int ext3_bg_has_super(struct super_block *sb, int group)
+ {
+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+ !ext3_group_sparse(group))
+ return 0;
+ return 1;
+ }
+
+ /**
+ * ext3_bg_num_gdb - number of blocks used by the group table in group
+ * @sb: superblock for filesystem
+ * @group: group number to check
+ *
+ * Return the number of blocks used by the group descriptor table
+ * (primary or backup) in this group. In the future there may be a
+ * different number of descriptor blocks in each group.
+ */
+ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
+ {
+ if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+ !ext3_group_sparse(group))
+ return 0;
+ return EXT3_SB(sb)->s_gdb_count;
+ }
+
+ #ifdef CONFIG_EXT3_CHECK
+ /* Called at mount-time, super-block is locked */
+ void ext3_check_blocks_bitmap (struct super_block * sb)
+ {
+ struct buffer_head * bh;
+ struct ext3_super_block * es;
+ unsigned long desc_count, bitmap_count, x, j;
+ unsigned long desc_blocks;
+ int bitmap_nr;
+ struct ext3_group_desc * gdp;
+ int i;
+
+ es = sb->u.ext3_sb.s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ gdp = ext3_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ bitmap_nr = load_block_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ continue;
+
+ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
+
+ if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data))
+ ext3_error(sb, __FUNCTION__,
+ "Superblock in group %d is marked free", i);
+
+ desc_blocks = ext3_bg_num_gdb(sb, i);
+ for (j = 0; j < desc_blocks; j++)
+ if (!ext3_test_bit(j + 1, bh->b_data))
+ ext3_error(sb, __FUNCTION__,
+ "Descriptor block #%ld in group "
+ "%d is marked free", j, i);
+
+ if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
+ sb, bh->b_data))
+ ext3_error (sb, "ext3_check_blocks_bitmap",
+ "Block bitmap for group %d is marked free",
+ i);
+
+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
+ sb, bh->b_data))
+ ext3_error (sb, "ext3_check_blocks_bitmap",
+ "Inode bitmap for group %d is marked free",
+ i);
+
+ for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
+ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
+ sb, bh->b_data))
+ ext3_error (sb, "ext3_check_blocks_bitmap",
+ "Block #%d of the inode table in "
+ "group %d is marked free", j, i);
+
+ x = ext3_count_free (bh, sb->s_blocksize);
+ if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
+ ext3_error (sb, "ext3_check_blocks_bitmap",
+ "Wrong free blocks count for group %d, "
+ "stored = %d, counted = %lu", i,
+ le16_to_cpu(gdp->bg_free_blocks_count), x);
+ bitmap_count += x;
+ }
+ if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
+ ext3_error (sb, "ext3_check_blocks_bitmap",
+ "Wrong free blocks count in super block, "
+ "stored = %lu, counted = %lu",
+ (unsigned long)le32_to_cpu(es->s_free_blocks_count),
+ bitmap_count);
+ }
+ #endif
diff -rc2P linux/fs/ext3/bitmap.c linux-2.4.13/fs/ext3/bitmap.c
*** linux/fs/ext3/bitmap.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/bitmap.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,26 ----
+ /*
+ * linux/fs/ext3/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+ #include <linux/fs.h>
+
+
+ static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
+ {
+ unsigned int i;
+ unsigned long sum = 0;
+
+ if (!map)
+ return (0);
+ for (i = 0; i < numchars; i++)
+ sum += nibblemap[map->b_data[i] & 0xf] +
+ nibblemap[(map->b_data[i] >> 4) & 0xf];
+ return (sum);
+ }
diff -rc2P linux/fs/ext3/dir.c linux-2.4.13/fs/ext3/dir.c
*** linux/fs/ext3/dir.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/dir.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,190 ----
+ /*
+ * linux/fs/ext3/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/dir.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext3 directory handling functions
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+
+ static unsigned char ext3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+
+ static int ext3_readdir(struct file *, void *, filldir_t);
+
+ struct file_operations ext3_dir_operations = {
+ read: generic_read_dir,
+ readdir: ext3_readdir, /* BKL held */
+ ioctl: ext3_ioctl, /* BKL held */
+ fsync: ext3_sync_file, /* BKL held */
+ };
+
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ struct ext3_dir_entry_2 * de,
+ struct buffer_head * bh,
+ unsigned long offset)
+ {
+ const char * error_msg = NULL;
+ const int rlen = le16_to_cpu(de->rec_len);
+
+ if (rlen < EXT3_DIR_REC_LEN(1))
+ error_msg = "rec_len is smaller than minimal";
+ else if (rlen % 4 != 0)
+ error_msg = "rec_len % 4 != 0";
+ else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+ error_msg = "rec_len is too small for name_len";
+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ error_msg = "directory entry across blocks";
+ else if (le32_to_cpu(de->inode) >
+ le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
+ error_msg = "inode out of bounds";
+
+ if (error_msg != NULL)
+ ext3_error (dir->i_sb, function,
+ "bad entry in directory #%lu: %s - "
+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error_msg, offset,
+ (unsigned long) le32_to_cpu(de->inode),
+ rlen, de->name_len);
+ return error_msg == NULL ? 1 : 0;
+ }
+
+ static int ext3_readdir(struct file * filp,
+ void * dirent, filldir_t filldir)
+ {
+ int error = 0;
+ unsigned long offset, blk;
+ int i, num, stored;
+ struct buffer_head * bh, * tmp, * bha[16];
+ struct ext3_dir_entry_2 * de;
+ struct super_block * sb;
+ int err;
+ struct inode *inode = filp->f_dentry->d_inode;
+
+ sb = inode->i_sb;
+
+ stored = 0;
+ bh = NULL;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+
+ while (!error && !stored && filp->f_pos < inode->i_size) {
+ blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
+ bh = ext3_bread (0, inode, blk, 0, &err);
+ if (!bh) {
+ ext3_error (sb, "ext3_readdir",
+ "directory #%lu contains a hole at offset %lu",
+ inode->i_ino, (unsigned long)filp->f_pos);
+ filp->f_pos += sb->s_blocksize - offset;
+ continue;
+ }
+
+ /*
+ * Do the readahead
+ */
+ if (!offset) {
+ for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
+ i > 0; i--) {
+ tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
+ if (tmp && !buffer_uptodate(tmp) &&
+ !buffer_locked(tmp))
+ bha[num++] = tmp;
+ else
+ brelse (tmp);
+ }
+ if (num) {
+ ll_rw_block (READA, num, bha);
+ for (i = 0; i < num; i++)
+ brelse (bha[i]);
+ }
+ }
+
+ revalidate:
+ /* If the dir block has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the block
+ * to make sure. */
+ if (filp->f_version != inode->i_version) {
+ for (i = 0; i < sb->s_blocksize && i < offset; ) {
+ de = (struct ext3_dir_entry_2 *)
+ (bh->b_data + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (le16_to_cpu(de->rec_len) <
+ EXT3_DIR_REC_LEN(1))
+ break;
+ i += le16_to_cpu(de->rec_len);
+ }
+ offset = i;
+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ | offset;
+ filp->f_version = inode->i_version;
+ }
+
+ while (!error && filp->f_pos < inode->i_size
+ && offset < sb->s_blocksize) {
+ de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
+ if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
+ bh, offset)) {
+ /* On error, skip the f_pos to the
+ next block. */
+ filp->f_pos = (filp->f_pos |
+ (sb->s_blocksize - 1)) + 1;
+ brelse (bh);
+ return stored;
+ }
+ offset += le16_to_cpu(de->rec_len);
+ if (le32_to_cpu(de->inode)) {
+ /* We might block in the next section
+ * if the data destination is
+ * currently swapped out. So, use a
+ * version stamp to detect whether or
+ * not the directory has been modified
+ * during the copy operation.
+ */
+ unsigned long version = filp->f_version;
+ unsigned char d_type = DT_UNKNOWN;
+
+ if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+ EXT3_FEATURE_INCOMPAT_FILETYPE)
+ && de->file_type < EXT3_FT_MAX)
+ d_type =
+ ext3_filetype_table[de->file_type];
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+ d_type);
+ if (error)
+ break;
+ if (version != filp->f_version)
+ goto revalidate;
+ stored ++;
+ }
+ filp->f_pos += le16_to_cpu(de->rec_len);
+ }
+ offset = 0;
+ brelse (bh);
+ }
+ UPDATE_ATIME(inode);
+ return 0;
+ }
diff -rc2P linux/fs/ext3/file.c linux-2.4.13/fs/ext3/file.c
*** linux/fs/ext3/file.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/file.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,97 ----
+ /*
+ * linux/fs/ext3/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/file.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext3 fs regular file handling primitives
+ *
+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
+ * ([email protected])
+ */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+
+ /*
+ * Called when an inode is released. Note that this is different
+ * from ext3_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+ static int ext3_release_file (struct inode * inode, struct file * filp)
+ {
+ if (filp->f_mode & FMODE_WRITE)
+ ext3_discard_prealloc (inode);
+ return 0;
+ }
+
+ /*
+ * Called when an inode is about to be opened.
+ * We use this to disallow opening RW large files on 32bit systems if
+ * the caller didn't specify O_LARGEFILE. On 64bit systems we force
+ * on this flag in sys_open.
+ */
+ static int ext3_open_file (struct inode * inode, struct file * filp)
+ {
+ if (!(filp->f_flags & O_LARGEFILE) &&
+ inode->i_size > 0x7FFFFFFFLL)
+ return -EFBIG;
+ return 0;
+ }
+
+ /*
+ * ext3_file_write().
+ *
+ * Most things are done in ext3_prepare_write() and ext3_commit_write().
+ */
+
+ static ssize_t
+ ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
+ {
+ int ret;
+ struct inode *inode = file->f_dentry->d_inode;
+
+ ret = generic_file_write(file, buf, count, ppos);
+ if ((ret >= 0) && IS_SYNC(inode)) {
+ if (file->f_flags & O_SYNC) {
+ /*
+ * generic_osync_inode() has already done the sync
+ */
+ } else {
+ int ret2 = ext3_force_commit(inode->i_sb);
+ if (ret2)
+ ret = ret2;
+ }
+ }
+ return ret;
+ }
+
+ struct file_operations ext3_file_operations = {
+ llseek: generic_file_llseek, /* BKL held */
+ read: generic_file_read, /* BKL not held. Don't need */
+ write: ext3_file_write, /* BKL not held. Don't need */
+ ioctl: ext3_ioctl, /* BKL held */
+ mmap: generic_file_mmap,
+ open: ext3_open_file, /* BKL not held. Don't need */
+ release: ext3_release_file, /* BKL not held. Don't need */
+ fsync: ext3_sync_file, /* BKL held */
+ };
+
+ struct inode_operations ext3_file_inode_operations = {
+ truncate: ext3_truncate, /* BKL held */
+ setattr: ext3_setattr, /* BKL held */
+ };
+
diff -rc2P linux/fs/ext3/fsync.c linux-2.4.13/fs/ext3/fsync.c
*** linux/fs/ext3/fsync.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/fsync.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,69 ----
+ /*
+ * linux/fs/ext3/fsync.c
+ *
+ * Copyright (C) 1993 Stephen Tweedie ([email protected])
+ * from
+ * Copyright (C) 1992 Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext3fs fsync primitive
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ *
+ * Removed unnecessary code duplication for little endian machines
+ * and excessive __inline__s.
+ * Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/smp_lock.h>
+
+ /*
+ * akpm: A new design for ext3_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task. (AKPM: quotas?)
+ * Another task could have dirtied this inode. Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it. This will snapshot the
+ * inode to disk.
+ *
+ * Note that there is a serious optimisation we can make here: if the current
+ * inode is not part of j_running_transaction or j_committing_transaction
+ * then we have nothing to do. That would require implementation of t_ilist,
+ * which isn't too hard.
+ */
+
+ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+ {
+ struct inode *inode = dentry->d_inode;
+ int ret;
+
+ J_ASSERT(ext3_journal_current_handle() == 0);
+
+ /*
+ * fsync_inode_buffers() just walks i_dirty_buffers and waits
+ * on them. It's a no-op for full data journalling because
+ * i_dirty_buffers will be ampty.
+ * Really, we only need to start I/O on the dirty buffers -
+ * we'll end up waiting on them in commit.
+ */
+ ret = fsync_inode_buffers(inode);
+
+ ext3_force_commit(inode->i_sb);
+
+ return ret;
+ }
diff -rc2P linux/fs/ext3/ialloc.c linux-2.4.13/fs/ext3/ialloc.c
*** linux/fs/ext3/ialloc.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/ialloc.c Fri Nov 9 17:03:46 2001
***************
*** 0 ****
--- 1,664 ----
+ /*
+ * linux/fs/ext3/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * BSD ufs-inspired inode and directory allocation by
+ * Stephen Tweedie ([email protected]), 1993
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+ #include <asm/bitops.h>
+ #include <asm/byteorder.h>
+
+ /*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+ /*
+ * The free inodes are managed by bitmaps. A file system contains several
+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block. Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block. The descriptors are loaded in memory
+ * when a file system is mounted (see ext3_read_super).
+ */
+
+
+ /*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return >=0 on success or a -ve error code.
+ */
+ static int read_inode_bitmap (struct super_block * sb,
+ unsigned long block_group,
+ unsigned int bitmap_nr)
+ {
+ struct ext3_group_desc * gdp;
+ struct buffer_head * bh = NULL;
+ int retval = 0;
+
+ gdp = ext3_get_group_desc (sb, block_group, NULL);
+ if (!gdp) {
+ retval = -EIO;
+ goto error_out;
+ }
+ bh = bread (sb->s_dev,
+ le32_to_cpu(gdp->bg_inode_bitmap), sb->s_blocksize);
+ if (!bh) {
+ ext3_error (sb, "read_inode_bitmap",
+ "Cannot read inode bitmap - "
+ "block_group = %lu, inode_bitmap = %lu",
+ block_group, (unsigned long) gdp->bg_inode_bitmap);
+ retval = -EIO;
+ }
+ /*
+ * On IO error, just leave a zero in the superblock's block pointer for
+ * this group. The IO will be retried next time.
+ */
+ error_out:
+ sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group;
+ sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh;
+ return retval;
+ }
+
+ /*
+ * load_inode_bitmap loads the inode bitmap for a blocks group
+ *
+ * It maintains a cache for the last bitmaps loaded. This cache is managed
+ * with a LRU algorithm.
+ *
+ * Notes:
+ * 1/ There is one cache per mounted file system.
+ * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
+ * this function reads the bitmap without maintaining a LRU cache.
+ *
+ * Return the slot used to store the bitmap, or a -ve error code.
+ */
+ static int load_inode_bitmap (struct super_block * sb,
+ unsigned int block_group)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ unsigned long inode_bitmap_number;
+ struct buffer_head * inode_bitmap;
+ int i, j, retval = 0;
+
+ if (block_group >= sbi->s_groups_count)
+ ext3_panic (sb, "load_inode_bitmap",
+ "block_group >= groups_count - "
+ "block_group = %d, groups_count = %lu",
+ block_group, sbi->s_groups_count);
+ if (sbi->s_loaded_inode_bitmaps > 0 &&
+ sbi->s_inode_bitmap_number[0] == block_group &&
+ sbi->s_inode_bitmap[0] != NULL)
+ return 0;
+ if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+ if (sbi->s_inode_bitmap[block_group]) {
+ if (sbi->s_inode_bitmap_number[block_group] !=
+ block_group)
+ ext3_panic(sb, "load_inode_bitmap",
+ "block_group != inode_bitmap_number");
+ return block_group;
+ }
+ retval = read_inode_bitmap(sb, block_group, block_group);
+ if (retval < 0)
+ return retval;
+ return block_group;
+ }
+
+ for (i = 0; i < sbi->s_loaded_inode_bitmaps &&
+ sbi->s_inode_bitmap_number[i] != block_group; i++)
+ /* do nothing */;
+ if (i < sbi->s_loaded_inode_bitmaps &&
+ sbi->s_inode_bitmap_number[i] == block_group) {
+ inode_bitmap_number = sbi->s_inode_bitmap_number[i];
+ inode_bitmap = sbi->s_inode_bitmap[i];
+ for (j = i; j > 0; j--) {
+ sbi->s_inode_bitmap_number[j] =
+ sbi->s_inode_bitmap_number[j - 1];
+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
+ }
+ sbi->s_inode_bitmap_number[0] = inode_bitmap_number;
+ sbi->s_inode_bitmap[0] = inode_bitmap;
+
+ /*
+ * There's still one special case here --- if inode_bitmap == 0
+ * then our last attempt to read the bitmap failed and we have
+ * just ended up caching that failure. Try again to read it.
+ */
+ if (!inode_bitmap)
+ retval = read_inode_bitmap (sb, block_group, 0);
+ } else {
+ if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED)
+ sbi->s_loaded_inode_bitmaps++;
+ else
+ brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]);
+ for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) {
+ sbi->s_inode_bitmap_number[j] =
+ sbi->s_inode_bitmap_number[j - 1];
+ sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
+ }
+ retval = read_inode_bitmap (sb, block_group, 0);
+ }
+ return retval;
+ }
+
+ /*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+ void ext3_free_inode (handle_t *handle, struct inode * inode)
+ {
+ struct super_block * sb = inode->i_sb;
+ int is_directory;
+ unsigned long ino;
+ struct buffer_head * bh;
+ struct buffer_head * bh2;
+ unsigned long block_group;
+ unsigned long bit;
+ int bitmap_nr;
+ struct ext3_group_desc * gdp;
+ struct ext3_super_block * es;
+ int fatal = 0, err;
+
+ if (!inode->i_dev) {
+ printk ("ext3_free_inode: inode has no device\n");
+ return;
+ }
+ if (atomic_read(&inode->i_count) > 1) {
+ printk ("ext3_free_inode: inode has count=%d\n",
+ atomic_read(&inode->i_count));
+ return;
+ }
+ if (inode->i_nlink) {
+ printk ("ext3_free_inode: inode has nlink=%d\n",
+ inode->i_nlink);
+ return;
+ }
+ if (!sb) {
+ printk("ext3_free_inode: inode on nonexistent device\n");
+ return;
+ }
+
+ ino = inode->i_ino;
+ ext3_debug ("freeing inode %lu\n", ino);
+
+ /*
+ * Note: we must free any quota before locking the superblock,
+ * as writing the quota to disk may need the lock as well.
+ */
+ DQUOT_INIT(inode);
+ DQUOT_FREE_INODE(inode);
+ DQUOT_DROP(inode);
+
+ is_directory = S_ISDIR(inode->i_mode);
+
+ /* Do this BEFORE marking the inode not in use or returning an error */
+ clear_inode (inode);
+
+ lock_super (sb);
+ es = sb->u.ext3_sb.s_es;
+ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+ ext3_error (sb, "ext3_free_inode",
+ "reserved or nonexistent inode %lu", ino);
+ goto error_return;
+ }
+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+ bitmap_nr = load_inode_bitmap (sb, block_group);
+ if (bitmap_nr < 0)
+ goto error_return;
+
+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
+
+ BUFFER_TRACE(bh, "get_write_access");
+ fatal = ext3_journal_get_write_access(handle, bh);
+ if (fatal)
+ goto error_return;
+
+ /* Ok, now we can actually update the inode bitmaps.. */
+ if (!ext3_clear_bit (bit, bh->b_data))
+ ext3_error (sb, "ext3_free_inode",
+ "bit already cleared for inode %lu", ino);
+ else {
+ gdp = ext3_get_group_desc (sb, block_group, &bh2);
+
+ BUFFER_TRACE(bh2, "get_write_access");
+ fatal = ext3_journal_get_write_access(handle, bh2);
+ if (fatal) goto error_return;
+
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
+ fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ if (fatal) goto error_return;
+
+ if (gdp) {
+ gdp->bg_free_inodes_count = cpu_to_le16(
+ le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+ if (is_directory)
+ gdp->bg_used_dirs_count = cpu_to_le16(
+ le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+ }
+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (!fatal) fatal = err;
+ es->s_free_inodes_count =
+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
+ "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ if (!fatal) fatal = err;
+ }
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (!fatal)
+ fatal = err;
+ sb->s_dirt = 1;
+ error_return:
+ ext3_std_error(sb, fatal);
+ unlock_super(sb);
+ }
+
+ /*
+ * There are two policies for allocating an inode. If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+ struct inode * ext3_new_inode (handle_t *handle,
+ const struct inode * dir, int mode)
+ {
+ struct super_block * sb;
+ struct buffer_head * bh;
+ struct buffer_head * bh2;
+ int i, j, avefreei;
+ struct inode * inode;
+ int bitmap_nr;
+ struct ext3_group_desc * gdp;
+ struct ext3_group_desc * tmp;
+ struct ext3_super_block * es;
+ int err = 0;
+
+ /* Cannot create files in a deleted directory */
+ if (!dir || !dir->i_nlink)
+ return ERR_PTR(-EPERM);
+
+ sb = dir->i_sb;
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ init_rwsem(&inode->u.ext3_i.truncate_sem);
+
+ lock_super (sb);
+ es = sb->u.ext3_sb.s_es;
+ repeat:
+ gdp = NULL;
+ i = 0;
+
+ if (S_ISDIR(mode)) {
+ avefreei = le32_to_cpu(es->s_free_inodes_count) /
+ sb->u.ext3_sb.s_groups_count;
+ if (!gdp) {
+ for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
+ struct buffer_head *temp_buffer;
+ tmp = ext3_get_group_desc (sb, j, &temp_buffer);
+ if (tmp &&
+ le16_to_cpu(tmp->bg_free_inodes_count) &&
+ le16_to_cpu(tmp->bg_free_inodes_count) >=
+ avefreei) {
+ if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) >
+ le16_to_cpu(gdp->bg_free_blocks_count))) {
+ i = j;
+ gdp = tmp;
+ bh2 = temp_buffer;
+ }
+ }
+ }
+ }
+ } else {
+ /*
+ * Try to place the inode in its parent directory
+ */
+ i = dir->u.ext3_i.i_block_group;
+ tmp = ext3_get_group_desc (sb, i, &bh2);
+ if (tmp && le16_to_cpu(tmp->bg_free_inodes_count))
+ gdp = tmp;
+ else
+ {
+ /*
+ * Use a quadratic hash to find a group with a
+ * free inode
+ */
+ for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
+ i += j;
+ if (i >= sb->u.ext3_sb.s_groups_count)
+ i -= sb->u.ext3_sb.s_groups_count;
+ tmp = ext3_get_group_desc (sb, i, &bh2);
+ if (tmp &&
+ le16_to_cpu(tmp->bg_free_inodes_count)) {
+ gdp = tmp;
+ break;
+ }
+ }
+ }
+ if (!gdp) {
+ /*
+ * That failed: try linear search for a free inode
+ */
+ i = dir->u.ext3_i.i_block_group + 1;
+ for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
+ if (++i >= sb->u.ext3_sb.s_groups_count)
+ i = 0;
+ tmp = ext3_get_group_desc (sb, i, &bh2);
+ if (tmp &&
+ le16_to_cpu(tmp->bg_free_inodes_count)) {
+ gdp = tmp;
+ break;
+ }
+ }
+ }
+ }
+
+ err = -ENOSPC;
+ if (!gdp)
+ goto fail;
+
+ err = -EIO;
+ bitmap_nr = load_inode_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ goto fail;
+
+ bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
+
+ if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data,
+ EXT3_INODES_PER_GROUP(sb))) <
+ EXT3_INODES_PER_GROUP(sb)) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err) goto fail;
+
+ if (ext3_set_bit (j, bh->b_data)) {
+ ext3_error (sb, "ext3_new_inode",
+ "bit already set for inode %d", j);
+ goto repeat;
+ }
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err) goto fail;
+ } else {
+ if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) {
+ ext3_error (sb, "ext3_new_inode",
+ "Free inodes count corrupted in group %d",
+ i);
+ /* Is it really ENOSPC? */
+ err = -ENOSPC;
+ if (sb->s_flags & MS_RDONLY)
+ goto fail;
+
+ BUFFER_TRACE(bh2, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh2);
+ if (err) goto fail;
+ gdp->bg_free_inodes_count = 0;
+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (err) goto fail;
+ }
+ goto repeat;
+ }
+ j += i * EXT3_INODES_PER_GROUP(sb) + 1;
+ if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
+ ext3_error (sb, "ext3_new_inode",
+ "reserved inode or inode > inodes count - "
+ "block_group = %d,inode=%d", i, j);
+ err = -EIO;
+ goto fail;
+ }
+
+ BUFFER_TRACE(bh2, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh2);
+ if (err) goto fail;
+ gdp->bg_free_inodes_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+ if (S_ISDIR(mode))
+ gdp->bg_used_dirs_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+ BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh2);
+ if (err) goto fail;
+
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ if (err) goto fail;
+ es->s_free_inodes_count =
+ cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ sb->s_dirt = 1;
+ if (err) goto fail;
+
+ inode->i_uid = current->fsuid;
+ if (test_opt (sb, GRPID))
+ inode->i_gid = dir->i_gid;
+ else if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current->fsgid;
+ inode->i_mode = mode;
+
+ inode->i_ino = j;
+ /* This is the optimal IO size (for stat), not the fs block size */
+ inode->i_blksize = PAGE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL;
+ if (S_ISLNK(mode))
+ inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL | EXT3_APPEND_FL);
+ #ifdef EXT3_FRAGMENTS
+ inode->u.ext3_i.i_faddr = 0;
+ inode->u.ext3_i.i_frag_no = 0;
+ inode->u.ext3_i.i_frag_size = 0;
+ #endif
+ inode->u.ext3_i.i_file_acl = 0;
+ inode->u.ext3_i.i_dir_acl = 0;
+ inode->u.ext3_i.i_dtime = 0;
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+ #ifdef EXT3_PREALLOCATE
+ inode->u.ext3_i.i_prealloc_count = 0;
+ #endif
+ inode->u.ext3_i.i_block_group = i;
+
+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ insert_inode_hash(inode);
+ inode->i_generation = event++;
+
+ inode->u.ext3_i.i_state = EXT3_STATE_NEW;
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) goto fail;
+
+ unlock_super (sb);
+ if(DQUOT_ALLOC_INODE(inode)) {
+ DQUOT_DROP(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode->i_nlink = 0;
+ iput(inode);
+ return ERR_PTR(-EDQUOT);
+ }
+ ext3_debug ("allocating inode %lu\n", inode->i_ino);
+ return inode;
+
+ fail:
+ unlock_super(sb);
+ iput(inode);
+ ext3_std_error(sb, err);
+ return ERR_PTR(err);
+ }
+
+ /* Verify that we are loading a valid orphan from disk */
+ struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
+ {
+ ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+ unsigned long block_group;
+ int bit;
+ int bitmap_nr;
+ struct buffer_head *bh;
+ struct inode *inode = NULL;
+
+ /* Error cases - e2fsck has already cleaned up for us */
+ if (ino > max_ino) {
+ ext3_warning(sb, __FUNCTION__,
+ "bad orphan ino %ld! e2fsck was run?\n", ino);
+ return NULL;
+ }
+
+ block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+ if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
+ !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
+ ext3_warning(sb, __FUNCTION__,
+ "inode bitmap error for orphan %ld\n", ino);
+ return NULL;
+ }
+
+ /* Having the inode bit set should be a 100% indicator that this
+ * is a valid orphan (no e2fsck run on fs). Orphans also include
+ * inodes that were being truncated, so we can't check i_nlink==0.
+ */
+ if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
+ is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
+ ext3_warning(sb, __FUNCTION__,
+ "bad orphan inode %ld! e2fsck was run?\n", ino);
+ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
+ bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
+ printk(KERN_NOTICE "inode=%p\n", inode);
+ if (inode) {
+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ is_bad_inode(inode));
+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n",
+ NEXT_ORPHAN(inode));
+ printk(KERN_NOTICE "max_ino=%ld\n", max_ino);
+ }
+ /* Avoid freeing blocks if we got a bad deleted inode */
+ if (inode && inode->i_nlink == 0)
+ inode->i_blocks = 0;
+ iput(inode);
+ return NULL;
+ }
+
+ return inode;
+ }
+
+ unsigned long ext3_count_free_inodes (struct super_block * sb)
+ {
+ #ifdef EXT3FS_DEBUG
+ struct ext3_super_block * es;
+ unsigned long desc_count, bitmap_count, x;
+ int bitmap_nr;
+ struct ext3_group_desc * gdp;
+ int i;
+
+ lock_super (sb);
+ es = sb->u.ext3_sb.s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ gdp = ext3_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ bitmap_nr = load_inode_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ continue;
+
+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+ EXT3_INODES_PER_GROUP(sb) / 8);
+ printk ("group %d: stored = %d, counted = %lu\n",
+ i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ bitmap_count += x;
+ }
+ printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+ unlock_super (sb);
+ return desc_count;
+ #else
+ return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
+ #endif
+ }
+
+ #ifdef CONFIG_EXT3_CHECK
+ /* Called at mount-time, super-block is locked */
+ void ext3_check_inodes_bitmap (struct super_block * sb)
+ {
+ struct ext3_super_block * es;
+ unsigned long desc_count, bitmap_count, x;
+ int bitmap_nr;
+ struct ext3_group_desc * gdp;
+ int i;
+
+ es = sb->u.ext3_sb.s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+ for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ gdp = ext3_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ bitmap_nr = load_inode_bitmap (sb, i);
+ if (bitmap_nr < 0)
+ continue;
+
+ x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+ EXT3_INODES_PER_GROUP(sb) / 8);
+ if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
+ ext3_error (sb, "ext3_check_inodes_bitmap",
+ "Wrong free inodes count in group %d, "
+ "stored = %d, counted = %lu", i,
+ le16_to_cpu(gdp->bg_free_inodes_count), x);
+ bitmap_count += x;
+ }
+ if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
+ ext3_error (sb, "ext3_check_inodes_bitmap",
+ "Wrong free inodes count in super block, "
+ "stored = %lu, counted = %lu",
+ (unsigned long)le32_to_cpu(es->s_free_inodes_count),
+ bitmap_count);
+ }
+ #endif
diff -rc2P linux/fs/ext3/inode.c linux-2.4.13/fs/ext3/inode.c
*** linux/fs/ext3/inode.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/inode.c Fri Nov 9 17:03:19 2001
***************
*** 0 ****
--- 1,2676 ----
+ /*
+ * linux/fs/ext3/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Goal-directed block allocation by Stephen Tweedie
+ * ([email protected]), 1993, 1998
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
+ * ([email protected])
+ *
+ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
+ */
+
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
+
+
+ /*
+ * SEARCH_FROM_ZERO forces each block allocation to search from the start
+ * of the filesystem. This is to force rapid reallocation of recently-freed
+ * blocks. The file fragmentation is horrendous.
+ */
+ #undef SEARCH_FROM_ZERO
+
+ /* The ext3 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ */
+
+ static int ext3_forget(handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ int blocknr)
+ {
+ int err;
+
+ BUFFER_TRACE(bh, "enter");
+
+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ "data mode %lx\n",
+ bh, is_metadata, inode->i_mode,
+ test_opt(inode->i_sb, DATA_FLAGS));
+
+ /* Never use the revoke function if we are doing full data
+ * journaling: there is no need to, and a V1 superblock won't
+ * support it. Otherwise, only skip the revoke on un-journaled
+ * data blocks. */
+
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
+ (!is_metadata && !ext3_should_journal_data(inode))) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call journal_forget");
+ ext3_journal_forget(handle, bh);
+ }
+ return 0;
+ }
+
+ /*
+ * data!=journal && (is_metadata || should_journal_data(inode))
+ */
+ BUFFER_TRACE(bh, "call ext3_journal_revoke");
+ err = ext3_journal_revoke(handle, blocknr, bh);
+ if (err)
+ ext3_abort(inode->i_sb, __FUNCTION__,
+ "error %d when attempting revoke", err);
+ BUFFER_TRACE(bh, "exit");
+ return err;
+ }
+
+ /*
+ * Truncate transactions can be complex and absolutely huge. So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit. If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+
+ static handle_t *start_transaction(struct inode *inode)
+ {
+ long needed;
+ handle_t *result;
+
+ needed = inode->i_blocks;
+ if (needed > EXT3_MAX_TRANS_DATA)
+ needed = EXT3_MAX_TRANS_DATA;
+
+ result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
+ if (!IS_ERR(result))
+ return result;
+
+ ext3_std_error(inode->i_sb, PTR_ERR(result));
+ return result;
+ }
+
+ /*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room. If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+ {
+ long needed;
+
+ if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+ return 0;
+ needed = inode->i_blocks;
+ if (needed > EXT3_MAX_TRANS_DATA)
+ needed = EXT3_MAX_TRANS_DATA;
+ if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
+ return 0;
+ return 1;
+ }
+
+ /*
+ * Restart the transaction associated with *handle. This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+ {
+ long needed = inode->i_blocks;
+ if (needed > EXT3_MAX_TRANS_DATA)
+ needed = EXT3_MAX_TRANS_DATA;
+ jbd_debug(2, "restarting handle %p\n", handle);
+ return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
+ }
+
+ /*
+ * Called at each iput()
+ */
+ void ext3_put_inode (struct inode * inode)
+ {
+ ext3_discard_prealloc (inode);
+ }
+
+ /*
+ * Called at the last iput() if i_nlink is zero.
+ */
+ void ext3_delete_inode (struct inode * inode)
+ {
+ handle_t *handle;
+
+ if (is_bad_inode(inode) ||
+ inode->i_ino == EXT3_ACL_IDX_INO ||
+ inode->i_ino == EXT3_ACL_DATA_INO)
+ goto no_delete;
+
+ lock_kernel();
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ /* If we're going to skip the normal cleanup, we still
+ * need to make sure that the in-core orphan linked list
+ * is properly cleaned up. */
+ ext3_orphan_del(NULL, inode);
+
+ ext3_std_error(inode->i_sb, PTR_ERR(handle));
+ unlock_kernel();
+ goto no_delete;
+ }
+
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ inode->i_size = 0;
+ if (inode->i_blocks)
+ ext3_truncate(inode);
+ /*
+ * Kill off the orphan record which ext3_truncate created.
+ * AKPM: I think this can be inside the above `if'.
+ * Note that ext3_orphan_del() has to be able to cope with the
+ * deletion of a non-existent orphan - this is because we don't
+ * know if ext3_truncate() actually created an orphan record.
+ * (Well, we could do this if we need to, but heck - it works)
+ */
+ ext3_orphan_del(handle, inode);
+ inode->u.ext3_i.i_dtime = CURRENT_TIME;
+
+ /*
+ * One subtle ordering requirement: if anything has gone wrong
+ * (transaction abort, IO errors, whatever), then we can still
+ * do these next steps (the fs will already have been marked as
+ * having errors), but we can't free the inode if the mark_dirty
+ * fails.
+ */
+ if (ext3_mark_inode_dirty(handle, inode))
+ /* If that failed, just do the required in-core inode clear. */
+ clear_inode(inode);
+ else
+ ext3_free_inode(handle, inode);
+ ext3_journal_stop(handle, inode);
+ unlock_kernel();
+ return;
+ no_delete:
+ clear_inode(inode); /* We must guarantee clearing of inode... */
+ }
+
+ void ext3_discard_prealloc (struct inode * inode)
+ {
+ #ifdef EXT3_PREALLOCATE
+ lock_kernel();
+ /* Writer: ->i_prealloc* */
+ if (inode->u.ext3_i.i_prealloc_count) {
+ unsigned short total = inode->u.ext3_i.i_prealloc_count;
+ unsigned long block = inode->u.ext3_i.i_prealloc_block;
+ inode->u.ext3_i.i_prealloc_count = 0;
+ inode->u.ext3_i.i_prealloc_block = 0;
+ /* Writer: end */
+ ext3_free_blocks (inode, block, total);
+ }
+ unlock_kernel();
+ #endif
+ }
+
+ static int ext3_alloc_block (handle_t *handle,
+ struct inode * inode, unsigned long goal, int *err)
+ {
+ #ifdef EXT3FS_DEBUG
+ static unsigned long alloc_hits = 0, alloc_attempts = 0;
+ #endif
+ unsigned long result;
+
+ #ifdef EXT3_PREALLOCATE
+ /* Writer: ->i_prealloc* */
+ if (inode->u.ext3_i.i_prealloc_count &&
+ (goal == inode->u.ext3_i.i_prealloc_block ||
+ goal + 1 == inode->u.ext3_i.i_prealloc_block))
+ {
+ result = inode->u.ext3_i.i_prealloc_block++;
+ inode->u.ext3_i.i_prealloc_count--;
+ /* Writer: end */
+ ext3_debug ("preallocation hit (%lu/%lu).\n",
+ ++alloc_hits, ++alloc_attempts);
+ } else {
+ ext3_discard_prealloc (inode);
+ ext3_debug ("preallocation miss (%lu/%lu).\n",
+ alloc_hits, ++alloc_attempts);
+ if (S_ISREG(inode->i_mode))
+ result = ext3_new_block (inode, goal,
+ &inode->u.ext3_i.i_prealloc_count,
+ &inode->u.ext3_i.i_prealloc_block, err);
+ else
+ result = ext3_new_block (inode, goal, 0, 0, err);
+ /*
+ * AKPM: this is somewhat sticky. I'm not surprised it was
+ * disabled in 2.2's ext3. Need to integrate b_committed_data
+ * guarding with preallocation, if indeed preallocation is
+ * effective.
+ */
+ }
+ #else
+ result = ext3_new_block (handle, inode, goal, 0, 0, err);
+ #endif
+ return result;
+ }
+
+
+ typedef struct {
+ u32 *p;
+ u32 key;
+ struct buffer_head *bh;
+ } Indirect;
+
+ static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
+ {
+ p->key = *(p->p = v);
+ p->bh = bh;
+ }
+
+ static inline int verify_chain(Indirect *from, Indirect *to)
+ {
+ while (from <= to && from->key == *from->p)
+ from++;
+ return (from > to);
+ }
+
+ /**
+ * ext3_block_to_path - parse the block number into array of offsets
+ * @inode: inode in question (we are only interested in its superblock)
+ * @i_block: block number to be parsed
+ * @offsets: array to store the offsets in
+ *
+ * To store the locations of file's data ext3 uses a data structure common
+ * for UNIX filesystems - tree of pointers anchored in the inode, with
+ * data blocks at leaves and indirect blocks in intermediate nodes.
+ * This function translates the block number into path in that tree -
+ * return value is the path length and @offsets[n] is the offset of
+ * pointer to (n+1)th node in the nth one. If @block is out of range
+ * (negative or too large) warning is printed and zero returned.
+ *
+ * Note: function doesn't find node addresses, so no IO is needed. All
+ * we need to know is the capacity of indirect blocks (taken from the
+ * inode->i_sb).
+ */
+
+ /*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+ static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
+ {
+ int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+ int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+ const long direct_blocks = EXT3_NDIR_BLOCKS,
+ indirect_blocks = ptrs,
+ double_blocks = (1 << (ptrs_bits * 2));
+ int n = 0;
+
+ if (i_block < 0) {
+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
+ } else if (i_block < direct_blocks) {
+ offsets[n++] = i_block;
+ } else if ( (i_block -= direct_blocks) < indirect_blocks) {
+ offsets[n++] = EXT3_IND_BLOCK;
+ offsets[n++] = i_block;
+ } else if ((i_block -= indirect_blocks) < double_blocks) {
+ offsets[n++] = EXT3_DIND_BLOCK;
+ offsets[n++] = i_block >> ptrs_bits;
+ offsets[n++] = i_block & (ptrs - 1);
+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+ offsets[n++] = EXT3_TIND_BLOCK;
+ offsets[n++] = i_block >> (ptrs_bits * 2);
+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+ offsets[n++] = i_block & (ptrs - 1);
+ } else {
+ ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
+ }
+ return n;
+ }
+
+ /**
+ * ext3_get_branch - read the chain of indirect blocks leading to data
+ * @inode: inode in question
+ * @depth: depth of the chain (1 - direct pointer, etc.)
+ * @offsets: offsets of pointers in inode/indirect blocks
+ * @chain: place to store the result
+ * @err: here we store the error value
+ *
+ * Function fills the array of triples <key, p, bh> and returns %NULL
+ * if everything went OK or the pointer to the last filled triple
+ * (incomplete one) otherwise. Upon the return chain[i].key contains
+ * the number of (i+1)-th block in the chain (as it is stored in memory,
+ * i.e. little-endian 32-bit), chain[i].p contains the address of that
+ * number (it points into struct inode for i==0 and into the bh->b_data
+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ * block for i>0 and NULL for i==0. In other words, it holds the block
+ * numbers of the chain, addresses they were taken from (and where we can
+ * verify that chain did not change) and buffer_heads hosting these
+ * numbers.
+ *
+ * Function stops when it stumbles upon zero pointer (absent block)
+ * (pointer to last triple returned, *@err == 0)
+ * or when it gets an IO error reading an indirect block
+ * (ditto, *@err == -EIO)
+ * or when it notices that chain had been changed while it was reading
+ * (ditto, *@err == -EAGAIN)
+ * or when it reads all @depth-1 indirect blocks successfully and finds
+ * the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+ static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
+ Indirect chain[4], int *err)
+ {
+ kdev_t dev = inode->i_dev;
+ int blocksize = inode->i_sb->s_blocksize;
+ Indirect *p = chain;
+ struct buffer_head *bh;
+
+ *err = 0;
+ /* i_data is not going away, no lock needed */
+ add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
+ if (!p->key)
+ goto no_block;
+ while (--depth) {
+ bh = bread(dev, le32_to_cpu(p->key), blocksize);
+ if (!bh)
+ goto failure;
+ /* Reader: pointers */
+ if (!verify_chain(chain, p))
+ goto changed;
+ add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+ /* Reader: end */
+ if (!p->key)
+ goto no_block;
+ }
+ return NULL;
+
+ changed:
+ *err = -EAGAIN;
+ goto no_block;
+ failure:
+ *err = -EIO;
+ no_block:
+ return p;
+ }
+
+ /**
+ * ext3_find_near - find a place for allocation with sufficient locality
+ * @inode: owner
+ * @ind: descriptor of indirect block.
+ *
+ * This function returns the prefered place for block allocation.
+ * It is used when heuristic for sequential allocation fails.
+ * Rules are:
+ * + if there is a block to the left of our position - allocate near it.
+ * + if pointer will live in indirect block - allocate near that block.
+ * + if pointer will live in inode - allocate in the same
+ * cylinder group.
+ * Caller must make sure that @ind is valid and will stay that way.
+ */
+
+ static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+ {
+ u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
+ u32 *p;
+
+ /* Try to find previous block */
+ for (p = ind->p - 1; p >= start; p--)
+ if (*p)
+ return le32_to_cpu(*p);
+
+ /* No such thing, so let's try location of indirect block */
+ if (ind->bh)
+ return ind->bh->b_blocknr;
+
+ /*
+ * It is going to be refered from inode itself? OK, just put it into
+ * the same cylinder group then.
+ */
+ return (inode->u.ext3_i.i_block_group *
+ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+ le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
+ }
+
+ /**
+ * ext3_find_goal - find a prefered place for allocation.
+ * @inode: owner
+ * @block: block we want
+ * @chain: chain of indirect blocks
+ * @partial: pointer to the last triple within a chain
+ * @goal: place to store the result.
+ *
+ * Normally this function find the prefered place for block allocation,
+ * stores it in *@goal and returns zero. If the branch had been changed
+ * under us we return -EAGAIN.
+ */
+
+ static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
+ Indirect *partial, unsigned long *goal)
+ {
+ /* Writer: ->i_next_alloc* */
+ if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
+ inode->u.ext3_i.i_next_alloc_block++;
+ inode->u.ext3_i.i_next_alloc_goal++;
+ }
+ #ifdef SEARCH_FROM_ZERO
+ inode->u.ext3_i.i_next_alloc_block = 0;
+ inode->u.ext3_i.i_next_alloc_goal = 0;
+ #endif
+ /* Writer: end */
+ /* Reader: pointers, ->i_next_alloc* */
+ if (verify_chain(chain, partial)) {
+ /*
+ * try the heuristic for sequential allocation,
+ * failing that at least try to get decent locality.
+ */
+ if (block == inode->u.ext3_i.i_next_alloc_block)
+ *goal = inode->u.ext3_i.i_next_alloc_goal;
+ if (!*goal)
+ *goal = ext3_find_near(inode, partial);
+ #ifdef SEARCH_FROM_ZERO
+ *goal = 0;
+ #endif
+ return 0;
+ }
+ /* Reader: end */
+ return -EAGAIN;
+ }
+
+ /**
+ * ext3_alloc_branch - allocate and set up a chain of blocks.
+ * @inode: owner
+ * @num: depth of the chain (number of blocks to allocate)
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
+ *
+ * This function allocates @num blocks, zeroes out all but the last one,
+ * links them into chain and (if we are synchronous) writes them to disk.
+ * In other words, it prepares a branch that can be spliced onto the
+ * inode. It stores the information about that chain in the branch[], in
+ * the same format as ext3_get_branch() would do. We are calling it after
+ * we had read the existing part of chain and partial points to the last
+ * triple of that (one with zero ->key). Upon the exit we have the same
+ * picture as after the successful ext3_get_block(), excpet that in one
+ * place chain is disconnected - *branch->p is still zero (we did not
+ * set the last link), but branch->key contains the number that should
+ * be placed into *branch->p to fill that gap.
+ *
+ * If allocation fails we free all blocks we've allocated (and forget
+ * their buffer_heads) and return the error value the from failed
+ * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
+ */
+
+ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
+ int num,
+ unsigned long goal,
+ int *offsets,
+ Indirect *branch)
+ {
+ int blocksize = inode->i_sb->s_blocksize;
+ int n = 0, keys = 0;
+ int err = 0;
+ int i;
+ int parent = ext3_alloc_block(handle, inode, goal, &err);
+
+ branch[0].key = cpu_to_le32(parent);
+ if (parent) {
+ for (n = 1; n < num; n++) {
+ struct buffer_head *bh;
+ /* Allocate the next block */
+ int nr = ext3_alloc_block(handle, inode, parent, &err);
+ if (!nr)
+ break;
+ branch[n].key = cpu_to_le32(nr);
+ keys = n+1;
+
+ /*
+ * Get buffer_head for parent block, zero it out
+ * and set the pointer to new one, then send
+ * parent to disk.
+ */
+ bh = getblk(inode->i_dev, parent, blocksize);
+ branch[n].bh = bh;
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ err = ext3_journal_get_create_access(handle, bh);
+ if (err) {
+ unlock_buffer(bh);
+ brelse(bh);
+ break;
+ }
+
+ memset(bh->b_data, 0, blocksize);
+ branch[n].p = (u32*) bh->b_data + offsets[n];
+ *branch[n].p = branch[n].key;
+ BUFFER_TRACE(bh, "marking uptodate");
+ mark_buffer_uptodate(bh, 1);
+ unlock_buffer(bh);
+
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (err)
+ break;
+
+ parent = nr;
+ }
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ }
+ if (n == num)
+ return 0;
+
+ /* Allocation failed, free what we already allocated */
+ for (i = 1; i < keys; i++) {
+ BUFFER_TRACE(branch[i].bh, "call journal_forget");
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
+ return err;
+ }
+
+ /**
+ * ext3_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ * ext3_alloc_branch)
+ * @where: location of missing link
+ * @num: number of blocks we are adding
+ *
+ * This function verifies that chain (up to the missing link) had not
+ * changed, fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0. Otherwise (== chain had been changed)
+ * we free the new blocks (forgetting their buffer_heads, indeed) and
+ * return -EAGAIN.
+ */
+
+ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
+ Indirect chain[4], Indirect *where, int num)
+ {
+ int i;
+ int err = 0;
+
+ /*
+ * If we're splicing into a [td]indirect block (as opposed to the
+ * inode) then we need to get write access to the [td]indirect block
+ * before the splice.
+ */
+ if (where->bh) {
+ BUFFER_TRACE(where->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, where->bh);
+ if (err)
+ goto err_out;
+ }
+ /* Verify that place we are splicing to is still there and vacant */
+
+ /* Writer: pointers, ->i_next_alloc* */
+ if (!verify_chain(chain, where-1) || *where->p)
+ /* Writer: end */
+ goto changed;
+
+ /* That's it */
+
+ *where->p = where->key;
+ inode->u.ext3_i.i_next_alloc_block = block;
+ inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+ #ifdef SEARCH_FROM_ZERO
+ inode->u.ext3_i.i_next_alloc_block = 0;
+ inode->u.ext3_i.i_next_alloc_goal = 0;
+ #endif
+ /* Writer: end */
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+
+ inode->i_ctime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+
+ /* had we spliced it onto indirect block? */
+ if (where->bh) {
+ /*
+ * akpm: If we spliced it onto an indirect block, we haven't
+ * altered the inode. Note however that if it is being spliced
+ * onto an indirect block at the very end of the file (the
+ * file is growing) then we *will* alter the inode to reflect
+ * the new i_size. But that is not done here - it is done in
+ * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
+ */
+ jbd_debug(5, "splicing indirect only\n");
+ BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, where->bh);
+ if (err)
+ goto err_out;
+ } else {
+ /*
+ * OK, we spliced it into the inode itself on a direct block.
+ * Inode was dirtied above.
+ */
+ jbd_debug(5, "splicing direct\n");
+ }
+ return err;
+
+ changed:
+ /*
+ * AKPM: if where[i].bh isn't part of the current updating
+ * transaction then we explode nastily. Test this code path.
+ */
+ jbd_debug(1, "the chain changed: try again\n");
+ err = -EAGAIN;
+
+ err_out:
+ for (i = 1; i < num; i++) {
+ BUFFER_TRACE(where[i].bh, "call journal_forget");
+ ext3_journal_forget(handle, where[i].bh);
+ }
+ /* For the normal collision cleanup case, we free up the blocks.
+ * On genuine filesystem errors we don't even think about doing
+ * that. */
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+ le32_to_cpu(where[i].key), 1);
+ return err;
+ }
+
+ /*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * akpm: `handle' can be NULL if create == 0.
+ */
+
+ static int ext3_get_block_handle(handle_t *handle, struct inode *inode,
+ long iblock,
+ struct buffer_head *bh_result, int create)
+ {
+ int err = -EIO;
+ int offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ unsigned long goal;
+ int left;
+ int depth = ext3_block_to_path(inode, iblock, offsets);
+ loff_t new_size;
+
+ J_ASSERT(handle != NULL || create == 0);
+
+ if (depth == 0)
+ goto out;
+
+ lock_kernel();
+ reread:
+ partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+
+ /* Simplest case - block found, no allocation needed */
+ if (!partial) {
+ bh_result->b_state &= ~(1UL << BH_New);
+ got_it:
+ bh_result->b_dev = inode->i_dev;
+ bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
+ bh_result->b_state |= (1UL << BH_Mapped);
+ /* Clean up and exit */
+ partial = chain+depth-1; /* the whole chain */
+ goto cleanup;
+ }
+
+ /* Next simple case - plain lookup or failed read of indirect block */
+ if (!create || err == -EIO) {
+ cleanup:
+ while (partial > chain) {
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+ BUFFER_TRACE(bh_result, "returned");
+ unlock_kernel();
+ out:
+ return err;
+ }
+
+ /*
+ * Indirect block might be removed by truncate while we were
+ * reading it. Handling of that case (forget what we've got and
+ * reread) is taken out of the main path.
+ */
+ if (err == -EAGAIN)
+ goto changed;
+
+ if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
+ goto changed;
+
+ left = (chain + depth) - partial;
+
+ /*
+ * Block out ext3_truncate while we alter the tree
+ */
+ down_read(&inode->u.ext3_i.truncate_sem);
+ err = ext3_alloc_branch(handle, inode, left, goal,
+ offsets+(partial-chain), partial);
+
+ /* The ext3_splice_branch call will free and forget any buffers
+ * on the new chain if there is a failure, but that risks using
+ * up transaction credits, especially for bitmaps where the
+ * credits cannot be returned. Can we handle this somehow? We
+ * may need to return -EAGAIN upwards in the worst case. --sct */
+ if (!err)
+ err = ext3_splice_branch(handle, inode, iblock, chain,
+ partial, left);
+ up_read(&inode->u.ext3_i.truncate_sem);
+ if (err == -EAGAIN)
+ goto changed;
+ if (err)
+ goto cleanup;
+
+ new_size = inode->i_size;
+ /*
+ * This is not racy against ext3_truncate's modification of i_disksize
+ * because VM/VFS ensures that the file cannot be extended while
+ * truncate is in progress. It is racy between multiple parallel
+ * instances of get_block, but we have the BKL.
+ */
+ if (new_size > inode->u.ext3_i.i_disksize)
+ inode->u.ext3_i.i_disksize = new_size;
+
+ bh_result->b_state |= (1UL << BH_New);
+ goto got_it;
+
+ changed:
+ while (partial > chain) {
+ jbd_debug(1, "buffer chain changed, retrying\n");
+ BUFFER_TRACE(partial->bh, "brelsing");
+ brelse(partial->bh);
+ partial--;
+ }
+ goto reread;
+ }
+
+ static int ext3_get_block(struct inode *inode, long iblock,
+ struct buffer_head *bh_result, int create)
+ {
+ handle_t *handle = 0;
+ int ret;
+
+ if (create) {
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+ ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
+ return ret;
+ }
+
+ /*
+ * `handle' can be NULL if create is zero
+ */
+ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
+ long block, int create, int * errp)
+ {
+ struct buffer_head dummy;
+ int fatal = 0, err;
+
+ J_ASSERT(handle != NULL || create == 0);
+
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+ *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = getblk(dummy.b_dev, dummy.b_blocknr,
+ inode->i_sb->s_blocksize);
+ if (buffer_new(&dummy)) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != 0);
+
+ /* Now that we do not always journal data, we
+ should keep in mind whether this should
+ always journal the new buffer as metadata.
+ For now, regular file writes use
+ ext3_get_block instead, so it's not a
+ problem. */
+ lock_kernel();
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext3_journal_get_create_access(handle, bh);
+ if (!fatal) {
+ memset(bh->b_data, 0,
+ inode->i_sb->s_blocksize);
+ mark_buffer_uptodate(bh, 1);
+ }
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ err = ext3_journal_dirty_metadata(handle, bh);
+ if (!fatal) fatal = err;
+ unlock_kernel();
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
+ }
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
+ }
+ return NULL;
+ }
+
+ struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
+ int block, int create, int *err)
+ {
+ struct buffer_head * bh;
+ int prev_blocks;
+
+ prev_blocks = inode->i_blocks;
+
+ bh = ext3_getblk (handle, inode, block, create, err);
+ if (!bh)
+ return bh;
+ #ifdef EXT3_PREALLOCATE
+ /*
+ * If the inode has grown, and this is a directory, then use a few
+ * more of the preallocated blocks to keep directory fragmentation
+ * down. The preallocated blocks are guaranteed to be contiguous.
+ */
+ if (create &&
+ S_ISDIR(inode->i_mode) &&
+ inode->i_blocks > prev_blocks &&
+ EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
+ int i;
+ struct buffer_head *tmp_bh;
+
+ for (i = 1;
+ inode->u.ext3_i.i_prealloc_count &&
+ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
+ i++) {
+ /*
+ * ext3_getblk will zero out the contents of the
+ * directory for us
+ */
+ tmp_bh = ext3_getblk(handle, inode,
+ block+i, create, err);
+ if (!tmp_bh) {
+ brelse (bh);
+ return 0;
+ }
+ brelse (tmp_bh);
+ }
+ }
+ #endif
+ if (buffer_uptodate(bh))
+ return bh;
+ ll_rw_block (READ, 1, &bh);
+ wait_on_buffer (bh);
+ if (buffer_uptodate(bh))
+ return bh;
+ brelse (bh);
+ *err = -EIO;
+ return NULL;
+ }
+
+ static int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh))
+ {
+ struct buffer_head *bh;
+ unsigned block_start, block_end;
+ unsigned blocksize = head->b_size;
+ int err, ret = 0;
+
+ for ( bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = bh->b_this_page)
+ {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (partial && !buffer_uptodate(bh))
+ *partial = 1;
+ continue;
+ }
+ err = (*fn)(handle, bh);
+ if (!ret)
+ ret = err;
+ }
+ return ret;
+ }
+
+ /*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction. We cannot
+ * close off a transaction and start a new one between the ext3_get_block()
+ * and the commit_write(). So doing the journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext3_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext3_writepage()
+ * has generated enough buffer credits to do the whole page. So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext3 can be reentered when a transaction is open via
+ * quota file writes. If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated. We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+
+ static int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
+ {
+ return ext3_journal_get_write_access(handle, bh);
+ }
+
+ static int ext3_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+ {
+ struct inode *inode = page->mapping->host;
+ handle_t *handle = ext3_journal_current_handle();
+ int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+
+ lock_kernel();
+ handle = ext3_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = block_prepare_write(page, from, to, ext3_get_block);
+ if (ret != 0)
+ goto prepare_write_failed;
+
+ if (ext3_should_journal_data(inode))
+ ret = walk_page_buffers(handle, page->buffers,
+ from, to, NULL, do_journal_get_write_access);
+ prepare_write_failed:
+ if (ret)
+ ext3_journal_stop(handle, inode);
+ out:
+ unlock_kernel();
+ return ret;
+ }
+
+ static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
+ {
+ return ext3_journal_dirty_data(handle, bh, 0);
+ }
+
+ /*
+ * For ext3_writepage(). We also brelse() the buffer to account for
+ * the bget() which ext3_writepage() performs.
+ */
+ static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
+ {
+ int ret = ext3_journal_dirty_data(handle, bh, 1);
+ __brelse(bh);
+ return ret;
+ }
+
+ /* For commit_write() in data=journal mode */
+ static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+ {
+ set_bit(BH_Uptodate, &bh->b_state);
+ return ext3_journal_dirty_metadata(handle, bh);
+ }
+
+ /*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from block_symlink().
+ *
+ * ext3 inode->i_dirty_buffers policy: If we're journalling data we
+ * definitely don't want them to appear on the inode at all - instead
+ * we need to manage them at the JBD layer and we need to intercept
+ * the relevant sync operations and translate them into journal operations.
+ *
+ * If we're not journalling data then we can just leave the buffers
+ * on ->i_dirty_buffers. If someone writes them out for us then thanks.
+ * Otherwise we'll do it in commit, if we're using ordered data.
+ */
+
+ static int ext3_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+ {
+ handle_t *handle = ext3_journal_current_handle();
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+
+ lock_kernel();
+ if (ext3_should_journal_data(inode)) {
+ /*
+ * Here we duplicate the generic_commit_write() functionality
+ */
+ int partial = 0;
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ ret = walk_page_buffers(handle, page->buffers,
+ from, to, &partial, commit_write_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ kunmap(page);
+ if (pos > inode->i_size)
+ inode->i_size = pos;
+ set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state);
+ } else {
+ if (ext3_should_order_data(inode)) {
+ ret = walk_page_buffers(handle, page->buffers,
+ from, to, NULL, journal_dirty_sync_data);
+ }
+ /* Be careful here if generic_commit_write becomes a
+ * required invocation after block_prepare_write. */
+ if (ret == 0)
+ ret = generic_commit_write(file, page, from, to);
+ }
+ if (inode->i_size > inode->u.ext3_i.i_disksize) {
+ inode->u.ext3_i.i_disksize = inode->i_size;
+ ret2 = ext3_mark_inode_dirty(handle, inode);
+ if (!ret)
+ ret = ret2;
+ }
+ ret2 = ext3_journal_stop(handle, inode);
+ unlock_kernel();
+ if (!ret)
+ ret = ret2;
+ return ret;
+ }
+
+ /*
+ * bmap() is special. It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal. If somebody makes a swapfile on an ext3 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+ static int ext3_bmap(struct address_space *mapping, long block)
+ {
+ struct inode *inode = mapping->host;
+ journal_t *journal;
+ int err;
+
+ if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) {
+ /*
+ * This is a REALLY heavyweight approach, but the use of
+ * bmap on dirty files is expected to be extremely rare:
+ * only if we run lilo or swapon on a freshly made file
+ * do we expect this to happen.
+ *
+ * (bmap requires CAP_SYS_RAWIO so this does not
+ * represent an unprivileged user DOS attack --- we'd be
+ * in trouble if mortal users could trigger this path at
+ * will.)
+ *
+ * NB. EXT3_STATE_JDATA is not set on files other than
+ * regular files. If somebody wants to bmap a directory
+ * or symlink and gets confused because the buffer
+ * hasn't yet been flushed to disk, they deserve
+ * everything they get.
+ */
+
+ journal = EXT3_JOURNAL(inode);
+ journal_lock_updates(journal);
+ err = journal_flush(journal);
+ journal_unlock_updates(journal);
+
+ if (err)
+ return 0;
+ }
+
+ return generic_block_bmap(mapping,block,ext3_get_block);
+ }
+
+ static int bget_one(handle_t *handle, struct buffer_head *bh)
+ {
+ atomic_inc(&bh->b_count);
+ return 0;
+ }
+
+ /*
+ * Note that we always start a transaction even if we're not journalling
+ * data. This is to preserve ordering: any hole instantiation within
+ * __block_write_full_page -> ext3_get_block() should be journalled
+ * along with the data so we don't crash and then get metadata which
+ * refers to old data.
+ *
+ * In all journalling modes block_write_full_page() will start the I/O.
+ *
+ * Problem:
+ *
+ * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ * ext3_writepage()
+ *
+ * Similar for:
+ *
+ * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ *
+ * Same applies to ext3_get_block(). We will deadlock on various things like
+ * lock_journal and i_truncate_sem.
+ *
+ * Setting PF_MEMALLOC here doesn't work - too many internal memory
+ * allocations fail.
+ *
+ * 16May01: If we're reentered then journal_current_handle() will be
+ * non-zero. We simply *return*.
+ *
+ * 1 July 2001: @@@ FIXME:
+ * In journalled data mode, a data buffer may be metadata against the
+ * current transaction. But the same file is part of a shared mapping
+ * and someone does a writepage() on it.
+ *
+ * We will move the buffer onto the async_data list, but *after* it has
+ * been dirtied. So there's a small window where we have dirty data on
+ * BJ_Metadata.
+ *
+ * Note that this only applies to the last partial page in the file. The
+ * bit which block_write_full_page() uses prepare/commit for. (That's
+ * broken code anyway: it's wrong for msync()).
+ *
+ * It's a rare case: affects the final partial page, for journalled data
+ * where the file is subject to bith write() and writepage() in the same
+ * transction. To fix it we'll need a custom block_write_full_page().
+ * We'll probably need that anyway for journalling writepage() output.
+ *
+ * We don't honour synchronous mounts for writepage(). That would be
+ * disastrous. Any write() or metadata operation will sync the fs for
+ * us.
+ */
+ static int ext3_writepage(struct page *page)
+ {
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *page_buffers;
+ handle_t *handle = NULL;
+ int ret = 0, err;
+ int needed;
+ int order_data;
+
+ J_ASSERT(PageLocked(page));
+
+ /*
+ * We give up here if we're reentered, because it might be
+ * for a different filesystem. One *could* look for a
+ * nested transaction opportunity.
+ */
+ lock_kernel();
+ if (ext3_journal_current_handle())
+ goto out_fail;
+
+ needed = ext3_writepage_trans_blocks(inode);
+ if (current->flags & PF_MEMALLOC)
+ handle = ext3_journal_try_start(inode, needed);
+ else
+ handle = ext3_journal_start(inode, needed);
+
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_fail;
+ }
+
+ order_data = ext3_should_order_data(inode) ||
+ ext3_should_journal_data(inode);
+
+ unlock_kernel();
+
+ page_buffers = NULL; /* Purely to prevent compiler warning */
+
+ /* bget() all the buffers */
+ if (order_data) {
+ if (!page->buffers)
+ create_empty_buffers(page,
+ inode->i_dev, inode->i_sb->s_blocksize);
+ page_buffers = page->buffers;
+ walk_page_buffers(handle, page_buffers, 0,
+ PAGE_CACHE_SIZE, NULL, bget_one);
+ }
+
+ ret = block_write_full_page(page, ext3_get_block);
+
+ /*
+ * The page can become unlocked at any point now, and
+ * truncate can then come in and change things. So we
+ * can't touch *page from now on. But *page_buffers is
+ * safe due to elevated refcount.
+ */
+
+ handle = ext3_journal_current_handle();
+ lock_kernel();
+
+ /* And attach them to the current transaction */
+ if (order_data) {
+ err = walk_page_buffers(handle, page_buffers,
+ 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
+ if (!ret)
+ ret = err;
+ }
+
+ err = ext3_journal_stop(handle, inode);
+ if (!ret)
+ ret = err;
+ unlock_kernel();
+ return ret;
+
+ out_fail:
+
+ unlock_kernel();
+ SetPageDirty(page);
+ UnlockPage(page);
+ return ret;
+ }
+
+ static int ext3_readpage(struct file *file, struct page *page)
+ {
+ return block_read_full_page(page,ext3_get_block);
+ }
+
+
+ static int ext3_flushpage(struct page *page, unsigned long offset)
+ {
+ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ return journal_flushpage(journal, page, offset);
+ }
+
+ static int ext3_releasepage(struct page *page, int wait)
+ {
+ journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ return journal_try_to_free_buffers(journal, page, wait);
+ }
+
+
+ struct address_space_operations ext3_aops = {
+ readpage: ext3_readpage, /* BKL not held. Don't need */
+ writepage: ext3_writepage, /* BKL not held. We take it */
+ sync_page: block_sync_page,
+ prepare_write: ext3_prepare_write, /* BKL not held. We take it */
+ commit_write: ext3_commit_write, /* BKL not held. We take it */
+ bmap: ext3_bmap, /* BKL held */
+ flushpage: ext3_flushpage, /* BKL not held. Don't need */
+ releasepage: ext3_releasepage, /* BKL not held. Don't need */
+ };
+
+ /*
+ * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+ static int ext3_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, iblock, length, pos;
+ struct inode *inode = mapping->host;
+ struct page *page;
+ struct buffer_head *bh;
+ int err;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = offset & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!length)
+ return 0;
+
+ length = blocksize - length;
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ page = grab_cache_page(mapping, index);
+ err = -ENOMEM;
+ if (!page)
+ goto out;
+
+ if (!page->buffers)
+ create_empty_buffers(page, inode->i_dev, blocksize);
+
+ /* Find the buffer that contains "offset" */
+ bh = page->buffers;
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ err = 0;
+ if (!buffer_mapped(bh)) {
+ /* Hole? Nothing to do */
+ if (buffer_uptodate(bh))
+ goto unlock;
+ ext3_get_block(inode, iblock, bh, 0);
+ /* Still unmapped? Nothing to do */
+ if (!buffer_mapped(bh))
+ goto unlock;
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (Page_Uptodate(page))
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+
+ if (ext3_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err)
+ goto unlock;
+ }
+
+ memset(kmap(page) + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap(page);
+
+ BUFFER_TRACE(bh, "zeroed end of block");
+
+ err = 0;
+ if (ext3_should_journal_data(inode)) {
+ err = ext3_journal_dirty_metadata(handle, bh);
+ } else {
+ if (ext3_should_order_data(inode))
+ err = ext3_journal_dirty_data(handle, bh, 0);
+ __mark_buffer_dirty(bh);
+ }
+
+ unlock:
+ UnlockPage(page);
+ page_cache_release(page);
+ out:
+ return err;
+ }
+
+ /*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+ static inline int all_zeroes(u32 *p, u32 *q)
+ {
+ while (p < q)
+ if (*p++)
+ return 0;
+ return 1;
+ }
+
+ /**
+ * ext3_find_shared - find the indirect blocks for partial truncation.
+ * @inode: inode in question
+ * @depth: depth of the affected branch
+ * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
+ * @chain: place to store the pointers to partial indirect blocks
+ * @top: place to the (detached) top of branch
+ *
+ * This is a helper function used by ext3_truncate().
+ *
+ * When we do truncate() we may have to clean the ends of several
+ * indirect blocks but leave the blocks themselves alive. Block is
+ * partially truncated if some data below the new i_size is refered
+ * from it (and it is on the path to the first completely truncated
+ * data block, indeed). We have to free the top of that path along
+ * with everything to the right of the path. Since no allocation
+ * past the truncation point is possible until ext3_truncate()
+ * finishes, we may safely do the latter, but top of branch may
+ * require special attention - pageout below the truncation point
+ * might try to populate it.
+ *
+ * We atomically detach the top of branch from the tree, store the
+ * block number of its root in *@top, pointers to buffer_heads of
+ * partially truncated blocks - in @chain[].bh and pointers to
+ * their last elements that should not be removed - in
+ * @chain[].p. Return value is the pointer to last filled element
+ * of @chain.
+ *
+ * The work left to caller to do the actual freeing of subtrees:
+ * a) free the subtree starting from *@top
+ * b) free the subtrees whose roots are stored in
+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ * c) free the subtrees growing from the inode past the @chain[0].
+ * (no partially truncated stuff there). */
+
+ static Indirect *ext3_find_shared(struct inode *inode,
+ int depth,
+ int offsets[4],
+ Indirect chain[4],
+ u32 *top)
+ {
+ Indirect *partial, *p;
+ int k, err;
+
+ *top = 0;
+ /* Make k index the deepest non-null offest + 1 */
+ for (k = depth; k > 1 && !offsets[k-1]; k--)
+ ;
+ partial = ext3_get_branch(inode, k, offsets, chain, &err);
+ /* Writer: pointers */
+ if (!partial)
+ partial = chain + k-1;
+ /*
+ * If the branch acquired continuation since we've looked at it -
+ * fine, it should all survive and (new) top doesn't belong to us.
+ */
+ if (!partial->key && *partial->p)
+ /* Writer: end */
+ goto no_top;
+ for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
+ ;
+ /*
+ * OK, we've found the last block that must survive. The rest of our
+ * branch should be detached before unlocking. However, if that rest
+ * of branch is all ours and does not grow immediately from the inode
+ * it's easier to cheat and just decrement partial->p.
+ */
+ if (p == chain + k - 1 && p > chain) {
+ p->p--;
+ } else {
+ *top = *p->p;
+ /* Nope, don't do this in ext3. Must leave the tree intact */
+ #if 0
+ *p->p = 0;
+ #endif
+ }
+ /* Writer: end */
+
+ while(partial > p)
+ {
+ brelse(partial->bh);
+ partial--;
+ }
+ no_top:
+ return partial;
+ }
+
+ /*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ */
+ static void
+ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
+ unsigned long block_to_free, unsigned long count,
+ u32 *first, u32 *last)
+ {
+ u32 *p;
+ kdev_t dev = inode->i_sb->s_dev;
+ unsigned long blocksize = inode->i_sb->s_blocksize;
+
+ if (try_to_extend_transaction(handle, inode)) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, bh);
+ }
+ ext3_mark_inode_dirty(handle, inode);
+ ext3_journal_test_restart(handle, inode);
+ BUFFER_TRACE(bh, "get_write_access");
+ ext3_journal_get_write_access(handle, bh);
+ }
+
+ /*
+ * Any buffers which are on the journal will be in memory. We find
+ * them on the hash table so journal_revoke() will run journal_forget()
+ * on them. We've already detached each block from the file, so
+ * bforget() in journal_forget() should be safe.
+ *
+ * AKPM: turn on bforget in journal_forget()!!!
+ */
+ for (p = first; p < last; p++) {
+ u32 nr = le32_to_cpu(*p);
+ if (nr) {
+ struct buffer_head *bh;
+
+ *p = 0;
+ bh = get_hash_table(dev, nr, blocksize);
+ ext3_forget(handle, 0, inode, bh, nr);
+ }
+ }
+
+ ext3_free_blocks(handle, inode, block_to_free, count);
+ }
+
+ /**
+ * ext3_free_data - free a list of data blocks
+ * @handle: handle for this transaction
+ * @inode: inode we are dealing with
+ * @this_bh: indirect buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: points immediately past the end of array
+ *
+ * We are freeing all blocks refered from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free. Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+ static void ext3_free_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *this_bh, u32 *first, u32 *last)
+ {
+ unsigned long block_to_free = 0; /* Starting block # of a run */
+ unsigned long count = 0; /* Number of blocks in the run */
+ u32 *block_to_free_p = NULL; /* Pointer into inode/ind
+ corresponding to
+ block_to_free */
+ unsigned long nr; /* Current block # */
+ u32 *p; /* Pointer into inode/ind
+ for current block */
+ int err;
+
+ if (this_bh) { /* For indirect block */
+ BUFFER_TRACE(this_bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, this_bh);
+ /* Important: if we can't update the indirect pointers
+ * to the blocks, we can't free them. */
+ if (err)
+ return;
+ }
+
+ for (p = first; p < last; p++) {
+ nr = le32_to_cpu(*p);
+ if (nr) {
+ /* accumulate blocks to free if they're contiguous */
+ if (count == 0) {
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ } else if (nr == block_to_free + count) {
+ count++;
+ } else {
+ ext3_clear_blocks(handle, inode, this_bh,
+ block_to_free,
+ count, block_to_free_p, p);
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ }
+ }
+ }
+
+ if (count > 0)
+ ext3_clear_blocks(handle, inode, this_bh, block_to_free,
+ count, block_to_free_p, p);
+
+ if (this_bh) {
+ BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, this_bh);
+ }
+ }
+
+ /**
+ * ext3_free_branches - free an array of branches
+ * @handle: JBD handle for this transaction
+ * @inode: inode we are dealing with
+ * @parent_bh: the buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: pointer immediately past the end of array
+ * @depth: depth of the branches to free
+ *
+ * We are freeing all blocks refered from these branches (numbers are
+ * stored as little-endian 32-bit) and updating @inode->i_blocks
+ * appropriately.
+ */
+ static void ext3_free_branches(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh,
+ u32 *first, u32 *last, int depth)
+ {
+ unsigned long nr;
+ u32 *p;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ if (depth--) {
+ struct buffer_head *bh;
+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+ p = last;
+ while (--p >= first) {
+ nr = le32_to_cpu(*p);
+ if (!nr)
+ continue; /* A hole */
+
+ /* Go read the buffer for the next level down */
+ bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize);
+
+ /*
+ * A read failure? Report error and clear slot
+ * (should be rare).
+ */
+ if (!bh) {
+ ext3_error(inode->i_sb, "ext3_free_branches",
+ "Read failure, inode=%ld, block=%ld",
+ inode->i_ino, nr);
+ continue;
+ }
+
+ /* This zaps the entire block. Bottom up. */
+ BUFFER_TRACE(bh, "free child branches");
+ ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
+ (u32*)bh->b_data + addr_per_block,
+ depth);
+
+ /*
+ * We've probably journalled the indirect block several
+ * times during the truncate. But it's no longer
+ * needed and we now drop it from the transaction via
+ * journal_revoke().
+ *
+ * That's easy if it's exclusively part of this
+ * transaction. But if it's part of the committing
+ * transaction then journal_forget() will simply
+ * brelse() it. That means that if the underlying
+ * block is reallocated in ext3_get_block(),
+ * unmap_underlying_metadata() will find this block
+ * and will try to get rid of it. damn, damn.
+ *
+ * If this block has already been committed to the
+ * journal, a revoke record will be written. And
+ * revoke records must be emitted *before* clearing
+ * this block's bit in the bitmaps.
+ */
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+
+ /*
+ * Everything below this this pointer has been
+ * released. Now let this top-of-subtree go.
+ *
+ * We want the freeing of this indirect block to be
+ * atomic in the journal with the updating of the
+ * bitmap block which owns it. So make some room in
+ * the journal.
+ *
+ * We zero the parent pointer *after* freeing its
+ * pointee in the bitmaps, so if extend_transaction()
+ * for some reason fails to put the bitmap changes and
+ * the release into the same transaction, recovery
+ * will merely complain about releasing a free block,
+ * rather than leaking blocks.
+ */
+ if (is_handle_aborted(handle))
+ return;
+ if (try_to_extend_transaction(handle, inode)) {
+ ext3_mark_inode_dirty(handle, inode);
+ ext3_journal_test_restart(handle, inode);
+ }
+
+ ext3_free_blocks(handle, inode, nr, 1);
+
+ if (parent_bh) {
+ /*
+ * The block which we have just freed is
+ * pointed to by an indirect block: journal it
+ */
+ BUFFER_TRACE(parent_bh, "get_write_access");
+ if (!ext3_journal_get_write_access(handle,
+ parent_bh)){
+ *p = 0;
+ BUFFER_TRACE(parent_bh,
+ "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle,
+ parent_bh);
+ }
+ }
+ }
+ } else {
+ /* We have reached the bottom of the tree. */
+ BUFFER_TRACE(parent_bh, "free data blocks");
+ ext3_free_data(handle, inode, parent_bh, first, last);
+ }
+ }
+
+ /*
+ * ext3_truncate()
+ *
+ * We block out ext3_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext3_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commmit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk. We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable. It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext3_truncate() to have another go. So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext3 filesystem. But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext3_truncate() run will find them and release them.
+ */
+
+ void ext3_truncate(struct inode * inode)
+ {
+ handle_t *handle;
+ u32 *i_data = inode->u.ext3_i.i_data;
+ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+ int offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ int nr = 0;
+ int n;
+ long last_block;
+ unsigned blocksize;
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode))
+ return;
+
+ ext3_discard_prealloc(inode);
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle))
+ return; /* AKPM: return what? */
+
+ blocksize = inode->i_sb->s_blocksize;
+ last_block = (inode->i_size + blocksize-1)
+ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+
+ ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
+
+
+ n = ext3_block_to_path(inode, last_block, offsets);
+ if (n == 0)
+ goto out_stop; /* error */
+
+ /*
+ * OK. This truncate is going to happen. We add the inode to the
+ * orphan list, so that if this truncate spans multiple transactions,
+ * and we crash, we will resume the truncate when the filesystem
+ * recovers. It also marks the inode dirty, to catch the new size.
+ *
+ * Implication: the file must always be in a sane, consistent
+ * truncatable state while each transaction commits.
+ */
+ if (ext3_orphan_add(handle, inode))
+ goto out_stop;
+
+ /*
+ * The orphan list entry will now protect us from any crash which
+ * occurs before the truncate completes, so it is now safe to propagate
+ * the new, shorter inode size (held for now in i_size) into the
+ * on-disk inode. We do this via i_disksize, which is the value which
+ * ext3 *really* writes onto the disk inode.
+ */
+ inode->u.ext3_i.i_disksize = inode->i_size;
+
+ /*
+ * From here we block out all ext3_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&inode->u.ext3_i.truncate_sem);
+
+ if (n == 1) { /* direct blocks */
+ ext3_free_data(handle, inode, NULL, i_data+offsets[0],
+ i_data + EXT3_NDIR_BLOCKS);
+ goto do_indirects;
+ }
+
+ partial = ext3_find_shared(inode, n, offsets, chain, &nr);
+ /* Kill the top of shared branch (not detached) */
+ if (nr) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext3_free_branches(handle, inode, NULL,
+ &nr, &nr+1, (chain+n-1) - partial);
+ *partial->p = 0;
+ /*
+ * We mark the inode dirty prior to restart,
+ * and prior to stop. No need for it here.
+ */
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
+ ext3_free_branches(handle, inode, partial->bh,
+ partial->p,
+ partial->p+1, (chain+n-1) - partial);
+ }
+ }
+ /* Clear the ends of indirect blocks on the shared branch */
+ while (partial > chain) {
+ ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
+ (u32*)partial->bh->b_data + addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse (partial->bh);
+ partial--;
+ }
+ do_indirects:
+ /* Kill the remaining (whole) subtrees */
+ switch (offsets[0]) {
+ default:
+ nr = i_data[EXT3_IND_BLOCK];
+ if (nr) {
+ ext3_free_branches(handle, inode, NULL,
+ &nr, &nr+1, 1);
+ i_data[EXT3_IND_BLOCK] = 0;
+ }
+ case EXT3_IND_BLOCK:
+ nr = i_data[EXT3_DIND_BLOCK];
+ if (nr) {
+ ext3_free_branches(handle, inode, NULL,
+ &nr, &nr+1, 2);
+ i_data[EXT3_DIND_BLOCK] = 0;
+ }
+ case EXT3_DIND_BLOCK:
+ nr = i_data[EXT3_TIND_BLOCK];
+ if (nr) {
+ ext3_free_branches(handle, inode, NULL,
+ &nr, &nr+1, 3);
+ i_data[EXT3_TIND_BLOCK] = 0;
+ }
+ case EXT3_TIND_BLOCK:
+ ;
+ }
+ up_write(&inode->u.ext3_i.truncate_sem);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+
+ /* In a multi-transaction truncate, we only make the final
+ * transaction synchronous */
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ out_stop:
+ /*
+ * If this was a simple ftruncate(), and the file will remain alive
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext3_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext3_orphan_del(handle, inode);
+
+ ext3_journal_stop(handle, inode);
+ }
+
+ /*
+ * ext3_get_inode_loc returns with an extra refcount against the
+ * inode's underlying buffer_head on success.
+ */
+
+ int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
+ {
+ struct buffer_head *bh = 0;
+ unsigned long block;
+ unsigned long block_group;
+ unsigned long group_desc;
+ unsigned long desc;
+ unsigned long offset;
+ struct ext3_group_desc * gdp;
+
+ if ((inode->i_ino != EXT3_ROOT_INO &&
+ inode->i_ino != EXT3_ACL_IDX_INO &&
+ inode->i_ino != EXT3_ACL_DATA_INO &&
+ inode->i_ino != EXT3_JOURNAL_INO &&
+ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+ inode->i_ino > le32_to_cpu(
+ inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ "bad inode number: %lu", inode->i_ino);
+ goto bad_inode;
+ }
+ block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
+ if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ "group >= groups count");
+ goto bad_inode;
+ }
+ group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
+ desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
+ bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
+ if (!bh) {
+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ "Descriptor not loaded");
+ goto bad_inode;
+ }
+
+ gdp = (struct ext3_group_desc *) bh->b_data;
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
+ EXT3_INODE_SIZE(inode->i_sb);
+ block = le32_to_cpu(gdp[desc].bg_inode_table) +
+ (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
+ if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
+ ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ "unable to read inode block - "
+ "inode=%lu, block=%lu", inode->i_ino, block);
+ goto bad_inode;
+ }
+ offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
+
+ iloc->bh = bh;
+ iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
+ iloc->block_group = block_group;
+
+ return 0;
+
+ bad_inode:
+ return -EIO;
+ }
+
+ void ext3_read_inode(struct inode * inode)
+ {
+ struct ext3_iloc iloc;
+ struct ext3_inode *raw_inode;
+ struct buffer_head *bh;
+ int block;
+
+ if(ext3_get_inode_loc(inode, &iloc))
+ goto bad_inode;
+ bh = iloc.bh;
+ raw_inode = iloc.raw_inode;
+ init_rwsem(&inode->u.ext3_i.truncate_sem);
+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if(!(test_opt (inode->i_sb, NO_UID32))) {
+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ }
+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ inode->i_size = le32_to_cpu(raw_inode->i_size);
+ inode->i_atime = le32_to_cpu(raw_inode->i_atime);
+ inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
+ inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
+ inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ /* We now have enough fields to check if the inode was active or not.
+ * This is needed because nfsd might try to access dead inodes
+ * the test is that same one that e2fsck uses
+ * NeilBrown 1999oct15
+ */
+ if (inode->i_nlink == 0) {
+ if (inode->i_mode == 0 ||
+ !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
+ /* this inode is deleted */
+ brelse (bh);
+ goto bad_inode;
+ }
+ /* The only unlinked inodes we let through here have
+ * valid i_mode and are being read by the orphan
+ * recovery code: that's fine, we're about to complete
+ * the process of deleting those. */
+ }
+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
+ * (for stat), not the fs block
+ * size */
+ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+ inode->i_version = ++event;
+ inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
+ #ifdef EXT3_FRAGMENTS
+ inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
+ inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
+ inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
+ #endif
+ inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+ if (!S_ISREG(inode->i_mode)) {
+ inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+ } else {
+ inode->i_size |=
+ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+ }
+ inode->u.ext3_i.i_disksize = inode->i_size;
+ inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+ #ifdef EXT3_PREALLOCATE
+ inode->u.ext3_i.i_prealloc_count = 0;
+ #endif
+ inode->u.ext3_i.i_block_group = iloc.block_group;
+
+ /*
+ * NOTE! The in-memory inode i_data array is in little-endian order
+ * even on big-endian machines: we do NOT byteswap the block numbers!
+ */
+ for (block = 0; block < EXT3_N_BLOCKS; block++)
+ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+
+ brelse (iloc.bh);
+
+ if (inode->i_ino == EXT3_ACL_IDX_INO ||
+ inode->i_ino == EXT3_ACL_DATA_INO)
+ /* Nothing to do */ ;
+ else if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+ if (!inode->i_blocks)
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+ else {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ }
+ } else
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(iloc.raw_inode->i_block[0]));
+ /* inode->i_attr_flags = 0; unused */
+ if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+ /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+ inode->i_flags |= S_SYNC;
+ }
+ if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) {
+ /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */
+ inode->i_flags |= S_APPEND;
+ }
+ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FILE_FL) {
+ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */
+ inode->i_flags |= S_IMMUTABLE_FILE;
+ }
+ if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_LINK_FL) {
+ /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_LINK; unused */
+ inode->i_flags |= S_IMMUTABLE_LINK;
+ }
+ if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) {
+ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */
+ inode->i_flags |= S_NOATIME;
+ }
+ return;
+
+ bad_inode:
+ make_bad_inode(inode);
+ return;
+ }
+
+ /*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache. This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ */
+
+ static int ext3_do_update_inode(handle_t *handle,
+ struct inode *inode,
+ struct ext3_iloc *iloc)
+ {
+ struct ext3_inode *raw_inode = iloc->raw_inode;
+ struct buffer_head *bh = iloc->bh;
+ int err = 0, rc, block;
+
+ if (handle) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, bh);
+ if (err)
+ goto out_brelse;
+ }
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ if(!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ /*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+ if(!inode->u.ext3_i.i_dtime) {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(inode->i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(inode->i_gid));
+ } else {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ } else {
+ raw_inode->i_uid_low =
+ cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ raw_inode->i_gid_low =
+ cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+ raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
+ raw_inode->i_atime = cpu_to_le32(inode->i_atime);
+ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
+ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
+ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+ raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
+ raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
+ #ifdef EXT3_FRAGMENTS
+ raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
+ raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
+ raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
+ #else
+ /* If we are not tracking these fields in the in-memory inode,
+ * then preserve them on disk, but still initialise them to zero
+ * for new inodes. */
+ if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) {
+ raw_inode->i_faddr = 0;
+ raw_inode->i_frag = 0;
+ raw_inode->i_fsize = 0;
+ }
+ #endif
+ raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
+ if (!S_ISREG(inode->i_mode)) {
+ raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
+ } else {
+ raw_inode->i_size_high =
+ cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
+ if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
+ struct super_block *sb = inode->i_sb;
+ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ EXT3_SB(sb)->s_es->s_rev_level ==
+ cpu_to_le32(EXT3_GOOD_OLD_REV)) {
+ /* If this is the first large file
+ * created, add a flag to the superblock.
+ */
+ err = ext3_journal_get_write_access(handle,
+ sb->u.ext3_sb.s_sbh);
+ if (err)
+ goto out_brelse;
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_RO_COMPAT_FEATURE(sb,
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
+ sb->s_dirt = 1;
+ handle->h_sync = 1;
+ err = ext3_journal_dirty_metadata(handle,
+ sb->u.ext3_sb.s_sbh);
+ }
+ }
+ }
+ raw_inode->i_generation = le32_to_cpu(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ raw_inode->i_block[0] =
+ cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
+ else for (block = 0; block < EXT3_N_BLOCKS; block++)
+ raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
+
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ rc = ext3_journal_dirty_metadata(handle, bh);
+ if (!err)
+ err = rc;
+ inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW;
+
+ out_brelse:
+ brelse (bh);
+ ext3_std_error(inode->i_sb, err);
+ return err;
+ }
+
+ /*
+ * ext3_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ * Here, there will be no transaction running. We wait for any running
+ * trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ * We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ * Here we simply return. We can't afford to block kswapd on the
+ * journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this. The code:
+ *
+ * mark_inode_dirty(inode)
+ * stuff();
+ * inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost. Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+ void ext3_write_inode(struct inode *inode, int wait)
+ {
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ if (ext3_journal_current_handle()) {
+ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+ return;
+ }
+
+ if (!wait)
+ return;
+
+ ext3_force_commit(inode->i_sb);
+ }
+
+ /*
+ * ext3_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible. In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk. (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * This is only needed for regular files. rmdir() has its own path, and
+ * we can never truncate a direcory except on final unlink (at which
+ * point i_nlink is zero so recovery is easy.)
+ *
+ * Called with the BKL.
+ */
+
+ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
+ {
+ struct inode *inode = dentry->d_inode;
+ int error, rc;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ handle_t *handle;
+
+ handle = ext3_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+
+ error = ext3_orphan_add(handle, inode);
+ inode->u.ext3_i.i_disksize = attr->ia_size;
+ rc = ext3_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ ext3_journal_stop(handle, inode);
+ }
+
+ inode_setattr(inode, attr);
+
+ /* If inode_setattr's call to ext3_truncate failed to get a
+ * transaction handle at all, we need to clean up the in-core
+ * orphan list manually. */
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
+
+ err_out:
+ ext3_std_error(inode->i_sb, error);
+ return 0;
+ }
+
+
+ /*
+ * akpm: how many blocks doth make a writepage()?
+ *
+ * With N blocks per page, it may be:
+ * N data blocks
+ * 2 indirect block
+ * 2 dindirect
+ * 1 tindirect
+ * N+5 bitmap blocks (from the above)
+ * N+5 group descriptor summary blocks
+ * 1 inode block
+ * 1 superblock.
+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
+ *
+ * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+ *
+ * With ordered or writeback data it's the same, less the N data blocks.
+ *
+ * If the inode's direct blocks can hold an integral number of pages then a
+ * page cannot straddle two indirect blocks, and we can only touch one indirect
+ * and dindirect block, and the "5" above becomes "3".
+ *
+ * This still overestimates under most circumstances. If we were to pass the
+ * start and end offsets in here as well we could do block_to_path() on each
+ * block and work out the exact number of indirects which are touched. Pah.
+ */
+
+ int ext3_writepage_trans_blocks(struct inode *inode)
+ {
+ int bpp = ext3_journal_blocks_per_page(inode);
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+ ret = 2 * (bpp + indirects) + 2;
+
+ #ifdef CONFIG_QUOTA
+ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
+ #endif
+
+ return ret;
+ }
+
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext3_iloc *iloc)
+ {
+ int err = 0;
+
+ if (handle) {
+ /* the do_update_inode consumes one bh->b_count */
+ atomic_inc(&iloc->bh->b_count);
+ err = ext3_do_update_inode(handle, inode, iloc);
+ /* ext3_do_update_inode() does journal_dirty_metadata */
+ brelse(iloc->bh);
+ } else {
+ printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n");
+ }
+ return err;
+ }
+
+ /*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+ int
+ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext3_iloc *iloc)
+ {
+ int err = 0;
+ if (handle) {
+ err = ext3_get_inode_loc(inode, iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, iloc->bh);
+ if (err) {
+ brelse(iloc->bh);
+ iloc->bh = NULL;
+ }
+ }
+ }
+ ext3_std_error(inode->i_sb, err);
+ return err;
+ }
+
+ /*
+ * akpm: What we do here is to mark the in-core inode as clean
+ * with respect to inode dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O. This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating? Not really. Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective? Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O. But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out. One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory. It has the desired
+ * effect.
+ */
+ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
+ {
+ struct ext3_iloc iloc;
+ int err;
+
+ err = ext3_reserve_inode_write(handle, inode, &iloc);
+ if (!err)
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ return err;
+ }
+
+ /*
+ * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+ void ext3_dirty_inode(struct inode *inode)
+ {
+ handle_t *current_handle = ext3_journal_current_handle();
+ handle_t *handle;
+
+ lock_kernel();
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ goto out;
+ if (current_handle &&
+ current_handle->h_transaction != handle->h_transaction) {
+ /* This task has a transaction open against a different fs */
+ printk(KERN_EMERG __FUNCTION__": transactions do not match!\n");
+ } else {
+ jbd_debug(5, "marking dirty. outer handle=%p\n",
+ current_handle);
+ ext3_mark_inode_dirty(handle, inode);
+ }
+ ext3_journal_stop(handle, inode);
+ out:
+ unlock_kernel();
+ }
+
+ #ifdef AKPM
+ /*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early. Unlike
+ * ext3_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+ static inline int
+ ext3_pin_inode(handle_t *handle, struct inode *inode)
+ {
+ struct ext3_iloc iloc;
+
+ int err = 0;
+ if (handle) {
+ err = ext3_get_inode_loc(inode, &iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc.bh, "get_write_access");
+ err = journal_get_write_access(handle, iloc.bh);
+ if (!err)
+ err = ext3_journal_dirty_metadata(handle,
+ iloc.bh);
+ brelse(iloc.bh);
+ }
+ }
+ ext3_std_error(inode->i_sb, err);
+ return err;
+ }
+ #endif
+
+ int ext3_change_inode_journal_flag(struct inode *inode, int val)
+ {
+ journal_t *journal;
+ handle_t *handle;
+ int err;
+
+ /*
+ * We have to be very careful here: changing a data block's
+ * journaling status dynamically is dangerous. If we write a
+ * data block to the journal, change the status and then delete
+ * that block, we risk forgetting to revoke the old log record
+ * from the journal and so a subsequent replay can corrupt data.
+ * So, first we make sure that the journal is empty and that
+ * nobody is changing anything.
+ */
+
+ journal = EXT3_JOURNAL(inode);
+ if (is_journal_aborted(journal) || IS_RDONLY(inode))
+ return -EROFS;
+
+ journal_lock_updates(journal);
+ journal_flush(journal);
+
+ /*
+ * OK, there are no updates running now, and all cached data is
+ * synced to disk. We are now in a completely consistent state
+ * which doesn't have anything in the journal, and we know that
+ * no filesystem updates are running, so it is safe to modify
+ * the inode's in-core data-journaling state flag now.
+ */
+
+ if (val)
+ inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
+ else
+ inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
+
+ journal_unlock_updates(journal);
+
+ /* Finally we can mark the inode as dirty. */
+
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext3_mark_inode_dirty(handle, inode);
+ handle->h_sync = 1;
+ ext3_journal_stop(handle, inode);
+ ext3_std_error(inode->i_sb, err);
+
+ return err;
+ }
+
+
+ /*
+ * ext3_aops_journal_start().
+ *
+ * <This function died, but the comment lives on>
+ *
+ * We need to take the inode semaphore *outside* the
+ * journal_start/journal_stop. Otherwise, a different task could do a
+ * wait_for_commit() while holding ->i_sem, which deadlocks. The rule
+ * is: transaction open/closes are considered to be a locking operation
+ * and they nest *inside* ->i_sem.
+ * ----------------------------------------------------------------------------
+ * Possible problem:
+ * ext3_file_write()
+ * -> generic_file_write()
+ * -> __alloc_pages()
+ * -> page_launder()
+ * -> ext3_writepage()
+ *
+ * And the writepage can be on a different fs while we have a
+ * transaction open against this one! Bad.
+ *
+ * I tried making the task PF_MEMALLOC here, but that simply results in
+ * 0-order allocation failures passed back to generic_file_write().
+ * Instead, we rely on the reentrancy protection in ext3_writepage().
+ * ----------------------------------------------------------------------------
+ * When we do the journal_start() here we don't really need to reserve
+ * any blocks - we won't need any until we hit ext3_prepare_write(),
+ * which does all the needed journal extending. However! There is a
+ * problem with quotas:
+ *
+ * Thread 1:
+ * sys_sync
+ * ->sync_dquots
+ * ->commit_dquot
+ * ->lock_dquot
+ * ->write_dquot
+ * ->ext3_file_write
+ * ->journal_start
+ * ->ext3_prepare_write
+ * ->journal_extend
+ * ->journal_start
+ * Thread 2:
+ * ext3_create (for example)
+ * ->ext3_new_inode
+ * ->dquot_initialize
+ * ->lock_dquot
+ *
+ * Deadlock. Thread 1's journal_start blocks because thread 2 has a
+ * transaction open. Thread 2's transaction will never close because
+ * thread 2 is stuck waiting for the dquot lock.
+ *
+ * So. We must ensure that thread 1 *never* needs to extend the journal
+ * for quota writes. We do that by reserving enough journal blocks
+ * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
+ * need to extend" test in ext3_prepare_write() succeeds.
+ */
+
+
+ MODULE_LICENSE("GPL");
diff -rc2P linux/fs/ext3/ioctl.c linux-2.4.13/fs/ext3/ioctl.c
*** linux/fs/ext3/ioctl.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/ioctl.c Fri Nov 9 17:03:13 2001
***************
*** 0 ****
--- 1,176 ----
+ /*
+ * linux/fs/ext3/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/sched.h>
+ #include <asm/uaccess.h>
+
+
+ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+ unsigned long arg)
+ {
+ unsigned int flags;
+
+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
+ case EXT3_IOC_GETFLAGS:
+ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
+ return put_user(flags, (int *) arg);
+ case EXT3_IOC_SETFLAGS: {
+ handle_t *handle = NULL;
+ int err;
+ struct ext3_iloc iloc;
+ unsigned int oldflags;
+ unsigned int jflag;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+
+ if (get_user(flags, (int *) arg))
+ return -EFAULT;
+
+ oldflags = inode->u.ext3_i.i_flags;
+
+ /* The JOURNAL_DATA flag is modifiable only by root */
+ jflag = flags & EXT3_JOURNAL_DATA_FL;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ *
+ * This test looks nicer. Thanks to Pauline Middelink
+ */
+ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+ }
+
+ /*
+ * The JOURNAL_DATA flag can only be changed by
+ * the relevant capability.
+ */
+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+ }
+
+
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ err = ext3_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto flags_err;
+
+ flags = flags & EXT3_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
+ inode->u.ext3_i.i_flags = flags;
+
+ if (flags & EXT3_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ else
+ inode->i_flags &= ~S_SYNC;
+ if (flags & EXT3_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+ if (flags & EXT3_IMMUTABLE_FILE_FL)
+ inode->i_flags |= S_IMMUTABLE_FILE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE_FILE;
+
+ if (flags & EXT3_IMMUTABLE_LINK_FL)
+ inode->i_flags |= S_IMMUTABLE_LINK;
+ else
+ inode->i_flags &= ~S_IMMUTABLE_LINK;
+
+ if (flags & EXT3_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
+ inode->i_ctime = CURRENT_TIME;
+
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ flags_err:
+ ext3_journal_stop(handle, inode);
+ if (err)
+ return err;
+
+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
+ err = ext3_change_inode_journal_flag(inode, jflag);
+ return err;
+ }
+ case EXT3_IOC_GETVERSION:
+ case EXT3_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int *) arg);
+ case EXT3_IOC_SETVERSION:
+ case EXT3_IOC_SETVERSION_OLD: {
+ handle_t *handle;
+ struct ext3_iloc iloc;
+ __u32 generation;
+ int err;
+
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (get_user(generation, (int *) arg))
+ return -EFAULT;
+
+ handle = ext3_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ err = ext3_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ return err;
+
+ inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = generation;
+
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ ext3_journal_stop(handle, inode);
+ return err;
+ }
+ #ifdef CONFIG_JBD_DEBUG
+ case EXT3_IOC_WAIT_FOR_READONLY:
+ /*
+ * This is racy - by the time we're woken up and running,
+ * the superblock could be released. And the module could
+ * have been unloaded. So sue me.
+ *
+ * Returns 1 if it slept, else zero.
+ */
+ {
+ struct super_block *sb = inode->i_sb;
+ DECLARE_WAITQUEUE(wait, current);
+ int ret = 0;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+ if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
+ schedule();
+ ret = 1;
+ }
+ remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+ return ret;
+ }
+ #endif
+ default:
+ return -ENOTTY;
+ }
+ }
diff -rc2P linux/fs/ext3/namei.c linux-2.4.13/fs/ext3/namei.c
*** linux/fs/ext3/namei.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/namei.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,1125 ----
+ /*
+ * linux/fs/ext3/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/namei.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o ([email protected]), 1998
+ */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+
+ /*
+ * define how far ahead to read directories while searching them.
+ */
+ #define NAMEI_RA_CHUNKS 2
+ #define NAMEI_RA_BLOCKS 4
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
+ /*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT3_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+ static inline int ext3_match (int len, const char * const name,
+ struct ext3_dir_entry_2 * de)
+ {
+ if (len != de->name_len)
+ return 0;
+ if (!de->inode)
+ return 0;
+ return !memcmp(name, de->name, len);
+ }
+
+ /*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+ static int inline search_dirblock(struct buffer_head * bh,
+ struct inode *dir,
+ struct dentry *dentry,
+ unsigned long offset,
+ struct ext3_dir_entry_2 ** res_dir)
+ {
+ struct ext3_dir_entry_2 * de;
+ char * dlimit;
+ int de_len;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ while ((char *) de < dlimit) {
+ /* this code is executed quadratically often */
+ /* do minimal checking `by hand' */
+
+ if ((char *) de + namelen <= dlimit &&
+ ext3_match (namelen, name, de)) {
+ /* found a match - just to be sure, do a full check */
+ if (!ext3_check_dir_entry("ext3_find_entry",
+ dir, de, bh, offset))
+ return -1;
+ *res_dir = de;
+ return 1;
+ }
+ /* prevent looping on a bad block */
+ de_len = le16_to_cpu(de->rec_len);
+ if (de_len <= 0)
+ return -1;
+ offset += de_len;
+ de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
+ }
+ return 0;
+ }
+
+ /*
+ * ext3_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+ struct ext3_dir_entry_2 ** res_dir)
+ {
+ struct super_block * sb;
+ struct buffer_head * bh_use[NAMEI_RA_SIZE];
+ struct buffer_head * bh, *ret = NULL;
+ unsigned long start, block, b;
+ int ra_max = 0; /* Number of bh's in the readahead
+ buffer, bh_use[] */
+ int ra_ptr = 0; /* Current index into readahead
+ buffer */
+ int num = 0;
+ int nblocks, i, err;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ start = dir->u.ext3_i.i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+ restart:
+ do {
+ /*
+ * We deal with the read-ahead logic here.
+ */
+ if (ra_ptr >= ra_max) {
+ /* Refill the readahead buffer */
+ ra_ptr = 0;
+ b = block;
+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+ /*
+ * Terminate if we reach the end of the
+ * directory and must wrap, or if our
+ * search has finished at this block.
+ */
+ if (b >= nblocks || (num && block == start)) {
+ bh_use[ra_max] = NULL;
+ break;
+ }
+ num++;
+ bh = ext3_getblk(NULL, dir, b++, 0, &err);
+ bh_use[ra_max] = bh;
+ if (bh)
+ ll_rw_block(READ, 1, &bh);
+ }
+ }
+ if ((bh = bh_use[ra_ptr++]) == NULL)
+ goto next;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ /* read error, skip block & hope for the best */
+ brelse(bh);
+ goto next;
+ }
+ i = search_dirblock(bh, dir, dentry,
+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+ dir->u.ext3_i.i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+ brelse(bh);
+ if (i < 0)
+ goto cleanup_and_exit;
+ }
+ next:
+ if (++block >= nblocks)
+ block = 0;
+ } while (block != start);
+
+ /*
+ * If the directory has grown while we were searching, then
+ * search the last part of the directory before giving up.
+ */
+ block = nblocks;
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ if (block < nblocks) {
+ start = 0;
+ goto restart;
+ }
+
+ cleanup_and_exit:
+ /* Clean up the read-ahead blocks */
+ for (; ra_ptr < ra_max; ra_ptr++)
+ brelse (bh_use[ra_ptr]);
+ return ret;
+ }
+
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+ struct inode * inode;
+ struct ext3_dir_entry_2 * de;
+ struct buffer_head * bh;
+
+ if (dentry->d_name.len > EXT3_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ bh = ext3_find_entry(dentry, &de);
+ inode = NULL;
+ if (bh) {
+ unsigned long ino = le32_to_cpu(de->inode);
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+ if (!inode)
+ return ERR_PTR(-EACCES);
+ }
+ d_add(dentry, inode);
+ return NULL;
+ }
+
+ #define S_SHIFT 12
+ static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] EXT3_FT_DIR,
+ [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK,
+ };
+
+ static inline void ext3_set_de_type(struct super_block *sb,
+ struct ext3_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+
+ /*
+ * ext3_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext3_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+
+ /*
+ * AKPM: the journalling code here looks wrong on the error paths
+ */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned long offset;
+ unsigned short rec_len;
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de, * de1;
+ struct super_block * sb;
+ int retval;
+
+ sb = dir->i_sb;
+
+ if (!namelen)
+ return -EINVAL;
+ bh = ext3_bread (handle, dir, 0, 0, &retval);
+ if (!bh)
+ return retval;
+ rec_len = EXT3_DIR_REC_LEN(namelen);
+ offset = 0;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ while (1) {
+ if ((char *)de >= sb->s_blocksize + bh->b_data) {
+ brelse (bh);
+ bh = NULL;
+ bh = ext3_bread (handle, dir,
+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+ if (!bh)
+ return retval;
+ if (dir->i_size <= offset) {
+ if (dir->i_size == 0) {
+ brelse(bh);
+ return -ENOENT;
+ }
+
+ ext3_debug ("creating next block\n");
+
+ BUFFER_TRACE(bh, "get_write_access");
+ ext3_journal_get_write_access(handle, bh);
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ de->inode = 0;
+ de->rec_len = le16_to_cpu(sb->s_blocksize);
+ dir->u.ext3_i.i_disksize =
+ dir->i_size = offset + sb->s_blocksize;
+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ ext3_mark_inode_dirty(handle, dir);
+ } else {
+
+ ext3_debug ("skipping to next block\n");
+
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ }
+ }
+ if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+ offset)) {
+ brelse (bh);
+ return -ENOENT;
+ }
+ if (ext3_match (namelen, name, de)) {
+ brelse (bh);
+ return -EEXIST;
+ }
+ if ((le32_to_cpu(de->inode) == 0 &&
+ le16_to_cpu(de->rec_len) >= rec_len) ||
+ (le16_to_cpu(de->rec_len) >=
+ EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+ BUFFER_TRACE(bh, "get_write_access");
+ ext3_journal_get_write_access(handle, bh);
+ /* By now the buffer is marked for journaling */
+ offset += le16_to_cpu(de->rec_len);
+ if (le32_to_cpu(de->inode)) {
+ de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+ EXT3_DIR_REC_LEN(de->name_len));
+ de1->rec_len =
+ cpu_to_le16(le16_to_cpu(de->rec_len) -
+ EXT3_DIR_REC_LEN(de->name_len));
+ de->rec_len = cpu_to_le16(
+ EXT3_DIR_REC_LEN(de->name_len));
+ de = de1;
+ }
+ de->file_type = EXT3_FT_UNKNOWN;
+ if (inode) {
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+ } else
+ de->inode = 0;
+ de->name_len = namelen;
+ memcpy (de->name, name, namelen);
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext3_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ ext3_mark_inode_dirty(handle, dir);
+ dir->i_version = ++event;
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, bh);
+ brelse(bh);
+ return 0;
+ }
+ offset += le16_to_cpu(de->rec_len);
+ de = (struct ext3_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ brelse (bh);
+ return -ENOSPC;
+ }
+
+ /*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+ static int ext3_delete_entry (handle_t *handle,
+ struct inode * dir,
+ struct ext3_dir_entry_2 * de_del,
+ struct buffer_head * bh)
+ {
+ struct ext3_dir_entry_2 * de, * pde;
+ int i;
+
+ i = 0;
+ pde = NULL;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ while (i < bh->b_size) {
+ if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
+ return -EIO;
+ if (de == de_del) {
+ BUFFER_TRACE(bh, "get_write_access");
+ ext3_journal_get_write_access(handle, bh);
+ if (pde)
+ pde->rec_len =
+ cpu_to_le16(le16_to_cpu(pde->rec_len) +
+ le16_to_cpu(de->rec_len));
+ else
+ de->inode = 0;
+ dir->i_version = ++event;
+ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, bh);
+ return 0;
+ }
+ i += le16_to_cpu(de->rec_len);
+ pde = de;
+ de = (struct ext3_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ return -ENOENT;
+ }
+
+ /*
+ * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
+ * do not perform it in these functions. We perform it at the call site,
+ * if it is needed.
+ */
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+ inode->i_nlink++;
+ }
+
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+ inode->i_nlink--;
+ }
+
+ static int ext3_add_nondir(handle_t *handle,
+ struct dentry *dentry, struct inode *inode)
+ {
+ int err = ext3_add_entry(handle, dentry, inode);
+ if (!err) {
+ d_instantiate(dentry, inode);
+ return 0;
+ }
+ ext3_dec_count(handle, inode);
+ iput(inode);
+ return err;
+ }
+
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode)
+ {
+ handle_t *handle;
+ struct inode * inode;
+ int err;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext3_new_inode (handle, dir, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ }
+ ext3_journal_stop(handle, dir);
+ return err;
+ }
+
+ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
+ int mode, int rdev)
+ {
+ handle_t *handle;
+ struct inode *inode;
+ int err;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext3_new_inode (handle, dir, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, mode, rdev);
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ }
+ ext3_journal_stop(handle, dir);
+ return err;
+ }
+
+ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+ {
+ handle_t *handle;
+ struct inode * inode;
+ struct buffer_head * dir_block;
+ struct ext3_dir_entry_2 * de;
+ int err;
+
+ if (dir->i_nlink >= EXT3_LINK_MAX)
+ return -EMLINK;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext3_new_inode (handle, dir, S_IFDIR);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
+ inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+ inode->i_nlink--; /* is this nlink == 0? */
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+ }
+ BUFFER_TRACE(dir_block, "get_write_access");
+ ext3_journal_get_write_access(handle, dir_block);
+ de = (struct ext3_dir_entry_2 *) dir_block->b_data;
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+ de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
+ strcpy (de->name, ".");
+ ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+ de = (struct ext3_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ de->inode = cpu_to_le32(dir->i_ino);
+ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
+ de->name_len = 2;
+ strcpy (de->name, "..");
+ ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+ inode->i_nlink = 2;
+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_block);
+ brelse (dir_block);
+ inode->i_mode = S_IFDIR | mode;
+ if (dir->i_mode & S_ISGID)
+ inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+ if (err)
+ goto out_no_entry;
+ dir->i_nlink++;
+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+
+ out_no_entry:
+ inode->i_nlink = 0;
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+ }
+
+ /*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+ static int empty_dir (struct inode * inode)
+ {
+ unsigned long offset;
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de, * de1;
+ struct super_block * sb;
+ int err;
+
+ sb = inode->i_sb;
+ if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
+ !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+ ext3_warning (inode->i_sb, "empty_dir",
+ "bad directory (dir #%lu) - no data block",
+ inode->i_ino);
+ return 1;
+ }
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ de1 = (struct ext3_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ if (le32_to_cpu(de->inode) != inode->i_ino ||
+ !le32_to_cpu(de1->inode) ||
+ strcmp (".", de->name) ||
+ strcmp ("..", de1->name)) {
+ ext3_warning (inode->i_sb, "empty_dir",
+ "bad directory (dir #%lu) - no `.' or `..'",
+ inode->i_ino);
+ brelse (bh);
+ return 1;
+ }
+ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+ de = (struct ext3_dir_entry_2 *)
+ ((char *) de1 + le16_to_cpu(de1->rec_len));
+ while (offset < inode->i_size ) {
+ if (!bh ||
+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ brelse (bh);
+ bh = ext3_bread (NULL, inode,
+ offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
+ if (!bh) {
+ #if 0
+ ext3_error (sb, "empty_dir",
+ "directory #%lu contains a hole at offset %lu",
+ inode->i_ino, offset);
+ #endif
+ offset += sb->s_blocksize;
+ continue;
+ }
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+ }
+ if (!ext3_check_dir_entry ("empty_dir", inode, de, bh,
+ offset)) {
+ brelse (bh);
+ return 1;
+ }
+ if (le32_to_cpu(de->inode)) {
+ brelse (bh);
+ return 0;
+ }
+ offset += le16_to_cpu(de->rec_len);
+ de = (struct ext3_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ brelse (bh);
+ return 1;
+ }
+
+ /* ext3_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext3_orphan_cleanup().
+ */
+ int ext3_orphan_add(handle_t *handle, struct inode *inode)
+ {
+ struct super_block *sb = inode->i_sb;
+ struct ext3_iloc iloc;
+ int err = 0, rc;
+
+ lock_super(sb);
+ if (!list_empty(&inode->u.ext3_i.i_orphan))
+ goto out_unlock;
+
+ /* Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. */
+
+ /* @@@ FIXME: Observation from aviro:
+ * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
+ * here (on lock_super()), so race with ext3_link() which might bump
+ * ->i_nlink. For, say it, character device. Not a regular file,
+ * not a directory, not a symlink and ->i_nlink > 0.
+ */
+ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ if (err)
+ goto out_unlock;
+
+ err = ext3_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_unlock;
+
+ /* Insert this inode at the head of the on-disk orphan list... */
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+ EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+
+ /* Only add to the head of the in-memory list if all the
+ * previous operations succeeded. If the orphan_add is going to
+ * fail (possibly taking the journal offline), we can't risk
+ * leaving the inode on the orphan list: stray orphan-list
+ * entries can cause panics at unmount time.
+ *
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+ if (!err)
+ list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %ld will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+ out_unlock:
+ unlock_super(sb);
+ ext3_std_error(inode->i_sb, err);
+ return err;
+ }
+
+ /*
+ * ext3_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+ struct list_head *prev;
+ struct ext3_sb_info *sbi;
+ ino_t ino_next;
+ struct ext3_iloc iloc;
+ int err = 0;
+
+ lock_super(inode->i_sb);
+ if (list_empty(&inode->u.ext3_i.i_orphan)) {
+ unlock_super(inode->i_sb);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ prev = inode->u.ext3_i.i_orphan.prev;
+ sbi = EXT3_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+
+ list_del(&inode->u.ext3_i.i_orphan);
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (!handle)
+ goto out;
+
+ err = ext3_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_err;
+
+ if (prev == &sbi->s_orphan) {
+ jbd_debug(4, "superblock will point to %ld\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto out_brelse;
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+ } else {
+ struct ext3_iloc iloc2;
+ struct inode *i_prev =
+ list_entry(prev, struct inode, u.ext3_i.i_orphan);
+
+ jbd_debug(4, "orphan inode %ld will point to %ld\n",
+ i_prev->i_ino, ino_next);
+ err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ if (err)
+ goto out_brelse;
+
+ out_err:
+ ext3_std_error(inode->i_sb, err);
+ out:
+ unlock_super(inode->i_sb);
+ return err;
+
+ out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+ }
+
+ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
+ {
+ int retval;
+ struct inode * inode;
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de;
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ retval = -ENOENT;
+ bh = ext3_find_entry (dentry, &de);
+ if (!bh)
+ goto end_rmdir;
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ inode = dentry->d_inode;
+ DQUOT_INIT(inode);
+
+ retval = -EIO;
+ if (le32_to_cpu(de->inode) != inode->i_ino)
+ goto end_rmdir;
+
+ retval = -ENOTEMPTY;
+ if (!empty_dir (inode))
+ goto end_rmdir;
+
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+ if (inode->i_nlink != 2)
+ ext3_warning (inode->i_sb, "ext3_rmdir",
+ "empty directory has nlink!=2 (%d)",
+ inode->i_nlink);
+ inode->i_version = ++event;
+ inode->i_nlink = 0;
+ /* There's no need to set i_disksize: the fact that i_nlink is
+ * zero will ensure that the right thing happens during any
+ * recovery. */
+ inode->i_size = 0;
+ ext3_orphan_add(handle, inode);
+ ext3_mark_inode_dirty(handle, inode);
+ dir->i_nlink--;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ ext3_mark_inode_dirty(handle, dir);
+
+ end_rmdir:
+ ext3_journal_stop(handle, dir);
+ brelse (bh);
+ return retval;
+ }
+
+ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
+ {
+ int retval;
+ struct inode * inode;
+ struct buffer_head * bh;
+ struct ext3_dir_entry_2 * de;
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ retval = -ENOENT;
+ bh = ext3_find_entry (dentry, &de);
+ if (!bh)
+ goto end_unlink;
+
+ inode = dentry->d_inode;
+ DQUOT_INIT(inode);
+
+ retval = -EIO;
+ if (le32_to_cpu(de->inode) != inode->i_ino)
+ goto end_unlink;
+
+ if (!inode->i_nlink) {
+ ext3_warning (inode->i_sb, "ext3_unlink",
+ "Deleting nonexistent file (%lu), %d",
+ inode->i_ino, inode->i_nlink);
+ inode->i_nlink = 1;
+ }
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ ext3_mark_inode_dirty(handle, dir);
+ inode->i_nlink--;
+ if (!inode->i_nlink)
+ ext3_orphan_add(handle, inode);
+ ext3_mark_inode_dirty(handle, inode);
+ inode->i_ctime = dir->i_ctime;
+ retval = 0;
+
+ end_unlink:
+ ext3_journal_stop(handle, dir);
+ brelse (bh);
+ return retval;
+ }
+
+ static int ext3_symlink (struct inode * dir,
+ struct dentry *dentry, const char * symname)
+ {
+ handle_t *handle;
+ struct inode * inode;
+ int l, err;
+
+ l = strlen(symname)+1;
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ if (l > sizeof (inode->u.ext3_i.i_data)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+ * block_symlink() calls back into ext3_prepare/commit_write.
+ * We have a transaction open. All is sweetness. It also sets
+ * i_size in generic_commit_write().
+ */
+ err = block_symlink(inode, symname, l);
+ if (err)
+ goto out_no_entry;
+ } else {
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+ memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
+ inode->i_size = l-1;
+ }
+ inode->u.ext3_i.i_disksize = inode->i_size;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+
+ out_no_entry:
+ ext3_dec_count(handle, inode);
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+ }
+
+ static int ext3_link (struct dentry * old_dentry,
+ struct inode * dir, struct dentry *dentry)
+ {
+ handle_t *handle;
+ struct inode *inode = old_dentry->d_inode;
+ int err;
+
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+ if (inode->i_nlink >= EXT3_LINK_MAX)
+ return -EMLINK;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+ inode->i_ctime = CURRENT_TIME;
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_nondir(handle, dentry, inode);
+ ext3_journal_stop(handle, dir);
+ return err;
+ }
+
+ #define PARENT_INO(buffer) \
+ ((struct ext3_dir_entry_2 *) ((char *) buffer + \
+ le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
+
+ /*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
+ struct inode * new_dir,struct dentry *new_dentry)
+ {
+ handle_t *handle;
+ struct inode * old_inode, * new_inode;
+ struct buffer_head * old_bh, * new_bh, * dir_bh;
+ struct ext3_dir_entry_2 * old_de, * new_de;
+ int retval;
+
+ old_bh = new_bh = dir_bh = NULL;
+
+ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ handle->h_sync = 1;
+
+ old_bh = ext3_find_entry (old_dentry, &old_de);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ old_inode = old_dentry->d_inode;
+ retval = -ENOENT;
+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ goto end_rename;
+
+ new_inode = new_dentry->d_inode;
+ new_bh = ext3_find_entry (new_dentry, &new_de);
+ if (new_bh) {
+ if (!new_inode) {
+ brelse (new_bh);
+ new_bh = NULL;
+ } else {
+ DQUOT_INIT(new_inode);
+ }
+ }
+ if (S_ISDIR(old_inode->i_mode)) {
+ if (new_inode) {
+ retval = -ENOTEMPTY;
+ if (!empty_dir (new_inode))
+ goto end_rename;
+ }
+ retval = -EIO;
+ dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+ if (!dir_bh)
+ goto end_rename;
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+ if (!new_inode && new_dir!=old_dir &&
+ new_dir->i_nlink >= EXT3_LINK_MAX)
+ goto end_rename;
+ }
+ if (!new_bh) {
+ retval = ext3_add_entry (handle, new_dentry, old_inode);
+ if (retval)
+ goto end_rename;
+ } else {
+ BUFFER_TRACE(new_bh, "get write access");
+ BUFFER_TRACE(new_bh, "get_write_access");
+ ext3_journal_get_write_access(handle, new_bh);
+ new_de->inode = le32_to_cpu(old_inode->i_ino);
+ if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+ EXT3_FEATURE_INCOMPAT_FILETYPE))
+ new_de->file_type = old_de->file_type;
+ new_dir->i_version = ++event;
+ BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, new_bh);
+ brelse(new_bh);
+ new_bh = NULL;
+ }
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old_inode->i_ctime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, old_inode);
+
+ /*
+ * ok, that's it
+ */
+ ext3_delete_entry(handle, old_dir, old_de, old_bh);
+
+ if (new_inode) {
+ new_inode->i_nlink--;
+ new_inode->i_ctime = CURRENT_TIME;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+ old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ ext3_journal_get_write_access(handle, dir_bh);
+ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_bh);
+ old_dir->i_nlink--;
+ if (new_inode) {
+ new_inode->i_nlink--;
+ } else {
+ new_dir->i_nlink++;
+ new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+ }
+ ext3_mark_inode_dirty(handle, old_dir);
+ if (new_inode) {
+ ext3_mark_inode_dirty(handle, new_inode);
+ if (!new_inode->i_nlink)
+ ext3_orphan_add(handle, new_inode);
+ }
+ retval = 0;
+
+ end_rename:
+ brelse (dir_bh);
+ brelse (old_bh);
+ brelse (new_bh);
+ ext3_journal_stop(handle, old_dir);
+ return retval;
+ }
+
+ /*
+ * directories can handle most operations...
+ */
+ struct inode_operations ext3_dir_inode_operations = {
+ create: ext3_create, /* BKL held */
+ lookup: ext3_lookup, /* BKL held */
+ link: ext3_link, /* BKL held */
+ unlink: ext3_unlink, /* BKL held */
+ symlink: ext3_symlink, /* BKL held */
+ mkdir: ext3_mkdir, /* BKL held */
+ rmdir: ext3_rmdir, /* BKL held */
+ mknod: ext3_mknod, /* BKL held */
+ rename: ext3_rename, /* BKL held */
+ };
diff -rc2P linux/fs/ext3/super.c linux-2.4.13/fs/ext3/super.c
*** linux/fs/ext3/super.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/super.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,1743 ----
+ /*
+ * linux/fs/ext3/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller ([email protected]), 1995
+ */
+
+ #include <linux/config.h>
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+ #include <linux/blkdev.h>
+ #include <linux/smp_lock.h>
+ #include <asm/uaccess.h>
+
+ #ifdef CONFIG_JBD_DEBUG
+ static int ext3_ro_after; /* Make fs read-only after this many jiffies */
+ #endif
+
+ static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
+ static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+ int);
+ static void ext3_commit_super (struct super_block * sb,
+ struct ext3_super_block * es,
+ int sync);
+ static void ext3_mark_recovery_complete(struct super_block * sb,
+ struct ext3_super_block * es);
+ static void ext3_clear_journal_err(struct super_block * sb,
+ struct ext3_super_block * es);
+
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+ * Debug code for turning filesystems "read-only" after a specified
+ * amount of time. This is for crash/recovery testing.
+ */
+
+ static void make_rdonly(kdev_t dev, int *no_write)
+ {
+ if (dev) {
+ printk(KERN_WARNING "Turning device %s read-only\n",
+ bdevname(dev));
+ *no_write = 0xdead0000 + dev;
+ }
+ }
+
+ static void turn_fs_readonly(unsigned long arg)
+ {
+ struct super_block *sb = (struct super_block *)arg;
+
+ make_rdonly(sb->s_dev, &journal_no_write[0]);
+ make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]);
+ wake_up(&EXT3_SB(sb)->ro_wait_queue);
+ }
+
+ static void setup_ro_after(struct super_block *sb)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ init_timer(&sbi->turn_ro_timer);
+ if (ext3_ro_after) {
+ printk(KERN_DEBUG "fs will go read-only in %d jiffies\n",
+ ext3_ro_after);
+ init_waitqueue_head(&sbi->ro_wait_queue);
+ journal_no_write[0] = 0;
+ journal_no_write[1] = 0;
+ sbi->turn_ro_timer.function = turn_fs_readonly;
+ sbi->turn_ro_timer.data = (unsigned long)sb;
+ sbi->turn_ro_timer.expires = jiffies + ext3_ro_after;
+ ext3_ro_after = 0;
+ add_timer(&sbi->turn_ro_timer);
+ }
+ }
+
+ static void clear_ro_after(struct super_block *sb)
+ {
+ del_timer_sync(&EXT3_SB(sb)->turn_ro_timer);
+ journal_no_write[0] = 0;
+ journal_no_write[1] = 0;
+ ext3_ro_after = 0;
+ }
+ #else
+ #define setup_ro_after(sb) do {} while (0)
+ #define clear_ro_after(sb) do {} while (0)
+ #endif
+
+
+ static char error_buf[1024];
+
+ /* Determine the appropriate response to ext3_error on a given filesystem */
+
+ static int ext3_error_behaviour(struct super_block *sb)
+ {
+ /* First check for mount-time options */
+ if (test_opt (sb, ERRORS_PANIC))
+ return EXT3_ERRORS_PANIC;
+ if (test_opt (sb, ERRORS_RO))
+ return EXT3_ERRORS_RO;
+ if (test_opt (sb, ERRORS_CONT))
+ return EXT3_ERRORS_CONTINUE;
+
+ /* If no overrides were specified on the mount, then fall back
+ * to the default behaviour set in the filesystem's superblock
+ * on disk. */
+ switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
+ case EXT3_ERRORS_PANIC:
+ return EXT3_ERRORS_PANIC;
+ case EXT3_ERRORS_RO:
+ return EXT3_ERRORS_RO;
+ default:
+ break;
+ }
+ return EXT3_ERRORS_CONTINUE;
+ }
+
+ /* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock. That is not possible on ext3, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the journal_abort() error code to record an error in
+ * the journal instead. On recovery, the journal will compain about
+ * that error until we've noted it down and cleared it.
+ */
+
+ static void ext3_handle_error(struct super_block *sb)
+ {
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+ EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+ es->s_state |= cpu_to_le32(EXT3_ERROR_FS);
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) {
+ EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+ }
+
+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
+ panic ("EXT3-fs (device %s): panic forced after error\n",
+ bdevname(sb->s_dev));
+
+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) {
+ printk (KERN_CRIT "Remounting filesystem read-only\n");
+ sb->s_flags |= MS_RDONLY;
+ }
+
+ ext3_commit_super(sb, es, 1);
+ }
+
+ void ext3_error (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+ {
+ va_list args;
+
+ va_start (args, fmt);
+ vsprintf (error_buf, fmt, args);
+ va_end (args);
+
+ printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n",
+ bdevname(sb->s_dev), function, error_buf);
+
+ ext3_handle_error(sb);
+ }
+
+ const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16])
+ {
+ char *errstr = NULL;
+
+ switch (errno) {
+ case -EIO:
+ errstr = "IO failure";
+ break;
+ case -ENOMEM:
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+ if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
+ errstr = "Journal has aborted";
+ else
+ errstr = "Readonly filesystem";
+ break;
+ default:
+ /* If the caller passed in an extra buffer for unknown
+ * errors, textualise them now. Else we just return
+ * NULL. */
+ if (nbuf) {
+ /* Check for truncated error codes... */
+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+ errstr = nbuf;
+ }
+
+ break;
+ }
+
+ return errstr;
+ }
+
+ /* __ext3_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response. */
+
+ void __ext3_std_error (struct super_block * sb, const char * function,
+ int errno)
+ {
+ char nbuf[16];
+ const char *errstr = ext3_decode_error(sb, errno, nbuf);
+
+ printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
+ bdevname(sb->s_dev), function, errstr);
+
+ ext3_handle_error(sb);
+ }
+
+ /*
+ * ext3_abort is a much stronger failure handler than ext3_error. The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+ void ext3_abort (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+ {
+ va_list args;
+
+ printk (KERN_CRIT "ext3_abort called.\n");
+
+ va_start (args, fmt);
+ vsprintf (error_buf, fmt, args);
+ va_end (args);
+
+ if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
+ panic ("EXT3-fs panic (device %s): %s: %s\n",
+ bdevname(sb->s_dev), function, error_buf);
+
+ printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n",
+ bdevname(sb->s_dev), function, error_buf);
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ printk (KERN_CRIT "Remounting filesystem read-only\n");
+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+ sb->s_flags |= MS_RDONLY;
+ sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
+ journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+ }
+
+ /* Deal with the reporting of failure conditions while running, such as
+ * inconsistencies in operation or invalid system states.
+ *
+ * Use ext3_error() for cases of invalid filesystem states, as that will
+ * record an error on disk and force a filesystem check on the next boot.
+ */
+ NORET_TYPE void ext3_panic (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+ {
+ va_list args;
+
+ va_start (args, fmt);
+ vsprintf (error_buf, fmt, args);
+ va_end (args);
+
+ /* this is to prevent panic from syncing this filesystem */
+ /* AKPM: is this sufficient? */
+ sb->s_flags |= MS_RDONLY;
+ panic ("EXT3-fs panic (device %s): %s: %s\n",
+ bdevname(sb->s_dev), function, error_buf);
+ }
+
+ void ext3_warning (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+ {
+ va_list args;
+
+ va_start (args, fmt);
+ vsprintf (error_buf, fmt, args);
+ va_end (args);
+ printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n",
+ bdevname(sb->s_dev), function, error_buf);
+ }
+
+ void ext3_update_dynamic_rev(struct super_block *sb)
+ {
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+
+ if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
+ return;
+
+ ext3_warning(sb, __FUNCTION__,
+ "updating to rev %d because of new feature flag, "
+ "running e2fsck is recommended",
+ EXT3_DYNAMIC_REV);
+
+ es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
+ es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
+ es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
+ /* leave es->s_feature_*compat flags alone */
+ /* es->s_uuid will be set by e2fsck if empty */
+
+ /*
+ * The rest of the superblock fields should be zero, and if not it
+ * means they are likely already in use, so leave them alone. We
+ * can leave it up to e2fsck to clean up any inconsistencies there.
+ */
+ }
+
+ /*
+ * Open the external journal device
+ */
+ static struct block_device *ext3_blkdev_get(kdev_t dev)
+ {
+ struct block_device *bdev;
+ int err = -ENODEV;
+
+ bdev = bdget(kdev_t_to_nr(dev));
+ if (bdev == NULL)
+ goto fail;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
+ if (err < 0)
+ goto fail;
+ return bdev;
+
+ fail:
+ printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n",
+ bdevname(dev), err);
+ return NULL;
+ }
+
+ /*
+ * Release the journal device
+ */
+ static int ext3_blkdev_put(struct block_device *bdev)
+ {
+ return blkdev_put(bdev, BDEV_FS);
+ }
+
+ static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+ {
+ struct block_device *bdev;
+ int ret = -ENODEV;
+
+ bdev = sbi->journal_bdev;
+ if (bdev) {
+ ret = ext3_blkdev_put(bdev);
+ sbi->journal_bdev = 0;
+ }
+ return ret;
+ }
+
+ #define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan)
+
+ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
+ {
+ struct list_head *l;
+
+ printk(KERN_ERR "sb orphan head is %d\n",
+ le32_to_cpu(sbi->s_es->s_last_orphan));
+
+ printk(KERN_ERR "sb_info orphan list:\n");
+ list_for_each(l, &sbi->s_orphan) {
+ struct inode *inode = orphan_list_entry(l);
+ printk(KERN_ERR " "
+ "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n",
+ inode->i_dev, inode->i_ino, inode,
+ inode->i_mode, inode->i_nlink,
+ le32_to_cpu(NEXT_ORPHAN(inode)));
+ }
+ }
+
+ void ext3_put_super (struct super_block * sb)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_super_block *es = sbi->s_es;
+ kdev_t j_dev = sbi->s_journal->j_dev;
+ int i;
+
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ es->s_state = le16_to_cpu(sbi->s_mount_state);
+ BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+ mark_buffer_dirty(sbi->s_sbh);
+ ext3_commit_super(sb, es, 1);
+ }
+
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(sbi->s_group_desc[i]);
+ kfree(sbi->s_group_desc);
+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
+ brelse(sbi->s_inode_bitmap[i]);
+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
+ brelse(sbi->s_block_bitmap[i]);
+ brelse(sbi->s_sbh);
+
+ /* Debugging code just in case the in-memory inode orphan list
+ * isn't empty. The on-disk one can be non-empty if we've
+ * detected an error and taken the fs readonly, but the
+ * in-memory list had better be clean by this point. */
+ if (!list_empty(&sbi->s_orphan))
+ dump_orphan_list(sb, sbi);
+ J_ASSERT(list_empty(&sbi->s_orphan));
+
+ invalidate_buffers(sb->s_dev);
+ if (j_dev != sb->s_dev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ fsync_no_super(j_dev);
+ invalidate_buffers(j_dev);
+ ext3_blkdev_remove(sbi);
+ }
+ clear_ro_after(sb);
+
+ return;
+ }
+
+ static struct super_operations ext3_sops = {
+ read_inode: ext3_read_inode, /* BKL held */
+ write_inode: ext3_write_inode, /* BKL not held. Don't need */
+ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
+ put_inode: ext3_put_inode, /* BKL not held. Don't need */
+ delete_inode: ext3_delete_inode, /* BKL not held. We take it */
+ put_super: ext3_put_super, /* BKL held */
+ write_super: ext3_write_super, /* BKL held */
+ write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
+ unlockfs: ext3_unlockfs, /* BKL not held. We take it */
+ statfs: ext3_statfs, /* BKL held */
+ remount_fs: ext3_remount, /* BKL held */
+ };
+
+ static int want_value(char *value, char *option)
+ {
+ if (!value || !*value) {
+ printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n",
+ option);
+ return -1;
+ }
+ return 0;
+ }
+
+ static int want_null_value(char *value, char *option)
+ {
+ if (*value) {
+ printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n",
+ option, value);
+ return -1;
+ }
+ return 0;
+ }
+
+ static int want_numeric(char *value, char *option, unsigned long *number)
+ {
+ if (want_value(value, option))
+ return -1;
+ *number = simple_strtoul(value, &value, 0);
+ if (want_null_value(value, option))
+ return -1;
+ return 0;
+ }
+
+ /*
+ * This function has been shamelessly adapted from the msdos fs
+ */
+ static int parse_options (char * options, unsigned long * sb_block,
+ struct ext3_sb_info *sbi,
+ unsigned long * inum,
+ int is_remount)
+ {
+ unsigned long *mount_options = &sbi->s_mount_opt;
+ uid_t *resuid = &sbi->s_resuid;
+ gid_t *resgid = &sbi->s_resgid;
+ char * this_char;
+ char * value;
+
+ if (!options)
+ return 1;
+ for (this_char = strtok (options, ",");
+ this_char != NULL;
+ this_char = strtok (NULL, ",")) {
+ if ((value = strchr (this_char, '=')) != NULL)
+ *value++ = 0;
+ if (!strcmp (this_char, "bsddf"))
+ clear_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nouid32")) {
+ set_opt (*mount_options, NO_UID32);
+ }
+ else if (!strcmp (this_char, "abort"))
+ set_opt (*mount_options, ABORT);
+ else if (!strcmp (this_char, "check")) {
+ if (!value || !*value || !strcmp (value, "none"))
+ clear_opt (*mount_options, CHECK);
+ else
+ #ifdef CONFIG_EXT3_CHECK
+ set_opt (*mount_options, CHECK);
+ #else
+ printk(KERN_ERR
+ "EXT3 Check option not supported\n");
+ #endif
+ }
+ else if (!strcmp (this_char, "debug"))
+ set_opt (*mount_options, DEBUG);
+ else if (!strcmp (this_char, "errors")) {
+ if (want_value(value, "errors"))
+ return 0;
+ if (!strcmp (value, "continue")) {
+ clear_opt (*mount_options, ERRORS_RO);
+ clear_opt (*mount_options, ERRORS_PANIC);
+ set_opt (*mount_options, ERRORS_CONT);
+ }
+ else if (!strcmp (value, "remount-ro")) {
+ clear_opt (*mount_options, ERRORS_CONT);
+ clear_opt (*mount_options, ERRORS_PANIC);
+ set_opt (*mount_options, ERRORS_RO);
+ }
+ else if (!strcmp (value, "panic")) {
+ clear_opt (*mount_options, ERRORS_CONT);
+ clear_opt (*mount_options, ERRORS_RO);
+ set_opt (*mount_options, ERRORS_PANIC);
+ }
+ else {
+ printk (KERN_ERR
+ "EXT3-fs: Invalid errors option: %s\n",
+ value);
+ return 0;
+ }
+ }
+ else if (!strcmp (this_char, "grpid") ||
+ !strcmp (this_char, "bsdgroups"))
+ set_opt (*mount_options, GRPID);
+ else if (!strcmp (this_char, "minixdf"))
+ set_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nocheck"))
+ clear_opt (*mount_options, CHECK);
+ else if (!strcmp (this_char, "nogrpid") ||
+ !strcmp (this_char, "sysvgroups"))
+ clear_opt (*mount_options, GRPID);
+ else if (!strcmp (this_char, "resgid")) {
+ unsigned long v;
+ if (want_numeric(value, "resgid", &v))
+ return 0;
+ *resgid = v;
+ }
+ else if (!strcmp (this_char, "resuid")) {
+ unsigned long v;
+ if (want_numeric(value, "resuid", &v))
+ return 0;
+ *resuid = v;
+ }
+ else if (!strcmp (this_char, "sb")) {
+ if (want_numeric(value, "sb", sb_block))
+ return 0;
+ }
+ #ifdef CONFIG_JBD_DEBUG
+ else if (!strcmp (this_char, "ro-after")) {
+ unsigned long v;
+ if (want_numeric(value, "ro-after", &v))
+ return 0;
+ ext3_ro_after = v;
+ }
+ #endif
+ /* Silently ignore the quota options */
+ else if (!strcmp (this_char, "grpquota")
+ || !strcmp (this_char, "noquota")
+ || !strcmp (this_char, "quota")
+ || !strcmp (this_char, "usrquota"))
+ /* Don't do anything ;-) */ ;
+ else if (!strcmp (this_char, "journal")) {
+ /* @@@ FIXME */
+ /* Eventually we will want to be able to create
+ a journal file here. For now, only allow the
+ user to specify an existing inode to be the
+ journal file. */
+ if (is_remount) {
+ printk(KERN_ERR "EXT3-fs: cannot specify "
+ "journal on remount\n");
+ return 0;
+ }
+
+ if (want_value(value, "journal"))
+ return 0;
+ if (!strcmp (value, "update"))
+ set_opt (*mount_options, UPDATE_JOURNAL);
+ else if (want_numeric(value, "journal", inum))
+ return 0;
+ }
+ else if (!strcmp (this_char, "noload"))
+ set_opt (*mount_options, NOLOAD);
+ else if (!strcmp (this_char, "data")) {
+ int data_opt = 0;
+
+ if (want_value(value, "data"))
+ return 0;
+ if (!strcmp (value, "journal"))
+ data_opt = EXT3_MOUNT_JOURNAL_DATA;
+ else if (!strcmp (value, "ordered"))
+ data_opt = EXT3_MOUNT_ORDERED_DATA;
+ else if (!strcmp (value, "writeback"))
+ data_opt = EXT3_MOUNT_WRITEBACK_DATA;
+ else {
+ printk (KERN_ERR
+ "EXT3-fs: Invalid data option: %s\n",
+ value);
+ return 0;
+ }
+ if (is_remount) {
+ if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) !=
+ data_opt) {
+ printk(KERN_ERR
+ "EXT3-fs: cannot change data "
+ "mode on remount\n");
+ return 0;
+ }
+ } else {
+ *mount_options &= ~EXT3_MOUNT_DATA_FLAGS;
+ *mount_options |= data_opt;
+ }
+ } else {
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option %s\n",
+ this_char);
+ return 0;
+ }
+ }
+ return 1;
+ }
+
+ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
+ int read_only)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ int res = 0;
+
+ if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
+ printk (KERN_ERR "EXT3-fs warning: revision level too high, "
+ "forcing read-only mode\n");
+ res = MS_RDONLY;
+ }
+ if (read_only)
+ return res;
+ if (!(sbi->s_mount_state & EXT3_VALID_FS))
+ printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
+ "running e2fsck is recommended\n");
+ else if ((sbi->s_mount_state & EXT3_ERROR_FS))
+ printk (KERN_WARNING
+ "EXT3-fs warning: mounting fs with errors, "
+ "running e2fsck is recommended\n");
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ le16_to_cpu(es->s_mnt_count) >=
+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+ printk (KERN_WARNING
+ "EXT3-fs warning: maximal mount count reached, "
+ "running e2fsck is recommended\n");
+ else if (le32_to_cpu(es->s_checkinterval) &&
+ (le32_to_cpu(es->s_lastcheck) +
+ le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME))
+ printk (KERN_WARNING
+ "EXT3-fs warning: checktime reached, "
+ "running e2fsck is recommended\n");
+ #if 0
+ /* @@@ We _will_ want to clear the valid bit if we find
+ inconsistencies, to force a fsck at reboot. But for
+ a plain journaled filesystem we can keep it set as
+ valid forever! :) */
+ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
+ #endif
+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+ es->s_max_mnt_count =
+ (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
+ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ ext3_commit_super (sb, es, 1);
+ if (test_opt (sb, DEBUG))
+ printk (KERN_INFO
+ "[EXT3 FS %s, %s, bs=%lu, gc=%lu, "
+ "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize,
+ sbi->s_groups_count,
+ EXT3_BLOCKS_PER_GROUP(sb),
+ EXT3_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt);
+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+ bdevname(sb->s_dev));
+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+ printk("external journal on %s\n",
+ bdevname(EXT3_SB(sb)->s_journal->j_dev));
+ } else {
+ printk("internal journal\n");
+ }
+ #ifdef CONFIG_EXT3_CHECK
+ if (test_opt (sb, CHECK)) {
+ ext3_check_blocks_bitmap (sb);
+ ext3_check_inodes_bitmap (sb);
+ }
+ #endif
+ setup_ro_after(sb);
+ return res;
+ }
+
+ static int ext3_check_descriptors (struct super_block * sb)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ struct ext3_group_desc * gdp = NULL;
+ int desc_block = 0;
+ int i;
+
+ ext3_debug ("Checking group descriptors");
+
+ for (i = 0; i < sbi->s_groups_count; i++)
+ {
+ if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
+ gdp = (struct ext3_group_desc *)
+ sbi->s_group_desc[desc_block++]->b_data;
+ if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
+ le32_to_cpu(gdp->bg_block_bitmap) >=
+ block + EXT3_BLOCKS_PER_GROUP(sb))
+ {
+ ext3_error (sb, "ext3_check_descriptors",
+ "Block bitmap for group %d"
+ " not in group (block %lu)!",
+ i, (unsigned long)
+ le32_to_cpu(gdp->bg_block_bitmap));
+ return 0;
+ }
+ if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
+ le32_to_cpu(gdp->bg_inode_bitmap) >=
+ block + EXT3_BLOCKS_PER_GROUP(sb))
+ {
+ ext3_error (sb, "ext3_check_descriptors",
+ "Inode bitmap for group %d"
+ " not in group (block %lu)!",
+ i, (unsigned long)
+ le32_to_cpu(gdp->bg_inode_bitmap));
+ return 0;
+ }
+ if (le32_to_cpu(gdp->bg_inode_table) < block ||
+ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
+ block + EXT3_BLOCKS_PER_GROUP(sb))
+ {
+ ext3_error (sb, "ext3_check_descriptors",
+ "Inode table for group %d"
+ " not in group (block %lu)!",
+ i, (unsigned long)
+ le32_to_cpu(gdp->bg_inode_table));
+ return 0;
+ }
+ block += EXT3_BLOCKS_PER_GROUP(sb);
+ gdp++;
+ }
+ return 1;
+ }
+
+
+ /* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext3_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+ static void ext3_orphan_cleanup (struct super_block * sb,
+ struct ext3_super_block * es)
+ {
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+ if (!es->s_last_orphan) {
+ jbd_debug(4, "no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (s_flags & MS_RDONLY) {
+ printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
+ bdevname(sb->s_dev));
+ sb->s_flags &= ~MS_RDONLY;
+ }
+
+ if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
+ if (es->s_last_orphan)
+ jbd_debug(1, "Errors on filesystem, "
+ "clearing orphan list.\n");
+ es->s_last_orphan = 0;
+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ while (es->s_last_orphan) {
+ struct inode *inode;
+
+ if (!(inode =
+ ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+ if (inode->i_nlink) {
+ printk(KERN_DEBUG __FUNCTION__
+ ": truncating inode %ld to %Ld bytes\n",
+ inode->i_ino, inode->i_size);
+ jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+ inode->i_ino, inode->i_size);
+ ext3_truncate(inode);
+ nr_truncates++;
+ } else {
+ printk(KERN_DEBUG __FUNCTION__
+ ": deleting unreferenced inode %ld\n",
+ inode->i_ino);
+ jbd_debug(2, "deleting unreferenced inode %ld\n",
+ inode->i_ino);
+ nr_orphans++;
+ }
+ iput(inode); /* The delete magic happens here! */
+ }
+
+ #define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+ if (nr_orphans)
+ printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
+ bdevname(sb->s_dev), PLURAL(nr_orphans));
+ if (nr_truncates)
+ printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
+ bdevname(sb->s_dev), PLURAL(nr_truncates));
+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ }
+
+ #define log2(n) ffz(~(n))
+
+ /*
+ * Maximal file size. There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+ static loff_t ext3_max_size(int bits)
+ {
+ loff_t res = EXT3_NDIR_BLOCKS;
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ res <<= bits;
+ if (res > (512LL << 32) - (1 << bits))
+ res = (512LL << 32) - (1 << bits);
+ return res;
+ }
+
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+ int silent)
+ {
+ struct buffer_head * bh;
+ struct ext3_super_block *es = 0;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ unsigned long sb_block = 1;
+ unsigned long logic_sb_block = 1;
+ unsigned long offset = 0;
+ unsigned long journal_inum = 0;
+ kdev_t dev = sb->s_dev;
+ int blocksize;
+ int hblock;
+ int db_count;
+ int i;
+ int needs_recovery;
+
+ #ifdef CONFIG_JBD_DEBUG
+ ext3_ro_after = 0;
+ #endif
+ /*
+ * See what the current blocksize for the device is, and
+ * use that as the blocksize. Otherwise (or if the blocksize
+ * is smaller than the default) use the default.
+ * This is important for devices that have a hardware
+ * sectorsize that is larger than the default.
+ */
+ blocksize = EXT3_MIN_BLOCK_SIZE;
+ hblock = get_hardsect_size(dev);
+ if (blocksize < hblock)
+ blocksize = hblock;
+
+ sbi->s_mount_opt = 0;
+ sbi->s_resuid = EXT3_DEF_RESUID;
+ sbi->s_resgid = EXT3_DEF_RESGID;
+ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+ sb->s_dev = 0;
+ goto out_fail;
+ }
+
+ set_blocksize (dev, blocksize);
+
+ /*
+ * The ext3 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT3_MIN_BLOCK_SIZE) {
+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+ }
+
+ if (!(bh = bread (dev, logic_sb_block, blocksize))) {
+ printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+ goto out_fail;
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext3 macro-instructions depend on its value
+ */
+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT3_SUPER_MAGIC) {
+ if (!silent)
+ printk(KERN_ERR
+ "VFS: Can't find ext3 filesystem on dev %s.\n",
+ bdevname(dev));
+ goto failed_mount;
+ }
+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
+ (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
+ EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+ EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ printk(KERN_WARNING
+ "EXT3-fs warning: feature flags set on rev 0 fs, "
+ "running e2fsck is recommended\n");
+ /*
+ * Check feature flags regardless of the revision level, since we
+ * previously didn't change the revision level when setting the flags,
+ * so there is a chance incompat flags are set on a rev 0 filesystem.
+ */
+ if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) {
+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
+ "unsupported optional features (%x).\n",
+ bdevname(dev), i);
+ goto failed_mount;
+ }
+ if (!(sb->s_flags & MS_RDONLY) &&
+ (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){
+ printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
+ "unsupported optional features (%x).\n",
+ bdevname(dev), i);
+ goto failed_mount;
+ }
+ sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10;
+ sb->s_blocksize = 1 << sb->s_blocksize_bits;
+
+ if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE ||
+ sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) {
+ printk(KERN_ERR
+ "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
+ blocksize, bdevname(dev));
+ goto failed_mount;
+ }
+
+ sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
+
+ if (sb->s_blocksize != blocksize) {
+ blocksize = sb->s_blocksize;
+
+ /*
+ * Make sure the blocksize for the filesystem is larger
+ * than the hardware sectorsize for the machine.
+ */
+ if (sb->s_blocksize < hblock) {
+ printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
+ "device blocksize %d.\n", blocksize, hblock);
+ goto failed_mount;
+ }
+
+ brelse (bh);
+ set_blocksize (dev, sb->s_blocksize);
+ logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+ offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+ bh = bread (dev, logic_sb_block, blocksize);
+ if (!bh) {
+ printk(KERN_ERR
+ "EXT3-fs: Can't read superblock on 2nd try.\n");
+ return NULL;
+ }
+ es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) {
+ printk (KERN_ERR
+ "EXT3-fs: Magic mismatch, very weird !\n");
+ goto failed_mount;
+ }
+ }
+
+ if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
+ sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
+ sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
+ } else {
+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) {
+ printk (KERN_ERR
+ "EXT3-fs: unsupported inode size: %d\n",
+ sbi->s_inode_size);
+ goto failed_mount;
+ }
+ }
+ sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
+ le32_to_cpu(es->s_log_frag_size);
+ if (blocksize != sbi->s_frag_size) {
+ printk(KERN_ERR
+ "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+ sbi->s_frag_size, blocksize);
+ goto failed_mount;
+ }
+ sbi->s_frags_per_block = 1;
+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+ sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
+ sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block;
+ sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
+ sbi->s_sbh = bh;
+ if (sbi->s_resuid == EXT3_DEF_RESUID)
+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+ if (sbi->s_resgid == EXT3_DEF_RESGID)
+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
+
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ printk (KERN_ERR
+ "EXT3-fs: #blocks per group too big: %lu\n",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_frags_per_group > blocksize * 8) {
+ printk (KERN_ERR
+ "EXT3-fs: #fragments per group too big: %lu\n",
+ sbi->s_frags_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_inodes_per_group > blocksize * 8) {
+ printk (KERN_ERR
+ "EXT3-fs: #inodes per group too big: %lu\n",
+ sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
+
+ sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+ EXT3_BLOCKS_PER_GROUP(sb);
+ db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
+ EXT3_DESC_PER_BLOCK(sb);
+ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+ GFP_KERNEL);
+ if (sbi->s_group_desc == NULL) {
+ printk (KERN_ERR "EXT3-fs: not enough memory\n");
+ goto failed_mount;
+ }
+ for (i = 0; i < db_count; i++) {
+ sbi->s_group_desc[i] = bread(dev, logic_sb_block + i + 1,
+ blocksize);
+ if (!sbi->s_group_desc[i]) {
+ printk (KERN_ERR "EXT3-fs: "
+ "can't read group descriptor %d\n", i);
+ db_count = i;
+ goto failed_mount2;
+ }
+ }
+ if (!ext3_check_descriptors (sb)) {
+ printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+ goto failed_mount2;
+ }
+ for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) {
+ sbi->s_inode_bitmap_number[i] = 0;
+ sbi->s_inode_bitmap[i] = NULL;
+ sbi->s_block_bitmap_number[i] = 0;
+ sbi->s_block_bitmap[i] = NULL;
+ }
+ sbi->s_loaded_inode_bitmaps = 0;
+ sbi->s_loaded_block_bitmaps = 0;
+ sbi->s_gdb_count = db_count;
+ /*
+ * set up enough so that it can read an inode
+ */
+ sb->s_op = &ext3_sops;
+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+
+ sb->s_root = 0;
+
+ needs_recovery = (es->s_last_orphan != 0 ||
+ EXT3_HAS_INCOMPAT_FEATURE(sb,
+ EXT3_FEATURE_INCOMPAT_RECOVER));
+
+ /*
+ * The first inode we look at is the journal inode. Don't try
+ * root first: it may be modified in the journal!
+ */
+ if (!test_opt(sb, NOLOAD) &&
+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (ext3_load_journal(sb, es))
+ goto failed_mount2;
+ } else if (journal_inum) {
+ if (ext3_create_journal(sb, es, journal_inum))
+ goto failed_mount2;
+ } else {
+ if (!silent)
+ printk (KERN_ERR
+ "ext3: No journal on filesystem on %s\n",
+ bdevname(dev));
+ goto failed_mount2;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ capabilities: ORDERED_DATA if the journal can
+ cope, else JOURNAL_DATA */
+ if (journal_check_available_features
+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ else
+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+ break;
+
+ case EXT3_MOUNT_ORDERED_DATA:
+ case EXT3_MOUNT_WRITEBACK_DATA:
+ if (!journal_check_available_features
+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
+ printk(KERN_ERR "EXT3-fs: Journal does not support "
+ "requested data journaling mode\n");
+ goto failed_mount3;
+ }
+ default:
+ break;
+ }
+
+ /*
+ * The journal_load will have done any necessary log recovery,
+ * so we can safely mount the rest of the filesystem now.
+ */
+
+ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO));
+ if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) ||
+ !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) {
+ if (sb->s_root) {
+ dput(sb->s_root);
+ sb->s_root = NULL;
+ printk(KERN_ERR
+ "EXT3-fs: corrupt root inode, run e2fsck\n");
+ } else
+ printk(KERN_ERR "EXT3-fs: get root inode failed\n");
+ goto failed_mount3;
+ }
+
+ ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ /*
+ * akpm: core read_super() calls in here with the superblock locked.
+ * That deadlocks, because orphan cleanup needs to lock the superblock
+ * in numerous places. Here we just pop the lock - it's relatively
+ * harmless, because we are now ready to accept write_super() requests,
+ * and aviro says that's the only reason for hanging onto the
+ * superblock lock.
+ */
+ EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+ unlock_super(sb); /* akpm: sigh */
+ ext3_orphan_cleanup(sb, es);
+ lock_super(sb);
+ EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+ if (needs_recovery)
+ printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+ ext3_mark_recovery_complete(sb, es);
+ printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+ "writeback");
+
+ return sb;
+
+ failed_mount3:
+ journal_destroy(sbi->s_journal);
+ failed_mount2:
+ for (i = 0; i < db_count; i++)
+ brelse(sbi->s_group_desc[i]);
+ kfree(sbi->s_group_desc);
+ failed_mount:
+ ext3_blkdev_remove(sbi);
+ brelse(bh);
+ out_fail:
+ return NULL;
+ }
+
+ static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
+ {
+ struct inode *journal_inode;
+ journal_t *journal;
+
+ /* First, test for the existence of a valid inode on disk. Bad
+ * things happen if we iget() an unused inode, as the subsequent
+ * iput() will try to delete it. */
+
+ journal_inode = iget(sb, journal_inum);
+ if (!journal_inode) {
+ printk(KERN_ERR "EXT3-fs: no journal found.\n");
+ return NULL;
+ }
+ if (!journal_inode->i_nlink) {
+ make_bad_inode(journal_inode);
+ iput(journal_inode);
+ printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+ return NULL;
+ }
+
+ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+ journal_inode, journal_inode->i_size);
+ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
+ printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+ iput(journal_inode);
+ return NULL;
+ }
+
+ journal = journal_init_inode(journal_inode);
+ if (!journal)
+ iput(journal_inode);
+ return journal;
+ }
+
+ static journal_t *ext3_get_dev_journal(struct super_block *sb,
+ int dev)
+ {
+ struct buffer_head * bh;
+ journal_t *journal;
+ int start;
+ int len;
+ int hblock, blocksize;
+ unsigned long sb_block;
+ unsigned long offset;
+ kdev_t journal_dev = to_kdev_t(dev);
+ struct ext3_super_block * es;
+ struct block_device *bdev;
+
+ bdev = ext3_blkdev_get(journal_dev);
+ if (bdev == NULL)
+ return NULL;
+
+ blocksize = sb->s_blocksize;
+ hblock = get_hardsect_size(journal_dev);
+ if (blocksize < hblock) {
+ printk(KERN_ERR
+ "EXT3-fs: blocksize too small for journal device.\n");
+ goto out_bdev;
+ }
+
+ sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
+ offset = EXT3_MIN_BLOCK_SIZE % blocksize;
+ set_blocksize(dev, blocksize);
+ if (!(bh = bread(dev, sb_block, blocksize))) {
+ printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
+ "external journal\n");
+ goto out_bdev;
+ }
+
+ es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+ if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
+ !(le32_to_cpu(es->s_feature_incompat) &
+ EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+ printk(KERN_ERR "EXT3-fs: external journal has "
+ "bad superblock\n");
+ brelse(bh);
+ goto out_bdev;
+ }
+
+ if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+ printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+ brelse(bh);
+ goto out_bdev;
+ }
+
+ len = le32_to_cpu(es->s_blocks_count);
+ start = sb_block + 1;
+ brelse(bh); /* we're done with the superblock */
+
+ journal = journal_init_dev(journal_dev, sb->s_dev,
+ start, len, blocksize);
+ if (!journal) {
+ printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+ goto out_bdev;
+ }
+ ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ wait_on_buffer(journal->j_sb_buffer);
+ if (!buffer_uptodate(journal->j_sb_buffer)) {
+ printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+ goto out_journal;
+ }
+ if (ntohl(journal->j_superblock->s_nr_users) != 1) {
+ printk(KERN_ERR "EXT3-fs: External journal has more than one "
+ "user (unsupported) - %d\n",
+ ntohl(journal->j_superblock->s_nr_users));
+ goto out_journal;
+ }
+ EXT3_SB(sb)->journal_bdev = bdev;
+ return journal;
+ out_journal:
+ journal_destroy(journal);
+ out_bdev:
+ ext3_blkdev_put(bdev);
+ return NULL;
+ }
+
+ static int ext3_load_journal(struct super_block * sb,
+ struct ext3_super_block * es)
+ {
+ journal_t *journal;
+ int journal_inum = le32_to_cpu(es->s_journal_inum);
+ int journal_dev = le32_to_cpu(es->s_journal_dev);
+ int err;
+ int really_read_only;
+
+ really_read_only = is_read_only(sb->s_dev);
+
+ /*
+ * Are we loading a blank journal or performing recovery after a
+ * crash? For recovery, we need to check in advance whether we
+ * can get read-write access to the device.
+ */
+
+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+ if (sb->s_flags & MS_RDONLY) {
+ printk(KERN_INFO "EXT3-fs: INFO: recovery "
+ "required on readonly filesystem.\n");
+ if (really_read_only) {
+ printk(KERN_ERR "EXT3-fs: write access "
+ "unavailable, cannot proceed.\n");
+ return -EROFS;
+ }
+ printk (KERN_INFO "EXT3-fs: write access will "
+ "be enabled during recovery.\n");
+ }
+ }
+
+ if (journal_inum && journal_dev) {
+ printk(KERN_ERR "EXT3-fs: filesystem has both journal "
+ "and inode journals!\n");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ if (!(journal = ext3_get_journal(sb, journal_inum)))
+ return -EINVAL;
+ } else {
+ if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
+ return -EINVAL;
+ }
+
+
+ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+ err = journal_update_format(journal);
+ if (err) {
+ printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+ journal_destroy(journal);
+ return err;
+ }
+ }
+
+ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
+ journal_wipe(journal, !really_read_only);
+
+ err = journal_load(journal);
+ if (err) {
+ printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+ journal_destroy(journal);
+ return err;
+ }
+
+ EXT3_SB(sb)->s_journal = journal;
+ ext3_clear_journal_err(sb, es);
+ return 0;
+ }
+
+ static int ext3_create_journal(struct super_block * sb,
+ struct ext3_super_block * es,
+ int journal_inum)
+ {
+ journal_t *journal;
+
+ if (sb->s_flags & MS_RDONLY) {
+ printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
+ "create journal.\n");
+ return -EROFS;
+ }
+
+ if (!(journal = ext3_get_journal(sb, journal_inum)))
+ return -EINVAL;
+
+ printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
+ journal_inum);
+
+ if (journal_create(journal)) {
+ printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+ journal_destroy(journal);
+ return -EIO;
+ }
+
+ EXT3_SB(sb)->s_journal = journal;
+
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
+
+ es->s_journal_inum = cpu_to_le32(journal_inum);
+ sb->s_dirt = 1;
+
+ /* Make sure we flush the recovery flag to disk. */
+ ext3_commit_super(sb, es, 1);
+
+ return 0;
+ }
+
+ static void ext3_commit_super (struct super_block * sb,
+ struct ext3_super_block * es,
+ int sync)
+ {
+ es->s_wtime = cpu_to_le32(CURRENT_TIME);
+ BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
+ mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
+ if (sync) {
+ ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
+ wait_on_buffer(sb->u.ext3_sb.s_sbh);
+ }
+ }
+
+
+ /*
+ * Have we just finished recovery? If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk. Record that fact.
+ */
+ static void ext3_mark_recovery_complete(struct super_block * sb,
+ struct ext3_super_block * es)
+ {
+ journal_flush(EXT3_SB(sb)->s_journal);
+ if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
+ sb->s_flags & MS_RDONLY) {
+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ sb->s_dirt = 0;
+ ext3_commit_super(sb, es, 1);
+ }
+ }
+
+ /*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+ static void ext3_clear_journal_err(struct super_block * sb,
+ struct ext3_super_block * es)
+ {
+ journal_t *journal;
+ int j_errno;
+ const char *errstr;
+
+ journal = EXT3_SB(sb)->s_journal;
+
+ /*
+ * Now check for any error status which may have been recorded in the
+ * journal by a prior ext3_error() or ext3_abort()
+ */
+
+ j_errno = journal_errno(journal);
+ if (j_errno) {
+ char nbuf[16];
+
+ errstr = ext3_decode_error(sb, j_errno, nbuf);
+ ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
+ "from previous mount: %s", errstr);
+ ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
+ "filesystem check.");
+
+ sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+ ext3_commit_super (sb, es, 1);
+
+ journal_clear_err(journal);
+ }
+ }
+
+ /*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+ int ext3_force_commit(struct super_block *sb)
+ {
+ journal_t *journal;
+ int ret;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ journal = EXT3_SB(sb)->s_journal;
+ sb->s_dirt = 0;
+ lock_kernel(); /* important: lock down j_running_transaction */
+ ret = ext3_journal_force_commit(journal);
+ unlock_kernel();
+ return ret;
+ }
+
+ /*
+ * Ext3 always journals updates to the superblock itself, so we don't
+ * have to propagate any other updates to the superblock on disk at this
+ * point. Just start an async writeback to get the buffers on their way
+ * to the disk.
+ *
+ * This implicitly triggers the writebehind on sync().
+ */
+
+ static int do_sync_supers = 0;
+ MODULE_PARM(do_sync_supers, "i");
+ MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
+
+ void ext3_write_super (struct super_block * sb)
+ {
+ tid_t target;
+
+ if (down_trylock(&sb->s_lock) == 0)
+ BUG(); /* aviro detector */
+ sb->s_dirt = 0;
+ target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
+ if (do_sync_supers) {
+ unlock_super(sb);
+ log_wait_commit(EXT3_SB(sb)->s_journal, target);
+ lock_super(sb);
+ }
+ }
+
+ /*
+ * LVM calls this function before a (read-only) snapshot is created. This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ */
+ void ext3_write_super_lockfs(struct super_block *sb)
+ {
+ sb->s_dirt = 0;
+
+ lock_kernel(); /* 2.4.5 forgot to do this for us */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ journal_t *journal = EXT3_SB(sb)->s_journal;
+
+ /* Now we set up the journal barrier. */
+ journal_lock_updates(journal);
+ journal_flush(journal);
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+ }
+ unlock_kernel();
+ }
+
+ /*
+ * Called by LVM after the snapshot is done. We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+ void ext3_unlockfs(struct super_block *sb)
+ {
+ if (!(sb->s_flags & MS_RDONLY)) {
+ lock_kernel();
+ lock_super(sb);
+ /* Reser the needs_recovery flag before the fs is unlocked. */
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+ unlock_super(sb);
+ journal_unlock_updates(EXT3_SB(sb)->s_journal);
+ unlock_kernel();
+ }
+ }
+
+ int ext3_remount (struct super_block * sb, int * flags, char * data)
+ {
+ struct ext3_super_block * es;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ unsigned long tmp;
+
+ clear_ro_after(sb);
+
+ /*
+ * Allow the "check" option to be passed as a remount option.
+ */
+ if (!parse_options(data, &tmp, sbi, &tmp, 1))
+ return -EINVAL;
+
+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+
+ es = sbi->s_es;
+
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ return -EROFS;
+
+ if (*flags & MS_RDONLY) {
+ /*
+ * First of all, the unconditional stuff we have to do
+ * to disable replay of the journal when we next remount
+ */
+ sb->s_flags |= MS_RDONLY;
+
+ /*
+ * OK, test if we are remounting a valid rw partition
+ * readonly, and if so set the rdonly flag and then
+ * mark the partition as valid again.
+ */
+ if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
+ (sbi->s_mount_state & EXT3_VALID_FS))
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+ ext3_mark_recovery_complete(sb, es);
+ } else {
+ int ret;
+ if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
+ ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
+ printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+ "remount RDWR because of unsupported "
+ "optional features (%x).\n",
+ bdevname(sb->s_dev), ret);
+ return -EROFS;
+ }
+ /*
+ * Mounting a RDONLY partition read-write, so reread
+ * and store the current valid flag. (It may have
+ * been changed by e2fsck since we originally mounted
+ * the partition.)
+ */
+ ext3_clear_journal_err(sb, es);
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ if (!ext3_setup_super (sb, es, 0))
+ sb->s_flags &= ~MS_RDONLY;
+ }
+ }
+ setup_ro_after(sb);
+ return 0;
+ }
+
+ int ext3_statfs (struct super_block * sb, struct statfs * buf)
+ {
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ unsigned long overhead;
+ int i;
+
+ if (test_opt (sb, MINIX_DF))
+ overhead = 0;
+ else {
+ /*
+ * Compute the overhead (FS structures)
+ */
+
+ /*
+ * All of the blocks before first_data_block are
+ * overhead
+ */
+ overhead = le32_to_cpu(es->s_first_data_block);
+
+ /*
+ * Add the overhead attributed to the superblock and
+ * block group descriptors. If the sparse superblocks
+ * feature is turned on, then not all groups have this.
+ */
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+ overhead += ext3_bg_has_super(sb, i) +
+ ext3_bg_num_gdb(sb, i);
+
+ /*
+ * Every block group has an inode bitmap, a block
+ * bitmap, and an inode table.
+ */
+ overhead += (EXT3_SB(sb)->s_groups_count *
+ (2 + EXT3_SB(sb)->s_itb_per_group));
+ }
+
+ buf->f_type = EXT3_SUPER_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+ buf->f_bfree = ext3_count_free_blocks (sb);
+ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+ buf->f_bavail = 0;
+ buf->f_files = le32_to_cpu(es->s_inodes_count);
+ buf->f_ffree = ext3_count_free_inodes (sb);
+ buf->f_namelen = EXT3_NAME_LEN;
+ return 0;
+ }
+
+ static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
+
+ static int __init init_ext3_fs(void)
+ {
+ return register_filesystem(&ext3_fs_type);
+ }
+
+ static void __exit exit_ext3_fs(void)
+ {
+ unregister_filesystem(&ext3_fs_type);
+ }
+
+ EXPORT_NO_SYMBOLS;
+
+ MODULE_LICENSE("GPL");
+ module_init(init_ext3_fs)
+ module_exit(exit_ext3_fs)
diff -rc2P linux/fs/ext3/symlink.c linux-2.4.13/fs/ext3/symlink.c
*** linux/fs/ext3/symlink.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/symlink.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,39 ----
+ /*
+ * linux/fs/ext3/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/symlink.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext3 symlink handling code
+ */
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+ return vfs_readlink(dentry, buffer, buflen, s);
+ }
+
+ static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+ {
+ char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+ return vfs_follow_link(nd, s);
+ }
+
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+ readlink: ext3_readlink, /* BKL not held. Don't need */
+ follow_link: ext3_follow_link, /* BKL not held. Don't need */
+ };
diff -rc2P linux/fs/inode.c linux-2.4.13/fs/inode.c
*** linux/fs/inode.c Fri Sep 28 21:03:48 2001
--- linux-2.4.13/fs/inode.c Fri Nov 9 16:57:59 2001
***************
*** 110,113 ****
--- 110,114 ----
sema_init(&inode->i_sem, 1);
sema_init(&inode->i_zombie, 1);
+ init_rwsem(&inode->i_truncate_sem);
spin_lock_init(&inode->i_data.i_shared_lock);
}
diff -rc2P linux/fs/jbd/Makefile linux-2.4.13/fs/jbd/Makefile
*** linux/fs/jbd/Makefile Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/Makefile Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,15 ----
+ #
+ # fs/jbd/Makefile
+ #
+ # Makefile for the linux journaling routines.
+ #
+
+ export-objs := journal.o
+ O_TARGET := jbd.o
+
+ obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
+
+ obj-m := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
+
diff -rc2P linux/fs/jbd/checkpoint.c linux-2.4.13/fs/jbd/checkpoint.c
*** linux/fs/jbd/checkpoint.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/checkpoint.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,608 ----
+ /*
+ * linux/fs/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+
+ extern spinlock_t journal_datalist_lock;
+
+ /*
+ * Unlink a buffer from a transaction.
+ *
+ * Called with journal_datalist_lock held.
+ */
+
+ static inline void __buffer_unlink(struct journal_head *jh)
+ {
+ transaction_t *transaction;
+
+ transaction = jh->b_cp_transaction;
+ jh->b_cp_transaction = NULL;
+
+ jh->b_cpnext->b_cpprev = jh->b_cpprev;
+ jh->b_cpprev->b_cpnext = jh->b_cpnext;
+ if (transaction->t_checkpoint_list == jh)
+ transaction->t_checkpoint_list = jh->b_cpnext;
+ if (transaction->t_checkpoint_list == jh)
+ transaction->t_checkpoint_list = NULL;
+ }
+
+ /*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it.
+ * Requires journal_datalist_lock
+ */
+ static int __try_to_free_cp_buf(struct journal_head *jh)
+ {
+ int ret = 0;
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ __journal_remove_checkpoint(jh);
+ __journal_remove_journal_head(bh);
+ BUFFER_TRACE(bh, "release");
+ /* BUF_LOCKED -> BUF_CLEAN (fwiw) */
+ refile_buffer(bh);
+ __brelse(bh);
+ ret = 1;
+ }
+ return ret;
+ }
+
+ /*
+ * log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called with the journal already locked, but it will be unlocked if we have
+ * to wait for a checkpoint to free up some space in the log.
+ */
+
+ void log_wait_for_space(journal_t *journal, int nblocks)
+ {
+ while (log_space_left(journal) < nblocks) {
+ if (journal->j_flags & JFS_ABORT)
+ return;
+ unlock_journal(journal);
+ down(&journal->j_checkpoint_sem);
+ lock_journal(journal);
+
+ /* Test again, another process may have checkpointed
+ * while we were waiting for the checkpoint lock */
+ if (log_space_left(journal) < nblocks) {
+ log_do_checkpoint(journal, nblocks);
+ }
+ up(&journal->j_checkpoint_sem);
+ }
+ }
+
+ /*
+ * Clean up a transaction's checkpoint list.
+ *
+ * We wait for any pending IO to complete and make sure any clean
+ * buffers are removed from the transaction.
+ *
+ * Return 1 if we performed any actions which might have destroyed the
+ * checkpoint. (journal_remove_checkpoint() deletes the transaction when
+ * the last checkpoint buffer is cleansed)
+ *
+ * Called with the journal locked.
+ * Called with journal_datalist_lock held.
+ */
+ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+ {
+ struct journal_head *jh, *next_jh, *last_jh;
+ struct buffer_head *bh;
+ int ret = 0;
+
+ assert_spin_locked(&journal_datalist_lock);
+ jh = transaction->t_checkpoint_list;
+ if (!jh)
+ return 0;
+
+ last_jh = jh->b_cpprev;
+ next_jh = jh;
+ do {
+ jh = next_jh;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ goto out_return_1;
+ }
+
+ if (jh->b_transaction != NULL) {
+ transaction_t *transaction = jh->b_transaction;
+ tid_t tid = transaction->t_tid;
+
+ spin_unlock(&journal_datalist_lock);
+ log_start_commit(journal, transaction);
+ unlock_journal(journal);
+ log_wait_commit(journal, tid);
+ goto out_return_1;
+ }
+
+ /*
+ * We used to test for (jh->b_list != BUF_CLEAN) here.
+ * But unmap_underlying_metadata() can place buffer onto
+ * BUF_CLEAN. Since refile_buffer() no longer takes buffers
+ * off checkpoint lists, we cope with it here
+ */
+ /*
+ * AKPM: I think the buffer_jdirty test is redundant - it
+ * shouldn't have NULL b_transaction?
+ */
+ next_jh = jh->b_cpnext;
+ if (!buffer_dirty(bh) && !buffer_jdirty(bh)) {
+ BUFFER_TRACE(bh, "remove from checkpoint");
+ __journal_remove_checkpoint(jh);
+ __journal_remove_journal_head(bh);
+ refile_buffer(bh);
+ __brelse(bh);
+ ret = 1;
+ }
+
+ jh = next_jh;
+ } while (jh != last_jh);
+
+ return ret;
+ out_return_1:
+ lock_journal(journal);
+ spin_lock(&journal_datalist_lock);
+ return 1;
+ }
+
+ #define NR_BATCH 64
+
+ static void __flush_batch(struct buffer_head **bhs, int *batch_count)
+ {
+ int i;
+
+ spin_unlock(&journal_datalist_lock);
+ ll_rw_block(WRITE, *batch_count, bhs);
+ run_task_queue(&tq_disk);
+ spin_lock(&journal_datalist_lock);
+ for (i = 0; i < *batch_count; i++) {
+ struct buffer_head *bh = bhs[i];
+ clear_bit(BH_JWrite, &bh->b_state);
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ }
+ *batch_count = 0;
+ }
+
+ /*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.
+ *
+ * Called with journal_datalist_lock held.
+ */
+ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
+ struct buffer_head **bhs, int *batch_count,
+ int *drop_count)
+ {
+ struct buffer_head *bh = jh2bh(jh);
+ int ret = 0;
+
+ if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
+ J_ASSERT_JH(jh, jh->b_transaction == NULL);
+
+ /*
+ * Important: we are about to write the buffer, and
+ * possibly block, while still holding the journal lock.
+ * We cannot afford to let the transaction logic start
+ * messing around with this buffer before we write it to
+ * disk, as that would break recoverability.
+ */
+ BUFFER_TRACE(bh, "queue");
+ atomic_inc(&bh->b_count);
+ J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state));
+ set_bit(BH_JWrite, &bh->b_state);
+ bhs[*batch_count] = bh;
+ (*batch_count)++;
+ if (*batch_count == NR_BATCH) {
+ __flush_batch(bhs, batch_count);
+ ret = 1;
+ }
+ } else {
+ int last_buffer = 0;
+ if (jh->b_cpnext == jh) {
+ /* We may be about to drop the transaction. Tell the
+ * caller that the lists have changed.
+ */
+ last_buffer = 1;
+ }
+ if (__try_to_free_cp_buf(jh)) {
+ (*drop_count)++;
+ ret = last_buffer;
+ }
+ }
+ return ret;
+ }
+
+
+ /*
+ * Perform an actual checkpoint. We don't write out only enough to
+ * satisfy the current blocked requests: rather we submit a reasonably
+ * sized chunk of the outstanding data to disk at once for
+ * efficiency. log_wait_for_space() will retry if we didn't free enough.
+ *
+ * However, we _do_ take into account the amount requested so that once
+ * the IO has been queued, we can return as soon as enough of it has
+ * completed to disk.
+ *
+ * The journal should be locked before calling this function.
+ */
+
+ /* @@@ `nblocks' is unused. Should it be used? */
+ int log_do_checkpoint (journal_t *journal, int nblocks)
+ {
+ transaction_t *transaction, *last_transaction, *next_transaction;
+ int result;
+ int target;
+ int batch_count = 0;
+ struct buffer_head *bhs[NR_BATCH];
+
+ jbd_debug(1, "Start checkpoint\n");
+
+ /*
+ * First thing: if there are any transactions in the log which
+ * don't need checkpointing, just eliminate them from the
+ * journal straight away.
+ */
+ result = cleanup_journal_tail(journal);
+ jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+ if (result <= 0)
+ return result;
+
+ /*
+ * OK, we need to start writing disk blocks. Try to free up a
+ * quarter of the log in a single checkpoint if we can.
+ */
+ /*
+ * AKPM: check this code. I had a feeling a while back that it
+ * degenerates into a busy loop at unmount time.
+ */
+ target = (journal->j_last - journal->j_first) / 4;
+
+ spin_lock(&journal_datalist_lock);
+ repeat:
+ transaction = journal->j_checkpoint_transactions;
+ if (transaction == NULL)
+ goto done;
+ last_transaction = transaction->t_cpprev;
+ next_transaction = transaction;
+
+ do {
+ struct journal_head *jh, *last_jh, *next_jh;
+ int drop_count = 0;
+ int cleanup_ret, retry = 0;
+
+ transaction = next_transaction;
+ next_transaction = transaction->t_cpnext;
+ jh = transaction->t_checkpoint_list;
+ last_jh = jh->b_cpprev;
+ next_jh = jh;
+ do {
+ jh = next_jh;
+ next_jh = jh->b_cpnext;
+ retry = __flush_buffer(journal, jh, bhs, &batch_count,
+ &drop_count);
+ } while (jh != last_jh && !retry);
+ if (batch_count) {
+ __flush_batch(bhs, &batch_count);
+ goto repeat;
+ }
+ if (retry)
+ goto repeat;
+ /*
+ * We have walked the whole transaction list without
+ * finding anything to write to disk. We had better be
+ * able to make some progress or we are in trouble.
+ */
+ cleanup_ret = __cleanup_transaction(journal, transaction);
+ J_ASSERT(drop_count != 0 || cleanup_ret != 0);
+ goto repeat; /* __cleanup may have dropped lock */
+ } while (transaction != last_transaction);
+
+ done:
+ spin_unlock(&journal_datalist_lock);
+ result = cleanup_journal_tail(journal);
+ if (result < 0)
+ return result;
+
+ return 0;
+ }
+
+ /*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock. If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts. Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the journal superblock if
+ * we have an abort error outstanding.
+ */
+
+ int cleanup_journal_tail(journal_t *journal)
+ {
+ transaction_t * transaction;
+ tid_t first_tid;
+ unsigned long blocknr, freed;
+
+ /* OK, work out the oldest transaction remaining in the log, and
+ * the log block it starts at.
+ *
+ * If the log is now empty, we need to work out which is the
+ * next transaction ID we will write, and where it will
+ * start. */
+
+ /* j_checkpoint_transactions needs locking */
+ spin_lock(&journal_datalist_lock);
+ transaction = journal->j_checkpoint_transactions;
+ if (transaction) {
+ first_tid = transaction->t_tid;
+ blocknr = transaction->t_log_start;
+ } else if ((transaction = journal->j_committing_transaction) != NULL) {
+ first_tid = transaction->t_tid;
+ blocknr = transaction->t_log_start;
+ } else if ((transaction = journal->j_running_transaction) != NULL) {
+ first_tid = transaction->t_tid;
+ blocknr = journal->j_head;
+ } else {
+ first_tid = journal->j_transaction_sequence;
+ blocknr = journal->j_head;
+ }
+ spin_unlock(&journal_datalist_lock);
+ J_ASSERT (blocknr != 0);
+
+ /* If the oldest pinned transaction is at the tail of the log
+ already then there's not much we can do right now. */
+ if (journal->j_tail_sequence == first_tid)
+ return 1;
+
+ /* OK, update the superblock to recover the freed space.
+ * Physical blocks come first: have we wrapped beyond the end of
+ * the log? */
+ freed = blocknr - journal->j_tail;
+ if (blocknr < journal->j_tail)
+ freed = freed + journal->j_last - journal->j_first;
+
+ jbd_debug(1,
+ "Cleaning journal tail from %d to %d (offset %lu), "
+ "freeing %lu\n",
+ journal->j_tail_sequence, first_tid, blocknr, freed);
+
+ journal->j_free += freed;
+ journal->j_tail_sequence = first_tid;
+ journal->j_tail = blocknr;
+ if (!(journal->j_flags & JFS_ABORT))
+ journal_update_superblock(journal, 1);
+ return 0;
+ }
+
+
+ /* Checkpoint list management */
+
+ /*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with journal_datalist_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+ int __journal_clean_checkpoint_list(journal_t *journal)
+ {
+ transaction_t *transaction, *last_transaction, *next_transaction;
+ int ret = 0;
+
+ transaction = journal->j_checkpoint_transactions;
+ if (transaction == 0)
+ goto out;
+
+ last_transaction = transaction->t_cpprev;
+ next_transaction = transaction;
+ do {
+ struct journal_head *jh;
+
+ transaction = next_transaction;
+ next_transaction = transaction->t_cpnext;
+ jh = transaction->t_checkpoint_list;
+ if (jh) {
+ struct journal_head *last_jh = jh->b_cpprev;
+ struct journal_head *next_jh = jh;
+ do {
+ struct buffer_head *bh;
+
+ jh = next_jh;
+ next_jh = jh->b_cpnext;
+ bh = jh2bh(jh);
+ ret += __try_to_free_cp_buf(jh);
+ } while (jh != last_jh);
+ }
+ } while (transaction != last_transaction);
+ out:
+ return ret;
+ }
+
+ /*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk. To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * list until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint list.
+ *
+ * This function is called with the journal locked.
+ * This function is called with journal_datalist_lock held.
+ */
+
+ void __journal_remove_checkpoint(struct journal_head *jh)
+ {
+ transaction_t *transaction;
+ journal_t *journal;
+
+ JBUFFER_TRACE(jh, "entry");
+
+ if ((transaction = jh->b_cp_transaction) == NULL) {
+ JBUFFER_TRACE(jh, "not on transaction");
+ goto out;
+ }
+
+ journal = transaction->t_journal;
+
+ __buffer_unlink(jh);
+
+ if (transaction->t_checkpoint_list != NULL)
+ goto out;
+ JBUFFER_TRACE(jh, "transaction has no more buffers");
+
+ /* There is one special case to worry about: if we have just
+ pulled the buffer off a committing transaction's forget list,
+ then even if the checkpoint list is empty, the transaction
+ obviously cannot be dropped! */
+
+ if (transaction == journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "belongs to committing transaction");
+ goto out;
+ }
+
+ /* OK, that was the last buffer for the transaction: we can now
+ safely remove this transaction from the log */
+
+ __journal_drop_transaction(journal, transaction);
+
+ /* Just in case anybody was waiting for more transactions to be
+ checkpointed... */
+ wake_up(&journal->j_wait_logspace);
+ out:
+ JBUFFER_TRACE(jh, "exit");
+ }
+
+ void journal_remove_checkpoint(struct journal_head *jh)
+ {
+ spin_lock(&journal_datalist_lock);
+ __journal_remove_checkpoint(jh);
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with journal_datalist_lock held.
+ */
+ void __journal_insert_checkpoint(struct journal_head *jh,
+ transaction_t *transaction)
+ {
+ JBUFFER_TRACE(jh, "entry");
+ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh)));
+ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+ assert_spin_locked(&journal_datalist_lock);
+ jh->b_cp_transaction = transaction;
+
+ if (!transaction->t_checkpoint_list) {
+ jh->b_cpnext = jh->b_cpprev = jh;
+ } else {
+ jh->b_cpnext = transaction->t_checkpoint_list;
+ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+ jh->b_cpprev->b_cpnext = jh;
+ jh->b_cpnext->b_cpprev = jh;
+ }
+ transaction->t_checkpoint_list = jh;
+ }
+
+ void journal_insert_checkpoint(struct journal_head *jh,
+ transaction_t *transaction)
+ {
+ spin_lock(&journal_datalist_lock);
+ __journal_insert_checkpoint(jh, transaction);
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with journal_datalist_lock held.
+ */
+
+ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+ {
+ assert_spin_locked(&journal_datalist_lock);
+ if (transaction->t_cpnext) {
+ transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+ transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+ if (journal->j_checkpoint_transactions == transaction)
+ journal->j_checkpoint_transactions =
+ transaction->t_cpnext;
+ if (journal->j_checkpoint_transactions == transaction)
+ journal->j_checkpoint_transactions = NULL;
+ }
+
+ J_ASSERT (transaction->t_ilist == NULL);
+ J_ASSERT (transaction->t_buffers == NULL);
+ J_ASSERT (transaction->t_sync_datalist == NULL);
+ J_ASSERT (transaction->t_async_datalist == NULL);
+ J_ASSERT (transaction->t_forget == NULL);
+ J_ASSERT (transaction->t_iobuf_list == NULL);
+ J_ASSERT (transaction->t_shadow_list == NULL);
+ J_ASSERT (transaction->t_log_list == NULL);
+ J_ASSERT (transaction->t_checkpoint_list == NULL);
+ J_ASSERT (transaction->t_updates == 0);
+
+ J_ASSERT (transaction->t_journal->j_committing_transaction !=
+ transaction);
+
+ jbd_debug (1, "Dropping transaction %d, all done\n",
+ transaction->t_tid);
+ kfree (transaction);
+ }
+
diff -rc2P linux/fs/jbd/commit.c linux-2.4.13/fs/jbd/commit.c
*** linux/fs/jbd/commit.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/commit.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,701 ----
+ /*
+ * linux/fs/commit.c
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+
+ extern spinlock_t journal_datalist_lock;
+
+ /*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+ {
+ BUFFER_TRACE(bh, "");
+ mark_buffer_uptodate(bh, uptodate);
+ unlock_buffer(bh);
+ }
+
+ /*
+ * journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log. This
+ * function is called by the journal thread to begin a complete commit.
+ */
+ void journal_commit_transaction(journal_t *journal)
+ {
+ transaction_t *commit_transaction;
+ struct journal_head *jh, *new_jh, *descriptor;
+ struct journal_head *next_jh, *last_jh;
+ struct buffer_head *wbuf[64];
+ int bufs;
+ int flags;
+ int blocknr;
+ char *tagp = NULL;
+ journal_header_t *header;
+ journal_block_tag_t *tag = NULL;
+ int space_left = 0;
+ int first_tag = 0;
+ int tag_flag;
+ int i;
+
+ /*
+ * First job: lock down the current transaction and wait for
+ * all outstanding updates to complete.
+ */
+
+ lock_journal(journal); /* Protect journal->j_running_transaction */
+
+ #ifdef COMMIT_STATS
+ spin_lock(&journal_datalist_lock);
+ summarise_journal_usage(journal);
+ spin_unlock(&journal_datalist_lock);
+ #endif
+
+ lock_kernel();
+
+ J_ASSERT (journal->j_running_transaction != NULL);
+ J_ASSERT (journal->j_committing_transaction == NULL);
+
+ commit_transaction = journal->j_running_transaction;
+ J_ASSERT (commit_transaction->t_state == T_RUNNING);
+
+ jbd_debug (1, "JBD: starting commit of transaction %d\n",
+ commit_transaction->t_tid);
+
+ commit_transaction->t_state = T_LOCKED;
+ while (commit_transaction->t_updates != 0) {
+ unlock_journal(journal);
+ sleep_on(&journal->j_wait_updates);
+ lock_journal(journal);
+ }
+
+ J_ASSERT (commit_transaction->t_outstanding_credits <=
+ journal->j_max_transaction_buffers);
+
+ /* Do we need to erase the effects of a prior journal_flush? */
+ if (journal->j_flags & JFS_FLUSHED) {
+ jbd_debug(3, "super block updated\n");
+ journal_update_superblock(journal, 1);
+ } else {
+ jbd_debug(3, "superblock not updated\n");
+ }
+
+ /*
+ * First thing we are allowed to do is to discard any remaining
+ * BJ_Reserved buffers. Note, it is _not_ permissible to assume
+ * that there are no such buffers: if a large filesystem
+ * operation like a truncate needs to split itself over multiple
+ * transactions, then it may try to do a journal_restart() while
+ * there are still BJ_Reserved buffers outstanding. These must
+ * be released cleanly from the current transaction.
+ *
+ * In this case, the filesystem must still reserve write access
+ * again before modifying the buffer in the new transaction, but
+ * we do not require it to remember exactly which old buffers it
+ * has reserved. This is consistent with the existing behaviour
+ * that multiple journal_get_write_access() calls to the same
+ * buffer are perfectly permissable.
+ */
+
+ while (commit_transaction->t_reserved_list) {
+ jh = commit_transaction->t_reserved_list;
+ JBUFFER_TRACE(jh, "reserved, unused: refile");
+ journal_refile_buffer(jh);
+ }
+
+ /*
+ * Now try to drop any written-back buffers from the journal's
+ * checkpoint lists. We do this *before* commit because it potentially
+ * frees some memory
+ */
+ spin_lock(&journal_datalist_lock);
+ __journal_clean_checkpoint_list(journal);
+ spin_unlock(&journal_datalist_lock);
+
+ /* First part of the commit: force the revoke list out to disk.
+ * The revoke code generates its own metadata blocks on disk for this.
+ *
+ * It is important that we do this while the transaction is
+ * still locked. Generating the revoke records should not
+ * generate any IO stalls, so this should be quick; and doing
+ * the work while we have the transaction locked means that we
+ * only ever have to maintain the revoke list for one
+ * transaction at a time.
+ */
+
+ jbd_debug (3, "JBD: commit phase 1\n");
+
+ journal_write_revoke_records(journal, commit_transaction);
+
+ /*
+ * Now that we have built the revoke records, we can start
+ * reusing the revoke list for a new running transaction. We
+ * can now safely start committing the old transaction: time to
+ * get a new running transaction for incoming filesystem updates
+ */
+
+ commit_transaction->t_state = T_FLUSH;
+
+ wake_up(&journal->j_wait_transaction_locked);
+
+ journal->j_committing_transaction = commit_transaction;
+ journal->j_running_transaction = NULL;
+
+ commit_transaction->t_log_start = journal->j_head;
+
+ unlock_kernel();
+
+ jbd_debug (3, "JBD: commit phase 2\n");
+
+ /*
+ * Now start flushing things to disk, in the order they appear
+ * on the transaction lists. Data blocks go first.
+ */
+
+ /*
+ * Whenever we unlock the journal and sleep, things can get added
+ * onto ->t_datalist, so we have to keep looping back to write_out_data
+ * until we *know* that the list is empty.
+ */
+ write_out_data:
+
+ /*
+ * Cleanup any flushed data buffers from the data list. Even in
+ * abort mode, we want to flush this out as soon as possible.
+ *
+ * We take journal_datalist_lock to protect the lists from
+ * journal_try_to_free_buffers().
+ */
+ spin_lock(&journal_datalist_lock);
+
+ write_out_data_locked:
+ bufs = 0;
+ next_jh = commit_transaction->t_sync_datalist;
+ if (next_jh == NULL)
+ goto sync_datalist_empty;
+ last_jh = next_jh->b_tprev;
+
+ do {
+ struct buffer_head *bh;
+
+ jh = next_jh;
+ next_jh = jh->b_tnext;
+ bh = jh2bh(jh);
+ if (!buffer_locked(bh)) {
+ if (buffer_dirty(bh)) {
+ BUFFER_TRACE(bh, "start journal writeout");
+ atomic_inc(&bh->b_count);
+ wbuf[bufs++] = bh;
+ } else {
+ BUFFER_TRACE(bh, "writeout complete: unfile");
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = NULL;
+ __journal_remove_journal_head(bh);
+ refile_buffer(bh);
+ __brelse(bh);
+ }
+ }
+ if (bufs == ARRAY_SIZE(wbuf)) {
+ /*
+ * Major speedup: start here on the next scan
+ */
+ J_ASSERT(commit_transaction->t_sync_datalist != 0);
+ commit_transaction->t_sync_datalist = jh;
+ break;
+ }
+ } while (jh != last_jh);
+
+ if (bufs || current->need_resched) {
+ jbd_debug(2, "submit %d writes\n", bufs);
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ if (bufs)
+ ll_rw_block(WRITE, bufs, wbuf);
+ if (current->need_resched)
+ schedule();
+ journal_brelse_array(wbuf, bufs);
+ lock_journal(journal);
+ spin_lock(&journal_datalist_lock);
+ if (bufs)
+ goto write_out_data_locked;
+ }
+
+ /*
+ * Wait for all previously submitted IO on the data list to complete.
+ */
+ jh = commit_transaction->t_sync_datalist;
+ if (jh == NULL)
+ goto sync_datalist_empty;
+
+ do {
+ struct buffer_head *bh;
+ jh = jh->b_tprev; /* Wait on the last written */
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ wait_on_buffer(bh);
+ /* the journal_head may have been removed now */
+ lock_journal(journal);
+ goto write_out_data;
+ } else if (buffer_dirty(bh)) {
+ goto write_out_data_locked;
+ }
+ } while (jh != commit_transaction->t_sync_datalist);
+ goto write_out_data_locked;
+
+ sync_datalist_empty:
+ /*
+ * Wait for all the async writepage data. As they become unlocked
+ * in end_buffer_io_async(), the only place where they can be
+ * reaped is in try_to_free_buffers(), and we're locked against
+ * that.
+ */
+ while ((jh = commit_transaction->t_async_datalist)) {
+ struct buffer_head *bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ wait_on_buffer(bh);
+ lock_journal(journal);
+ spin_lock(&journal_datalist_lock);
+ continue; /* List may have changed */
+ }
+ if (jh->b_next_transaction) {
+ /*
+ * For writepage() buffers in journalled data mode: a
+ * later transaction may want the buffer for "metadata"
+ */
+ __journal_refile_buffer(jh);
+ } else {
+ BUFFER_TRACE(bh, "finished async writeout: unfile");
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = NULL;
+ __journal_remove_journal_head(bh);
+ BUFFER_TRACE(bh, "finished async writeout: refile");
+ /* It can sometimes be on BUF_LOCKED due to migration
+ * from syncdata to asyncdata */
+ if (bh->b_list != BUF_CLEAN)
+ refile_buffer(bh);
+ __brelse(bh);
+ }
+ }
+ spin_unlock(&journal_datalist_lock);
+
+ /*
+ * If we found any dirty or locked buffers, then we should have
+ * looped back up to the write_out_data label. If there weren't
+ * any then journal_clean_data_list should have wiped the list
+ * clean by now, so check that it is in fact empty.
+ */
+ J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT (commit_transaction->t_async_datalist == NULL);
+
+ jbd_debug (3, "JBD: commit phase 3\n");
+
+ /*
+ * Way to go: we have now written out all of the data for a
+ * transaction! Now comes the tricky part: we need to write out
+ * metadata. Loop over the transaction's entire buffer list:
+ */
+ commit_transaction->t_state = T_COMMIT;
+
+ descriptor = 0;
+ bufs = 0;
+ while (commit_transaction->t_buffers) {
+
+ /* Find the next buffer to be journaled... */
+
+ jh = commit_transaction->t_buffers;
+
+ /* If we're in abort mode, we just un-journal the buffer and
+ release it for background writing. */
+
+ if (is_journal_aborted(journal)) {
+ JBUFFER_TRACE(jh, "journal is aborting: refile");
+ journal_refile_buffer(jh);
+ /* If that was the last one, we need to clean up
+ * any descriptor buffers which may have been
+ * already allocated, even if we are now
+ * aborting. */
+ if (!commit_transaction->t_buffers)
+ goto start_journal_io;
+ continue;
+ }
+
+ /* Make sure we have a descriptor block in which to
+ record the metadata buffer. */
+
+ if (!descriptor) {
+ struct buffer_head *bh;
+
+ J_ASSERT (bufs == 0);
+
+ jbd_debug(4, "JBD: get descriptor\n");
+
+ descriptor = journal_get_descriptor_buffer(journal);
+ bh = jh2bh(descriptor);
+ jbd_debug(4, "JBD: got buffer %ld (%p)\n",
+ bh->b_blocknr, bh->b_data);
+ header = (journal_header_t *)&bh->b_data[0];
+ header->h_magic = htonl(JFS_MAGIC_NUMBER);
+ header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK);
+ header->h_sequence = htonl(commit_transaction->t_tid);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ space_left = bh->b_size - sizeof(journal_header_t);
+ first_tag = 1;
+ set_bit(BH_JWrite, &bh->b_state);
+ wbuf[bufs++] = bh;
+
+ /* Record it so that we can wait for IO
+ completion later */
+ BUFFER_TRACE(bh, "ph3: file as descriptor");
+ journal_file_buffer(descriptor, commit_transaction,
+ BJ_LogCtl);
+ }
+
+ /* Where is the buffer to be written? */
+
+ blocknr = journal_next_log_block(journal);
+
+ /* Bump b_count to prevent truncate from stumbling over
+ the shadowed buffer! @@@ This can go if we ever get
+ rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ atomic_inc(&jh2bh(jh)->b_count);
+
+ /* Make a temporary IO buffer with which to write it out
+ (this will requeue both the metadata buffer and the
+ temporary IO buffer). new_bh goes on BJ_IO*/
+
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+ /*
+ * akpm: journal_write_metadata_buffer() sets
+ * new_bh->b_transaction to commit_transaction.
+ * We need to clean this up before we release new_bh
+ * (which is of type BJ_IO)
+ */
+ JBUFFER_TRACE(jh, "ph3: write metadata");
+ flags = journal_write_metadata_buffer(commit_transaction,
+ jh, &new_jh, blocknr);
+ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+ wbuf[bufs++] = jh2bh(new_jh);
+
+ /* Record the new block's tag in the current descriptor
+ buffer */
+
+ tag_flag = 0;
+ if (flags & 1)
+ tag_flag |= JFS_FLAG_ESCAPE;
+ if (!first_tag)
+ tag_flag |= JFS_FLAG_SAME_UUID;
+
+ tag = (journal_block_tag_t *) tagp;
+ tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr);
+ tag->t_flags = htonl(tag_flag);
+ tagp += sizeof(journal_block_tag_t);
+ space_left -= sizeof(journal_block_tag_t);
+
+ if (first_tag) {
+ memcpy (tagp, journal->j_uuid, 16);
+ tagp += 16;
+ space_left -= 16;
+ first_tag = 0;
+ }
+
+ /* If there's no more to do, or if the descriptor is full,
+ let the IO rip! */
+
+ if (bufs == ARRAY_SIZE(wbuf) ||
+ commit_transaction->t_buffers == NULL ||
+ space_left < sizeof(journal_block_tag_t) + 16) {
+
+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+ /* Write an end-of-descriptor marker before
+ submitting the IOs. "tag" still points to
+ the last tag we set up. */
+
+ tag->t_flags |= htonl(JFS_FLAG_LAST_TAG);
+
+ start_journal_io:
+ unlock_journal(journal);
+ for (i=0; i<bufs; i++) {
+ struct buffer_head *bh = wbuf[i];
+ set_bit(BH_Lock, &bh->b_state);
+ clear_bit(BH_Dirty, &bh->b_state);
+ bh->b_end_io = journal_end_buffer_io_sync;
+ submit_bh(WRITE, bh);
+ }
+ if (current->need_resched)
+ schedule();
+ lock_journal(journal);
+
+ /* Force a new descriptor to be generated next
+ time round the loop. */
+ descriptor = NULL;
+ bufs = 0;
+ }
+ }
+
+ /* Lo and behold: we have just managed to send a transaction to
+ the log. Before we can commit it, wait for the IO so far to
+ complete. Control buffers being written are on the
+ transaction's t_log_list queue, and metadata buffers are on
+ the t_iobuf_list queue.
+
+ Wait for the transactions in reverse order. That way we are
+ less likely to be woken up until all IOs have completed, and
+ so we incur less scheduling load.
+ */
+
+ jbd_debug(3, "JBD: commit phase 4\n");
+
+ /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */
+ wait_for_iobuf:
+ while (commit_transaction->t_iobuf_list != NULL) {
+ struct buffer_head *bh;
+ jh = commit_transaction->t_iobuf_list->b_tprev;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ unlock_journal(journal);
+ wait_on_buffer(bh);
+ lock_journal(journal);
+ goto wait_for_iobuf;
+ }
+
+ clear_bit(BH_JWrite, &jh2bh(jh)->b_state);
+
+ JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+ journal_unfile_buffer(jh);
+
+ /*
+ * akpm: don't put back a buffer_head with stale pointers
+ * dangling around.
+ */
+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
+ jh->b_transaction = NULL;
+
+ /*
+ * ->t_iobuf_list should contain only dummy buffer_heads
+ * which were created by journal_write_metadata_buffer().
+ */
+ bh = jh2bh(jh);
+ BUFFER_TRACE(bh, "dumping temporary bh");
+ journal_unlock_journal_head(jh);
+ __brelse(bh);
+ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+ put_unused_buffer_head(bh);
+
+ /* We also have to unlock and free the corresponding
+ shadowed buffer */
+ jh = commit_transaction->t_shadow_list->b_tprev;
+ bh = jh2bh(jh);
+ clear_bit(BH_JWrite, &bh->b_state);
+ J_ASSERT_BH(bh, buffer_jdirty(bh));
+
+ /* The metadata is now released for reuse, but we need
+ to remember it against this transaction so that when
+ we finally commit, we can do any checkpointing
+ required. */
+ JBUFFER_TRACE(jh, "file as BJ_Forget");
+ journal_file_buffer(jh, commit_transaction, BJ_Forget);
+ /* Wake up any transactions which were waiting for this
+ IO to complete */
+ wake_up(&bh->b_wait);
+ JBUFFER_TRACE(jh, "brelse shadowed buffer");
+ __brelse(bh);
+ }
+
+ J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+ jbd_debug(3, "JBD: commit phase 5\n");
+
+ /* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+ while (commit_transaction->t_log_list != NULL) {
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_log_list->b_tprev;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ unlock_journal(journal);
+ wait_on_buffer(bh);
+ lock_journal(journal);
+ goto wait_for_ctlbuf;
+ }
+
+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+ clear_bit(BH_JWrite, &bh->b_state);
+ journal_unfile_buffer(jh);
+ jh->b_transaction = NULL;
+ journal_unlock_journal_head(jh);
+ __brelse(bh); /* One for getblk */
+ /* AKPM: bforget here */
+ }
+
+ jbd_debug(3, "JBD: commit phase 6\n");
+
+ /* Done it all: now write the commit record. We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely. */
+
+ if (is_journal_aborted(journal))
+ goto skip_commit;
+
+ descriptor = journal_get_descriptor_buffer(journal);
+
+ /* AKPM: buglet - add `i' to tmp! */
+ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
+ journal_header_t *tmp =
+ (journal_header_t*)jh2bh(descriptor)->b_data;
+ tmp->h_magic = htonl(JFS_MAGIC_NUMBER);
+ tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK);
+ tmp->h_sequence = htonl(commit_transaction->t_tid);
+ }
+
+ unlock_journal(journal);
+ JBUFFER_TRACE(descriptor, "write commit block");
+ {
+ struct buffer_head *bh = jh2bh(descriptor);
+ ll_rw_block(WRITE, 1, &bh);
+ wait_on_buffer(bh);
+ __brelse(bh); /* One for getblk() */
+ journal_unlock_journal_head(descriptor);
+ }
+ lock_journal(journal);
+
+ /* End of a transaction! Finally, we can do checkpoint
+ processing: any buffers committed as a result of this
+ transaction can be removed from any checkpoint list it was on
+ before. */
+
+ skip_commit:
+
+ jbd_debug(3, "JBD: commit phase 7\n");
+
+ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(commit_transaction->t_async_datalist == NULL);
+ J_ASSERT(commit_transaction->t_buffers == NULL);
+ J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+ J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+ J_ASSERT(commit_transaction->t_shadow_list == NULL);
+ J_ASSERT(commit_transaction->t_log_list == NULL);
+
+ while (commit_transaction->t_forget) {
+ transaction_t *cp_transaction;
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_forget;
+ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
+ jh->b_transaction == journal->j_running_transaction);
+
+ /*
+ * If there is undo-protected committed data against
+ * this buffer, then we can remove it now. If it is a
+ * buffer needing such protection, the old frozen_data
+ * field now points to a committed version of the
+ * buffer, so rotate that field to the new committed
+ * data.
+ *
+ * Otherwise, we can just throw away the frozen data now.
+ */
+ if (jh->b_committed_data) {
+ kfree(jh->b_committed_data);
+ jh->b_committed_data = NULL;
+ if (jh->b_frozen_data) {
+ jh->b_committed_data = jh->b_frozen_data;
+ jh->b_frozen_data = NULL;
+ }
+ } else if (jh->b_frozen_data) {
+ kfree(jh->b_frozen_data);
+ jh->b_frozen_data = NULL;
+ }
+
+ spin_lock(&journal_datalist_lock);
+ cp_transaction = jh->b_cp_transaction;
+ if (cp_transaction) {
+ JBUFFER_TRACE(jh, "remove from old cp transaction");
+ J_ASSERT_JH(jh, commit_transaction != cp_transaction);
+ __journal_remove_checkpoint(jh);
+ }
+
+ /* Only re-checkpoint the buffer_head if it is marked
+ * dirty. If the buffer was added to the BJ_Forget list
+ * by journal_forget, it may no longer be dirty and
+ * there's no point in keeping a checkpoint record for
+ * it. */
+ bh = jh2bh(jh);
+ if (buffer_jdirty(bh)) {
+ JBUFFER_TRACE(jh, "add to new checkpointing trans");
+ __journal_insert_checkpoint(jh, commit_transaction);
+ JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+ __journal_refile_buffer(jh);
+ } else {
+ J_ASSERT_BH(bh, !buffer_dirty(bh));
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = 0;
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ /* Done with this transaction! */
+
+ jbd_debug(3, "JBD: commit phase 8\n");
+
+ J_ASSERT (commit_transaction->t_state == T_COMMIT);
+ commit_transaction->t_state = T_FINISHED;
+
+ J_ASSERT (commit_transaction == journal->j_committing_transaction);
+ journal->j_commit_sequence = commit_transaction->t_tid;
+ journal->j_committing_transaction = NULL;
+
+ spin_lock(&journal_datalist_lock);
+ if (commit_transaction->t_checkpoint_list == NULL) {
+ __journal_drop_transaction(journal, commit_transaction);
+ } else {
+ if (journal->j_checkpoint_transactions == NULL) {
+ journal->j_checkpoint_transactions = commit_transaction;
+ commit_transaction->t_cpnext = commit_transaction;
+ commit_transaction->t_cpprev = commit_transaction;
+ } else {
+ commit_transaction->t_cpnext =
+ journal->j_checkpoint_transactions;
+ commit_transaction->t_cpprev =
+ commit_transaction->t_cpnext->t_cpprev;
+ commit_transaction->t_cpnext->t_cpprev =
+ commit_transaction;
+ commit_transaction->t_cpprev->t_cpnext =
+ commit_transaction;
+ }
+ }
+ spin_unlock(&journal_datalist_lock);
+
+ jbd_debug(1, "JBD: commit %d complete, head %d\n",
+ journal->j_commit_sequence, journal->j_tail_sequence);
+
+ unlock_journal(journal);
+ wake_up(&journal->j_wait_done_commit);
+ }
diff -rc2P linux/fs/jbd/journal.c linux-2.4.13/fs/jbd/journal.c
*** linux/fs/jbd/journal.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/journal.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,1716 ----
+ /*
+ * linux/fs/journal.c
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates. This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+ #include <linux/mm.h>
+ #include <linux/slab.h>
+
+ EXPORT_SYMBOL(journal_start);
+ EXPORT_SYMBOL(journal_try_start);
+ EXPORT_SYMBOL(journal_restart);
+ EXPORT_SYMBOL(journal_extend);
+ EXPORT_SYMBOL(journal_stop);
+ EXPORT_SYMBOL(journal_lock_updates);
+ EXPORT_SYMBOL(journal_unlock_updates);
+ EXPORT_SYMBOL(journal_get_write_access);
+ EXPORT_SYMBOL(journal_get_create_access);
+ EXPORT_SYMBOL(journal_get_undo_access);
+ EXPORT_SYMBOL(journal_dirty_data);
+ EXPORT_SYMBOL(journal_dirty_metadata);
+ #if 0
+ EXPORT_SYMBOL(journal_release_buffer);
+ #endif
+ EXPORT_SYMBOL(journal_forget);
+ #if 0
+ EXPORT_SYMBOL(journal_sync_buffer);
+ #endif
+ EXPORT_SYMBOL(journal_flush);
+ EXPORT_SYMBOL(journal_revoke);
+
+ EXPORT_SYMBOL(journal_init_dev);
+ EXPORT_SYMBOL(journal_init_inode);
+ EXPORT_SYMBOL(journal_update_format);
+ EXPORT_SYMBOL(journal_check_used_features);
+ EXPORT_SYMBOL(journal_check_available_features);
+ EXPORT_SYMBOL(journal_set_features);
+ EXPORT_SYMBOL(journal_create);
+ EXPORT_SYMBOL(journal_load);
+ EXPORT_SYMBOL(journal_destroy);
+ EXPORT_SYMBOL(journal_recover);
+ EXPORT_SYMBOL(journal_update_superblock);
+ EXPORT_SYMBOL(__journal_abort);
+ EXPORT_SYMBOL(journal_abort);
+ EXPORT_SYMBOL(journal_errno);
+ EXPORT_SYMBOL(journal_ack_err);
+ EXPORT_SYMBOL(journal_clear_err);
+ EXPORT_SYMBOL(log_wait_commit);
+ EXPORT_SYMBOL(log_start_commit);
+ EXPORT_SYMBOL(journal_wipe);
+ EXPORT_SYMBOL(journal_blocks_per_page);
+ EXPORT_SYMBOL(journal_flushpage);
+ EXPORT_SYMBOL(journal_try_to_free_buffers);
+ EXPORT_SYMBOL(journal_bmap);
+ EXPORT_SYMBOL(journal_force_commit);
+
+ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+
+ /*
+ * journal_datalist_lock is used to protect data buffers:
+ *
+ * bh->b_transaction
+ * bh->b_tprev
+ * bh->b_tnext
+ *
+ * journal_free_buffer() is called from journal_try_to_free_buffer(), and is
+ * async wrt everything else.
+ *
+ * It is also used for checkpoint data, also to protect against
+ * journal_try_to_free_buffer():
+ *
+ * bh->b_cp_transaction
+ * bh->b_cpnext
+ * bh->b_cpprev
+ * transaction->t_checkpoint_list
+ * transaction->t_cpnext
+ * transaction->t_cpprev
+ * journal->j_checkpoint_transactions
+ *
+ * It is global at this time rather than per-journal because it's
+ * impossible for __journal_free_buffer to go from a buffer_head
+ * back to a journal_t unracily (well, not true. Fix later)
+ *
+ *
+ * The `datalist' and `checkpoint list' functions are quite
+ * separate and we could use two spinlocks here.
+ *
+ * lru_list_lock nests inside journal_datalist_lock.
+ */
+ spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED;
+
+ /*
+ * List of all journals in the system. Protected by the BKL.
+ */
+ static LIST_HEAD(all_journals);
+
+ /*
+ * Helper function used to manage commit timeouts
+ */
+
+ static void commit_timeout(unsigned long __data)
+ {
+ struct task_struct * p = (struct task_struct *) __data;
+
+ wake_up_process(p);
+ }
+
+ /* Static check for data structure consistency. There's no code
+ * invoked --- we'll just get a linker failure if things aren't right.
+ */
+ void __journal_internal_check(void)
+ {
+ extern void journal_bad_superblock_size(void);
+ if (sizeof(struct journal_superblock_s) != 1024)
+ journal_bad_superblock_size();
+ }
+
+ /*
+ * kjournald: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT: Every so often we need to commit the current state of the
+ * filesystem to disk. The journal thread is responsible for writing
+ * all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ * of the data in that part of the log has been rewritten elsewhere on
+ * the disk. Flushing these old buffers to reclaim space in the log is
+ * known as checkpointing, and this thread is responsible for that job.
+ */
+
+ journal_t *current_journal; // AKPM: debug
+
+ int kjournald(void *arg)
+ {
+ journal_t *journal = (journal_t *) arg;
+ transaction_t *transaction;
+ struct timer_list timer;
+
+ current_journal = journal;
+
+ lock_kernel();
+ daemonize();
+ spin_lock_irq(&current->sigmask_lock);
+ sigfillset(&current->blocked);
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+
+ sprintf(current->comm, "kjournald");
+
+ /* Set up an interval timer which can be used to trigger a
+ commit wakeup after the commit interval expires */
+ init_timer(&timer);
+ timer.data = (unsigned long) current;
+ timer.function = commit_timeout;
+ journal->j_commit_timer = &timer;
+
+ /* Record that the journal thread is running */
+ journal->j_task = current;
+ wake_up(&journal->j_wait_done_commit);
+
+ printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
+ journal->j_commit_interval / HZ);
+ list_add(&journal->j_all_journals, &all_journals);
+
+ /* And now, wait forever for commit wakeup events. */
+ while (1) {
+ if (journal->j_flags & JFS_UNMOUNT)
+ break;
+
+ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+ journal->j_commit_sequence, journal->j_commit_request);
+
+ if (journal->j_commit_sequence != journal->j_commit_request) {
+ jbd_debug(1, "OK, requests differ\n");
+ if (journal->j_commit_timer_active) {
+ journal->j_commit_timer_active = 0;
+ del_timer(journal->j_commit_timer);
+ }
+
+ journal_commit_transaction(journal);
+ continue;
+ }
+
+ wake_up(&journal->j_wait_done_commit);
+ interruptible_sleep_on(&journal->j_wait_commit);
+
+ jbd_debug(1, "kjournald wakes\n");
+
+ /* Were we woken up by a commit wakeup event? */
+ if ((transaction = journal->j_running_transaction) != NULL &&
+ time_after_eq(jiffies, transaction->t_expires)) {
+ journal->j_commit_request = transaction->t_tid;
+ jbd_debug(1, "woke because of timeout\n");
+ }
+ }
+
+ if (journal->j_commit_timer_active) {
+ journal->j_commit_timer_active = 0;
+ del_timer_sync(journal->j_commit_timer);
+ }
+
+ list_del(&journal->j_all_journals);
+
+ journal->j_task = NULL;
+ wake_up(&journal->j_wait_done_commit);
+ jbd_debug(1, "Journal thread exiting.\n");
+ return 0;
+ }
+
+ static void journal_start_thread(journal_t *journal)
+ {
+ kernel_thread(kjournald, (void *) journal,
+ CLONE_VM | CLONE_FS | CLONE_FILES);
+ while (!journal->j_task)
+ sleep_on(&journal->j_wait_done_commit);
+ }
+
+ static void journal_kill_thread(journal_t *journal)
+ {
+ journal->j_flags |= JFS_UNMOUNT;
+
+ while (journal->j_task) {
+ wake_up(&journal->j_wait_commit);
+ sleep_on(&journal->j_wait_done_commit);
+ }
+ }
+
+ #if 0
+
+ This is no longer needed - we do it in commit quite efficiently.
+ Note that if this function is resurrected, the loop needs to
+ be reorganised into the next_jh/last_jh algorithm.
+
+ /*
+ * journal_clean_data_list: cleanup after data IO.
+ *
+ * Once the IO system has finished writing the buffers on the transaction's
+ * data list, we can remove those buffers from the list. This function
+ * scans the list for such buffers and removes them cleanly.
+ *
+ * We assume that the journal is already locked.
+ * We are called with journal_datalist_lock held.
+ *
+ * AKPM: This function looks inefficient. Approximately O(n^2)
+ * for potentially thousands of buffers. It no longer shows on profiles
+ * because these buffers are mainly dropped in journal_commit_transaction().
+ */
+
+ void __journal_clean_data_list(transaction_t *transaction)
+ {
+ struct journal_head *jh, *next;
+
+ assert_spin_locked(&journal_datalist_lock);
+
+ restart:
+ jh = transaction->t_sync_datalist;
+ if (!jh)
+ goto out;
+ do {
+ next = jh->b_tnext;
+ if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) {
+ struct buffer_head *bh = jh2bh(jh);
+ BUFFER_TRACE(bh, "data writeout complete: unfile");
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = NULL;
+ __journal_remove_journal_head(bh);
+ refile_buffer(bh);
+ __brelse(bh);
+ goto restart;
+ }
+ jh = next;
+ } while (transaction->t_sync_datalist &&
+ jh != transaction->t_sync_datalist);
+ out:
+ return;
+ }
+ #endif
+
+ /*
+ * journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block. The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here. If the buffer happens to start with the
+ * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks. In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block. The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ * <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+ static inline unsigned long virt_to_offset(void *p)
+ {return ((unsigned long) p) & ~PAGE_MASK;}
+
+ int journal_write_metadata_buffer(transaction_t *transaction,
+ struct journal_head *jh_in,
+ struct journal_head **jh_out,
+ int blocknr)
+ {
+ int need_copy_out = 0;
+ int done_copy_out = 0;
+ int do_escape = 0;
+ char *mapped_data;
+ struct buffer_head *new_bh;
+ struct journal_head * new_jh;
+ struct page *new_page;
+ unsigned int new_offset;
+
+ /*
+ * The buffer really shouldn't be locked: only the current committing
+ * transaction is allowed to write it, so nobody else is allowed
+ * to do any IO.
+ *
+ * akpm: except if we're journalling data, and write() output is
+ * also part of a shared mapping, and another thread has
+ * decided to launch a writepage() against this buffer.
+ */
+ J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in)));
+
+ /*
+ * If a new transaction has already done a buffer copy-out, then
+ * we use that version of the data for the commit.
+ */
+
+ if (jh_in->b_frozen_data) {
+ done_copy_out = 1;
+ new_page = virt_to_page(jh_in->b_frozen_data);
+ new_offset = virt_to_offset(jh_in->b_frozen_data);
+ } else {
+ new_page = jh2bh(jh_in)->b_page;
+ new_offset = virt_to_offset(jh2bh(jh_in)->b_data);
+ }
+
+ mapped_data = ((char *) kmap(new_page)) + new_offset;
+
+ /*
+ * Check for escaping
+ */
+ if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) {
+ need_copy_out = 1;
+ do_escape = 1;
+ }
+
+ /*
+ * Do we need to do a data copy?
+ */
+
+ if (need_copy_out && !done_copy_out) {
+ char *tmp;
+ tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS);
+
+ jh_in->b_frozen_data = tmp;
+ memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size);
+
+ /* If we get to this path, we'll always need the new
+ address kmapped so that we can clear the escaped
+ magic number below. */
+ kunmap(new_page);
+ new_page = virt_to_page(tmp);
+ new_offset = virt_to_offset(tmp);
+ mapped_data = ((char *) kmap(new_page)) + new_offset;
+
+ done_copy_out = 1;
+ }
+
+ /*
+ * Right, time to make up the new buffer_head.
+ */
+ do {
+ new_bh = get_unused_buffer_head(0);
+ if (!new_bh) {
+ printk (KERN_NOTICE __FUNCTION__
+ ": ENOMEM at get_unused_buffer_head, "
+ "trying again.\n");
+ current->policy |= SCHED_YIELD;
+ schedule();
+ }
+ } while (!new_bh);
+ /* keep subsequent assertions sane */
+ new_bh->b_prev_free = 0;
+ new_bh->b_next_free = 0;
+ new_bh->b_state = 0;
+ init_buffer(new_bh, NULL, NULL);
+ atomic_set(&new_bh->b_count, 1);
+ new_jh = journal_add_journal_head(new_bh);
+
+ set_bh_page(new_bh, new_page, new_offset);
+
+ new_jh->b_transaction = NULL;
+ new_bh->b_size = jh2bh(jh_in)->b_size;
+ new_bh->b_dev = transaction->t_journal->j_dev;
+ new_bh->b_blocknr = blocknr;
+ new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty);
+
+ *jh_out = new_jh;
+
+ /*
+ * Did we need to do an escaping? Now we've done all the
+ * copying, we can finally do so.
+ */
+
+ if (do_escape)
+ * ((unsigned int *) mapped_data) = 0;
+ kunmap(new_page);
+
+ /*
+ * The to-be-written buffer needs to get moved to the io queue,
+ * and the original buffer whose contents we are shadowing or
+ * copying is moved to the transaction's shadow queue.
+ */
+ JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+ journal_file_buffer(jh_in, transaction, BJ_Shadow);
+ JBUFFER_TRACE(new_jh, "file as BJ_IO");
+ journal_file_buffer(new_jh, transaction, BJ_IO);
+
+ return do_escape | (done_copy_out << 1);
+ }
+
+ /*
+ * Allocation code for the journal file. Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+ /*
+ * log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ */
+
+ int log_space_left (journal_t *journal)
+ {
+ int left = journal->j_free;
+
+ /* Be pessimistic here about the number of those free blocks
+ * which might be required for log descriptor control blocks. */
+
+ #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+ left -= MIN_LOG_RESERVED_BLOCKS;
+
+ if (left <= 0)
+ return 0;
+ left -= (left >> 3);
+ return left;
+ }
+
+ /*
+ * This function must be non-allocating for PF_MEMALLOC tasks
+ */
+ tid_t log_start_commit (journal_t *journal, transaction_t *transaction)
+ {
+ tid_t target = journal->j_commit_request;
+
+ lock_kernel(); /* Protect journal->j_running_transaction */
+
+ /*
+ * A NULL transaction asks us to commit the currently running
+ * transaction, if there is one.
+ */
+ if (transaction)
+ target = transaction->t_tid;
+ else {
+ transaction = journal->j_running_transaction;
+ if (!transaction)
+ goto out;
+ target = transaction->t_tid;
+ }
+
+ /*
+ * Are we already doing a recent enough commit?
+ */
+ if (tid_geq(journal->j_commit_request, target))
+ goto out;
+
+ /*
+ * We want a new commit: OK, mark the request and wakup the
+ * commit thread. We do _not_ do the commit ourselves.
+ */
+
+ journal->j_commit_request = target;
+ jbd_debug(1, "JBD: requesting commit %d/%d\n",
+ journal->j_commit_request,
+ journal->j_commit_sequence);
+ wake_up(&journal->j_wait_commit);
+
+ out:
+ unlock_kernel();
+ return target;
+ }
+
+ /*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+ void log_wait_commit (journal_t *journal, tid_t tid)
+ {
+ lock_kernel();
+ #ifdef CONFIG_JBD_DEBUG
+ lock_journal(journal);
+ if (!tid_geq(journal->j_commit_request, tid)) {
+ printk(KERN_EMERG __FUNCTION__
+ ": error: j_commit_request=%d, tid=%d\n",
+ journal->j_commit_request, tid);
+ }
+ unlock_journal(journal);
+ #endif
+ while (tid_gt(tid, journal->j_commit_sequence)) {
+ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+ tid, journal->j_commit_sequence);
+ wake_up(&journal->j_wait_commit);
+ sleep_on(&journal->j_wait_done_commit);
+ }
+ unlock_kernel();
+ }
+
+ /*
+ * Log buffer allocation routines:
+ */
+
+ unsigned long journal_next_log_block(journal_t *journal)
+ {
+ unsigned long blocknr;
+
+ J_ASSERT(journal->j_free > 1);
+
+ blocknr = journal->j_head;
+ journal->j_head++;
+ journal->j_free--;
+ if (journal->j_head == journal->j_last)
+ journal->j_head = journal->j_first;
+ return journal_bmap(journal, blocknr);
+ }
+
+ /*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op. If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+ unsigned long journal_bmap(journal_t *journal, unsigned long blocknr)
+ {
+ unsigned long ret;
+
+ if (journal->j_inode) {
+ ret = bmap(journal->j_inode, blocknr);
+ J_ASSERT(ret != 0);
+ } else {
+ ret = blocknr; /* +journal->j_blk_offset */
+ }
+ return ret;
+ }
+
+ /*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ */
+
+ struct journal_head * journal_get_descriptor_buffer(journal_t *journal)
+ {
+ struct buffer_head *bh;
+ unsigned long blocknr = journal_next_log_block(journal);
+
+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ bh->b_state |= (1 << BH_Dirty);
+ BUFFER_TRACE(bh, "return this buffer");
+ return journal_add_journal_head(bh);
+ }
+
+ /*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk. */
+
+ /* First: create and setup a journal_t object in memory. We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+ static journal_t * journal_init_common (void)
+ {
+ journal_t *journal;
+ int err;
+
+ MOD_INC_USE_COUNT;
+
+ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
+ if (!journal)
+ goto fail;
+ memset(journal, 0, sizeof(*journal));
+
+ init_waitqueue_head(&journal->j_wait_transaction_locked);
+ init_waitqueue_head(&journal->j_wait_logspace);
+ init_waitqueue_head(&journal->j_wait_done_commit);
+ init_waitqueue_head(&journal->j_wait_checkpoint);
+ init_waitqueue_head(&journal->j_wait_commit);
+ init_waitqueue_head(&journal->j_wait_updates);
+ init_MUTEX(&journal->j_barrier);
+ init_MUTEX(&journal->j_checkpoint_sem);
+ init_MUTEX(&journal->j_sem);
+
+ journal->j_commit_interval = (HZ * 5);
+
+ /* The journal is marked for error until we succeed with recovery! */
+ journal->j_flags = JFS_ABORT;
+
+ /* Set up a default-sized revoke table for the new mount. */
+ err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+ if (err) {
+ kfree(journal);
+ goto fail;
+ }
+ return journal;
+ fail:
+ MOD_DEC_USE_COUNT;
+ return NULL;
+ }
+
+ /* journal_init_dev and journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal. We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ * journal_init_dev creates a journal which maps a fixed contiguous
+ * range of blocks on an arbitrary block device.
+ *
+ * journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal. The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+
+ journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
+ int start, int len, int blocksize)
+ {
+ journal_t *journal = journal_init_common();
+ struct buffer_head *bh;
+
+ if (!journal)
+ return NULL;
+
+ journal->j_dev = dev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_maxlen = len;
+ journal->j_blocksize = blocksize;
+
+ bh = getblk(journal->j_dev, start, journal->j_blocksize);
+ J_ASSERT(bh != NULL);
+ journal->j_sb_buffer = bh;
+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+ return journal;
+ }
+
+ journal_t * journal_init_inode (struct inode *inode)
+ {
+ struct buffer_head *bh;
+ journal_t *journal = journal_init_common();
+ int blocknr;
+
+ if (!journal)
+ return NULL;
+
+ journal->j_dev = inode->i_dev;
+ journal->j_fs_dev = inode->i_dev;
+ journal->j_inode = inode;
+ jbd_debug(1,
+ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+ journal, bdevname(inode->i_dev), inode->i_ino, inode->i_size,
+ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ journal->j_blocksize = inode->i_sb->s_blocksize;
+
+ blocknr = journal_bmap(journal, 0);
+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ J_ASSERT(bh != NULL);
+ journal->j_sb_buffer = bh;
+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+ return journal;
+ }
+
+ /*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session. We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+ static int journal_reset (journal_t *journal)
+ {
+ journal_superblock_t *sb = journal->j_superblock;
+ unsigned int first, last;
+
+ first = ntohl(sb->s_first);
+ last = ntohl(sb->s_maxlen);
+
+ journal->j_first = first;
+ journal->j_last = last;
+
+ journal->j_head = first;
+ journal->j_tail = first;
+ journal->j_free = last - first;
+
+ journal->j_tail_sequence = journal->j_transaction_sequence;
+ journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+ journal->j_commit_request = journal->j_commit_sequence;
+
+ journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+ /* Add the dynamic fields and write it to disk. */
+ journal_update_superblock(journal, 1);
+
+ lock_journal(journal);
+ journal_start_thread(journal);
+ unlock_journal(journal);
+
+ return 0;
+ }
+
+ /*
+ * Given a journal_t structure which tells us which disk blocks we can
+ * use, create a new journal superblock and initialise all of the
+ * journal fields from scratch. */
+
+ int journal_create (journal_t *journal)
+ {
+ int blocknr;
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+ int i;
+
+ if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
+ printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+ journal->j_maxlen);
+ return -EINVAL;
+ }
+
+ if (journal->j_inode == NULL) {
+ /*
+ * We don't know what block to start at!
+ */
+ printk(KERN_EMERG __FUNCTION__
+ ": creation of journal on external device!\n");
+ BUG();
+ }
+
+ /* Zero out the entire journal on disk. We cannot afford to
+ have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
+ jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+ for (i = 0; i < journal->j_maxlen; i++) {
+ blocknr = journal_bmap(journal, i);
+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ wait_on_buffer(bh);
+ memset (bh->b_data, 0, journal->j_blocksize);
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "marking uptodate");
+ mark_buffer_uptodate(bh, 1);
+ __brelse(bh);
+ }
+ sync_dev(journal->j_dev);
+ jbd_debug(1, "JBD: journal cleared.\n");
+
+ /* OK, fill in the initial static fields in the new superblock */
+ sb = journal->j_superblock;
+
+ sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER);
+ sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2);
+
+ sb->s_blocksize = htonl(journal->j_blocksize);
+ sb->s_maxlen = htonl(journal->j_maxlen);
+ sb->s_first = htonl(1);
+
+ journal->j_transaction_sequence = 1;
+
+ journal->j_flags &= ~JFS_ABORT;
+ journal->j_format_version = 2;
+
+ return journal_reset(journal);
+ }
+
+ /*
+ * Update a journal's dynamic superblock fields and write it to disk,
+ * optionally waiting for the IO to complete.
+ */
+
+ void journal_update_superblock(journal_t *journal, int wait)
+ {
+ journal_superblock_t *sb = journal->j_superblock;
+ struct buffer_head *bh = journal->j_sb_buffer;
+
+ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+
+ sb->s_sequence = htonl(journal->j_tail_sequence);
+ sb->s_start = htonl(journal->j_tail);
+ sb->s_errno = htonl(journal->j_errno);
+
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+ if (wait)
+ wait_on_buffer(bh);
+
+ /* If we have just flushed the log (by marking s_start==0), then
+ * any future commit will have to be careful to update the
+ * superblock again to re-record the true start of the log. */
+
+ if (sb->s_start)
+ journal->j_flags &= ~JFS_FLUSHED;
+ else
+ journal->j_flags |= JFS_FLUSHED;
+ }
+
+
+ /*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+
+ static int journal_get_superblock(journal_t *journal)
+ {
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+
+ bh = journal->j_sb_buffer;
+
+ J_ASSERT(bh != NULL);
+ if (!buffer_uptodate(bh)) {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ printk (KERN_ERR
+ "JBD: IO error reading journal superblock\n");
+ return -EIO;
+ }
+ }
+
+ sb = journal->j_superblock;
+
+ if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) ||
+ sb->s_blocksize != htonl(journal->j_blocksize)) {
+ printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+ return -EINVAL;
+ }
+
+ switch(ntohl(sb->s_header.h_blocktype)) {
+ case JFS_SUPERBLOCK_V1:
+ journal->j_format_version = 1;
+ break;
+ case JFS_SUPERBLOCK_V2:
+ journal->j_format_version = 2;
+ break;
+ default:
+ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+ return -EINVAL;
+ }
+
+ if (ntohl(sb->s_maxlen) < journal->j_maxlen)
+ journal->j_maxlen = ntohl(sb->s_maxlen);
+ else if (ntohl(sb->s_maxlen) > journal->j_maxlen) {
+ printk (KERN_WARNING "JBD: journal file too short\n");
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ /*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+ static int load_superblock(journal_t *journal)
+ {
+ int err;
+ journal_superblock_t *sb;
+
+ err = journal_get_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+
+ journal->j_tail_sequence = ntohl(sb->s_sequence);
+ journal->j_tail = ntohl(sb->s_start);
+ journal->j_first = ntohl(sb->s_first);
+ journal->j_last = ntohl(sb->s_maxlen);
+ journal->j_errno = ntohl(sb->s_errno);
+
+ return 0;
+ }
+
+
+ /*
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+
+ int journal_load(journal_t *journal)
+ {
+ int err;
+
+ err = load_superblock(journal);
+ if (err)
+ return err;
+
+ /* If this is a V2 superblock, then we have to check the
+ * features flags on it. */
+
+ if (journal->j_format_version >= 2) {
+ journal_superblock_t *sb = journal->j_superblock;
+
+ if ((sb->s_feature_ro_compat &
+ ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
+ (sb->s_feature_incompat &
+ ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
+ printk (KERN_WARNING
+ "JBD: Unrecognised features on journal\n");
+ return -EINVAL;
+ }
+ }
+
+ /* Let the recovery code check whether it needs to recover any
+ * data from the journal. */
+ if (journal_recover(journal))
+ goto recovery_error;
+
+ /* OK, we've finished with the dynamic journal bits:
+ * reinitialise the dynamic contents of the superblock in memory
+ * and reset them on disk. */
+ if (journal_reset(journal))
+ goto recovery_error;
+
+ journal->j_flags &= ~JFS_ABORT;
+ journal->j_flags |= JFS_LOADED;
+ return 0;
+
+ recovery_error:
+ printk (KERN_WARNING "JBD: recovery failed\n");
+ return -EIO;
+ }
+
+ /*
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ */
+
+ void journal_destroy (journal_t *journal)
+ {
+ /* Wait for the commit thread to wake up and die. */
+ journal_kill_thread(journal);
+
+ /* Force a final log commit */
+ if (journal->j_running_transaction)
+ journal_commit_transaction(journal);
+
+ /* Force any old transactions to disk */
+ lock_journal(journal);
+ while (journal->j_checkpoint_transactions != NULL)
+ log_do_checkpoint(journal, 1);
+
+ J_ASSERT(journal->j_running_transaction == NULL);
+ J_ASSERT(journal->j_committing_transaction == NULL);
+ J_ASSERT(journal->j_checkpoint_transactions == NULL);
+
+ /* We can now mark the journal as empty. */
+ journal->j_tail = 0;
+ journal->j_tail_sequence = ++journal->j_transaction_sequence;
+ journal_update_superblock(journal, 1);
+
+ if (journal->j_inode)
+ iput(journal->j_inode);
+ if (journal->j_revoke)
+ journal_destroy_revoke(journal);
+
+ unlock_journal(journal);
+ brelse(journal->j_sb_buffer);
+ kfree(journal);
+ MOD_DEC_USE_COUNT;
+ }
+
+
+ /* Published API: Check whether the journal uses all of a given set of
+ * features. Return true (non-zero) if it does. */
+
+ int journal_check_used_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+ {
+ journal_superblock_t *sb;
+
+ if (!compat && !ro && !incompat)
+ return 1;
+ if (journal->j_format_version == 1)
+ return 0;
+
+ sb = journal->j_superblock;
+
+ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+ return 1;
+
+ return 0;
+ }
+
+ /* Published API: Check whether the journaling code supports the use of
+ * all of a given set of features on this journal. Return true
+ * (non-zero) if it can. */
+
+ int journal_check_available_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+ {
+ journal_superblock_t *sb;
+
+ if (!compat && !ro && !incompat)
+ return 1;
+
+ sb = journal->j_superblock;
+
+ /* We can support any known requested features iff the
+ * superblock is in version 2. Otherwise we fail to support any
+ * extended sb features. */
+
+ if (journal->j_format_version != 2)
+ return 0;
+
+ if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
+ (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
+ (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
+ return 1;
+
+ return 0;
+ }
+
+ /* Published API: Mark a given journal feature as present on the
+ * superblock. Returns true if the requested features could be set. */
+
+ int journal_set_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+ {
+ journal_superblock_t *sb;
+
+ if (journal_check_used_features(journal, compat, ro, incompat))
+ return 1;
+
+ if (!journal_check_available_features(journal, compat, ro, incompat))
+ return 0;
+
+ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+ compat, ro, incompat);
+
+ sb = journal->j_superblock;
+
+ sb->s_feature_compat |= cpu_to_be32(compat);
+ sb->s_feature_ro_compat |= cpu_to_be32(ro);
+ sb->s_feature_incompat |= cpu_to_be32(incompat);
+
+ return 1;
+ }
+
+
+ /*
+ * Published API:
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+
+ int journal_update_format (journal_t *journal)
+ {
+ journal_superblock_t *sb;
+ int err;
+
+ err = journal_get_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+
+ switch (ntohl(sb->s_header.h_blocktype)) {
+ case JFS_SUPERBLOCK_V2:
+ return 0;
+ case JFS_SUPERBLOCK_V1:
+ return journal_convert_superblock_v1(journal, sb);
+ default:
+ break;
+ }
+ return -EINVAL;
+ }
+
+ static int journal_convert_superblock_v1(journal_t *journal,
+ journal_superblock_t *sb)
+ {
+ int offset, blocksize;
+ struct buffer_head *bh;
+
+ printk(KERN_WARNING
+ "JBD: Converting superblock from version 1 to 2.\n");
+
+ /* Pre-initialise new fields to zero */
+ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+ blocksize = ntohl(sb->s_blocksize);
+ memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+ sb->s_nr_users = cpu_to_be32(1);
+ sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+ journal->j_format_version = 2;
+
+ bh = journal->j_sb_buffer;
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+ wait_on_buffer(bh);
+ return 0;
+ }
+
+
+ /*
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+ int journal_flush (journal_t *journal)
+ {
+ int err = 0;
+ transaction_t *transaction = NULL;
+ unsigned long old_tail;
+
+ lock_kernel();
+
+ /* Force everything buffered to the log... */
+ if (journal->j_running_transaction) {
+ transaction = journal->j_running_transaction;
+ log_start_commit(journal, transaction);
+ } else if (journal->j_committing_transaction)
+ transaction = journal->j_committing_transaction;
+
+ /* Wait for the log commit to complete... */
+ if (transaction)
+ log_wait_commit(journal, transaction->t_tid);
+
+ /* ...and flush everything in the log out to disk. */
+ lock_journal(journal);
+ while (!err && journal->j_checkpoint_transactions != NULL)
+ err = log_do_checkpoint(journal, journal->j_maxlen);
+ cleanup_journal_tail(journal);
+
+ /* Finally, mark the journal as really needing no recovery.
+ * This sets s_start==0 in the underlying superblock, which is
+ * the magic code for a fully-recovered superblock. Any future
+ * commits of data to the journal will restore the current
+ * s_start value. */
+ old_tail = journal->j_tail;
+ journal->j_tail = 0;
+ journal_update_superblock(journal, 1);
+ journal->j_tail = old_tail;
+
+ unlock_journal(journal);
+
+ J_ASSERT(!journal->j_running_transaction);
+ J_ASSERT(!journal->j_committing_transaction);
+ J_ASSERT(!journal->j_checkpoint_transactions);
+ J_ASSERT(journal->j_head == journal->j_tail);
+ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+
+ unlock_kernel();
+
+ return err;
+ }
+
+ /*
+ * Wipe out all of the contents of a journal, safely. This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and journal_load().
+ *
+ * If (write) is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+ int journal_wipe (journal_t *journal, int write)
+ {
+ journal_superblock_t *sb;
+ int err = 0;
+
+ J_ASSERT (!(journal->j_flags & JFS_LOADED));
+
+ err = load_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+
+ if (!journal->j_tail)
+ goto no_recovery;
+
+ printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+ write ? "Clearing" : "Ignoring");
+
+ err = journal_skip_recovery(journal);
+ if (write)
+ journal_update_superblock(journal, 1);
+
+ no_recovery:
+ return err;
+ }
+
+ /*
+ * journal_dev_name: format a character string to describe on what
+ * device this journal is present.
+ */
+
+ const char * journal_dev_name(journal_t *journal)
+ {
+ kdev_t dev;
+
+ if (journal->j_inode)
+ dev = journal->j_inode->i_dev;
+ else
+ dev = journal->j_dev;
+
+ return bdevname(dev);
+ }
+
+ /*
+ * journal_abort: perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction). This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics. Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal. Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return. A
+ * journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the journal_abort call allows the caller to supply an errno
+ * which will be recored (if possible) in the journal superblock. This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk. ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ */
+
+ /* Quick version for internal journal use (doesn't lock the journal) */
+ void __journal_abort (journal_t *journal)
+ {
+ transaction_t *transaction;
+
+ printk (KERN_ERR "Aborting journal on device %s.\n",
+ journal_dev_name(journal));
+
+ journal->j_flags |= JFS_ABORT;
+ transaction = journal->j_running_transaction;
+ if (transaction)
+ log_start_commit(journal, transaction);
+ }
+
+ /* Full version for external use */
+ void journal_abort (journal_t *journal, int errno)
+ {
+ lock_journal(journal);
+
+ if (journal->j_flags & JFS_ABORT)
+ goto out;
+
+ if (!journal->j_errno)
+ journal->j_errno = errno;
+
+ __journal_abort(journal);
+
+ if (errno)
+ journal_update_superblock(journal, 1);
+
+ out:
+ unlock_journal(journal);
+ }
+
+ int journal_errno (journal_t *journal)
+ {
+ int err;
+
+ lock_journal(journal);
+ if (journal->j_flags & JFS_ABORT)
+ err = -EROFS;
+ else
+ err = journal->j_errno;
+ unlock_journal(journal);
+ return err;
+ }
+
+ int journal_clear_err (journal_t *journal)
+ {
+ int err = 0;
+
+ lock_journal(journal);
+ if (journal->j_flags & JFS_ABORT)
+ err = -EROFS;
+ else
+ journal->j_errno = 0;
+ unlock_journal(journal);
+ return err;
+ }
+
+ void journal_ack_err (journal_t *journal)
+ {
+ lock_journal(journal);
+ if (journal->j_errno)
+ journal->j_flags |= JFS_ACK_ERR;
+ unlock_journal(journal);
+ }
+
+ int journal_blocks_per_page(struct inode *inode)
+ {
+ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ }
+
+ /*
+ * shrink_journal_memory().
+ * Called when we're under memory pressure. Free up all the written-back
+ * checkpointed metadata buffers.
+ */
+ void shrink_journal_memory(void)
+ {
+ struct list_head *list;
+
+ lock_kernel();
+ list_for_each(list, &all_journals) {
+ journal_t *journal =
+ list_entry(list, journal_t, j_all_journals);
+ spin_lock(&journal_datalist_lock);
+ __journal_clean_checkpoint_list(journal);
+ spin_unlock(&journal_datalist_lock);
+ }
+ unlock_kernel();
+ }
+
+ /*
+ * Simple support for retying memory allocations. Introduced to help to
+ * debug different VM deadlock avoidance strategies.
+ */
+ /*
+ * Simple support for retying memory allocations. Introduced to help to
+ * debug different VM deadlock avoidance strategies.
+ */
+ void * __jbd_kmalloc (char *where, size_t size, int flags, int retry)
+ {
+ void *p;
+ static unsigned long last_warning;
+
+ while (1) {
+ p = kmalloc(size, flags);
+ if (p)
+ return p;
+ if (!retry)
+ return NULL;
+ /* Log every retry for debugging. Also log them to the
+ * syslog, but do rate-limiting on the non-debugging
+ * messages. */
+ jbd_debug(1, "ENOMEM in %s, retrying.\n", where);
+
+ if (time_after(jiffies, last_warning + 5*HZ)) {
+ printk(KERN_NOTICE
+ "ENOMEM in %s, retrying.\n", where);
+ last_warning = jiffies;
+ }
+
+ current->policy |= SCHED_YIELD;
+ schedule();
+ }
+ }
+
+ /*
+ * Journal_head storage management
+ */
+ static kmem_cache_t *journal_head_cache;
+ #ifdef CONFIG_JBD_DEBUG
+ static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+ #endif
+
+ static int journal_init_journal_head_cache(void)
+ {
+ int retval;
+
+ J_ASSERT(journal_head_cache == 0);
+ journal_head_cache = kmem_cache_create("journal_head",
+ sizeof(struct journal_head),
+ 0, /* offset */
+ 0, /* flags */
+ NULL, /* ctor */
+ NULL); /* dtor */
+ retval = 0;
+ if (journal_head_cache == 0) {
+ retval = -ENOMEM;
+ printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+ }
+ return retval;
+ }
+
+ static void journal_destroy_journal_head_cache(void)
+ {
+ J_ASSERT(journal_head_cache != NULL);
+ kmem_cache_destroy(journal_head_cache);
+ journal_head_cache = 0;
+ }
+
+ /*
+ * journal_head splicing and dicing
+ */
+ static struct journal_head *journal_alloc_journal_head(void)
+ {
+ struct journal_head *ret;
+ static unsigned long last_warning;
+
+ #ifdef CONFIG_JBD_DEBUG
+ atomic_inc(&nr_journal_heads);
+ #endif
+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+ if (ret == 0) {
+ jbd_debug(1, "out of memory for journal_head\n");
+ if (time_after(jiffies, last_warning + 5*HZ)) {
+ printk(KERN_NOTICE "ENOMEM in " __FUNCTION__
+ ", retrying.\n");
+ last_warning = jiffies;
+ }
+ while (ret == 0) {
+ current->policy |= SCHED_YIELD;
+ schedule();
+ ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+ }
+ }
+ return ret;
+ }
+
+ static void journal_free_journal_head(struct journal_head *jh)
+ {
+ #ifdef CONFIG_JBD_DEBUG
+ atomic_dec(&nr_journal_heads);
+ memset(jh, 0x5b, sizeof(*jh));
+ #endif
+ kmem_cache_free(journal_head_cache, jh);
+ }
+
+ /*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set. This bit is tested in core kernel code where we need to take
+ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head may be detached from its buffer_head when the journal_head's
+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * Various places in JBD call journal_remove_journal_head() to indicate that the
+ * journal_head can be dropped if needed.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction. To protect the
+ * journal_head in this situation, journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one. The caller must call
+ * journal_unlock_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ * (Attach a journal_head if needed. Increments b_jcount)
+ * struct journal_head *jh = journal_add_journal_head(bh);
+ * ...
+ * jh->b_transaction = xxx;
+ * journal_unlock_journal_head(jh);
+ *
+ * Now, the journal_head's b_jcount is zero, but it is safe from being released
+ * because it has a non-zero b_transaction.
+ */
+
+ /*
+ * Give a buffer_head a journal_head.
+ *
+ * Doesn't need the journal lock.
+ * May sleep.
+ * Cannot be called with journal_datalist_lock held.
+ */
+ struct journal_head *journal_add_journal_head(struct buffer_head *bh)
+ {
+ struct journal_head *jh;
+
+ spin_lock(&journal_datalist_lock);
+ if (buffer_jbd(bh)) {
+ jh = bh2jh(bh);
+ } else {
+ J_ASSERT_BH(bh,
+ (atomic_read(&bh->b_count) > 0) ||
+ (bh->b_page && bh->b_page->mapping));
+ spin_unlock(&journal_datalist_lock);
+ jh = journal_alloc_journal_head();
+ memset(jh, 0, sizeof(*jh));
+ spin_lock(&journal_datalist_lock);
+
+ if (buffer_jbd(bh)) {
+ /* Someone did it for us! */
+ J_ASSERT_BH(bh, bh->b_private != NULL);
+ journal_free_journal_head(jh);
+ jh = bh->b_private;
+ } else {
+ /*
+ * We actually don't need jh_splice_lock when
+ * adding a journal_head - only on removal.
+ */
+ spin_lock(&jh_splice_lock);
+ set_bit(BH_JBD, &bh->b_state);
+ bh->b_private = jh;
+ jh->b_bh = bh;
+ atomic_inc(&bh->b_count);
+ spin_unlock(&jh_splice_lock);
+ BUFFER_TRACE(bh, "added journal_head");
+ }
+ }
+ jh->b_jcount++;
+ spin_unlock(&journal_datalist_lock);
+ return bh->b_private;
+ }
+
+ /*
+ * journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * and has a zero b_jcount then remove and release its journal_head. If we did
+ * see that the buffer is not used by any transaction we also "logically"
+ * decrement ->b_count.
+ *
+ * We in fact take an additional increment on ->b_count as a convenience,
+ * because the caller usually wants to do additional things with the bh
+ * after calling here.
+ * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
+ * time. Once the caller has run __brelse(), the buffer is eligible for
+ * reaping by try_to_free_buffers().
+ *
+ * Requires journal_datalist_lock.
+ */
+ void __journal_remove_journal_head(struct buffer_head *bh)
+ {
+ struct journal_head *jh = bh2jh(bh);
+
+ assert_spin_locked(&journal_datalist_lock);
+ J_ASSERT_JH(jh, jh->b_jcount >= 0);
+ atomic_inc(&bh->b_count);
+ if (jh->b_jcount == 0) {
+ if (jh->b_transaction == NULL &&
+ jh->b_next_transaction == NULL &&
+ jh->b_cp_transaction == NULL) {
+ J_ASSERT_BH(bh, buffer_jbd(bh));
+ J_ASSERT_BH(bh, jh2bh(jh) == bh);
+ BUFFER_TRACE(bh, "remove journal_head");
+ spin_lock(&jh_splice_lock);
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_bit(BH_JBD, &bh->b_state);
+ __brelse(bh);
+ spin_unlock(&jh_splice_lock);
+ journal_free_journal_head(jh);
+ } else {
+ BUFFER_TRACE(bh, "journal_head was locked");
+ }
+ }
+ }
+
+ void journal_unlock_journal_head(struct journal_head *jh)
+ {
+ spin_lock(&journal_datalist_lock);
+ J_ASSERT_JH(jh, jh->b_jcount > 0);
+ --jh->b_jcount;
+ if (!jh->b_jcount && !jh->b_transaction) {
+ struct buffer_head *bh;
+ bh = jh2bh(jh);
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ void journal_remove_journal_head(struct buffer_head *bh)
+ {
+ spin_lock(&journal_datalist_lock);
+ __journal_remove_journal_head(bh);
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+ * Module startup and shutdown
+ */
+
+ static int __init journal_init_caches(void)
+ {
+ int ret;
+
+ ret = journal_init_revoke_caches();
+ if (ret == 0)
+ ret = journal_init_journal_head_cache();
+ return ret;
+ }
+
+ static void journal_destroy_caches(void)
+ {
+ journal_destroy_revoke_caches();
+ journal_destroy_journal_head_cache();
+ }
+
+ static int __init journal_init(void)
+ {
+ int ret;
+
+ printk(KERN_INFO "Journalled Block Device driver loaded\n");
+ ret = journal_init_caches();
+ if (ret != 0)
+ journal_destroy_caches();
+ return ret;
+ }
+
+ static void __exit journal_exit(void)
+ {
+ #ifdef CONFIG_JBD_DEBUG
+ int n = atomic_read(&nr_journal_heads);
+ if (n)
+ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+ #endif
+ journal_destroy_caches();
+ }
+
+ MODULE_LICENSE("GPL");
+ module_init(journal_init);
+ module_exit(journal_exit);
+
diff -rc2P linux/fs/jbd/recovery.c linux-2.4.13/fs/jbd/recovery.c
*** linux/fs/jbd/recovery.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/recovery.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,586 ----
+ /*
+ * linux/fs/recovery.c
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+ #ifndef __KERNEL__
+ #include "jfs_user.h"
+ #else
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #endif
+
+ /*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+ struct recovery_info
+ {
+ tid_t start_transaction;
+ tid_t end_transaction;
+
+ int nr_replays;
+ int nr_revokes;
+ int nr_revoke_hits;
+ };
+
+ enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+ static int do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass);
+ static int scan_revoke_records(journal_t *, struct buffer_head *,
+ tid_t, struct recovery_info *);
+
+ #ifdef __KERNEL__
+
+ /* Release readahead buffers after use */
+ void journal_brelse_array(struct buffer_head *b[], int n)
+ {
+ while (--n >= 0)
+ brelse (b[n]);
+ }
+
+
+ /*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us. We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all. Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though. 128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+ #define MAXBUF 8
+ static int do_readahead(journal_t *journal, unsigned int start)
+ {
+ int err;
+ unsigned int max, nbufs, next, blocknr;
+ struct buffer_head *bh;
+
+ struct buffer_head * bufs[MAXBUF];
+
+ /* Do up to 128K of readahead */
+ max = start + (128 * 1024 / journal->j_blocksize);
+ if (max > journal->j_maxlen)
+ max = journal->j_maxlen;
+
+ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
+ * a time to the block device IO layer. */
+
+ nbufs = 0;
+
+ for (next = start; next < max; next++) {
+ blocknr = journal_bmap(journal, next);
+
+ if (!blocknr) {
+ printk (KERN_ERR "JBD: bad block at offset %u\n",
+ next);
+ err = -EIO;
+ goto failed;
+ }
+
+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ if (!bh) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+ bufs[nbufs++] = bh;
+ if (nbufs == MAXBUF) {
+ ll_rw_block(READ, nbufs, bufs);
+ journal_brelse_array(bufs, nbufs);
+ nbufs = 0;
+ }
+ } else
+ brelse(bh);
+ }
+
+ if (nbufs)
+ ll_rw_block(READ, nbufs, bufs);
+ err = 0;
+
+ failed:
+ if (nbufs)
+ journal_brelse_array(bufs, nbufs);
+ return err;
+ }
+
+ #endif /* __KERNEL__ */
+
+
+ /*
+ * Read a block from the journal
+ */
+
+ static int jread(struct buffer_head **bhp, journal_t *journal,
+ unsigned int offset)
+ {
+ unsigned int blocknr;
+ struct buffer_head *bh;
+
+ *bhp = NULL;
+
+ J_ASSERT (offset < journal->j_maxlen);
+
+ blocknr = journal_bmap(journal, offset);
+
+ if (!blocknr) {
+ printk (KERN_ERR "JBD: bad block at offset %u\n",
+ offset);
+ return -EIO;
+ }
+
+ bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ if (!bh)
+ return -ENOMEM;
+
+ if (!buffer_uptodate(bh)) {
+ /* If this is a brand new buffer, start readahead.
+ Otherwise, we assume we are already reading it. */
+ if (!buffer_req(bh))
+ do_readahead(journal, offset);
+ wait_on_buffer(bh);
+ }
+
+ if (!buffer_uptodate(bh)) {
+ printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+ offset);
+ brelse(bh);
+ return -EIO;
+ }
+
+ *bhp = bh;
+ return 0;
+ }
+
+
+ /*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+ static int count_tags(struct buffer_head *bh, int size)
+ {
+ char * tagp;
+ journal_block_tag_t * tag;
+ int nr = 0;
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+
+ while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
+ tag = (journal_block_tag_t *) tagp;
+
+ nr++;
+ tagp += sizeof(journal_block_tag_t);
+ if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID)))
+ tagp += 16;
+
+ if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG))
+ break;
+ }
+
+ return nr;
+ }
+
+
+ /* Make sure we wrap around the log correctly! */
+ #define wrap(journal, var) \
+ do { \
+ if (var >= (journal)->j_last) \
+ var -= ((journal)->j_last - (journal)->j_first); \
+ } while (0)
+
+ /*
+ * journal_recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes. In the first pass, we look for the
+ * end of the log. In the second, we assemble the list of revoke
+ * blocks. In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+
+ int journal_recover(journal_t *journal)
+ {
+ int err;
+ journal_superblock_t * sb;
+
+ struct recovery_info info;
+
+ memset(&info, 0, sizeof(info));
+ sb = journal->j_superblock;
+
+ /*
+ * The journal superblock's s_start field (the current log head)
+ * is always zero if, and only if, the journal was cleanly
+ * unmounted.
+ */
+
+ if (!sb->s_start) {
+ jbd_debug(1, "No recovery required, last transaction %d\n",
+ ntohl(sb->s_sequence));
+ journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1;
+ return 0;
+ }
+
+
+ err = do_one_pass(journal, &info, PASS_SCAN);
+ if (!err)
+ err = do_one_pass(journal, &info, PASS_REVOKE);
+ if (!err)
+ err = do_one_pass(journal, &info, PASS_REPLAY);
+
+ jbd_debug(0, "JBD: recovery, exit status %d, "
+ "recovered transactions %u to %u\n",
+ err, info.start_transaction, info.end_transaction);
+ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
+ info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+ /* Restart the log at the next transaction ID, thus invalidating
+ * any existing commit records in the log. */
+ journal->j_transaction_sequence = ++info.end_transaction;
+
+ journal_clear_revoke(journal);
+ fsync_no_super(journal->j_fs_dev);
+ return err;
+ }
+
+ /*
+ * journal_skip_recovery
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+
+ int journal_skip_recovery(journal_t *journal)
+ {
+ int err;
+ journal_superblock_t * sb;
+
+ struct recovery_info info;
+
+ memset (&info, 0, sizeof(info));
+ sb = journal->j_superblock;
+
+ err = do_one_pass(journal, &info, PASS_SCAN);
+
+ if (err) {
+ printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+ ++journal->j_transaction_sequence;
+ } else {
+ #ifdef CONFIG_JBD_DEBUG
+ int dropped = info.end_transaction - ntohl(sb->s_sequence);
+ #endif
+
+ jbd_debug(0,
+ "JBD: ignoring %d transaction%s from the journal.\n",
+ dropped, (dropped == 1) ? "" : "s");
+ journal->j_transaction_sequence = ++info.end_transaction;
+ }
+
+ journal->j_tail = 0;
+
+ return err;
+ }
+
+ static int do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+ {
+
+ unsigned int first_commit_ID, next_commit_ID;
+ unsigned long next_log_block;
+ int err, success = 0;
+ journal_superblock_t * sb;
+ journal_header_t * tmp;
+ struct buffer_head * bh;
+ unsigned int sequence;
+ int blocktype;
+
+ /* Precompute the maximum metadata descriptors in a descriptor block */
+ int MAX_BLOCKS_PER_DESC;
+ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+ / sizeof(journal_block_tag_t));
+
+ /*
+ * First thing is to establish what we expect to find in the log
+ * (in terms of transaction IDs), and where (in terms of log
+ * block offsets): query the superblock.
+ */
+
+ sb = journal->j_superblock;
+ next_commit_ID = ntohl(sb->s_sequence);
+ next_log_block = ntohl(sb->s_start);
+
+ first_commit_ID = next_commit_ID;
+ if (pass == PASS_SCAN)
+ info->start_transaction = first_commit_ID;
+
+ jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+ /*
+ * Now we walk through the log, transaction by transaction,
+ * making sure that each transaction has a commit block in the
+ * expected place. Each complete transaction gets replayed back
+ * into the main filesystem.
+ */
+
+ while (1) {
+ int flags;
+ char * tagp;
+ journal_block_tag_t * tag;
+ struct buffer_head * obh;
+ struct buffer_head * nbh;
+
+ /* If we already know where to stop the log traversal,
+ * check right now that we haven't gone past the end of
+ * the log. */
+
+ if (pass != PASS_SCAN)
+ if (tid_geq(next_commit_ID, info->end_transaction))
+ break;
+
+ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ next_commit_ID, next_log_block, journal->j_last);
+
+ /* Skip over each chunk of the transaction looking
+ * either the next descriptor block or the final commit
+ * record. */
+
+ jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ err = jread(&bh, journal, next_log_block);
+ if (err)
+ goto failed;
+
+ next_log_block++;
+ wrap(journal, next_log_block);
+
+ /* What kind of buffer is it?
+ *
+ * If it is a descriptor block, check that it has the
+ * expected sequence number. Otherwise, we're all done
+ * here. */
+
+ tmp = (journal_header_t *)bh->b_data;
+
+ if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) {
+ brelse(bh);
+ break;
+ }
+
+ blocktype = ntohl(tmp->h_blocktype);
+ sequence = ntohl(tmp->h_sequence);
+ jbd_debug(3, "Found magic %d, sequence %d\n",
+ blocktype, sequence);
+
+ if (sequence != next_commit_ID) {
+ brelse(bh);
+ break;
+ }
+
+ /* OK, we have a valid descriptor block which matches
+ * all of the sequence number checks. What are we going
+ * to do with it? That depends on the pass... */
+
+ switch(blocktype) {
+ case JFS_DESCRIPTOR_BLOCK:
+ /* If it is a valid descriptor block, replay it
+ * in pass REPLAY; otherwise, just skip over the
+ * blocks it describes. */
+ if (pass != PASS_REPLAY) {
+ next_log_block +=
+ count_tags(bh, journal->j_blocksize);
+ wrap(journal, next_log_block);
+ brelse(bh);
+ continue;
+ }
+
+ /* A descriptor block: we can now write all of
+ * the data blocks. Yay, useful work is finally
+ * getting done here! */
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+ <= journal->j_blocksize) {
+ unsigned long io_block;
+
+ tag = (journal_block_tag_t *) tagp;
+ flags = ntohl(tag->t_flags);
+
+ io_block = next_log_block++;
+ wrap(journal, next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ /* Recover what we can, but
+ * report failure at the end. */
+ success = err;
+ printk (KERN_ERR
+ "JBD: IO error %d recovering "
+ "block %ld in log\n",
+ err, io_block);
+ } else {
+ unsigned long blocknr;
+
+ J_ASSERT(obh != NULL);
+ blocknr = ntohl(tag->t_blocknr);
+
+ /* If the block has been
+ * revoked, then we're all done
+ * here. */
+ if (journal_test_revoke
+ (journal, blocknr,
+ next_commit_ID)) {
+ brelse(obh);
+ ++info->nr_revoke_hits;
+ goto skip_write;
+ }
+
+ /* Find a buffer for the new
+ * data being restored */
+ nbh = getblk(journal->j_fs_dev, blocknr,
+ journal->j_blocksize);
+ if (nbh == NULL) {
+ printk(KERN_ERR
+ "JBD: Out of memory "
+ "during recovery.\n");
+ err = -ENOMEM;
+ brelse(bh);
+ brelse(obh);
+ goto failed;
+ }
+
+ memcpy(nbh->b_data, obh->b_data,
+ journal->j_blocksize);
+ if (flags & JFS_FLAG_ESCAPE) {
+ *((unsigned int *)bh->b_data) =
+ htonl(JFS_MAGIC_NUMBER);
+ }
+
+ BUFFER_TRACE(nbh, "marking dirty");
+ mark_buffer_dirty(nbh);
+ BUFFER_TRACE(nbh, "marking uptodate");
+ mark_buffer_uptodate(nbh, 1);
+ ++info->nr_replays;
+ /* ll_rw_block(WRITE, 1, &nbh); */
+ brelse(obh);
+ brelse(nbh);
+ }
+
+ skip_write:
+ tagp += sizeof(journal_block_tag_t);
+ if (!(flags & JFS_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JFS_FLAG_LAST_TAG)
+ break;
+ }
+
+ brelse(bh);
+ continue;
+
+ case JFS_COMMIT_BLOCK:
+ /* Found an expected commit block: not much to
+ * do other than move on to the next sequence
+ * number. */
+ brelse(bh);
+ next_commit_ID++;
+ continue;
+
+ case JFS_REVOKE_BLOCK:
+ /* If we aren't in the REVOKE pass, then we can
+ * just skip over this block. */
+ if (pass != PASS_REVOKE) {
+ brelse(bh);
+ continue;
+ }
+
+ err = scan_revoke_records(journal, bh,
+ next_commit_ID, info);
+ brelse(bh);
+ if (err)
+ goto failed;
+ continue;
+
+ default:
+ jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+ blocktype);
+ goto done;
+ }
+ }
+
+ done:
+ /*
+ * We broke out of the log scan loop: either we came to the
+ * known end of the log or we found an unexpected block in the
+ * log. If the latter happened, then we know that the "current"
+ * transaction marks the end of the valid log.
+ */
+
+ if (pass == PASS_SCAN)
+ info->end_transaction = next_commit_ID;
+ else {
+ /* It's really bad news if different passes end up at
+ * different places (but possible due to IO errors). */
+ if (info->end_transaction != next_commit_ID) {
+ printk (KERN_ERR "JBD: recovery pass %d ended at "
+ "transaction %u, expected %u\n",
+ pass, next_commit_ID, info->end_transaction);
+ if (!success)
+ success = -EIO;
+ }
+ }
+
+ return success;
+
+ failed:
+ return err;
+ }
+
+
+ /* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+ tid_t sequence, struct recovery_info *info)
+ {
+ journal_revoke_header_t *header;
+ int offset, max;
+
+ header = (journal_revoke_header_t *) bh->b_data;
+ offset = sizeof(journal_revoke_header_t);
+ max = ntohl(header->r_count);
+
+ while (offset < max) {
+ unsigned long blocknr;
+ int err;
+
+ blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset)));
+ offset += 4;
+ err = journal_set_revoke(journal, blocknr, sequence);
+ if (err)
+ return err;
+ ++info->nr_revokes;
+ }
+ return 0;
+ }
diff -rc2P linux/fs/jbd/revoke.c linux-2.4.13/fs/jbd/revoke.c
*** linux/fs/jbd/revoke.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/revoke.c Fri Nov 9 16:57:59 2001
***************
*** 0 ****
--- 1,631 ----
+ /*
+ * linux/fs/revoke.c
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks. The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ * transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ * revoked blocks. If there are multiple revoke records in the log
+ * for a single block, only the last one counts, and if there is a log
+ * entry for a block beyond the last revoke, then that log entry still
+ * gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ * The desired end result is the journaling of the new block, so we
+ * cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ * The revoke must take precedence over the write of the block, so we
+ * need either to cancel the journal entry or to write the revoke
+ * later in the log than the log block. In this case, we choose the
+ * latter: journaling a block cancels any revoke record for that block
+ * in the current transaction, so any revoke for that block in the
+ * transaction must have happened after the block was journaled and so
+ * the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ * The data write is allowed to succeed, but the revoke is _not_
+ * cancelled. We still need to prevent old log records from
+ * overwriting the new data. We don't even need to clear the revoke
+ * bit here.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear: no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ * buffer has not been revoked, and cancel_revoke
+ * need do nothing.
+ * RevokeValid set, Revoked set:
+ * buffer has been revoked.
+ */
+
+ #ifndef __KERNEL__
+ #include "jfs_user.h"
+ #else
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/list.h>
+ #include <linux/smp_lock.h>
+ #include <linux/init.h>
+ #endif
+
+ static kmem_cache_t *revoke_record_cache;
+ static kmem_cache_t *revoke_table_cache;
+
+ /* Each revoke record represents one single revoked block. During
+ journal replay, this involves recording the transaction ID of the
+ last transaction to revoke this block. */
+
+ struct jbd_revoke_record_s
+ {
+ struct list_head hash;
+ tid_t sequence; /* Used for recovery only */
+ unsigned long blocknr;
+ };
+
+
+ /* The revoke table is just a simple hash table of revoke records. */
+ struct jbd_revoke_table_s
+ {
+ /* It is conceivable that we might want a larger hash table
+ * for recovery. Must be a power of two. */
+ int hash_size;
+ int hash_shift;
+ struct list_head *hash_table;
+ };
+
+
+ #ifdef __KERNEL__
+ static void write_one_revoke_record(journal_t *, transaction_t *,
+ struct journal_head **, int *,
+ struct jbd_revoke_record_s *);
+ static void flush_descriptor(journal_t *, struct journal_head *, int);
+ #endif
+
+ /* Utility functions to maintain the revoke table */
+
+ /* Borrowed from buffer.c: this is a tried and tested block hash function */
+ static inline int hash(journal_t *journal, unsigned long block)
+ {
+ struct jbd_revoke_table_s *table = journal->j_revoke;
+ int hash_shift = table->hash_shift;
+
+ return ((block << (hash_shift - 6)) ^
+ (block >> 13) ^
+ (block << (hash_shift - 12))) & (table->hash_size - 1);
+ }
+
+ int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
+ {
+ struct list_head *hash_list;
+ struct jbd_revoke_record_s *record;
+
+ repeat:
+ record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
+ if (!record)
+ goto oom;
+
+ record->sequence = seq;
+ record->blocknr = blocknr;
+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+ list_add(&record->hash, hash_list);
+ return 0;
+
+ oom:
+ if (!journal_oom_retry)
+ return -ENOMEM;
+ jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
+ current->policy |= SCHED_YIELD;
+ schedule();
+ goto repeat;
+ }
+
+ /* Find a revoke record in the journal's hash table. */
+
+ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
+ unsigned long blocknr)
+ {
+ struct list_head *hash_list;
+ struct jbd_revoke_record_s *record;
+
+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+ record = (struct jbd_revoke_record_s *) hash_list->next;
+ while (&(record->hash) != hash_list) {
+ if (record->blocknr == blocknr)
+ return record;
+ record = (struct jbd_revoke_record_s *) record->hash.next;
+ }
+ return NULL;
+ }
+
+ int __init journal_init_revoke_caches(void)
+ {
+ revoke_record_cache = kmem_cache_create("revoke_record",
+ sizeof(struct jbd_revoke_record_s),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (revoke_record_cache == 0)
+ return -ENOMEM;
+
+ revoke_table_cache = kmem_cache_create("revoke_table",
+ sizeof(struct jbd_revoke_table_s),
+ 0, 0, NULL, NULL);
+ if (revoke_table_cache == 0) {
+ kmem_cache_destroy(revoke_record_cache);
+ revoke_record_cache = NULL;
+ return -ENOMEM;
+ }
+ return 0;
+ }
+
+ void journal_destroy_revoke_caches(void)
+ {
+ kmem_cache_destroy(revoke_record_cache);
+ revoke_record_cache = 0;
+ kmem_cache_destroy(revoke_table_cache);
+ revoke_table_cache = 0;
+ }
+
+ /* Initialise the revoke table for a given journal to a given size. */
+
+ int journal_init_revoke(journal_t *journal, int hash_size)
+ {
+ int shift, tmp;
+
+ J_ASSERT (journal->j_revoke == NULL);
+
+ journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+ if (!journal->j_revoke)
+ return -ENOMEM;
+
+ /* Check that the hash_size is a power of two */
+ J_ASSERT ((hash_size & (hash_size-1)) == 0);
+
+ journal->j_revoke->hash_size = hash_size;
+
+ shift = 0;
+ tmp = hash_size;
+ while((tmp >>= 1UL) != 0UL)
+ shift++;
+ journal->j_revoke->hash_shift = shift;
+
+ journal->j_revoke->hash_table =
+ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+ if (!journal->j_revoke->hash_table) {
+ kmem_cache_free(revoke_table_cache, journal->j_revoke);
+ journal->j_revoke = NULL;
+ return -ENOMEM;
+ }
+
+ for (tmp = 0; tmp < hash_size; tmp++)
+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+
+ return 0;
+ }
+
+ /* Destoy a journal's revoke table. The table must already be empty! */
+
+ void journal_destroy_revoke(journal_t *journal)
+ {
+ struct jbd_revoke_table_s *table;
+ struct list_head *hash_list;
+ int i;
+
+ table = journal->j_revoke;
+ if (!table)
+ return;
+
+ for (i=0; i<table->hash_size; i++) {
+ hash_list = &table->hash_table[i];
+ J_ASSERT (list_empty(hash_list));
+ }
+
+ kfree(table->hash_table);
+ kmem_cache_free(revoke_table_cache, table);
+ journal->j_revoke = NULL;
+ }
+
+
+ #ifdef __KERNEL__
+
+ /*
+ * journal_revoke: revoke a given buffer_head from the journal. This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits. Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete. In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+ int journal_revoke(handle_t *handle, unsigned long blocknr,
+ struct buffer_head *bh_in)
+ {
+ struct buffer_head *bh = NULL;
+ journal_t *journal;
+ kdev_t dev;
+ int err;
+
+ if (bh_in)
+ BUFFER_TRACE(bh_in, "enter");
+
+ journal = handle->h_transaction->t_journal;
+ if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
+ J_ASSERT (!"Cannot set revoke feature!");
+ return -EINVAL;
+ }
+
+ dev = journal->j_fs_dev;
+ bh = bh_in;
+
+ if (!bh) {
+ bh = get_hash_table(dev, blocknr, journal->j_blocksize);
+ if (bh)
+ BUFFER_TRACE(bh, "found on hash");
+ }
+ #ifdef JBD_EXPENSIVE_CHECKING
+ else {
+ struct buffer_head *bh2;
+
+ /* If there is a different buffer_head lying around in
+ * memory anywhere... */
+ bh2 = get_hash_table(dev, blocknr, journal->j_blocksize);
+ if (bh2) {
+ /* ... and it has RevokeValid status... */
+ if ((bh2 != bh) &&
+ test_bit(BH_RevokeValid, &bh2->b_state))
+ /* ...then it better be revoked too,
+ * since it's illegal to create a revoke
+ * record against a buffer_head which is
+ * not marked revoked --- that would
+ * risk missing a subsequent revoke
+ * cancel. */
+ J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
+ bh2->b_state));
+ __brelse(bh2);
+ }
+ }
+ #endif
+
+ /* We really ought not ever to revoke twice in a row without
+ first having the revoke cancelled: it's illegal to free a
+ block twice without allocating it in between! */
+ if (bh) {
+ J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
+ set_bit(BH_Revoked, &bh->b_state);
+ set_bit(BH_RevokeValid, &bh->b_state);
+ if (bh_in) {
+ BUFFER_TRACE(bh_in, "call journal_forget");
+ journal_forget(handle, bh_in);
+ } else {
+ BUFFER_TRACE(bh, "call brelse");
+ __brelse(bh);
+ }
+ }
+
+ lock_journal(journal);
+ jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+ err = insert_revoke_hash(journal, blocknr,
+ handle->h_transaction->t_tid);
+ unlock_journal(journal);
+ BUFFER_TRACE(bh_in, "exit");
+ return err;
+ }
+
+ /*
+ * Cancel an outstanding revoke. For use only internally by the
+ * journaling code (called from journal_get_write_access).
+ *
+ * We trust the BH_Revoked bit on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction. In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel. So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ *
+ * The caller must have the journal locked.
+ */
+ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+ {
+ struct jbd_revoke_record_s *record;
+ journal_t *journal = handle->h_transaction->t_journal;
+ int need_cancel;
+ int did_revoke = 0; /* akpm: debug */
+ struct buffer_head *bh = jh2bh(jh);
+
+ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+ /* Is the existing Revoke bit valid? If so, we trust it, and
+ * only perform the full cancel if the revoke bit is set. If
+ * not, we can't trust the revoke bit, and we need to do the
+ * full search for a revoke record. */
+ if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
+ need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
+ else {
+ need_cancel = 1;
+ clear_bit(BH_Revoked, &bh->b_state);
+ }
+
+ if (need_cancel) {
+ record = find_revoke_record(journal, bh->b_blocknr);
+ if (record) {
+ jbd_debug(4, "cancelled existing revoke on "
+ "blocknr %lu\n", bh->b_blocknr);
+ list_del(&record->hash);
+ kmem_cache_free(revoke_record_cache, record);
+ did_revoke = 1;
+ }
+ }
+
+ #ifdef JBD_EXPENSIVE_CHECKING
+ /* There better not be one left behind by now! */
+ record = find_revoke_record(journal, bh->b_blocknr);
+ J_ASSERT_JH(jh, record == NULL);
+ #endif
+
+ /* Finally, have we just cleared revoke on an unhashed
+ * buffer_head? If so, we'd better make sure we clear the
+ * revoked status on any hashed alias too, otherwise the revoke
+ * state machine will get very upset later on. */
+ if (need_cancel && !bh->b_pprev) {
+ struct buffer_head *bh2;
+ bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+ if (bh2) {
+ clear_bit(BH_Revoked, &bh2->b_state);
+ __brelse(bh2);
+ }
+ }
+
+ return did_revoke;
+ }
+
+
+ /*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ *
+ * Called with the journal lock held.
+ */
+
+ void journal_write_revoke_records(journal_t *journal,
+ transaction_t *transaction)
+ {
+ struct journal_head *descriptor;
+ struct jbd_revoke_record_s *record;
+ struct jbd_revoke_table_s *revoke;
+ struct list_head *hash_list;
+ int i, offset, count;
+
+ descriptor = NULL;
+ offset = 0;
+ count = 0;
+ revoke = journal->j_revoke;
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ hash_list = &revoke->hash_table[i];
+
+ while (!list_empty(hash_list)) {
+ record = (struct jbd_revoke_record_s *)
+ hash_list->next;
+ write_one_revoke_record(journal, transaction,
+ &descriptor, &offset,
+ record);
+ count++;
+ list_del(&record->hash);
+ kmem_cache_free(revoke_record_cache, record);
+ }
+ }
+ if (descriptor)
+ flush_descriptor(journal, descriptor, offset);
+ jbd_debug(1, "Wrote %d revoke records\n", count);
+ }
+
+ /*
+ * Write out one revoke record. We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+ static void write_one_revoke_record(journal_t *journal,
+ transaction_t *transaction,
+ struct journal_head **descriptorp,
+ int *offsetp,
+ struct jbd_revoke_record_s *record)
+ {
+ struct journal_head *descriptor;
+ int offset;
+ journal_header_t *header;
+
+ /* If we are already aborting, this all becomes a noop. We
+ still need to go round the loop in
+ journal_write_revoke_records in order to free all of the
+ revoke records: only the IO to the journal is omitted. */
+ if (is_journal_aborted(journal))
+ return;
+
+ descriptor = *descriptorp;
+ offset = *offsetp;
+
+ /* Make sure we have a descriptor with space left for the record */
+ if (descriptor) {
+ if (offset == journal->j_blocksize) {
+ flush_descriptor(journal, descriptor, offset);
+ descriptor = NULL;
+ }
+ }
+
+ if (!descriptor) {
+ descriptor = journal_get_descriptor_buffer(journal);
+ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header->h_magic = htonl(JFS_MAGIC_NUMBER);
+ header->h_blocktype = htonl(JFS_REVOKE_BLOCK);
+ header->h_sequence = htonl(transaction->t_tid);
+
+ /* Record it so that we can wait for IO completion later */
+ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+ journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+ offset = sizeof(journal_revoke_header_t);
+ *descriptorp = descriptor;
+ }
+
+ * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) =
+ htonl(record->blocknr);
+ offset += 4;
+ *offsetp = offset;
+ }
+
+ /*
+ * Flush a revoke descriptor out to the journal. If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+ static void flush_descriptor(journal_t *journal,
+ struct journal_head *descriptor,
+ int offset)
+ {
+ journal_revoke_header_t *header;
+
+ if (is_journal_aborted(journal)) {
+ JBUFFER_TRACE(descriptor, "brelse");
+ __brelse(jh2bh(descriptor));
+ return;
+ }
+
+ header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header->r_count = htonl(offset);
+ set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
+ {
+ struct buffer_head *bh = jh2bh(descriptor);
+ BUFFER_TRACE(bh, "write");
+ ll_rw_block (WRITE, 1, &bh);
+ }
+ }
+
+ #endif
+
+ /*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ * record all revoke records, including the tid of the latest instance
+ * of each revoke in the journal
+ *
+ * check whether a given block in a given transaction should be replayed
+ * (ie. has not been revoked by a revoke record in that or a subsequent
+ * transaction)
+ *
+ * empty the revoke table after recovery.
+ */
+
+ /*
+ * First, setting revoke records. We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+ int journal_set_revoke(journal_t *journal,
+ unsigned long blocknr,
+ tid_t sequence)
+ {
+ struct jbd_revoke_record_s *record;
+
+ record = find_revoke_record(journal, blocknr);
+ if (record) {
+ /* If we have multiple occurences, only record the
+ * latest sequence number in the hashed record */
+ if (tid_gt(sequence, record->sequence))
+ record->sequence = sequence;
+ return 0;
+ }
+ return insert_revoke_hash(journal, blocknr, sequence);
+ }
+
+ /*
+ * Test revoke records. For a given block referenced in the log, has
+ * that block been revoked? A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+ int journal_test_revoke(journal_t *journal,
+ unsigned long blocknr,
+ tid_t sequence)
+ {
+ struct jbd_revoke_record_s *record;
+
+ record = find_revoke_record(journal, blocknr);
+ if (!record)
+ return 0;
+ if (tid_gt(sequence, record->sequence))
+ return 0;
+ return 1;
+ }
+
+ /*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+ void journal_clear_revoke(journal_t *journal)
+ {
+ int i;
+ struct list_head *hash_list;
+ struct jbd_revoke_record_s *record;
+ struct jbd_revoke_table_s *revoke;
+
+ revoke = journal->j_revoke;
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ hash_list = &revoke->hash_table[i];
+ while (!list_empty(hash_list)) {
+ record = (struct jbd_revoke_record_s*) hash_list->next;
+ list_del(&record->hash);
+ kmem_cache_free(revoke_record_cache, record);
+ }
+ }
+ }
+
diff -rc2P linux/fs/jbd/transaction.c linux-2.4.13/fs/jbd/transaction.c
*** linux/fs/jbd/transaction.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/transaction.c Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,2078 ----
+ /*
+ * linux/fs/transaction.c
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/timer.h>
+ #include <linux/smp_lock.h>
+ #include <linux/mm.h>
+ #include <linux/swap.h> /* Uggh... needed for buffermem_pages */
+
+
+ extern spinlock_t journal_datalist_lock;
+
+ /*
+ * get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction. Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ * The journal MUST be locked. We don't perform atomic mallocs on the
+ * new transaction and we can't block without protecting against other
+ * processes trying to touch the journal while it is in transition.
+ */
+
+ static transaction_t * get_transaction (journal_t * journal, int is_try)
+ {
+ transaction_t * transaction;
+
+ transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS);
+ if (!transaction)
+ return NULL;
+
+ memset (transaction, 0, sizeof (transaction_t));
+
+ transaction->t_journal = journal;
+ transaction->t_state = T_RUNNING;
+ transaction->t_tid = journal->j_transaction_sequence++;
+ transaction->t_expires = jiffies + journal->j_commit_interval;
+
+ /* Set up the commit timer for the new transaction. */
+ J_ASSERT (!journal->j_commit_timer_active);
+ journal->j_commit_timer_active = 1;
+ journal->j_commit_timer->expires = transaction->t_expires;
+ add_timer(journal->j_commit_timer);
+
+ J_ASSERT (journal->j_running_transaction == NULL);
+ journal->j_running_transaction = transaction;
+
+ return transaction;
+ }
+
+ /*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+ /*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin. Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+ static int start_this_handle(journal_t *journal, handle_t *handle)
+ {
+ transaction_t *transaction;
+ int needed;
+ int nblocks = handle->h_buffer_credits;
+
+ jbd_debug(3, "New handle %p going live.\n", handle);
+
+ repeat:
+
+ lock_journal(journal);
+
+ if (is_journal_aborted(journal) ||
+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+ unlock_journal(journal);
+ return -EROFS;
+ }
+
+ /* Wait on the journal's transaction barrier if necessary */
+ if (journal->j_barrier_count) {
+ unlock_journal(journal);
+ sleep_on(&journal->j_wait_transaction_locked);
+ goto repeat;
+ }
+
+ repeat_locked:
+ if (!journal->j_running_transaction)
+ get_transaction(journal, 0);
+ /* @@@ Error? */
+ J_ASSERT(journal->j_running_transaction);
+
+ transaction = journal->j_running_transaction;
+
+ /* If the current transaction is locked down for commit, wait
+ * for the lock to be released. */
+
+ if (transaction->t_state == T_LOCKED) {
+ unlock_journal(journal);
+ jbd_debug(3, "Handle %p stalling...\n", handle);
+ sleep_on(&journal->j_wait_transaction_locked);
+ goto repeat;
+ }
+
+ /* If there is not enough space left in the log to write all
+ * potential buffers requested by this operation, we need to
+ * stall pending a log checkpoint to free some more log
+ * space. */
+
+ needed = transaction->t_outstanding_credits + nblocks;
+
+ if (needed > journal->j_max_transaction_buffers) {
+ /* If the current transaction is already too large, then
+ * start to commit it: we can then go back and attach
+ * this handle to a new transaction. */
+
+ jbd_debug(2, "Handle %p starting new commit...\n", handle);
+ log_start_commit(journal, transaction);
+ unlock_journal(journal);
+ sleep_on(&journal->j_wait_transaction_locked);
+ lock_journal(journal);
+ goto repeat_locked;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ *
+ * The worst part is, any transaction currently committing can
+ * reduce the free space arbitrarily. Be careful to account for
+ * those buffers when checkpointing.
+ */
+
+ /*
+ * @@@ AKPM: This seems rather over-defensive. We're giving commit
+ * a _lot_ of headroom: 1/4 of the journal plus the size of
+ * the committing transaction. Really, we only need to give it
+ * committing_transaction->t_outstanding_credits plus "enough" for
+ * the log control blocks.
+ * Also, this test is inconsitent with the matching one in
+ * journal_extend().
+ */
+ needed = journal->j_max_transaction_buffers;
+ if (journal->j_committing_transaction)
+ needed += journal->j_committing_transaction->
+ t_outstanding_credits;
+
+ if (log_space_left(journal) < needed) {
+ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+ log_wait_for_space(journal, needed);
+ goto repeat_locked;
+ }
+
+ /* OK, account for the buffers that this operation expects to
+ * use and add the handle to the running transaction. */
+
+ handle->h_transaction = transaction;
+ transaction->t_outstanding_credits += nblocks;
+ transaction->t_updates++;
+ transaction->t_handle_count++;
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+ handle, nblocks, transaction->t_outstanding_credits,
+ log_space_left(journal));
+
+ unlock_journal(journal);
+
+ return 0;
+ }
+
+ /*
+ * Obtain a new handle.
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log. We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext2fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or NULL on failure
+ */
+
+ handle_t *journal_start(journal_t *journal, int nblocks)
+ {
+ handle_t *handle = journal_current_handle();
+ int err;
+
+ if (!journal)
+ return ERR_PTR(-EROFS);
+
+ if (handle) {
+ J_ASSERT(handle->h_transaction->t_journal == journal);
+ handle->h_ref++;
+ return handle;
+ }
+
+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+ memset (handle, 0, sizeof (handle_t));
+
+ handle->h_buffer_credits = nblocks;
+ handle->h_ref = 1;
+ current->journal_info = handle;
+
+ err = start_this_handle(journal, handle);
+ if (err < 0) {
+ kfree(handle);
+ current->journal_info = NULL;
+ return ERR_PTR(err);
+ }
+
+ return handle;
+ }
+
+ /*
+ * Return zero on success
+ */
+ static int try_start_this_handle(journal_t *journal, handle_t *handle)
+ {
+ transaction_t *transaction;
+ int needed;
+ int nblocks = handle->h_buffer_credits;
+ int ret = 0;
+
+ jbd_debug(3, "New handle %p maybe going live.\n", handle);
+
+ lock_journal(journal);
+
+ if (is_journal_aborted(journal) ||
+ (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+ ret = -EROFS;
+ goto fail_unlock;
+ }
+
+ if (journal->j_barrier_count)
+ goto fail_unlock;
+
+ if (!journal->j_running_transaction && get_transaction(journal, 1) == 0)
+ goto fail_unlock;
+
+ transaction = journal->j_running_transaction;
+ if (transaction->t_state == T_LOCKED)
+ goto fail_unlock;
+
+ needed = transaction->t_outstanding_credits + nblocks;
+ /* We could run log_start_commit here */
+ if (needed > journal->j_max_transaction_buffers)
+ goto fail_unlock;
+
+ needed = journal->j_max_transaction_buffers;
+ if (journal->j_committing_transaction)
+ needed += journal->j_committing_transaction->
+ t_outstanding_credits;
+
+ if (log_space_left(journal) < needed)
+ goto fail_unlock;
+
+ handle->h_transaction = transaction;
+ transaction->t_outstanding_credits += nblocks;
+ transaction->t_updates++;
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+ handle, nblocks, transaction->t_outstanding_credits,
+ log_space_left(journal));
+ unlock_journal(journal);
+ return 0;
+
+ fail_unlock:
+ unlock_journal(journal);
+ if (ret >= 0)
+ ret = -1;
+ return ret;
+ }
+
+ /*
+ * Try to start a handle, but non-blockingly. If we weren't able
+ * to, return an ERR_PTR value.
+ */
+ handle_t *journal_try_start(journal_t *journal, int nblocks)
+ {
+ handle_t *handle = journal_current_handle();
+ int err;
+
+ if (!journal)
+ return ERR_PTR(-EROFS);
+
+ if (handle) {
+ jbd_debug(4, "h_ref %d -> %d\n",
+ handle->h_ref,
+ handle->h_ref + 1);
+ J_ASSERT(handle->h_transaction->t_journal == journal);
+ if (is_handle_aborted(handle))
+ return ERR_PTR(-EIO);
+ handle->h_ref++;
+ return handle;
+ } else {
+ jbd_debug(4, "no current transaction\n");
+ }
+
+ if (is_journal_aborted(journal))
+ return ERR_PTR(-EIO);
+
+ handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+ memset (handle, 0, sizeof (handle_t));
+
+ handle->h_buffer_credits = nblocks;
+ handle->h_ref = 1;
+ current->journal_info = handle;
+
+ err = try_start_this_handle(journal, handle);
+ if (err < 0) {
+ kfree(handle);
+ current->journal_info = NULL;
+ return ERR_PTR(err);
+ }
+
+ return handle;
+ }
+
+ /*
+ * journal_extend: extend buffer credits.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages. The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation: this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+
+ int journal_extend (handle_t *handle, int nblocks)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int result;
+ int wanted;
+
+ lock_journal (journal);
+
+ result = -EIO;
+ if (is_handle_aborted(handle))
+ goto error_out;
+
+ result = 1;
+
+ /* Don't extend a locked-down transaction! */
+ if (handle->h_transaction->t_state != T_RUNNING) {
+ jbd_debug(3, "denied handle %p %d blocks: "
+ "transaction not running\n", handle, nblocks);
+ goto error_out;
+ }
+
+ wanted = transaction->t_outstanding_credits + nblocks;
+
+ if (wanted > journal->j_max_transaction_buffers) {
+ jbd_debug(3, "denied handle %p %d blocks: "
+ "transaction too large\n", handle, nblocks);
+ goto error_out;
+ }
+
+ if (wanted > log_space_left(journal)) {
+ jbd_debug(3, "denied handle %p %d blocks: "
+ "insufficient log space\n", handle, nblocks);
+ goto error_out;
+ }
+
+ handle->h_buffer_credits += nblocks;
+ transaction->t_outstanding_credits += nblocks;
+ result = 0;
+
+ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+
+ error_out:
+ unlock_journal (journal);
+ return result;
+ }
+
+
+ /*
+ * journal_restart: restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+
+ int journal_restart(handle_t *handle, int nblocks)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int ret;
+
+ /* If we've had an abort of any type, don't even think about
+ * actually doing the restart! */
+ if (is_handle_aborted(handle))
+ return 0;
+
+ /* First unlink the handle from its current transaction, and
+ * start the commit on that. */
+
+ J_ASSERT (transaction->t_updates > 0);
+ J_ASSERT (journal_current_handle() == handle);
+
+ transaction->t_outstanding_credits -= handle->h_buffer_credits;
+ transaction->t_updates--;
+
+ if (!transaction->t_updates)
+ wake_up(&journal->j_wait_updates);
+
+ jbd_debug(2, "restarting handle %p\n", handle);
+ log_start_commit(journal, transaction);
+
+ handle->h_buffer_credits = nblocks;
+ ret = start_this_handle(journal, handle);
+ return ret;
+ }
+
+
+ /*
+ * Barrier operation: establish a transaction barrier.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+
+ void journal_lock_updates (journal_t *journal)
+ {
+ lock_journal(journal);
+ ++journal->j_barrier_count;
+
+ /* Wait until there are no running updates */
+ while (1) {
+ transaction_t *transaction = journal->j_running_transaction;
+ if (!transaction)
+ break;
+ if (!transaction->t_updates)
+ break;
+
+ unlock_journal(journal);
+ sleep_on(&journal->j_wait_updates);
+ lock_journal(journal);
+ }
+
+ unlock_journal(journal);
+
+ /* We have now established a barrier against other normal
+ * updates, but we also need to barrier against other
+ * journal_lock_updates() calls to make sure that we serialise
+ * special journal-locked operations too. */
+ down(&journal->j_barrier);
+ }
+
+ /*
+ * Release a transaction barrier obtained with journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+
+ void journal_unlock_updates (journal_t *journal)
+ {
+ lock_journal(journal);
+
+ J_ASSERT (journal->j_barrier_count != 0);
+
+ up(&journal->j_barrier);
+ --journal->j_barrier_count;
+ wake_up(&journal->j_wait_transaction_locked);
+ unlock_journal(journal);
+ }
+
+ /*
+ * journal_get_write_access: notify intent to modify a buffer for metadata
+ * (not data) update.
+ *
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do. If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk. We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+ static int
+ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int error;
+ char *frozen_buffer = NULL;
+ int need_copy = 0;
+
+ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+
+ JBUFFER_TRACE(jh, "entry");
+ repeat:
+ /* @@@ Need to check for errors here at some point. */
+
+ /*
+ * AKPM: neither bdflush nor kupdate run with the BKL. There's
+ * nothing we can do to prevent them from starting writeout of a
+ * BUF_DIRTY buffer at any time. And checkpointing buffers are on
+ * BUF_DIRTY. So. We no longer assert that the buffer is unlocked.
+ *
+ * However. It is very wrong for us to allow ext3 to start directly
+ * altering the ->b_data of buffers which may at that very time be
+ * undergoing writeout to the client filesystem. This can leave
+ * the filesystem in an inconsistent, transient state if we crash.
+ * So what we do is to steal the buffer if it is in checkpoint
+ * mode and dirty. The journal lock will keep out checkpoint-mode
+ * state transitions within journal_remove_checkpoint() and the buffer
+ * is locked to keep bdflush/kupdate/whoever away from it as well.
+ *
+ * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
+ * simple lock_journal(). This code here will care for locked buffers.
+ */
+ /*
+ * The buffer_locked() || buffer_dirty() tests here are simply an
+ * optimisation tweak. If anyone else in the system decides to
+ * lock this buffer later on, we'll blow up. There doesn't seem
+ * to be a good reason why they should do this.
+ */
+ if (jh->b_cp_transaction &&
+ (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
+ unlock_journal(journal);
+ lock_buffer(jh2bh(jh));
+ spin_lock(&journal_datalist_lock);
+ if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
+ /* OK, we need to steal it */
+ JBUFFER_TRACE(jh, "stealing from checkpoint mode");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+
+ J_ASSERT(handle->h_buffer_credits > 0);
+ handle->h_buffer_credits--;
+
+ /* This will clear BH_Dirty and set BH_JBDDirty. */
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ __journal_file_buffer(jh, transaction, BJ_Reserved);
+
+ /* And pull it off BUF_DIRTY, onto BUF_CLEAN */
+ refile_buffer(jh2bh(jh));
+
+ /*
+ * The buffer is now hidden from bdflush. It is
+ * metadata against the current transaction.
+ */
+ JBUFFER_TRACE(jh, "steal from cp mode is complete");
+ }
+ spin_unlock(&journal_datalist_lock);
+ unlock_buffer(jh2bh(jh));
+ lock_journal(journal);
+ }
+
+ J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
+
+ error = -EROFS;
+ if (is_handle_aborted(handle))
+ goto out_unlocked;
+ error = 0;
+
+ spin_lock(&journal_datalist_lock);
+
+ /* The buffer is already part of this transaction if
+ * b_transaction or b_next_transaction points to it. */
+
+ if (jh->b_transaction == transaction ||
+ jh->b_next_transaction == transaction)
+ goto done_locked;
+
+ /* If there is already a copy-out version of this buffer, then
+ * we don't need to make another one. */
+
+ if (jh->b_frozen_data) {
+ JBUFFER_TRACE(jh, "has frozen data");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ jh->b_next_transaction = transaction;
+
+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ handle->h_buffer_credits--;
+ goto done_locked;
+ }
+
+ /* Is there data here we need to preserve? */
+
+ if (jh->b_transaction && jh->b_transaction != transaction) {
+ JBUFFER_TRACE(jh, "owned by older transaction");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_committing_transaction);
+
+ /* There is one case we have to be very careful about.
+ * If the committing transaction is currently writing
+ * this buffer out to disk and has NOT made a copy-out,
+ * then we cannot modify the buffer contents at all
+ * right now. The essence of copy-out is that it is the
+ * extra copy, not the primary copy, which gets
+ * journaled. If the primary copy is already going to
+ * disk then we cannot do copy-out here. */
+
+ if (jh->b_jlist == BJ_Shadow) {
+ JBUFFER_TRACE(jh, "on shadow: sleep");
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ /* commit wakes up all shadow buffers after IO */
+ sleep_on(&jh2bh(jh)->b_wait);
+ lock_journal(journal);
+ goto repeat;
+ }
+
+ /* Only do the copy if the currently-owning transaction
+ * still needs it. If it is on the Forget list, the
+ * committing transaction is past that stage. The
+ * buffer had better remain locked during the kmalloc,
+ * but that should be true --- we hold the journal lock
+ * still and the buffer is already on the BUF_JOURNAL
+ * list so won't be flushed.
+ *
+ * Subtle point, though: if this is a get_undo_access,
+ * then we will be relying on the frozen_data to contain
+ * the new value of the committed_data record after the
+ * transaction, so we HAVE to force the frozen_data copy
+ * in that case. */
+
+ if (jh->b_jlist != BJ_Forget || force_copy) {
+ JBUFFER_TRACE(jh, "generate frozen data");
+ if (!frozen_buffer) {
+ JBUFFER_TRACE(jh, "allocate memory for buffer");
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
+ GFP_NOFS);
+ lock_journal(journal);
+ if (!frozen_buffer) {
+ printk(KERN_EMERG __FUNCTION__
+ "OOM for frozen_buffer\n");
+ JBUFFER_TRACE(jh, "oom!");
+ error = -ENOMEM;
+ spin_lock(&journal_datalist_lock);
+ goto done_locked;
+ }
+ goto repeat;
+ }
+
+ jh->b_frozen_data = frozen_buffer;
+ frozen_buffer = NULL;
+ need_copy = 1;
+ }
+ jh->b_next_transaction = transaction;
+ }
+
+ J_ASSERT(handle->h_buffer_credits > 0);
+ handle->h_buffer_credits--;
+
+ /* Finally, if the buffer is not journaled right now, we need to
+ * make sure it doesn't get written to disk before the caller
+ * actually commits the new data. */
+
+ if (!jh->b_transaction) {
+ JBUFFER_TRACE(jh, "no transaction");
+ J_ASSERT_JH(jh, !jh->b_next_transaction);
+ jh->b_transaction = transaction;
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ __journal_file_buffer(jh, transaction, BJ_Reserved);
+ }
+
+ done_locked:
+ spin_unlock(&journal_datalist_lock);
+ if (need_copy) {
+ struct page *page;
+ int offset;
+ char *source;
+
+ J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh)));
+ page = jh2bh(jh)->b_page;
+ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+ source = kmap(page);
+ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+ kunmap(page);
+ }
+
+
+ /* If we are about to journal a buffer, then any revoke pending
+ on it is no longer valid. */
+ journal_cancel_revoke(handle, jh);
+
+ out_unlocked:
+ if (frozen_buffer)
+ kfree(frozen_buffer);
+
+ JBUFFER_TRACE(jh, "exit");
+ return error;
+ }
+
+ int journal_get_write_access (handle_t *handle, struct buffer_head *bh)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh = journal_add_journal_head(bh);
+ int rc;
+
+ /* We do not want to get caught playing with fields which the
+ * log thread also manipulates. Make sure that the buffer
+ * completes any outstanding IO before proceeding. */
+ lock_journal(journal);
+ rc = do_get_write_access(handle, jh, 0);
+ journal_unlock_journal_head(jh);
+ unlock_journal(journal);
+ return rc;
+ }
+
+
+ /*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data. In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+ int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh = journal_add_journal_head(bh);
+ int err;
+
+ jbd_debug(5, "journal_head %p\n", jh);
+ lock_journal(journal);
+ err = -EROFS;
+ if (is_handle_aborted(handle))
+ goto out;
+ err = 0;
+
+ JBUFFER_TRACE(jh, "entry");
+ /* The buffer may already belong to this transaction due to
+ * pre-zeroing in the filesystem's new_block code. It may also
+ * be on the previous, committing transaction's lists, but it
+ * HAS to be in Forget state in that case: the transaction must
+ * have deleted the buffer for it to be reused here. */
+ J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+ jh->b_transaction == NULL ||
+ (jh->b_transaction == journal->j_committing_transaction &&
+ jh->b_jlist == BJ_Forget)));
+
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ handle->h_buffer_credits--;
+
+ spin_lock(&journal_datalist_lock);
+ if (jh->b_transaction == NULL) {
+ jh->b_transaction = transaction;
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ __journal_file_buffer(jh, transaction, BJ_Reserved);
+ JBUFFER_TRACE(jh, "refile");
+ refile_buffer(jh2bh(jh));
+ } else if (jh->b_transaction == journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "set next transaction");
+ jh->b_next_transaction = transaction;
+ }
+ spin_unlock(&journal_datalist_lock);
+
+ /*
+ * akpm: I added this. ext3_alloc_branch can pick up new indirect
+ * blocks which contain freed but then revoked metadata. We need
+ * to cancel the revoke in case we end up freeing it yet again
+ * and the reallocating as data - this would cause a second revoke,
+ * which hits an assertion error.
+ */
+ JBUFFER_TRACE(jh, "cancelling revoke");
+ journal_cancel_revoke(handle, jh);
+ journal_unlock_journal_head(jh);
+ out:
+ unlock_journal(journal);
+ return err;
+ }
+
+
+
+ /*
+ * journal_get_undo_access: Notify intent to modify metadata with non-
+ * rewindable consequences
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not. The ext3fs code uses
+ * this for freeing and allocating space: we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps. The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of: buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+
+ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
+ {
+ journal_t *journal = handle->h_transaction->t_journal;
+ int err;
+ struct journal_head *jh = journal_add_journal_head(bh);
+
+ JBUFFER_TRACE(jh, "entry");
+ lock_journal(journal);
+
+ /* Do this first --- it can drop the journal lock, so we want to
+ * make sure that obtaining the committed_data is done
+ * atomically wrt. completion of any outstanding commits. */
+ err = do_get_write_access (handle, jh, 1);
+ if (err)
+ goto out;
+
+ if (!jh->b_committed_data) {
+ /* Copy out the current buffer contents into the
+ * preserved, committed copy. */
+ JBUFFER_TRACE(jh, "generate b_committed data");
+ jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size,
+ GFP_NOFS);
+ if (!jh->b_committed_data) {
+ printk(KERN_EMERG __FUNCTION__
+ ": No memory for committed data!\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memcpy (jh->b_committed_data, jh2bh(jh)->b_data,
+ jh2bh(jh)->b_size);
+ }
+
+ out:
+ if (!err)
+ J_ASSERT_JH(jh, jh->b_committed_data);
+ journal_unlock_journal_head(jh);
+ unlock_journal(journal);
+ return err;
+ }
+
+ /*
+ * journal_dirty_data: mark a buffer as containing dirty data which
+ * needs to be flushed before we can commit the current transaction.
+ *
+ * The buffer is placed on the transaction's data list and is marked as
+ * belonging to the transaction.
+ *
+ * If `async' is set then the writebask will be initiated by the caller
+ * using submit_bh -> end_buffer_io_async. We put the buffer onto
+ * t_async_datalist.
+ *
+ * Returns error number or 0 on success.
+ *
+ * journal_dirty_data() can be called via page_launder->ext3_writepage
+ * by kswapd. So it cannot block. Happily, there's nothing here
+ * which needs lock_journal if `async' is set.
+ *
+ * When the buffer is on the current transaction we freely move it
+ * between BJ_AsyncData and BJ_SyncData according to who tried to
+ * change its state last.
+ */
+
+ int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
+ {
+ journal_t *journal = handle->h_transaction->t_journal;
+ int need_brelse = 0;
+ int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
+ struct journal_head *jh;
+
+ if (is_handle_aborted(handle))
+ return 0;
+
+ jh = journal_add_journal_head(bh);
+ JBUFFER_TRACE(jh, "entry");
+
+ /*
+ * The buffer could *already* be dirty. Writeout can start
+ * at any time.
+ */
+ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+
+ /*
+ * What if the buffer is already part of a running transaction?
+ *
+ * There are two cases:
+ * 1) It is part of the current running transaction. Refile it,
+ * just in case we have allocated it as metadata, deallocated
+ * it, then reallocated it as data.
+ * 2) It is part of the previous, still-committing transaction.
+ * If all we want to do is to guarantee that the buffer will be
+ * written to disk before this new transaction commits, then
+ * being sure that the *previous* transaction has this same
+ * property is sufficient for us! Just leave it on its old
+ * transaction.
+ *
+ * In case (2), the buffer must not already exist as metadata
+ * --- that would violate write ordering (a transaction is free
+ * to write its data at any point, even before the previous
+ * committing transaction has committed). The caller must
+ * never, ever allow this to happen: there's nothing we can do
+ * about it in this layer.
+ */
+ spin_lock(&journal_datalist_lock);
+ if (jh->b_transaction) {
+ JBUFFER_TRACE(jh, "has transaction");
+ if (jh->b_transaction != handle->h_transaction) {
+ JBUFFER_TRACE(jh, "belongs to older transaction");
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_committing_transaction);
+
+ /* @@@ IS THIS TRUE ? */
+ /*
+ * Not any more. Scenario: someone does a write()
+ * in data=journal mode. The buffer's transaction has
+ * moved into commit. Then someone does another
+ * write() to the file. We do the frozen data copyout
+ * and set b_next_transaction to point to j_running_t.
+ * And while we're in that state, someone does a
+ * writepage() in an attempt to pageout the same area
+ * of the file via a shared mapping. At present that
+ * calls journal_dirty_data(), and we get right here.
+ * It may be too late to journal the data. Simply
+ * falling through to the next test will suffice: the
+ * data will be dirty and wil be checkpointed. The
+ * ordering comments in the next comment block still
+ * apply.
+ */
+ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
+ /*
+ * If we're journalling data, and this buffer was
+ * subject to a write(), it could be metadata, forget
+ * or shadow against the committing transaction. Now,
+ * someone has dirtied the same darn page via a mapping
+ * and it is being writepage()'d.
+ * We *could* just steal the page from commit, with some
+ * fancy locking there. Instead, we just skip it -
+ * don't tie the page's buffers to the new transaction
+ * at all.
+ * Implication: if we crash before the writepage() data
+ * is written into the filesystem, recovery will replay
+ * the write() data.
+ */
+ if (jh->b_jlist != BJ_None &&
+ jh->b_jlist != BJ_SyncData &&
+ jh->b_jlist != BJ_AsyncData) {
+ JBUFFER_TRACE(jh, "Not stealing");
+ goto no_journal;
+ }
+
+ /*
+ * This buffer may be undergoing writeout in commit. We
+ * can't return from here and let the caller dirty it
+ * again because that can cause the write-out loop in
+ * commit to never terminate.
+ */
+ if (!async && buffer_dirty(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&journal_datalist_lock);
+ need_brelse = 1;
+ ll_rw_block(WRITE, 1, &bh);
+ wait_on_buffer(bh);
+ spin_lock(&journal_datalist_lock);
+ /* The buffer may become locked again at any
+ time if it is redirtied */
+ }
+
+ /* journal_clean_data_list() may have got there first */
+ if (jh->b_transaction != NULL) {
+ JBUFFER_TRACE(jh, "unfile from commit");
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = NULL;
+ }
+ /* The buffer will be refiled below */
+
+ }
+ /*
+ * Special case --- the buffer might actually have been
+ * allocated and then immediately deallocated in the previous,
+ * committing transaction, so might still be left on that
+ * transaction's metadata lists.
+ */
+ if (jh->b_jlist != wanted_jlist) {
+ JBUFFER_TRACE(jh, "not on correct data list: unfile");
+ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = NULL;
+ JBUFFER_TRACE(jh, "file as data");
+ __journal_file_buffer(jh, handle->h_transaction,
+ wanted_jlist);
+ }
+ } else {
+ JBUFFER_TRACE(jh, "not on a transaction");
+ __journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
+ }
+ /*
+ * We need to mark the buffer dirty and refile it inside the lock to
+ * protect it from release by journal_try_to_free_buffer()
+ *
+ * We set ->b_flushtime to something small enough to typically keep
+ * kupdate away from the buffer.
+ *
+ * We don't need to do a balance_dirty() - __block_commit_write()
+ * does that.
+ */
+ if (!async && !atomic_set_buffer_dirty(jh2bh(jh))) {
+ jh2bh(jh)->b_flushtime =
+ jiffies + journal->j_commit_interval + 1 * HZ;
+ refile_buffer(jh2bh(jh));
+ }
+ no_journal:
+ spin_unlock(&journal_datalist_lock);
+ if (need_brelse) {
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ }
+ JBUFFER_TRACE(jh, "exit");
+ journal_unlock_journal_head(jh);
+ return 0;
+ }
+
+ /*
+ * journal_dirty_metadata: mark a buffer as containing dirty metadata
+ * which needs to be journaled as part of the current transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit). In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ *
+ * Returns error number or 0 on success.
+ */
+
+ int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh = bh2jh(bh);
+
+ jbd_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
+ lock_journal(journal);
+ if (is_handle_aborted(handle))
+ goto out_unlock;
+
+ spin_lock(&journal_datalist_lock);
+ set_bit(BH_JBDDirty, &bh->b_state);
+ set_buffer_flushtime(bh);
+
+ J_ASSERT_JH(jh, jh->b_transaction != NULL);
+
+ /*
+ * Metadata already on the current transaction list doesn't
+ * need to be filed. Metadata on another transaction's list must
+ * be committing, and will be refiled once the commit completes:
+ * leave it alone for now.
+ */
+
+ if (jh->b_transaction != transaction) {
+ JBUFFER_TRACE(jh, "already on other transaction");
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_committing_transaction);
+ J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+ /* And this case is illegal: we can't reuse another
+ * transaction's data buffer, ever. */
+ /* FIXME: writepage() should be journalled */
+ J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData);
+ goto done_locked;
+ }
+
+ /* That test should have eliminated the following case: */
+ J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+
+ JBUFFER_TRACE(jh, "file as BJ_Metadata");
+ __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+
+ done_locked:
+ spin_unlock(&journal_datalist_lock);
+ JBUFFER_TRACE(jh, "exit");
+ out_unlock:
+ unlock_journal(journal);
+ return 0;
+ }
+
+ #if 0
+ /*
+ * journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ * journal_get_write_access() can block, so it is quite possible for a
+ * journaling component to decide after the write access is returned
+ * that global state has changed and the update is no longer required. */
+
+ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh = bh2jh(bh);
+
+ lock_journal(journal);
+ JBUFFER_TRACE(jh, "entry");
+
+ /* If the buffer is reserved but not modified by this
+ * transaction, then it is safe to release it. In all other
+ * cases, just leave the buffer as it is. */
+
+ spin_lock(&journal_datalist_lock);
+ if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction &&
+ !buffer_jdirty(jh2bh(jh))) {
+ JBUFFER_TRACE(jh, "unused: refiling it");
+ handle->h_buffer_credits++;
+ __journal_refile_buffer(jh);
+ }
+ spin_unlock(&journal_datalist_lock);
+
+ JBUFFER_TRACE(jh, "exit");
+ unlock_journal(journal);
+ }
+ #endif
+
+ /*
+ * journal_forget: bforget() for potentially-journaled buffers. We can
+ * only do the bforget if there are no commits pending against the
+ * buffer. If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable. Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+
+ void journal_forget (handle_t *handle, struct buffer_head *bh)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh;
+
+ BUFFER_TRACE(bh, "entry");
+
+ lock_journal(journal);
+ spin_lock(&journal_datalist_lock);
+
+ if (!buffer_jbd(bh))
+ goto not_jbd;
+ jh = bh2jh(bh);
+
+ if (jh->b_transaction == handle->h_transaction) {
+ J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+ /* If we are forgetting a buffer which is already part
+ * of this transaction, then we can just drop it from
+ * the transaction immediately. */
+ clear_bit(BH_Dirty, &bh->b_state);
+ clear_bit(BH_JBDDirty, &bh->b_state);
+
+ JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+ J_ASSERT_JH(jh, !jh->b_committed_data);
+
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = 0;
+
+ /*
+ * We are no longer going to journal this buffer.
+ * However, the commit of this transaction is still
+ * important to the buffer: the delete that we are now
+ * processing might obsolete an old log entry, so by
+ * committing, we can satisfy the buffer's checkpoint.
+ *
+ * So, if we have a checkpoint on the buffer, we should
+ * now refile the buffer on our BJ_Forget list so that
+ * we know to remove the checkpoint after we commit.
+ */
+
+ if (jh->b_cp_transaction) {
+ __journal_file_buffer(jh, transaction, BJ_Forget);
+ } else {
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ if (!buffer_jbd(bh)) {
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ __bforget(bh);
+ return;
+ }
+ }
+
+ } else if (jh->b_transaction) {
+ J_ASSERT_JH(jh, (jh->b_transaction ==
+ journal->j_committing_transaction));
+ /* However, if the buffer is still owned by a prior
+ * (committing) transaction, we can't drop it yet... */
+ JBUFFER_TRACE(jh, "belongs to older transaction");
+ /* ... but we CAN drop it from the new transaction if we
+ * have also modified it since the original commit. */
+
+ if (jh->b_next_transaction) {
+ J_ASSERT(jh->b_next_transaction == transaction);
+ jh->b_next_transaction = NULL;
+ }
+ }
+
+ not_jbd:
+ spin_unlock(&journal_datalist_lock);
+ unlock_journal(journal);
+ __brelse(bh);
+ return;
+ }
+
+ #if 0 /* Unused */
+ /*
+ * journal_sync_buffer: flush a potentially-journaled buffer to disk.
+ *
+ * Used for O_SYNC filesystem operations. If the buffer is journaled,
+ * we need to complete the O_SYNC by waiting for the transaction to
+ * complete. It is an error to call journal_sync_buffer before
+ * journal_stop!
+ */
+
+ void journal_sync_buffer(struct buffer_head *bh)
+ {
+ transaction_t *transaction;
+ journal_t *journal;
+ long sequence;
+ struct journal_head *jh;
+
+ /* If the buffer isn't journaled, this is easy: just sync it to
+ * disk. */
+ BUFFER_TRACE(bh, "entry");
+
+ spin_lock(&journal_datalist_lock);
+ if (!buffer_jbd(bh)) {
+ spin_unlock(&journal_datalist_lock);
+ return;
+ }
+ jh = bh2jh(bh);
+ if (jh->b_transaction == NULL) {
+ /* If the buffer has already been journaled, then this
+ * is a noop. */
+ if (jh->b_cp_transaction == NULL) {
+ spin_unlock(&journal_datalist_lock);
+ return;
+ }
+ atomic_inc(&bh->b_count);
+ spin_unlock(&journal_datalist_lock);
+ ll_rw_block (WRITE, 1, &bh);
+ wait_on_buffer(bh);
+ __brelse(bh);
+ goto out;
+ }
+
+ /* Otherwise, just wait until the transaction is synced to disk. */
+ transaction = jh->b_transaction;
+ journal = transaction->t_journal;
+ sequence = transaction->t_tid;
+ spin_unlock(&journal_datalist_lock);
+
+ jbd_debug(2, "requesting commit for jh %p\n", jh);
+ log_start_commit (journal, transaction);
+
+ while (tid_gt(sequence, journal->j_commit_sequence)) {
+ wake_up(&journal->j_wait_done_commit);
+ sleep_on(&journal->j_wait_done_commit);
+ }
+ JBUFFER_TRACE(jh, "exit");
+ out:
+ return;
+ }
+ #endif
+
+ /*
+ * All done for a particular handle.
+ *
+ * There is not much action needed here. We just return any remaining
+ * buffer credits to the transaction and remove the handle. The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances. In particular, expect it to
+ * return -EIO if a journal_abort has been executed since the
+ * transaction began.
+ */
+
+ int journal_stop(handle_t *handle)
+ {
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int old_handle_count, err;
+
+ if (!handle)
+ return 0;
+
+ J_ASSERT (transaction->t_updates > 0);
+ J_ASSERT (journal_current_handle() == handle);
+
+ if (is_handle_aborted(handle))
+ err = -EIO;
+ else
+ err = 0;
+
+ if (--handle->h_ref > 0) {
+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ return err;
+ }
+
+ jbd_debug(4, "Handle %p going down\n", handle);
+
+ /*
+ * Implement synchronous transaction batching. If the handle
+ * was synchronous, don't force a commit immediately. Let's
+ * yield and let another thread piggyback onto this transaction.
+ * Keep doing that while new threads continue to arrive.
+ * It doesn't cost much - we're about to run a commit and sleep
+ * on IO anyway. Speeds up many-threaded, many-dir operations
+ * by 30x or more...
+ */
+ if (handle->h_sync) {
+ do {
+ old_handle_count = transaction->t_handle_count;
+ set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ } while (old_handle_count != transaction->t_handle_count);
+ }
+
+ current->journal_info = NULL;
+ transaction->t_outstanding_credits -= handle->h_buffer_credits;
+ transaction->t_updates--;
+ if (!transaction->t_updates) {
+ wake_up(&journal->j_wait_updates);
+ if (journal->j_barrier_count)
+ wake_up(&journal->j_wait_transaction_locked);
+ }
+
+ /*
+ * If the handle is marked SYNC, we need to set another commit
+ * going! We also want to force a commit if the current
+ * transaction is occupying too much of the log, or if the
+ * transaction is too old now.
+ */
+ if (handle->h_sync ||
+ transaction->t_outstanding_credits >
+ journal->j_max_transaction_buffers ||
+ time_after_eq(jiffies, transaction->t_expires)) {
+ /* Do this even for aborted journals: an abort still
+ * completes the commit thread, it just doesn't write
+ * anything to disk. */
+ tid_t tid = transaction->t_tid;
+
+ jbd_debug(2, "transaction too old, requesting commit for "
+ "handle %p\n", handle);
+ /* This is non-blocking */
+ log_start_commit(journal, transaction);
+
+ /*
+ * Special case: JFS_SYNC synchronous updates require us
+ * to wait for the commit to complete.
+ */
+ if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+ log_wait_commit(journal, tid);
+ }
+ kfree(handle);
+ return err;
+ }
+
+ /*
+ * For synchronous operations: force any uncommitted trasnactions
+ * to disk. May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+ int journal_force_commit(journal_t *journal)
+ {
+ handle_t *handle;
+ int ret = 0;
+
+ lock_kernel();
+ handle = journal_start(journal, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ handle->h_sync = 1;
+ journal_stop(handle);
+ out:
+ unlock_kernel();
+ return ret;
+ }
+
+ /*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+ /*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ * journal_datalist_lock is held.
+ */
+
+ static inline void
+ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+ {
+ if (!*list) {
+ jh->b_tnext = jh->b_tprev = jh;
+ *list = jh;
+ } else {
+ /* Insert at the tail of the list to preserve order */
+ struct journal_head *first = *list, *last = first->b_tprev;
+ jh->b_tprev = last;
+ jh->b_tnext = first;
+ last->b_tnext = first->b_tprev = jh;
+ }
+ }
+
+ /*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with journal_datalist_lock held, and the journal may not
+ * be locked.
+ */
+
+ static inline void
+ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+ {
+ if (*list == jh) {
+ *list = jh->b_tnext;
+ if (*list == jh)
+ *list = 0;
+ }
+ jh->b_tprev->b_tnext = jh->b_tnext;
+ jh->b_tnext->b_tprev = jh->b_tprev;
+ }
+
+ /*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
+ * is holding onto a copy of one of thee pointers, it could go bad.
+ * Generally the caller needs to re-read the pointer from the transaction_t.
+ *
+ * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
+ * via journal_try_to_free_buffer() or journal_clean_data_list(). In that
+ * case, journal_datalist_lock will be held, and the journal may not be locked.
+ */
+ void __journal_unfile_buffer(struct journal_head *jh)
+ {
+ struct journal_head **list = 0;
+ transaction_t * transaction;
+
+ assert_spin_locked(&journal_datalist_lock);
+ transaction = jh->b_transaction;
+
+ #ifdef __SMP__
+ J_ASSERT (current->lock_depth >= 0);
+ #endif
+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+
+ if (jh->b_jlist != BJ_None)
+ J_ASSERT_JH(jh, transaction != 0);
+
+ switch (jh->b_jlist) {
+ case BJ_None:
+ return;
+ case BJ_SyncData:
+ list = &transaction->t_sync_datalist;
+ break;
+ case BJ_AsyncData:
+ list = &transaction->t_async_datalist;
+ break;
+ case BJ_Metadata:
+ transaction->t_nr_buffers--;
+ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+ list = &transaction->t_buffers;
+ break;
+ case BJ_Forget:
+ list = &transaction->t_forget;
+ break;
+ case BJ_IO:
+ list = &transaction->t_iobuf_list;
+ break;
+ case BJ_Shadow:
+ list = &transaction->t_shadow_list;
+ break;
+ case BJ_LogCtl:
+ list = &transaction->t_log_list;
+ break;
+ case BJ_Reserved:
+ list = &transaction->t_reserved_list;
+ break;
+ }
+
+ __blist_del_buffer(list, jh);
+ jh->b_jlist = BJ_None;
+ if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) {
+ set_bit(BH_Dirty, &jh2bh(jh)->b_state);
+ }
+ }
+
+ void journal_unfile_buffer(struct journal_head *jh)
+ {
+ spin_lock(&journal_datalist_lock);
+ __journal_unfile_buffer(jh);
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+ * Called from journal_try_to_free_buffers(). The journal is not
+ * locked. lru_list_lock is not held.
+ *
+ * Here we see why journal_datalist_lock is global and not per-journal.
+ * We cannot get back to this buffer's journal pointer without locking
+ * out journal_clean_data_list() in some manner.
+ *
+ * One could use journal_datalist_lock to get unracy access to a
+ * per-journal lock.
+ *
+ * Called with journal_datalist_lock held.
+ *
+ * Returns non-zero iff we were able to free the journal_head.
+ */
+ static int __journal_try_to_free_buffer(struct buffer_head *bh,
+ int *locked_or_dirty)
+ {
+ struct journal_head *jh;
+
+ assert_spin_locked(&journal_datalist_lock);
+
+ if (!buffer_jbd(bh))
+ return 1;
+ jh = bh2jh(bh);
+
+ if (buffer_locked(bh) || buffer_dirty(bh)) {
+ *locked_or_dirty = 1;
+ goto out;
+ }
+
+ if (!buffer_uptodate(bh))
+ goto out;
+
+ if (jh->b_next_transaction != 0)
+ goto out;
+
+ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
+ /* A written-back ordered data buffer */
+ JBUFFER_TRACE(jh, "release data");
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = 0;
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ }
+ else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+ /* written-back checkpointed metadata buffer */
+ if (jh->b_jlist == BJ_None) {
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ __journal_remove_checkpoint(jh);
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ }
+ return !buffer_jbd(bh);
+
+ out:
+ return 0;
+ }
+
+ /*
+ * journal_try_to_free_buffers(). For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them. Called with lru_list_lock
+ * not held. Does its own locking.
+ *
+ * This complicates JBD locking somewhat. We aren't protected by the
+ * BKL here. We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a journal_dirty_data on this
+ * buffer. So we need to lock against that. journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this? hmm... Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state. But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction. Yes?
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this is the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ */
+ int journal_try_to_free_buffers(journal_t *journal,
+ struct page *page, int gfp_mask)
+ {
+ struct buffer_head *bh;
+ struct buffer_head *tmp;
+ int locked_or_dirty = 0;
+ int call_ttfb = 1;
+
+ J_ASSERT(PageLocked(page));
+
+ bh = page->buffers;
+ tmp = bh;
+ spin_lock(&journal_datalist_lock);
+ do {
+ struct buffer_head *p = tmp;
+
+ tmp = tmp->b_this_page;
+ if (buffer_jbd(p))
+ if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
+ call_ttfb = 0;
+ } while (tmp != bh);
+ spin_unlock(&journal_datalist_lock);
+
+ if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
+ goto out;
+ if (!locked_or_dirty)
+ goto out;
+ /*
+ * The VM wants us to do writeout, or to block on IO, or both.
+ * So we allow try_to_free_buffers to be called even if the page
+ * still has journalled buffers.
+ */
+ call_ttfb = 1;
+ out:
+ return call_ttfb;
+ }
+
+ /*
+ * This buffer is no longer needed. If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits. If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ */
+ static int dispose_buffer(struct journal_head *jh,
+ transaction_t *transaction)
+ {
+ int may_free = 1;
+ struct buffer_head *bh = jh2bh(jh);
+
+ spin_lock(&journal_datalist_lock);
+ __journal_unfile_buffer(jh);
+ jh->b_transaction = 0;
+
+ if (jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "on running+cp transaction");
+ __journal_file_buffer(jh, transaction, BJ_Forget);
+ clear_bit(BH_JBDDirty, &bh->b_state);
+ may_free = 0;
+ } else {
+ JBUFFER_TRACE(jh, "on running transaction");
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ spin_unlock(&journal_datalist_lock);
+ return may_free;
+ }
+
+ /*
+ * journal_flushpage
+ *
+ * This code is tricky. It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling flushpage on the
+ * data.
+ *
+ * This is done in ext3 by defining an ext3_setattr method which
+ * updates i_size before truncate gets going. By maintaining this
+ * invariant, we can be sure that it is safe to throw away any buffers
+ * attached to the current transaction: once the transaction commits,
+ * we know that the data will not be needed.
+ *
+ * Note however that we can *not* throw away data belonging to the
+ * previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ * The bitmap committed_data images guarantee this: any block which is
+ * allocated in one transaction and removed in the next will be marked
+ * as in-use in the committed_data bitmap, so cannot be reused until
+ * the next transaction to delete the block commits. This means that
+ * leaving committing buffers dirty is quite safe: the disk blocks
+ * cannot be reallocated to a different file and so buffer aliasing is
+ * not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode. In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode. --sct
+ */
+
+ /*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here. Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+ {
+ transaction_t *transaction;
+ struct journal_head *jh;
+ int may_free = 1;
+
+ BUFFER_TRACE(bh, "entry");
+
+ if (!buffer_mapped(bh))
+ return 1;
+
+ /* It is safe to proceed here without the
+ * journal_datalist_spinlock because the buffers cannot be
+ * stolen by try_to_free_buffers as long as we are holding the
+ * page lock. --sct */
+
+ if (!buffer_jbd(bh))
+ goto zap_buffer;
+
+ jh = bh2jh(bh);
+ transaction = jh->b_transaction;
+ if (transaction == NULL) {
+ /* First case: not on any transaction. If it
+ * has no checkpoint link, then we can zap it:
+ * it's a writeback-mode buffer so we don't care
+ * if it hits disk safely. */
+ if (!jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "not on any transaction: zap");
+ goto zap_buffer;
+ }
+
+ if (!buffer_dirty(bh)) {
+ /* bdflush has written it. We can drop it now */
+ goto zap_buffer;
+ }
+
+ /* OK, it must be in the journal but still not
+ * written fully to disk: it's metadata or
+ * journaled data... */
+
+ if (journal->j_running_transaction) {
+ /* ... and once the current transaction has
+ * committed, the buffer won't be needed any
+ * longer. */
+ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+ return dispose_buffer(jh,
+ journal->j_running_transaction);
+ } else {
+ /* There is no currently-running transaction. So the
+ * orphan record which we wrote for this file must have
+ * passed into commit. We must attach this buffer to
+ * the committing transaction, if it exists. */
+ if (journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "give to committing trans");
+ return dispose_buffer(jh,
+ journal->j_committing_transaction);
+ } else {
+ /* The orphan record's transaction has
+ * committed. We can cleanse this buffer */
+ clear_bit(BH_JBDDirty, &bh->b_state);
+ goto zap_buffer;
+ }
+ }
+ } else if (transaction == journal->j_committing_transaction) {
+ /* If it is committing, we simply cannot touch it. We
+ * can remove it's next_transaction pointer from the
+ * running transaction if that is set, but nothing
+ * else. */
+ JBUFFER_TRACE(jh, "on committing transaction");
+ if (jh->b_next_transaction) {
+ J_ASSERT(jh->b_next_transaction ==
+ journal->j_running_transaction);
+ jh->b_next_transaction = NULL;
+ }
+ return 0;
+ } else {
+ /* Good, the buffer belongs to the running transaction.
+ * We are writing our own transaction's data, not any
+ * previous one's, so it is safe to throw it away
+ * (remember that we expect the filesystem to have set
+ * i_size already for this truncate so recovery will not
+ * expose the disk blocks we are discarding here.) */
+ J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+ may_free = dispose_buffer(jh, transaction);
+ }
+
+ zap_buffer:
+ if (buffer_dirty(bh))
+ mark_buffer_clean(bh);
+ J_ASSERT_BH(bh, !buffer_jdirty(bh));
+ clear_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Mapped, &bh->b_state);
+ clear_bit(BH_Req, &bh->b_state);
+ clear_bit(BH_New, &bh->b_state);
+ return may_free;
+ }
+
+ /*
+ * Return non-zero if the page's buffers were successfully reaped
+ */
+ int journal_flushpage(journal_t *journal,
+ struct page *page,
+ unsigned long offset)
+ {
+ struct buffer_head *head, *bh, *next;
+ unsigned int curr_off = 0;
+ int may_free = 1;
+
+ if (!PageLocked(page))
+ BUG();
+ if (!page->buffers)
+ return 1;
+
+ /* We will potentially be playing with lists other than just the
+ * data lists (especially for journaled data mode), so be
+ * cautious in our locking. */
+ lock_journal(journal);
+
+ head = bh = page->buffers;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+ next = bh->b_this_page;
+
+ /* AKPM: doing lock_buffer here may be overly paranoid */
+ if (offset <= curr_off) {
+ /* This block is wholly outside the truncation point */
+ lock_buffer(bh);
+ may_free &= journal_unmap_buffer(journal, bh);
+ unlock_buffer(bh);
+ }
+ curr_off = next_off;
+ bh = next;
+
+ } while (bh != head);
+
+ unlock_journal(journal);
+
+ if (!offset) {
+ if (!may_free || !try_to_free_buffers(page, 0)) {
+ atomic_inc(&buffermem_pages);
+ return 0;
+ }
+ J_ASSERT(page->buffers == NULL);
+ }
+
+ return 1;
+ }
+
+
+
+ /*
+ * File a buffer on the given transaction list.
+ */
+
+ void __journal_file_buffer(struct journal_head *jh,
+ transaction_t *transaction, int jlist)
+ {
+ struct journal_head **list = 0;
+
+ assert_spin_locked(&journal_datalist_lock);
+
+ #ifdef __SMP__
+ J_ASSERT (current->lock_depth >= 0);
+ #endif
+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+ J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+ jh->b_transaction == 0);
+
+ if (jh->b_transaction) {
+ if (jh->b_jlist == jlist)
+ return;
+ __journal_unfile_buffer(jh);
+ } else {
+ jh->b_transaction = transaction;
+ }
+
+ switch (jlist) {
+ case BJ_None:
+ J_ASSERT_JH(jh, !jh->b_committed_data);
+ J_ASSERT_JH(jh, !jh->b_frozen_data);
+ return;
+ case BJ_SyncData:
+ list = &transaction->t_sync_datalist;
+ break;
+ case BJ_AsyncData:
+ list = &transaction->t_async_datalist;
+ break;
+ case BJ_Metadata:
+ transaction->t_nr_buffers++;
+ list = &transaction->t_buffers;
+ break;
+ case BJ_Forget:
+ list = &transaction->t_forget;
+ break;
+ case BJ_IO:
+ list = &transaction->t_iobuf_list;
+ break;
+ case BJ_Shadow:
+ list = &transaction->t_shadow_list;
+ break;
+ case BJ_LogCtl:
+ list = &transaction->t_log_list;
+ break;
+ case BJ_Reserved:
+ list = &transaction->t_reserved_list;
+ break;
+ }
+
+ __blist_add_buffer(list, jh);
+ jh->b_jlist = jlist;
+
+ if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+ jlist == BJ_Shadow || jlist == BJ_Forget) {
+ if (atomic_set_buffer_clean(jh2bh(jh))) {
+ set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
+ }
+ }
+ }
+
+ void journal_file_buffer(struct journal_head *jh,
+ transaction_t *transaction, int jlist)
+ {
+ spin_lock(&journal_datalist_lock);
+ __journal_file_buffer(jh, transaction, jlist);
+ spin_unlock(&journal_datalist_lock);
+ }
+
+ /*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely. If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ */
+
+ void __journal_refile_buffer(struct journal_head *jh)
+ {
+ assert_spin_locked(&journal_datalist_lock);
+ #ifdef __SMP__
+ J_ASSERT_JH(jh, current->lock_depth >= 0);
+ #endif
+ __journal_unfile_buffer(jh);
+
+ /* If the buffer is now unused, just drop it. If it has been
+ modified by a later transaction, add it to the new
+ transaction's metadata list. */
+
+ jh->b_transaction = jh->b_next_transaction;
+ jh->b_next_transaction = NULL;
+
+ if (jh->b_transaction != NULL) {
+ __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
+ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+ } else {
+ /* Onto BUF_DIRTY for writeback */
+ refile_buffer(jh2bh(jh));
+ }
+ }
+
+ /*
+ * For the unlocked version of this call, also make sure that any
+ * hanging journal_head is cleaned up if necessary.
+ *
+ * __journal_refile_buffer is usually called as part of a single locked
+ * operation on a buffer_head, in which the caller is probably going to
+ * be hooking the journal_head onto other lists. In that case it is up
+ * to the caller to remove the journal_head if necessary. For the
+ * unlocked journal_refile_buffer call, the caller isn't going to be
+ * doing anything else to the buffer so we need to do the cleanup
+ * ourselves to avoid a jh leak.
+ *
+ * *** The journal_head may be freed by this call! ***
+ */
+ void journal_refile_buffer(struct journal_head *jh)
+ {
+ struct buffer_head *bh;
+
+ spin_lock(&journal_datalist_lock);
+ bh = jh2bh(jh);
+
+ __journal_refile_buffer(jh);
+ __journal_remove_journal_head(bh);
+
+ spin_unlock(&journal_datalist_lock);
+ __brelse(bh);
+ }
diff -rc2P linux/fs/jbd-kernel.c linux-2.4.13/fs/jbd-kernel.c
*** linux/fs/jbd-kernel.c Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd-kernel.c Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,336 ----
+ /*
+ * fs/jbd-kernel.c
+ *
+ * Support code for the Journalling Block Device layer.
+ * This file contains things which have to be in-kernel when
+ * JBD is a module.
+ *
+ * 15 May 2001 Andrew Morton <[email protected]>
+ * Created
+ */
+
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
+
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+
+ /*
+ * jh_splice_lock needs explantion.
+ *
+ * In a number of places we want to do things like:
+ *
+ * if (buffer_jbd(bh) && bh2jh(bh)->foo)
+ *
+ * This is racy on SMP, because another CPU could remove the journal_head
+ * in the middle of this expression. We need locking.
+ *
+ * But we can greatly optimise the locking cost by testing BH_JBD
+ * outside the lock. So, effectively:
+ *
+ * ret = 0;
+ * if (buffer_jbd(bh)) {
+ * spin_lock(&jh_splice_lock);
+ * if (buffer_jbd(bh)) { (* Still there? *)
+ * ret = bh2jh(bh)->foo;
+ * }
+ * spin_unlock(&jh_splice_lock);
+ * }
+ * return ret;
+ *
+ * Now, that protects us from races where another CPU can remove the
+ * journal_head. But it doesn't defend us from the situation where another
+ * CPU can *add* a journal_head. This is a correctness issue. But it's not
+ * a problem because a) the calling code was *already* racy and b) it often
+ * can't happen at the call site and c) the places where we add journal_heads
+ * tend to be under external locking.
+ */
+ spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED;
+ EXPORT_SYMBOL(jh_splice_lock);
+
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+ * Some sanity testing which is called from mark_buffer_clean(),
+ * and must be present in the main kernel.
+ */
+
+ void jbd_preclean_buffer_check(struct buffer_head *bh)
+ {
+ if (buffer_jbd(bh)) {
+ struct journal_head *jh = bh2jh(bh);
+
+ transaction_t *transaction = jh->b_transaction;
+ journal_t *journal;
+
+ if (jh->b_jlist == 0 && transaction == NULL)
+ return;
+
+ J_ASSERT_JH(jh, (jh->b_jlist == 0 ||
+ jh->b_jlist == BJ_LogCtl ||
+ jh->b_jlist == BJ_IO ||
+ jh->b_jlist == BJ_Forget ||
+ buffer_jbd_data(bh)));
+ J_ASSERT_JH(jh, transaction != NULL);
+ /* The kernel may be unmapping old data. We expect it
+ * to be dirty in that case, unless the buffer has
+ * already been forgotten by a transaction. */
+ if (jh->b_jlist != BJ_Forget) {
+ #if 1
+ if (!buffer_dirty(bh)) {
+ printk(__FUNCTION__": clean of clean buffer\n");
+ print_buffer_trace(bh);
+ return;
+ }
+ #endif
+ J_ASSERT_BH(bh, buffer_dirty(bh));
+ if (!buffer_jbd_data(bh)) {
+ J_ASSERT_JH(jh,
+ test_bit(BH_JWrite,
+ &jh2bh(jh)->b_state));
+ }
+ }
+
+ journal = transaction->t_journal;
+ J_ASSERT_JH(jh,
+ transaction == journal->j_running_transaction ||
+ transaction == journal->j_committing_transaction);
+ }
+ }
+ EXPORT_SYMBOL(jbd_preclean_buffer_check);
+ #endif /* CONFIG_JBD_DEBUG */
+
+ /*
+ * Entries in /proc/sys/fs
+ */
+
+ int journal_oom_retry = 1;
+ EXPORT_SYMBOL(journal_oom_retry);
+ #if defined(CONFIG_JBD_DEBUG)
+ int journal_enable_debug;
+ int journal_no_write[2];
+ EXPORT_SYMBOL(journal_enable_debug);
+ EXPORT_SYMBOL(journal_no_write);
+ #endif
+
+ #endif /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */
+
+ /*
+ * Support functions for BUFFER_TRACE()
+ */
+ #ifdef CONFIG_BUFFER_DEBUG
+
+ static spinlock_t trace_lock = SPIN_LOCK_UNLOCKED;
+
+ void buffer_trace(struct buffer_head *dest,
+ struct buffer_head *src, char *info)
+ {
+ struct buffer_history_item *bhist_i;
+ unsigned long flags;
+
+ if (dest == 0 || src == 0)
+ return;
+
+ spin_lock_irqsave(&trace_lock, flags);
+
+ /*
+ * Sometimes we don't initialise the ring pointers. (locally declared
+ * temp buffer_heads). Feebly attempt to detect and correct that here.
+ */
+ if ((dest->b_history.b_history_head - dest->b_history.b_history_tail >
+ BUFFER_HISTORY_SIZE)) {
+ dest->b_history.b_history_head = 0;
+ dest->b_history.b_history_tail = 0;
+ }
+ bhist_i = dest->b_history.b +
+ (dest->b_history.b_history_head & (BUFFER_HISTORY_SIZE - 1));
+ bhist_i->info = info;
+ bhist_i->b_state = src->b_state;
+ bhist_i->b_list = src->b_list;
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ bhist_i->b_trans_is_running = 0;
+ bhist_i->b_trans_is_committing = 0;
+ bhist_i->b_blocknr = src->b_blocknr;
+ if (buffer_jbd(src)) {
+ struct journal_head *jh;
+ journal_t *journal;
+ transaction_t *transaction;
+
+ /* Footwork to avoid racing with journal_remove_journal_head */
+ jh = src->b_private;
+ if (jh == 0)
+ goto raced;
+ transaction = jh->b_transaction;
+ if (src->b_private == 0)
+ goto raced;
+ bhist_i->b_jcount = jh->b_jcount;
+ bhist_i->b_jbd = 1;
+ bhist_i->b_jlist = jh->b_jlist;
+ bhist_i->b_frozen_data = jh->b_frozen_data;
+ bhist_i->b_committed_data = jh->b_committed_data;
+ bhist_i->b_transaction = !!jh->b_transaction;
+ bhist_i->b_next_transaction = !!jh->b_next_transaction;
+ bhist_i->b_cp_transaction = !!jh->b_cp_transaction;
+
+ if (transaction) {
+ journal = transaction->t_journal;
+ bhist_i->b_trans_is_running = transaction ==
+ journal->j_running_transaction;
+ bhist_i->b_trans_is_committing = transaction ==
+ journal->j_committing_transaction;
+ }
+ } else {
+ raced:
+ bhist_i->b_jcount = 0;
+ bhist_i->b_jbd = 0;
+ bhist_i->b_jlist = 0;
+ bhist_i->b_frozen_data = 0;
+ bhist_i->b_committed_data = 0;
+ bhist_i->b_transaction = 0;
+ bhist_i->b_next_transaction = 0;
+ bhist_i->b_cp_transaction = 0;
+ }
+ #endif /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */
+
+ bhist_i->on_lru = (src->b_prev_free != 0 && src->b_next_free != 0);
+ bhist_i->on_hash = (src->b_pprev != 0);
+ bhist_i->cpu = smp_processor_id();
+ bhist_i->b_count = atomic_read(&src->b_count);
+
+ dest->b_history.b_history_head++;
+ if (dest->b_history.b_history_head - dest->b_history.b_history_tail >
+ BUFFER_HISTORY_SIZE)
+ dest->b_history.b_history_tail =
+ dest->b_history.b_history_head - BUFFER_HISTORY_SIZE;
+
+ spin_unlock_irqrestore(&trace_lock, flags);
+ }
+
+ static const char *b_list_to_string(unsigned int b_list)
+ {
+ switch (b_list) {
+ case BUF_CLEAN: return "BUF_CLEAN";
+ case BUF_LOCKED: return "BUF_LOCKED";
+ case BUF_DIRTY: return "BUF_DIRTY";
+ default: return "Bad b_list";
+ }
+ }
+
+ static const char *b_jlist_to_string(unsigned int b_list)
+ {
+ switch (b_list) {
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ case BJ_None: return "BJ_None";
+ case BJ_SyncData: return "BJ_SyncData";
+ case BJ_AsyncData: return "BJ_AsyncData";
+ case BJ_Metadata: return "BJ_Metadata";
+ case BJ_Forget: return "BJ_Forget";
+ case BJ_IO: return "BJ_IO";
+ case BJ_Shadow: return "BJ_Shadow";
+ case BJ_LogCtl: return "BJ_LogCtl";
+ case BJ_Reserved: return "BJ_Reserved";
+ #endif
+ default: return "Bad b_jlist";
+ }
+ }
+
+ static void print_one_hist(struct buffer_history_item *bhist_i)
+ {
+ printk(" %s\n", bhist_i->info);
+ printk(" b_state:0x%lx b_list:%s b_jlist:%s on_lru:%d\n",
+ bhist_i->b_state,
+ b_list_to_string(bhist_i->b_list),
+ b_jlist_to_string(bhist_i->b_jlist),
+ bhist_i->on_lru);
+ printk(" cpu:%d on_hash:%d b_count:%d b_blocknr:%lu\n",
+ bhist_i->cpu,
+ bhist_i->on_hash,
+ bhist_i->b_count,
+ bhist_i->b_blocknr);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ printk(" b_jbd:%u b_frozen_data:%p b_committed_data:%p\n",
+ bhist_i->b_jbd,
+ bhist_i->b_frozen_data,
+ bhist_i->b_committed_data);
+ printk(" b_transaction:%u b_next_transaction:%u "
+ "b_cp_transaction:%u b_trans_is_running:%u\n",
+ bhist_i->b_transaction,
+ bhist_i->b_next_transaction,
+ bhist_i->b_cp_transaction,
+ bhist_i->b_trans_is_running);
+ printk(" b_trans_is_comitting:%u b_jcount:%u ",
+ bhist_i->b_trans_is_committing,
+ bhist_i->b_jcount);
+ #endif
+ printk("\n");
+ }
+
+ void print_buffer_fields(struct buffer_head *bh)
+ {
+ printk("b_next:%p, b_blocknr:%lu b_count:%d b_flushtime:%lu\n",
+ bh->b_next, bh->b_blocknr, atomic_read(&bh->b_count),
+ bh->b_flushtime);
+ printk("b_next_free:%p b_prev_free:%p b_this_page:%p b_reqnext:%p\n",
+ bh->b_next_free, bh->b_prev_free, bh->b_this_page,
+ bh->b_reqnext);
+ printk("b_pprev:%p b_data:%p b_page:%p b_inode:%p b_list:%d\n",
+ bh->b_pprev, bh->b_data, bh->b_page, bh->b_inode, bh->b_list);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ if (buffer_jbd(bh)) {
+ struct journal_head *jh = bh2jh(bh);
+
+ printk("b_jlist:%u b_frozen_data:%p b_committed_data:%p\n",
+ jh->b_jlist, jh->b_frozen_data, jh->b_committed_data);
+ printk(" b_transaction:%p b_next_transaction:%p "
+ "b_cp_transaction:%p\n",
+ jh->b_transaction, jh->b_next_transaction,
+ jh->b_cp_transaction);
+ printk("b_cpnext:%p b_cpprev:%p\n",
+ jh->b_cpnext, jh->b_cpprev);
+ }
+ #endif
+ }
+
+ void print_buffer_trace(struct buffer_head *bh)
+ {
+ #ifdef CONFIG_X86
+ extern void show_stack(unsigned long * esp);
+ #endif
+
+ unsigned long idx, count;
+ unsigned long flags;
+
+ printk("buffer trace for buffer at 0x%p (I am CPU %d)\n",
+ bh, smp_processor_id());
+ BUFFER_TRACE(bh, ""); /* Record state now */
+
+ spin_lock_irqsave(&trace_lock, flags);
+ for ( idx = bh->b_history.b_history_tail, count = 0;
+ idx < bh->b_history.b_history_head &&
+ count < BUFFER_HISTORY_SIZE;
+ idx++, count++)
+ print_one_hist(bh->b_history.b +
+ (idx & (BUFFER_HISTORY_SIZE - 1)));
+
+ print_buffer_fields(bh);
+ spin_unlock_irqrestore(&trace_lock, flags);
+ #ifdef CONFIG_X86
+ show_stack(NULL);
+ #endif
+ printk("\n");
+ }
+
+ static struct buffer_head *failed_buffer_head; /* For access with debuggers */
+
+ void buffer_assertion_failure(struct buffer_head *bh)
+ {
+ failed_buffer_head = bh;
+ print_buffer_trace(bh);
+ }
+ EXPORT_SYMBOL(buffer_trace);
+ EXPORT_SYMBOL(print_buffer_trace);
+ EXPORT_SYMBOL(buffer_assertion_failure);
+ EXPORT_SYMBOL(print_buffer_fields);
+ #endif /* CONFIG_BUFFER_DEBUG */
+
diff -rc2P linux/fs/open.c linux-2.4.13/fs/open.c
*** linux/fs/open.c Fri Nov 9 16:15:08 2001
--- linux-2.4.13/fs/open.c Fri Nov 9 16:57:59 2001
***************
*** 72,75 ****
--- 72,81 ----
}

+ /*
+ * i_sem is taken outside i_truncate_sem because that is the
+ * order in which these locks are taken on the path
+ * generic_file_write->copy_from_user->handle_mm_fault->do_no_page
+ */
+
int do_truncate(struct dentry *dentry, loff_t length)
{
***************
*** 83,89 ****
--- 89,97 ----

down(&inode->i_sem);
+ down_write(&inode->i_truncate_sem);
newattrs.ia_size = length;
newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
error = notify_change(dentry, &newattrs);
+ up_write(&inode->i_truncate_sem);
up(&inode->i_sem);
return error;
diff -rc2P linux/include/linux/buffer-trace.h linux-2.4.13/include/linux/buffer-trace.h
*** linux/include/linux/buffer-trace.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/buffer-trace.h Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,84 ----
+ /*
+ * include/linux/buffer-trace.h
+ *
+ * Debugging support for recording buffer_head state transitions
+ *
+ * May 2001, akpm
+ * Created
+ */
+
+ #ifndef BUFFER_TRACE_H_INCLUDED
+ #define BUFFER_TRACE_H_INCLUDED
+
+ #include <linux/config.h>
+
+ #ifdef CONFIG_BUFFER_DEBUG
+
+ /* The number of records per buffer_head. Must be a power of two */
+ #define BUFFER_HISTORY_SIZE 32
+
+ struct buffer_head;
+
+ /* This gets embedded in struct buffer_head */
+ struct buffer_history {
+ struct buffer_history_item {
+ char *info;
+ unsigned long b_state;
+ unsigned b_list:3;
+ unsigned b_jlist:4;
+ unsigned on_lru:1;
+ unsigned on_hash:1;
+ unsigned cpu:3;
+ unsigned b_count:8;
+ unsigned long b_blocknr; /* For src != dest */
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ unsigned b_jcount:4;
+ unsigned b_jbd:1;
+ unsigned b_transaction:1;
+ unsigned b_next_transaction:1;
+ unsigned b_cp_transaction:1;
+ unsigned b_trans_is_running:1;
+ unsigned b_trans_is_committing:1;
+ void *b_frozen_data;
+ void *b_committed_data;
+ #endif
+ } b[BUFFER_HISTORY_SIZE];
+ unsigned long b_history_head; /* Next place to write */
+ unsigned long b_history_tail; /* Oldest valid entry */
+ };
+
+ static inline void buffer_trace_init(struct buffer_history *bhist)
+ {
+ bhist->b_history_head = 0;
+ bhist->b_history_tail = 0;
+ }
+ extern void buffer_trace(struct buffer_head *dest,
+ struct buffer_head *src, char *info);
+ extern void print_buffer_fields(struct buffer_head *bh);
+ extern void print_buffer_trace(struct buffer_head *bh);
+
+ #define BUFFER_STRINGIFY2(X) #X
+ #define BUFFER_STRINGIFY(X) BUFFER_STRINGIFY2(X)
+
+ #define BUFFER_TRACE2(dest, src, info) \
+ do { \
+ buffer_trace((dest), (src), \
+ __FUNCTION__"() ["__FILE__":" \
+ BUFFER_STRINGIFY(__LINE__)"] " info); \
+ } while (0)
+
+ #define BUFFER_TRACE(bh, info) BUFFER_TRACE2(bh, bh, info)
+ #define JBUFFER_TRACE(jh, info) BUFFER_TRACE(jh2bh(jh), info)
+
+ #else /* CONFIG_BUFFER_DEBUG */
+
+ #define buffer_trace_init(bh) do {} while (0)
+ #define print_buffer_fields(bh) do {} while (0)
+ #define print_buffer_trace(bh) do {} while (0)
+ #define BUFFER_TRACE(bh, info) do {} while (0)
+ #define BUFFER_TRACE2(bh, bh2, info) do {} while (0)
+ #define JBUFFER_TRACE(jh, info) do {} while (0)
+
+ #endif /* CONFIG_BUFFER_DEBUG */
+
+ #endif /* BUFFER_TRACE_H_INCLUDED */
diff -rc2P linux/include/linux/capability.h linux-2.4.13/include/linux/capability.h
*** linux/include/linux/capability.h Fri Nov 9 16:15:08 2001
--- linux-2.4.13/include/linux/capability.h Fri Nov 9 16:58:00 2001
***************
*** 251,254 ****
--- 251,256 ----
/* Override quota limits. */
/* Override reserved space on ext2 filesystem */
+ /* Modify data journaling mode on ext3 filesystem (uses journaling
+ resources) */
/* NOTE: ext2 honors fsuid when checking for resource overrides, so
you can override using fsuid too */
diff -rc2P linux/include/linux/capability.h.orig linux-2.4.13/include/linux/capability.h.orig
*** linux/include/linux/capability.h.orig Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/capability.h.orig Fri Nov 9 16:15:08 2001
***************
*** 0 ****
--- 1,356 ----
+ /*
+ * This is <linux/capability.h>
+ *
+ * Andrew G. Morgan <[email protected]>
+ * Alexander Kjeldaas <[email protected]>
+ * with help from Aleph1, Roland Buresund and Andrew Main.
+ *
+ * See here for the libcap library ("POSIX draft" compliance):
+ *
+ * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.2/
+ */
+
+ #ifndef _LINUX_CAPABILITY_H
+ #define _LINUX_CAPABILITY_H
+
+ #include <linux/types.h>
+ #include <linux/fs.h>
+
+ /* User-level do most of the mapping between kernel and user
+ capabilities based on the version tag given by the kernel. The
+ kernel might be somewhat backwards compatible, but don't bet on
+ it. */
+
+ /* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to
+ a set of three capability sets. The transposition of 3*the
+ following structure to such a composite is better handled in a user
+ library since the draft standard requires the use of malloc/free
+ etc.. */
+
+ #define _LINUX_CAPABILITY_VERSION 0x19980330
+
+ typedef struct __user_cap_header_struct {
+ __u32 version;
+ int pid;
+ } *cap_user_header_t;
+
+ typedef struct __user_cap_data_struct {
+ __u32 effective;
+ __u32 permitted;
+ __u32 inheritable;
+ } *cap_user_data_t;
+
+ #ifdef __KERNEL__
+
+ /* #define STRICT_CAP_T_TYPECHECKS */
+
+ #ifdef STRICT_CAP_T_TYPECHECKS
+
+ typedef struct kernel_cap_struct {
+ __u32 cap;
+ } kernel_cap_t;
+
+ #else
+
+ typedef __u32 kernel_cap_t;
+
+ #endif
+
+ #define _USER_CAP_HEADER_SIZE (2*sizeof(__u32))
+ #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t))
+
+ #endif
+
+
+ /**
+ ** POSIX-draft defined capabilities.
+ **/
+
+ /* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this
+ overrides the restriction of changing file ownership and group
+ ownership. */
+
+ #define CAP_CHOWN 0
+
+ /* Override all DAC access, including ACL execute access if
+ [_POSIX_ACL] is defined. Excluding DAC access covered by
+ CAP_LINUX_IMMUTABLE. */
+
+ #define CAP_DAC_OVERRIDE 1
+
+ /* Overrides all DAC restrictions regarding read and search on files
+ and directories, including ACL restrictions if [_POSIX_ACL] is
+ defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. */
+
+ #define CAP_DAC_READ_SEARCH 2
+
+ /* Overrides all restrictions about allowed operations on files, where
+ file owner ID must be equal to the user ID, except where CAP_FSETID
+ is applicable. It doesn't override MAC and DAC restrictions. */
+
+ #define CAP_FOWNER 3
+
+ /* Overrides the following restrictions that the effective user ID
+ shall match the file owner ID when setting the S_ISUID and S_ISGID
+ bits on that file; that the effective group ID (or one of the
+ supplementary group IDs) shall match the file owner ID when setting
+ the S_ISGID bit on that file; that the S_ISUID and S_ISGID bits are
+ cleared on successful return from chown(2) (not implemented). */
+
+ #define CAP_FSETID 4
+
+ /* Used to decide between falling back on the old suser() or fsuser(). */
+
+ #define CAP_FS_MASK 0x1f
+
+ /* Overrides the restriction that the real or effective user ID of a
+ process sending a signal must match the real or effective user ID
+ of the process receiving the signal. */
+
+ #define CAP_KILL 5
+
+ /* Allows setgid(2) manipulation */
+ /* Allows setgroups(2) */
+ /* Allows forged gids on socket credentials passing. */
+
+ #define CAP_SETGID 6
+
+ /* Allows set*uid(2) manipulation (including fsuid). */
+ /* Allows forged pids on socket credentials passing. */
+
+ #define CAP_SETUID 7
+
+
+ /**
+ ** Linux-specific capabilities
+ **/
+
+ /* Transfer any capability in your permitted set to any pid,
+ remove any capability in your permitted set from any pid */
+
+ #define CAP_SETPCAP 8
+
+ /* Allow modification of S_IMMUTABLE and S_APPEND file attributes */
+
+ #define CAP_LINUX_IMMUTABLE 9
+
+ /* Allows binding to TCP/UDP sockets below 1024 */
+ /* Allows binding to ATM VCIs below 32 */
+
+ #define CAP_NET_BIND_SERVICE 10
+
+ /* Allow broadcasting, listen to multicast */
+
+ #define CAP_NET_BROADCAST 11
+
+ /* Allow interface configuration */
+ /* Allow administration of IP firewall, masquerading and accounting */
+ /* Allow setting debug option on sockets */
+ /* Allow modification of routing tables */
+ /* Allow setting arbitrary process / process group ownership on
+ sockets */
+ /* Allow binding to any address for transparent proxying */
+ /* Allow setting TOS (type of service) */
+ /* Allow setting promiscuous mode */
+ /* Allow clearing driver statistics */
+ /* Allow multicasting */
+ /* Allow read/write of device-specific registers */
+ /* Allow activation of ATM control sockets */
+
+ #define CAP_NET_ADMIN 12
+
+ /* Allow use of RAW sockets */
+ /* Allow use of PACKET sockets */
+
+ #define CAP_NET_RAW 13
+
+ /* Allow locking of shared memory segments */
+ /* Allow mlock and mlockall (which doesn't really have anything to do
+ with IPC) */
+
+ #define CAP_IPC_LOCK 14
+
+ /* Override IPC ownership checks */
+
+ #define CAP_IPC_OWNER 15
+
+ /* Insert and remove kernel modules - modify kernel without limit */
+ /* Modify cap_bset */
+ #define CAP_SYS_MODULE 16
+
+ /* Allow ioperm/iopl access */
+ /* Allow sending USB messages to any device via /proc/bus/usb */
+
+ #define CAP_SYS_RAWIO 17
+
+ /* Allow use of chroot() */
+
+ #define CAP_SYS_CHROOT 18
+
+ /* Allow ptrace() of any process */
+
+ #define CAP_SYS_PTRACE 19
+
+ /* Allow configuration of process accounting */
+
+ #define CAP_SYS_PACCT 20
+
+ /* Allow configuration of the secure attention key */
+ /* Allow administration of the random device */
+ /* Allow examination and configuration of disk quotas */
+ /* Allow configuring the kernel's syslog (printk behaviour) */
+ /* Allow setting the domainname */
+ /* Allow setting the hostname */
+ /* Allow calling bdflush() */
+ /* Allow mount() and umount(), setting up new smb connection */
+ /* Allow some autofs root ioctls */
+ /* Allow nfsservctl */
+ /* Allow VM86_REQUEST_IRQ */
+ /* Allow to read/write pci config on alpha */
+ /* Allow irix_prctl on mips (setstacksize) */
+ /* Allow flushing all cache on m68k (sys_cacheflush) */
+ /* Allow removing semaphores */
+ /* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+ and shared memory */
+ /* Allow locking/unlocking of shared memory segment */
+ /* Allow turning swap on/off */
+ /* Allow forged pids on socket credentials passing */
+ /* Allow setting readahead and flushing buffers on block devices */
+ /* Allow setting geometry in floppy driver */
+ /* Allow turning DMA on/off in xd driver */
+ /* Allow administration of md devices (mostly the above, but some
+ extra ioctls) */
+ /* Allow tuning the ide driver */
+ /* Allow access to the nvram device */
+ /* Allow administration of apm_bios, serial and bttv (TV) device */
+ /* Allow manufacturer commands in isdn CAPI support driver */
+ /* Allow reading non-standardized portions of pci configuration space */
+ /* Allow DDI debug ioctl on sbpcd driver */
+ /* Allow setting up serial ports */
+ /* Allow sending raw qic-117 commands */
+ /* Allow enabling/disabling tagged queuing on SCSI controllers and sending
+ arbitrary SCSI commands */
+ /* Allow setting encryption key on loopback filesystem */
+ /* Allow the selection of a security context */
+
+ #define CAP_SYS_ADMIN 21
+
+ /* Allow use of reboot() */
+
+ #define CAP_SYS_BOOT 22
+
+ /* Allow raising priority and setting priority on other (different
+ UID) processes */
+ /* Allow use of FIFO and round-robin (realtime) scheduling on own
+ processes and setting the scheduling algorithm used by another
+ process. */
+
+ #define CAP_SYS_NICE 23
+
+ /* Override resource limits. Set resource limits. */
+ /* Override quota limits. */
+ /* Override reserved space on ext2 filesystem */
+ /* NOTE: ext2 honors fsuid when checking for resource overrides, so
+ you can override using fsuid too */
+ /* Override size restrictions on IPC message queues */
+ /* Allow more than 64hz interrupts from the real-time clock */
+ /* Override max number of consoles on console allocation */
+ /* Override max number of keymaps */
+
+ #define CAP_SYS_RESOURCE 24
+
+ /* Allow manipulation of system clock */
+ /* Allow irix_stime on mips */
+ /* Allow setting the real-time clock */
+
+ #define CAP_SYS_TIME 25
+
+ /* Allow configuration of tty devices */
+ /* Allow vhangup() of tty */
+
+ #define CAP_SYS_TTY_CONFIG 26
+
+ /* Allow the privileged aspects of mknod() */
+
+ #define CAP_MKNOD 27
+
+ /* Allow taking of leases on files */
+
+ #define CAP_LEASE 28
+
+ /* Allow opening special device file */
+
+ #define CAP_OPENDEV 29
+
+ #ifdef __KERNEL__
+ /*
+ * Bounding set
+ */
+ extern kernel_cap_t cap_bset;
+
+ /*
+ * Internal kernel functions only
+ */
+
+ #ifdef STRICT_CAP_T_TYPECHECKS
+
+ #define to_cap_t(x) { x }
+ #define cap_t(x) (x).cap
+
+ #else
+
+ #define to_cap_t(x) (x)
+ #define cap_t(x) (x)
+
+ #endif
+
+ #define CAP_EMPTY_SET to_cap_t(0)
+ #define CAP_FULL_SET to_cap_t(~0)
+ #define CAP_INIT_EFF_SET to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP))
+ #define CAP_INIT_INH_SET to_cap_t(0)
+
+ #define CAP_TO_MASK(x) (1 << (x))
+ #define cap_raise(c, flag) (cap_t(c) |= CAP_TO_MASK(flag))
+ #define cap_lower(c, flag) (cap_t(c) &= ~CAP_TO_MASK(flag))
+ #define cap_raised(c, flag) (cap_t(c) & CAP_TO_MASK(flag))
+
+ static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b)
+ {
+ kernel_cap_t dest;
+ cap_t(dest) = cap_t(a) | cap_t(b);
+ return dest;
+ }
+
+ static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b)
+ {
+ kernel_cap_t dest;
+ cap_t(dest) = cap_t(a) & cap_t(b);
+ return dest;
+ }
+
+ static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop)
+ {
+ kernel_cap_t dest;
+ cap_t(dest) = cap_t(a) & ~cap_t(drop);
+ return dest;
+ }
+
+ static inline kernel_cap_t cap_invert(kernel_cap_t c)
+ {
+ kernel_cap_t dest;
+ cap_t(dest) = ~cap_t(c);
+ return dest;
+ }
+
+ #define cap_isclear(c) (!cap_t(c))
+ #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set)))
+
+ #define cap_clear(c) do { cap_t(c) = 0; } while(0)
+ #define cap_set_full(c) do { cap_t(c) = ~0; } while(0)
+ #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0)
+
+ #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK)
+
+ #endif /* __KERNEL__ */
+
+ #endif /* !_LINUX_CAPABILITY_H */
diff -rc2P linux/include/linux/ext3_fs.h linux-2.4.13/include/linux/ext3_fs.h
*** linux/include/linux/ext3_fs.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs.h Fri Nov 9 17:05:34 2001
***************
*** 0 ****
--- 1,716 ----
+ /*
+ * linux/include/linux/ext3_fs.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+ #ifndef _LINUX_EXT3_FS_H
+ #define _LINUX_EXT3_FS_H
+
+ #include <linux/types.h>
+
+ /*
+ * The second extended filesystem constants/structures
+ */
+
+ /*
+ * Define EXT3FS_DEBUG to produce debug messages
+ */
+ #undef EXT3FS_DEBUG
+
+ /*
+ * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files
+ */
+ #undef EXT3_PREALLOCATE /* @@@ Fix this! */
+ #define EXT3_DEFAULT_PREALLOC_BLOCKS 8
+
+ /*
+ * The second extended file system version
+ */
+ #define EXT3FS_DATE "21 Oct 2001"
+ #define EXT3FS_VERSION "2.4-0.9.13"
+
+ /*
+ * Debug code
+ */
+ #ifdef EXT3FS_DEBUG
+ #define ext3_debug(f, a...) \
+ do { \
+ printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __FUNCTION__); \
+ printk (KERN_DEBUG f, ## a); \
+ } while (0)
+ #else
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
+ /*
+ * Special inodes numbers
+ */
+ #define EXT3_BAD_INO 1 /* Bad blocks inode */
+ #define EXT3_ROOT_INO 2 /* Root inode */
+ #define EXT3_ACL_IDX_INO 3 /* ACL inode */
+ #define EXT3_ACL_DATA_INO 4 /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
+ #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
+ #define EXT3_JOURNAL_INO 8 /* Journal inode */
+
+ /* First non-reserved inode for old ext3 filesystems */
+ #define EXT3_GOOD_OLD_FIRST_INO 11
+
+ /*
+ * The second extended file system magic number
+ */
+ #define EXT3_SUPER_MAGIC 0xEF53
+
+ /*
+ * Maximal count of links to a file
+ */
+ #define EXT3_LINK_MAX 32000
+
+ /*
+ * Macro-instructions used to manage several block sizes
+ */
+ #define EXT3_MIN_BLOCK_SIZE 1024
+ #define EXT3_MAX_BLOCK_SIZE 4096
+ #define EXT3_MIN_BLOCK_LOG_SIZE 10
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
+ #else
+ # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+ #define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+ #else
+ # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
+ #endif
+ #ifdef __KERNEL__
+ #define EXT3_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_addr_per_block_bits)
+ #define EXT3_INODE_SIZE(s) ((s)->u.ext3_sb.s_inode_size)
+ #define EXT3_FIRST_INO(s) ((s)->u.ext3_sb.s_first_ino)
+ #else
+ #define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+ EXT3_GOOD_OLD_INODE_SIZE : \
+ (s)->s_inode_size)
+ #define EXT3_FIRST_INO(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+ EXT3_GOOD_OLD_FIRST_INO : \
+ (s)->s_first_ino)
+ #endif
+
+ /*
+ * Macro-instructions used to manage fragments
+ */
+ #define EXT3_MIN_FRAG_SIZE 1024
+ #define EXT3_MAX_FRAG_SIZE 4096
+ #define EXT3_MIN_FRAG_LOG_SIZE 10
+ #ifdef __KERNEL__
+ # define EXT3_FRAG_SIZE(s) ((s)->u.ext3_sb.s_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s) ((s)->u.ext3_sb.s_frags_per_block)
+ #else
+ # define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s))
+ #endif
+
+ /*
+ * ACL structures
+ */
+ struct ext3_acl_header /* Header of Access Control Lists */
+ {
+ __u32 aclh_size;
+ __u32 aclh_file_count;
+ __u32 aclh_acle_count;
+ __u32 aclh_first_acle;
+ };
+
+ struct ext3_acl_entry /* Access Control List Entry */
+ {
+ __u32 acle_size;
+ __u16 acle_perms; /* Access permissions */
+ __u16 acle_type; /* Type of entry */
+ __u16 acle_tag; /* User or group identity */
+ __u16 acle_pad1;
+ __u32 acle_next; /* Pointer on next entry for the */
+ /* same inode or on next free entry */
+ };
+
+ /*
+ * Structure of a blocks group descriptor
+ */
+ struct ext3_group_desc
+ {
+ __u32 bg_block_bitmap; /* Blocks bitmap block */
+ __u32 bg_inode_bitmap; /* Inodes bitmap block */
+ __u32 bg_inode_table; /* Inodes table block */
+ __u16 bg_free_blocks_count; /* Free blocks count */
+ __u16 bg_free_inodes_count; /* Free inodes count */
+ __u16 bg_used_dirs_count; /* Directories count */
+ __u16 bg_pad;
+ __u32 bg_reserved[3];
+ };
+
+ /*
+ * Macro-instructions used to manage group descriptors
+ */
+ #ifdef __KERNEL__
+ # define EXT3_BLOCKS_PER_GROUP(s) ((s)->u.ext3_sb.s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s) ((s)->u.ext3_sb.s_desc_per_block)
+ # define EXT3_INODES_PER_GROUP(s) ((s)->u.ext3_sb.s_inodes_per_group)
+ # define EXT3_DESC_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_desc_per_block_bits)
+ #else
+ # define EXT3_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc))
+ # define EXT3_INODES_PER_GROUP(s) ((s)->s_inodes_per_group)
+ #endif
+
+ /*
+ * Constants relative to the data blocks
+ */
+ #define EXT3_NDIR_BLOCKS 12
+ #define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
+ #define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
+ #define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
+ #define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
+
+ /*
+ * Inode flags
+ */
+ #define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
+ #define EXT3_UNRM_FL 0x00000002 /* Undelete */
+ #define EXT3_COMPR_FL 0x00000004 /* Compress file */
+ #define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
+ #define EXT3_IMMUTABLE_FILE_FL 0x00000010 /* Immutable file */
+ #define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
+ #define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
+ #define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
+ /* Reserved for compression usage... */
+ #define EXT3_DIRTY_FL 0x00000100
+ #define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+ #define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
+ #define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
+ /* End compression flags --- maybe not all used */
+ #define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
+ #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
+ #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+ #define EXT3_IMMUTABLE_LINK_FL 0x00008000 /* Immutable link */
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
+
+ #define EXT3_FL_USER_VISIBLE 0x00009FFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000080FF /* User modifiable flags */
+
+ /*
+ * Inode dynamic state flags
+ */
+ #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
+
+ /*
+ * ioctl commands
+ */
+ #define EXT3_IOC_GETFLAGS _IOR('f', 1, long)
+ #define EXT3_IOC_SETFLAGS _IOW('f', 2, long)
+ #define EXT3_IOC_GETVERSION _IOR('f', 3, long)
+ #define EXT3_IOC_SETVERSION _IOW('f', 4, long)
+ #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
+ #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+ #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
+ #endif
+
+ /*
+ * Structure of an inode on the disk
+ */
+ struct ext3_inode {
+ __u16 i_mode; /* File mode */
+ __u16 i_uid; /* Low 16 bits of Owner Uid */
+ __u32 i_size; /* Size in bytes */
+ __u32 i_atime; /* Access time */
+ __u32 i_ctime; /* Creation time */
+ __u32 i_mtime; /* Modification time */
+ __u32 i_dtime; /* Deletion Time */
+ __u16 i_gid; /* Low 16 bits of Group Id */
+ __u16 i_links_count; /* Links count */
+ __u32 i_blocks; /* Blocks count */
+ __u32 i_flags; /* File flags */
+ union {
+ struct {
+ __u32 l_i_reserved1;
+ } linux1;
+ struct {
+ __u32 h_i_translator;
+ } hurd1;
+ struct {
+ __u32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __u32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+ __u32 i_generation; /* File version (for NFS) */
+ __u32 i_file_acl; /* File ACL */
+ __u32 i_dir_acl; /* Directory ACL */
+ __u32 i_faddr; /* Fragment address */
+ union {
+ struct {
+ __u8 l_i_frag; /* Fragment number */
+ __u8 l_i_fsize; /* Fragment size */
+ __u16 i_pad1;
+ __u16 l_i_uid_high; /* these 2 fields */
+ __u16 l_i_gid_high; /* were reserved2[0] */
+ __u32 l_i_reserved2;
+ } linux2;
+ struct {
+ __u8 h_i_frag; /* Fragment number */
+ __u8 h_i_fsize; /* Fragment size */
+ __u16 h_i_mode_high;
+ __u16 h_i_uid_high;
+ __u16 h_i_gid_high;
+ __u32 h_i_author;
+ } hurd2;
+ struct {
+ __u8 m_i_frag; /* Fragment number */
+ __u8 m_i_fsize; /* Fragment size */
+ __u16 m_pad1;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+ };
+
+ #define i_size_high i_dir_acl
+
+ #if defined(__KERNEL__) || defined(__linux__)
+ #define i_reserved1 osd1.linux1.l_i_reserved1
+ #define i_frag osd2.linux2.l_i_frag
+ #define i_fsize osd2.linux2.l_i_fsize
+ #define i_uid_low i_uid
+ #define i_gid_low i_gid
+ #define i_uid_high osd2.linux2.l_i_uid_high
+ #define i_gid_high osd2.linux2.l_i_gid_high
+ #define i_reserved2 osd2.linux2.l_i_reserved2
+
+ #elif defined(__GNU__)
+
+ #define i_translator osd1.hurd1.h_i_translator
+ #define i_frag osd2.hurd2.h_i_frag;
+ #define i_fsize osd2.hurd2.h_i_fsize;
+ #define i_uid_high osd2.hurd2.h_i_uid_high
+ #define i_gid_high osd2.hurd2.h_i_gid_high
+ #define i_author osd2.hurd2.h_i_author
+
+ #elif defined(__masix__)
+
+ #define i_reserved1 osd1.masix1.m_i_reserved1
+ #define i_frag osd2.masix2.m_i_frag
+ #define i_fsize osd2.masix2.m_i_fsize
+ #define i_reserved2 osd2.masix2.m_i_reserved2
+
+ #endif /* defined(__KERNEL__) || defined(__linux__) */
+
+ /*
+ * File system states
+ */
+ #define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
+ #define EXT3_ERROR_FS 0x0002 /* Errors detected */
+ #define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
+
+ /*
+ * Mount flags
+ */
+ #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */
+ #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */
+ #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */
+ #define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */
+ #define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */
+ #define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
+ #define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
+ #define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/
+ #define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */
+ #define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */
+ #define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */
+ #define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */
+ #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+ #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
+ #define set_opt(o, opt) o |= EXT3_MOUNT_##opt
+ #define test_opt(sb, opt) ((sb)->u.ext3_sb.s_mount_opt & \
+ EXT3_MOUNT_##opt)
+ #else
+ #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
+ #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
+ #endif
+
+ #define ext3_set_bit ext2_set_bit
+ #define ext3_clear_bit ext2_clear_bit
+ #define ext3_test_bit ext2_test_bit
+ #define ext3_find_first_zero_bit ext2_find_first_zero_bit
+ #define ext3_find_next_zero_bit ext2_find_next_zero_bit
+
+ /*
+ * Maximal mount counts between two filesystem checks
+ */
+ #define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+ #define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+ /*
+ * Behaviour when detecting errors
+ */
+ #define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
+ #define EXT3_ERRORS_RO 2 /* Remount fs read-only */
+ #define EXT3_ERRORS_PANIC 3 /* Panic */
+ #define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
+
+ /*
+ * Structure of the super block
+ */
+ struct ext3_super_block {
+ /*00*/ __u32 s_inodes_count; /* Inodes count */
+ __u32 s_blocks_count; /* Blocks count */
+ __u32 s_r_blocks_count; /* Reserved blocks count */
+ __u32 s_free_blocks_count; /* Free blocks count */
+ /*10*/ __u32 s_free_inodes_count; /* Free inodes count */
+ __u32 s_first_data_block; /* First Data Block */
+ __u32 s_log_block_size; /* Block size */
+ __s32 s_log_frag_size; /* Fragment size */
+ /*20*/ __u32 s_blocks_per_group; /* # Blocks per group */
+ __u32 s_frags_per_group; /* # Fragments per group */
+ __u32 s_inodes_per_group; /* # Inodes per group */
+ __u32 s_mtime; /* Mount time */
+ /*30*/ __u32 s_wtime; /* Write time */
+ __u16 s_mnt_count; /* Mount count */
+ __s16 s_max_mnt_count; /* Maximal mount count */
+ __u16 s_magic; /* Magic signature */
+ __u16 s_state; /* File system state */
+ __u16 s_errors; /* Behaviour when detecting errors */
+ __u16 s_minor_rev_level; /* minor revision level */
+ /*40*/ __u32 s_lastcheck; /* time of last check */
+ __u32 s_checkinterval; /* max. time between checks */
+ __u32 s_creator_os; /* OS */
+ __u32 s_rev_level; /* Revision level */
+ /*50*/ __u16 s_def_resuid; /* Default uid for reserved blocks */
+ __u16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT3_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __u32 s_first_ino; /* First non-reserved inode */
+ __u16 s_inode_size; /* size of inode structure */
+ __u16 s_block_group_nr; /* block group # of this superblock */
+ __u32 s_feature_compat; /* compatible feature set */
+ /*60*/ __u32 s_feature_incompat; /* incompatible feature set */
+ __u32 s_feature_ro_compat; /* readonly-compatible feature set */
+ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+ /*78*/ char s_volume_name[16]; /* volume name */
+ /*88*/ char s_last_mounted[64]; /* directory where last mounted */
+ /*C8*/ __u32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __u16 s_padding1;
+ /*
+ * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+ /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+ /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
+
+ /*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+ #define EXT3_SB(sb) (&((sb)->u.ext3_sb))
+ #define EXT3_I(inode) (&((inode)->u.ext3_i))
+ #else
+ /* Assume that user mode programs are passing in an ext3fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+ * macros from user land. */
+ #define EXT3_SB(sb) (sb)
+ #endif
+
+ #define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime
+
+ /*
+ * Codes for operating systems
+ */
+ #define EXT3_OS_LINUX 0
+ #define EXT3_OS_HURD 1
+ #define EXT3_OS_MASIX 2
+ #define EXT3_OS_FREEBSD 3
+ #define EXT3_OS_LITES 4
+
+ /*
+ * Revision levels
+ */
+ #define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
+ #define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+ #define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
+ #define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
+
+ #define EXT3_GOOD_OLD_INODE_SIZE 128
+
+ /*
+ * Feature set definitions
+ */
+
+ #define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+ #define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+ #define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+ #define EXT3_SET_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+ #define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+ #define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+ #define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+ #define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+ #define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+ #define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+ #define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+ #define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+ #define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
+ #define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
+ #define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
+
+ #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+ #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+ #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+
+ #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
+ #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+
+ #define EXT3_FEATURE_COMPAT_SUPP 0
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+
+ /*
+ * Default values for user and/or group using reserved blocks
+ */
+ #define EXT3_DEF_RESUID 0
+ #define EXT3_DEF_RESGID 0
+
+ /*
+ * Structure of a directory entry
+ */
+ #define EXT3_NAME_LEN 255
+
+ struct ext3_dir_entry {
+ __u32 inode; /* Inode number */
+ __u16 rec_len; /* Directory entry length */
+ __u16 name_len; /* Name length */
+ char name[EXT3_NAME_LEN]; /* File name */
+ };
+
+ /*
+ * The new version of the directory entry. Since EXT3 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+ struct ext3_dir_entry_2 {
+ __u32 inode; /* Inode number */
+ __u16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[EXT3_NAME_LEN]; /* File name */
+ };
+
+ /*
+ * Ext3 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+ #define EXT3_FT_UNKNOWN 0
+ #define EXT3_FT_REG_FILE 1
+ #define EXT3_FT_DIR 2
+ #define EXT3_FT_CHRDEV 3
+ #define EXT3_FT_BLKDEV 4
+ #define EXT3_FT_FIFO 5
+ #define EXT3_FT_SOCK 6
+ #define EXT3_FT_SYMLINK 7
+
+ #define EXT3_FT_MAX 8
+
+ /*
+ * EXT3_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+ #define EXT3_DIR_PAD 4
+ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
+
+ #ifdef __KERNEL__
+
+ /* Filesize hard limits for 64-bit file offsets */
+ extern long long ext3_max_sizes[];
+
+ /*
+ * Describe an inode's exact location on disk and in memory
+ */
+ struct ext3_iloc
+ {
+ struct buffer_head *bh;
+ struct ext3_inode *raw_inode;
+ unsigned long block_group;
+ };
+
+ /*
+ * Function prototypes
+ */
+
+ /*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext3 source programs needs to include it so they are duplicated here.
+ */
+ # define NORET_TYPE /**/
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND noreturn,
+
+ /* acl.c */
+ extern int ext3_permission (struct inode *, int);
+
+ /* balloc.c */
+ extern int ext3_bg_has_super(struct super_block *sb, int group);
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long,
+ __u32 *, __u32 *, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+ unsigned long);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+ extern void ext3_check_blocks_bitmap (struct super_block *);
+ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ unsigned int block_group,
+ struct buffer_head ** bh);
+
+ /* bitmap.c */
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+ struct ext3_dir_entry_2 *, struct buffer_head *,
+ unsigned long);
+
+ /* file.c */
+
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, ino_t);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+ extern void ext3_check_inodes_bitmap (struct super_block *);
+
+ /* inode.c */
+
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+
+ extern int ext3_get_inode_loc (struct inode *, struct ext3_iloc *);
+ extern void ext3_read_inode (struct inode *);
+ extern void ext3_write_inode (struct inode *, int);
+ extern int ext3_setattr (struct dentry *, struct iattr *);
+ extern void ext3_put_inode (struct inode *);
+ extern void ext3_delete_inode (struct inode *);
+ extern int ext3_sync_inode (handle_t *, struct inode *);
+ extern void ext3_discard_prealloc (struct inode *);
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+ unsigned long);
+
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+ __attribute__ ((format (printf, 3, 4)));
+ extern void __ext3_std_error (struct super_block *, const char *, int);
+ extern void ext3_abort (struct super_block *, const char *, const char *, ...)
+ __attribute__ ((format (printf, 3, 4)));
+ extern NORET_TYPE void ext3_panic (struct super_block *, const char *,
+ const char *, ...)
+ __attribute__ ((NORET_AND format (printf, 3, 4)));
+ extern void ext3_warning (struct super_block *, const char *, const char *, ...)
+ __attribute__ ((format (printf, 3, 4)));
+ extern void ext3_update_dynamic_rev (struct super_block *sb);
+ extern void ext3_put_super (struct super_block *);
+ extern void ext3_write_super (struct super_block *);
+ extern void ext3_write_super_lockfs (struct super_block *);
+ extern void ext3_unlockfs (struct super_block *);
+ extern int ext3_remount (struct super_block *, int *, char *);
+ extern struct super_block * ext3_read_super (struct super_block *,void *,int);
+ extern int ext3_statfs (struct super_block *, struct statfs *);
+
+ /* truncate.c */
+ extern void ext3_truncate (struct inode *);
+
+ #define ext3_std_error(sb, errno) \
+ do { \
+ if ((errno)) \
+ __ext3_std_error((sb), __FUNCTION__, (errno)); \
+ } while (0)
+ extern const char *ext3_decode_error(struct super_block *sb, int errno, char nbuf[16]);
+
+ /*
+ * Inodes and files operations
+ */
+
+ /* dir.c */
+ extern struct file_operations ext3_dir_operations;
+
+ /* file.c */
+ extern struct inode_operations ext3_file_inode_operations;
+ extern struct file_operations ext3_file_operations;
+
+ /* symlink.c */
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
+ extern struct address_space_operations ext3_aops;
+
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_EXT3_FS_H */
diff -rc2P linux/include/linux/ext3_fs_i.h linux-2.4.13/include/linux/ext3_fs_i.h
*** linux/include/linux/ext3_fs_i.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs_i.h Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,78 ----
+ /*
+ * linux/include/linux/ext3_fs_i.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs_i.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+ #ifndef _LINUX_EXT3_FS_I
+ #define _LINUX_EXT3_FS_I
+
+ #include <linux/rwsem.h>
+
+ /*
+ * second extended file system inode data in memory
+ */
+ struct ext3_inode_info {
+ __u32 i_data[15];
+ __u32 i_flags;
+ #ifdef EXT3_FRAGMENTS
+ __u32 i_faddr;
+ __u8 i_frag_no;
+ __u8 i_frag_size;
+ __u16 unused; /* formerly i_osync */
+ #endif
+ __u32 i_file_acl;
+ __u32 i_dir_acl;
+ __u32 i_dtime;
+ __u32 i_block_group;
+ __u32 i_state; /* Dynamic state flags for ext3 */
+ __u32 i_next_alloc_block;
+ __u32 i_next_alloc_goal;
+ #ifdef EXT3_PREALLOCATE
+ __u32 i_prealloc_block;
+ __u32 i_prealloc_count;
+ #endif
+ __u32 i_dir_start_lookup;
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext3_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /*
+ * truncate_sem is for serialising ext3_truncate() against
+ * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext3 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have truncate_sem.
+ */
+ struct rw_semaphore truncate_sem;
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
diff -rc2P linux/include/linux/ext3_fs_sb.h linux-2.4.13/include/linux/ext3_fs_sb.h
*** linux/include/linux/ext3_fs_sb.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs_sb.h Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,77 ----
+ /*
+ * linux/include/linux/ext3_fs_sb.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card ([email protected])
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs_sb.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+ #ifndef _LINUX_EXT3_FS_SB
+ #define _LINUX_EXT3_FS_SB
+
+ #ifdef __KERNEL__
+ #include <linux/timer.h>
+ #include <linux/wait.h>
+ #endif
+
+ /*
+ * The following is not needed anymore since the descriptors buffer
+ * heads are now dynamically allocated
+ */
+ /* #define EXT3_MAX_GROUP_DESC 8 */
+
+ #define EXT3_MAX_GROUP_LOADED 8
+
+ /*
+ * third extended-fs super-block data in memory
+ */
+ struct ext3_sb_info {
+ unsigned long s_frag_size; /* Size of a fragment in bytes */
+ unsigned long s_frags_per_block;/* Number of fragments per block */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_frags_per_group;/* Number of fragments in a group */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ unsigned long s_groups_count; /* Number of groups in the fs */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head ** s_group_desc;
+ unsigned short s_loaded_inode_bitmaps;
+ unsigned short s_loaded_block_bitmaps;
+ unsigned long s_inode_bitmap_number[EXT3_MAX_GROUP_LOADED];
+ struct buffer_head * s_inode_bitmap[EXT3_MAX_GROUP_LOADED];
+ unsigned long s_block_bitmap_number[EXT3_MAX_GROUP_LOADED];
+ struct buffer_head * s_block_bitmap[EXT3_MAX_GROUP_LOADED];
+ unsigned long s_mount_opt;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+ struct journal_s * s_journal;
+ struct list_head s_orphan;
+ unsigned long s_commit_interval;
+ struct block_device *journal_bdev;
+ #ifdef CONFIG_JBD_DEBUG
+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
+ #endif
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
diff -rc2P linux/include/linux/ext3_jbd.h linux-2.4.13/include/linux/ext3_jbd.h
*** linux/include/linux/ext3_jbd.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_jbd.h Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,290 ----
+ /*
+ * linux/include/linux/ext3_jbd.h
+ *
+ * Written by Stephen C. Tweedie <[email protected]>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext3-specific journaling extensions.
+ */
+
+ #ifndef _LINUX_EXT3_JBD_H
+ #define _LINUX_EXT3_JBD_H
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+
+ #define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
+
+ /* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction. */
+
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS 8
+
+ /* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+ #define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
+
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+
+ /* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
+ * generous. We can grow the delete transaction later if necessary. */
+
+ #define EXT3_DELETE_TRANS_BLOCKS (2 * EXT3_DATA_TRANS_BLOCKS + 64)
+
+ /* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+ #define EXT3_MAX_TRANS_DATA 64
+
+ /* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one. Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates. Quota allocations are not
+ * needed. */
+
+ #define EXT3_RESERVE_TRANS_BLOCKS 12
+
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext3_iloc *iloc);
+
+ /*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+ int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext3_iloc *iloc);
+
+ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+ /*
+ * Wrapper functions with which ext3 calls into JBD. The intent here is
+ * to allow these to be turned into appropriate stubs so ext3 can control
+ * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
+ * been done yet.
+ */
+
+ static inline void ext3_journal_abort_handle(const char *caller,
+ const char *err_fn,
+ struct buffer_head *bh,
+ handle_t *handle,
+ int err)
+ {
+ char nbuf[16];
+ const char *errstr = ext3_decode_error(NULL, err, nbuf);
+
+ printk(KERN_ERR "%s: aborting transaction: %s in %s",
+ caller, errstr, err_fn);
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+ journal_abort_handle(handle);
+ if (!handle->h_err)
+ handle->h_err = err;
+ }
+
+ static inline int
+ __ext3_journal_get_undo_access(const char *where,
+ handle_t *handle, struct buffer_head *bh)
+ {
+ int err = journal_get_undo_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+ static inline int
+ __ext3_journal_get_write_access(const char *where,
+ handle_t *handle, struct buffer_head *bh)
+ {
+ int err = journal_get_write_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+ static inline int
+ __ext3_journal_dirty_data(const char *where,
+ handle_t *handle, struct buffer_head *bh, int async)
+ {
+ int err = journal_dirty_data(handle, bh, async);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+ static inline void
+ ext3_journal_forget(handle_t *handle, struct buffer_head *bh)
+ {
+ journal_forget(handle, bh);
+ }
+
+ static inline int
+ __ext3_journal_revoke(const char *where, handle_t *handle,
+ unsigned long blocknr, struct buffer_head *bh)
+ {
+ int err = journal_revoke(handle, blocknr, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+ static inline int
+ __ext3_journal_get_create_access(const char *where,
+ handle_t *handle, struct buffer_head *bh)
+ {
+ int err = journal_get_create_access(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+ static inline int
+ __ext3_journal_dirty_metadata(const char *where,
+ handle_t *handle, struct buffer_head *bh)
+ {
+ int err = journal_dirty_metadata(handle, bh);
+ if (err)
+ ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ return err;
+ }
+
+
+ #define ext3_journal_get_undo_access(handle, bh) \
+ __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_get_write_access(handle, bh) \
+ __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_dirty_data(handle, bh, async) \
+ __ext3_journal_dirty_data(__FUNCTION__, (handle), (bh), (async))
+ #define ext3_journal_revoke(handle, blocknr, bh) \
+ __ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ #define ext3_journal_get_create_access(handle, bh) \
+ __ext3_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_dirty_metadata(handle, bh) \
+ __ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+
+
+
+ /*
+ * Wrappers for journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+ static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+ {
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+ return journal_start(EXT3_JOURNAL(inode), nblocks);
+ }
+
+ static inline handle_t *
+ ext3_journal_try_start(struct inode *inode, int nblocks)
+ {
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+ return journal_try_start(EXT3_JOURNAL(inode), nblocks);
+ }
+
+ /*
+ * The only special thing we need to do here is to make sure that all
+ * journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+ static inline int __ext3_journal_stop(const char *where,
+ handle_t *handle, struct inode *inode)
+ {
+ int err = handle->h_err;
+ int rc = journal_stop(handle);
+
+ inode->i_sb->s_dirt = 1;
+ if (!err)
+ err = rc;
+ if (err)
+ __ext3_std_error(inode->i_sb, where, err);
+ return err;
+ }
+ #define ext3_journal_stop(handle, inode) \
+ __ext3_journal_stop(__FUNCTION__, (handle), (inode))
+
+ static inline handle_t *ext3_journal_current_handle(void)
+ {
+ return journal_current_handle();
+ }
+
+ static inline void
+ ext3_log_start_commit(journal_t *journal, transaction_t *transaction)
+ {
+ log_start_commit(journal, transaction);
+ }
+
+ static inline void ext3_log_wait_commit(journal_t *journal, tid_t tid)
+ {
+ log_wait_commit(journal, tid);
+ }
+
+ static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+ {
+ return journal_extend(handle, nblocks);
+ }
+
+ static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+ {
+ return journal_restart(handle, nblocks);
+ }
+
+ static inline int ext3_journal_blocks_per_page(struct inode *inode)
+ {
+ return journal_blocks_per_page(inode);
+ }
+
+ static inline int ext3_journal_force_commit(journal_t *journal)
+ {
+ return journal_force_commit(journal);
+ }
+
+ /* super.c */
+ int ext3_force_commit(struct super_block *sb);
+
+ static inline int ext3_should_journal_data(struct inode *inode)
+ {
+ if (!S_ISREG(inode->i_mode))
+ return 1;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+ return 1;
+ if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL)
+ return 1;
+ return 0;
+ }
+
+ static inline int ext3_should_order_data(struct inode *inode)
+ {
+ return (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA);
+ }
+
+
+ #endif /* _LINUX_EXT3_JBD_H */
diff -rc2P linux/include/linux/fs.h linux-2.4.13/include/linux/fs.h
*** linux/include/linux/fs.h Fri Nov 9 16:15:08 2001
--- linux-2.4.13/include/linux/fs.h Fri Nov 9 16:58:00 2001
***************
*** 22,25 ****
--- 22,26 ----
#include <linux/stddef.h>
#include <linux/string.h>
+ #include <linux/buffer-trace.h>

#include <asm/atomic.h>
***************
*** 219,222 ****
--- 220,224 ----
BH_Wait_IO, /* 1 if we should write out this buffer */
BH_launder, /* 1 if we should throttle on this buffer */
+ BH_JBD, /* 1 if it has an attached journal_head */

BH_PrivateStart,/* not a state bit, but the first bit available
***************
*** 265,268 ****
--- 267,274 ----
struct inode * b_inode;
struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */
+
+ #ifdef CONFIG_BUFFER_DEBUG
+ struct buffer_history b_history;
+ #endif
};

***************
*** 290,293 ****
--- 296,300 ----
#include <linux/minix_fs_i.h>
#include <linux/ext2_fs_i.h>
+ #include <linux/ext3_fs_i.h>
#include <linux/hpfs_fs_i.h>
#include <linux/ntfs_fs_i.h>
***************
*** 380,387 ****
--- 387,400 ----
int (*readpage)(struct file *, struct page *);
int (*sync_page)(struct page *);
+ /*
+ * ext3 requires that a successful prepare_write() call be followed
+ * by a commit_write() call - they must be balanced
+ */
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
int (*bmap)(struct address_space *, long);
+ int (*flushpage) (struct page *, unsigned long);
+ int (*releasepage) (struct page *, int);
#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
***************
*** 445,448 ****
--- 458,462 ----
unsigned long i_version;
struct semaphore i_sem;
+ struct rw_semaphore i_truncate_sem; /* Nests inside i_sem */
struct semaphore i_zombie;
struct inode_operations *i_op;
***************
*** 474,477 ****
--- 488,492 ----
struct minix_inode_info minix_i;
struct ext2_inode_info ext2_i;
+ struct ext3_inode_info ext3_i;
struct hpfs_inode_info hpfs_i;
struct ntfs_inode_info ntfs_i;
***************
*** 662,665 ****
--- 677,681 ----
#include <linux/minix_fs_sb.h>
#include <linux/ext2_fs_sb.h>
+ #include <linux/ext3_fs_sb.h>
#include <linux/hpfs_fs_sb.h>
#include <linux/ntfs_fs_sb.h>
***************
*** 718,721 ****
--- 734,738 ----
struct minix_sb_info minix_sb;
struct ext2_sb_info ext2_sb;
+ struct ext3_sb_info ext3_sb;
struct hpfs_sb_info hpfs_sb;
struct ntfs_sb_info ntfs_sb;
***************
*** 1091,1094 ****
--- 1108,1112 ----
extern int try_to_free_buffers(struct page *, unsigned int);
extern void refile_buffer(struct buffer_head * buf);
+ extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);

***************
*** 1132,1135 ****
--- 1150,1157 ----
static inline void mark_buffer_clean(struct buffer_head * bh)
{
+ #if defined(CONFIG_JBD_DEBUG)
+ extern void jbd_preclean_buffer_check(struct buffer_head *);
+ jbd_preclean_buffer_check(bh); /* @@@ Expensive debugging */
+ #endif
if (atomic_set_buffer_clean(bh))
__mark_buffer_clean(bh);
***************
*** 1173,1176 ****
--- 1195,1199 ----
}

+ extern void set_buffer_flushtime(struct buffer_head *);
extern void balance_dirty(void);
extern int check_disk_change(kdev_t);
***************
*** 1352,1355 ****
--- 1375,1380 ----
extern struct buffer_head * bread(kdev_t, int, int);
extern void wakeup_bdflush(void);
+ extern void put_unused_buffer_head(struct buffer_head * bh);
+ extern struct buffer_head * get_unused_buffer_head(int async);

extern int brw_page(int, struct page *, kdev_t, int [], int);
***************
*** 1358,1361 ****
--- 1383,1387 ----

/* Generic buffer handling for block filesystems.. */
+ extern int try_to_release_page(struct page * page, int gfp_mask);
extern int discard_bh_page(struct page *, unsigned long, int);
#define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
diff -rc2P linux/include/linux/fs.h.orig linux-2.4.13/include/linux/fs.h.orig
*** linux/include/linux/fs.h.orig Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/fs.h.orig Fri Nov 9 16:15:08 2001
***************
*** 0 ****
--- 1,1569 ----
+ #ifndef _LINUX_FS_H
+ #define _LINUX_FS_H
+
+ /*
+ * This file has definitions for some important file table
+ * structures etc.
+ */
+
+ #include <linux/config.h>
+ #include <linux/linkage.h>
+ #include <linux/limits.h>
+ #include <linux/wait.h>
+ #include <linux/types.h>
+ #include <linux/vfs.h>
+ #include <linux/net.h>
+ #include <linux/kdev_t.h>
+ #include <linux/ioctl.h>
+ #include <linux/list.h>
+ #include <linux/dcache.h>
+ #include <linux/stat.h>
+ #include <linux/cache.h>
+ #include <linux/stddef.h>
+ #include <linux/string.h>
+
+ #include <asm/atomic.h>
+ #include <asm/bitops.h>
+
+ struct poll_table_struct;
+
+
+ /*
+ * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
+ * the file limit at runtime and only root can increase the per-process
+ * nr_file rlimit, so it's safe to set up a ridiculously high absolute
+ * upper limit on files-per-process.
+ *
+ * Some programs (notably those using select()) may have to be
+ * recompiled to take full advantage of the new limits..
+ */
+
+ /* Fixed constants first: */
+ #undef NR_OPEN
+ #define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
+ #define INR_OPEN 1024 /* Initial setting for nfile rlimits */
+
+ #define BLOCK_SIZE_BITS 10
+ #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+
+ /* And dynamically-tunable limits and defaults: */
+ struct files_stat_struct {
+ int nr_files; /* read only */
+ int nr_free_files; /* read only */
+ int max_files; /* tunable */
+ };
+ extern struct files_stat_struct files_stat;
+
+ struct inodes_stat_t {
+ int nr_inodes;
+ int nr_unused;
+ int dummy[5];
+ };
+ extern struct inodes_stat_t inodes_stat;
+
+ extern int leases_enable, dir_notify_enable, lease_break_time;
+
+ #define NR_FILE 8192 /* this can well be larger on a larger system */
+ #define NR_RESERVED_FILES 10 /* reserved for root */
+ #define NR_SUPER 256
+
+ #define MAY_EXEC 1
+ #define MAY_WRITE 2
+ #define MAY_READ 4
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
+
+ #define READ 0
+ #define WRITE 1
+ #define READA 2 /* read-ahead - don't block if no resources */
+ #define SPECIAL 4 /* For non-blockdevice requests in request queue */
+
+ #define SEL_IN 1
+ #define SEL_OUT 2
+ #define SEL_EX 4
+
+ /* public flags for file_system_type */
+ #define FS_REQUIRES_DEV 1
+ #define FS_NO_DCACHE 2 /* Only dcache the necessary things. */
+ #define FS_NO_PRELIM 4 /* prevent preloading of dentries, even if
+ * FS_NO_DCACHE is not set.
+ */
+ #define FS_SINGLE 8 /* Filesystem that can have only one superblock */
+ #define FS_NOMOUNT 16 /* Never mount from userland */
+ #define FS_LITTER 32 /* Keeps the tree in dcache */
+ #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon
+ * as nfs_rename() will be cleaned up
+ */
+ /*
+ * These are the fs-independent mount-flags: up to 32 flags are supported
+ */
+ #define MS_RDONLY 1 /* Mount read-only */
+ #define MS_NOSUID 2 /* Ignore suid and sgid bits */
+ #define MS_NODEV 4 /* Disallow access to device special files */
+ #define MS_NOEXEC 8 /* Disallow program execution */
+ #define MS_SYNCHRONOUS 16 /* Writes are synced at once */
+ #define MS_REMOUNT 32 /* Alter flags of a mounted FS */
+ #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
+ #define MS_NOATIME 1024 /* Do not update access times. */
+ #define MS_NODIRATIME 2048 /* Do not update directory access times */
+ #define MS_BIND 4096
+ #define MS_REC 16384
+ #define MS_VERBOSE 32768
+ #define MS_NOUSER (1<<31)
+
+ /*
+ * Superblock flags that can be altered by MS_REMOUNT
+ */
+ #define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
+ MS_NODIRATIME)
+
+ /*
+ * Old magic mount flag and mask
+ */
+ #define MS_MGC_VAL 0xC0ED0000
+ #define MS_MGC_MSK 0xffff0000
+
+ /* Inode flags - they have nothing to superblock flags now */
+
+ #define S_SYNC 1 /* Writes are synced at once */
+ #define S_NOATIME 2 /* Do not update access times */
+ #define S_QUOTA 4 /* Quota initialized for file */
+ #define S_APPEND 8 /* Append-only file */
+ #define S_IMMUTABLE_FILE 16 /* Immutable file */
+ #define S_DEAD 32 /* removed, but still open directory */
+ #define S_NOQUOTA 64 /* Inode is not counted to quota */
+ #define S_IMMUTABLE_LINK 128 /* Immutable links */
+
+ /*
+ * Note that nosuid etc flags are inode-specific: setting some file-system
+ * flags just means all the inodes inherit those flags by default. It might be
+ * possible to override it selectively if you really wanted to with some
+ * ioctl() that is not currently implemented.
+ *
+ * Exception: MS_RDONLY is always applied to the entire file system.
+ *
+ * Unfortunately, it is possible to change a filesystems flags with it mounted
+ * with files in use. This means that all of the inodes will not have their
+ * i_flags updated. Hence, i_flags no longer inherit the superblock mount
+ * flags, so these have to be checked separately. -- [email protected]
+ */
+ #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
+
+ #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
+ #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
+ #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
+
+ #define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA)
+ #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
+ #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
+ #define IS_IMMUTABLE_FILE(inode) ((inode)->i_flags & S_IMMUTABLE_FILE)
+ #define IS_IMMUTABLE_LINK(inode) ((((inode)->i_flags & S_IMMUTABLE_FILE) << 3) ^ ((inode)->i_flags & S_IMMUTABLE_LINK) )
+ #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
+ #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME)
+
+ #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
+
+ /* the read-only stuff doesn't really belong here, but any other place is
+ probably as bad and I don't want to create yet another include file. */
+
+ #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
+ #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
+ #define BLKRRPART _IO(0x12,95) /* re-read partition table */
+ #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
+ #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
+ #define BLKRASET _IO(0x12,98) /* Set read ahead for block device */
+ #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
+ #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
+ #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
+ #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
+ #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
+ #define BLKSSZGET _IO(0x12,104)/* get block device sector size */
+ #if 0
+ #define BLKPG _IO(0x12,105)/* See blkpg.h */
+ #define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))/* elevator get */
+ #define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))/* elevator set */
+ /* This was here just to show that the number is taken -
+ probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
+ #endif
+ /* A jump here: 108-111 have been used for various private purposes. */
+ #define BLKBSZGET _IOR(0x12,112,sizeof(int))
+ #define BLKBSZSET _IOW(0x12,113,sizeof(int))
+ #define BLKGETSIZE64 _IOR(0x12,114,sizeof(u64)) /* return device size in bytes (u64 *arg) */
+
+ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
+ #define FIBMAP _IO(0x00,1) /* bmap access */
+ #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
+
+ #ifdef __KERNEL__
+
+ #include <asm/semaphore.h>
+ #include <asm/byteorder.h>
+
+ extern void update_atime (struct inode *);
+ #define UPDATE_ATIME(inode) update_atime (inode)
+
+ extern void buffer_init(unsigned long);
+ extern void inode_init(unsigned long);
+ extern void mnt_init(unsigned long);
+
+ /* bh state bits */
+ enum bh_state_bits {
+ BH_Uptodate, /* 1 if the buffer contains valid data */
+ BH_Dirty, /* 1 if the buffer is dirty */
+ BH_Lock, /* 1 if the buffer is locked */
+ BH_Req, /* 0 if the buffer has been invalidated */
+ BH_Mapped, /* 1 if the buffer has a disk mapping */
+ BH_New, /* 1 if the buffer is new and not yet written out */
+ BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */
+ BH_Wait_IO, /* 1 if we should write out this buffer */
+ BH_launder, /* 1 if we should throttle on this buffer */
+
+ BH_PrivateStart,/* not a state bit, but the first bit available
+ * for private allocation by other entities
+ */
+ };
+
+ /*
+ * Try to keep the most commonly used fields in single cache lines (16
+ * bytes) to improve performance. This ordering should be
+ * particularly beneficial on 32-bit processors.
+ *
+ * We use the first 16 bytes for the data which is used in searches
+ * over the block hash lists (ie. getblk() and friends).
+ *
+ * The second 16 bytes we use for lru buffer scans, as used by
+ * sync_buffers() and refill_freelist(). -- sct
+ */
+ struct buffer_head {
+ /* First cache line: */
+ struct buffer_head *b_next; /* Hash queue list */
+ unsigned long b_blocknr; /* block number */
+ unsigned short b_size; /* block size */
+ unsigned short b_list; /* List that this buffer appears */
+ kdev_t b_dev; /* device (B_FREE = free) */
+
+ atomic_t b_count; /* users using this block */
+ kdev_t b_rdev; /* Real device */
+ unsigned long b_state; /* buffer state bitmap (see above) */
+ unsigned long b_flushtime; /* Time when (dirty) buffer should be written */
+
+ struct buffer_head *b_next_free;/* lru/free list linkage */
+ struct buffer_head *b_prev_free;/* doubly linked list of buffers */
+ struct buffer_head *b_this_page;/* circular list of buffers in one page */
+ struct buffer_head *b_reqnext; /* request queue */
+
+ struct buffer_head **b_pprev; /* doubly linked list of hash-queue */
+ char * b_data; /* pointer to data block */
+ struct page *b_page; /* the page this bh is mapped to */
+ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
+ void *b_private; /* reserved for b_end_io */
+
+ unsigned long b_rsector; /* Real buffer location on disk */
+ wait_queue_head_t b_wait;
+
+ struct inode * b_inode;
+ struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */
+ };
+
+ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
+ void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
+
+ #define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0)
+
+ #define buffer_uptodate(bh) __buffer_state(bh,Uptodate)
+ #define buffer_dirty(bh) __buffer_state(bh,Dirty)
+ #define buffer_locked(bh) __buffer_state(bh,Lock)
+ #define buffer_req(bh) __buffer_state(bh,Req)
+ #define buffer_mapped(bh) __buffer_state(bh,Mapped)
+ #define buffer_new(bh) __buffer_state(bh,New)
+ #define buffer_async(bh) __buffer_state(bh,Async)
+
+ #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
+
+ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset);
+
+ #define touch_buffer(bh) mark_page_accessed(bh->b_page)
+
+
+ #include <linux/pipe_fs_i.h>
+ #include <linux/minix_fs_i.h>
+ #include <linux/ext2_fs_i.h>
+ #include <linux/hpfs_fs_i.h>
+ #include <linux/ntfs_fs_i.h>
+ #include <linux/msdos_fs_i.h>
+ #include <linux/umsdos_fs_i.h>
+ #include <linux/iso_fs_i.h>
+ #include <linux/nfs_fs_i.h>
+ #include <linux/sysv_fs_i.h>
+ #include <linux/affs_fs_i.h>
+ #include <linux/ufs_fs_i.h>
+ #include <linux/efs_fs_i.h>
+ #include <linux/coda_fs_i.h>
+ #include <linux/romfs_fs_i.h>
+ #include <linux/shmem_fs.h>
+ #include <linux/smb_fs_i.h>
+ #include <linux/hfs_fs_i.h>
+ #include <linux/adfs_fs_i.h>
+ #include <linux/qnx4_fs_i.h>
+ #include <linux/reiserfs_fs_i.h>
+ #include <linux/bfs_fs_i.h>
+ #include <linux/udf_fs_i.h>
+ #include <linux/ncp_fs_i.h>
+ #include <linux/proc_fs_i.h>
+ #include <linux/usbdev_fs_i.h>
+ #include <linux/jffs2_fs_i.h>
+ #include <linux/cramfs_fs_sb.h>
+
+ /*
+ * Attribute flags. These should be or-ed together to figure out what
+ * has been changed!
+ */
+ #define ATTR_MODE 1
+ #define ATTR_UID 2
+ #define ATTR_GID 4
+ #define ATTR_SIZE 8
+ #define ATTR_ATIME 16
+ #define ATTR_MTIME 32
+ #define ATTR_CTIME 64
+ #define ATTR_ATIME_SET 128
+ #define ATTR_MTIME_SET 256
+ #define ATTR_FORCE 512 /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG 1024
+
+ /*
+ * This is the Inode Attributes structure, used for notify_change(). It
+ * uses the above definitions as flags, to know which values have changed.
+ * Also, in this manner, a Filesystem can look at only the values it cares
+ * about. Basically, these are the attributes that the VFS layer can
+ * request to change from the FS layer.
+ *
+ * Derek Atkins <[email protected]> 94-10-20
+ */
+ struct iattr {
+ unsigned int ia_valid;
+ umode_t ia_mode;
+ uid_t ia_uid;
+ gid_t ia_gid;
+ loff_t ia_size;
+ time_t ia_atime;
+ time_t ia_mtime;
+ time_t ia_ctime;
+ unsigned int ia_attr_flags;
+ };
+
+ /*
+ * This is the inode attributes flag definitions
+ */
+ #define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */
+ #define ATTR_FLAG_NOATIME 2 /* Don't update atime */
+ #define ATTR_FLAG_APPEND 4 /* Append-only file */
+ #define ATTR_FLAG_IMMUTABLE_FILE 8 /* Immutable file */
+ #define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */
+ #define ATTR_FLAG_IMMUTABLE_LINK 32 /* Immutable file */
+
+ /*
+ * Includes for diskquotas and mount structures.
+ */
+ #include <linux/quota.h>
+ #include <linux/mount.h>
+
+ /*
+ * oh the beauties of C type declarations.
+ */
+ struct page;
+ struct address_space;
+ struct kiobuf;
+
+ struct address_space_operations {
+ int (*writepage)(struct page *);
+ int (*readpage)(struct file *, struct page *);
+ int (*sync_page)(struct page *);
+ int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
+ int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+ int (*bmap)(struct address_space *, long);
+ #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
+ int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+ };
+
+ struct address_space {
+ struct list_head clean_pages; /* list of clean pages */
+ struct list_head dirty_pages; /* list of dirty pages */
+ struct list_head locked_pages; /* list of locked pages */
+ unsigned long nrpages; /* number of total pages */
+ struct address_space_operations *a_ops; /* methods */
+ struct inode *host; /* owner: inode, block_device */
+ struct vm_area_struct *i_mmap; /* list of private mappings */
+ struct vm_area_struct *i_mmap_shared; /* list of shared mappings */
+ spinlock_t i_shared_lock; /* and spinlock protecting it */
+ int gfp_mask; /* how to allocate the pages */
+ };
+
+ struct char_device {
+ struct list_head hash;
+ atomic_t count;
+ dev_t dev;
+ atomic_t openers;
+ struct semaphore sem;
+ };
+
+ struct block_device {
+ struct list_head bd_hash;
+ atomic_t bd_count;
+ struct inode * bd_inode;
+ dev_t bd_dev; /* not a kdev_t - it's a search key */
+ int bd_openers;
+ const struct block_device_operations *bd_op;
+ struct semaphore bd_sem; /* open/close mutex */
+ struct list_head bd_inodes;
+ };
+
+ struct inode {
+ struct list_head i_hash;
+ struct list_head i_list;
+ struct list_head i_dentry;
+
+ struct list_head i_dirty_buffers;
+ struct list_head i_dirty_data_buffers;
+
+ unsigned long i_ino;
+ atomic_t i_count;
+ kdev_t i_dev;
+ umode_t i_mode;
+ nlink_t i_nlink;
+ uid_t i_uid;
+ gid_t i_gid;
+ kdev_t i_rdev;
+ loff_t i_size;
+ time_t i_atime;
+ time_t i_mtime;
+ time_t i_ctime;
+ unsigned int i_blkbits;
+ unsigned long i_blksize;
+ unsigned long i_blocks;
+ unsigned long i_version;
+ struct semaphore i_sem;
+ struct semaphore i_zombie;
+ struct inode_operations *i_op;
+ struct file_operations *i_fop; /* former ->i_op->default_file_ops */
+ struct super_block *i_sb;
+ wait_queue_head_t i_wait;
+ struct file_lock *i_flock;
+ struct address_space *i_mapping;
+ struct address_space i_data;
+ struct dquot *i_dquot[MAXQUOTAS];
+ /* These three should probably be a union */
+ struct list_head i_devices;
+ struct pipe_inode_info *i_pipe;
+ struct block_device *i_bdev;
+ struct char_device *i_cdev;
+
+ unsigned long i_dnotify_mask; /* Directory notify events */
+ struct dnotify_struct *i_dnotify; /* for directory notifications */
+
+ unsigned long i_state;
+
+ unsigned int i_flags;
+ unsigned char i_sock;
+
+ atomic_t i_writecount;
+ unsigned int i_attr_flags;
+ __u32 i_generation;
+ union {
+ struct minix_inode_info minix_i;
+ struct ext2_inode_info ext2_i;
+ struct hpfs_inode_info hpfs_i;
+ struct ntfs_inode_info ntfs_i;
+ struct msdos_inode_info msdos_i;
+ struct umsdos_inode_info umsdos_i;
+ struct iso_inode_info isofs_i;
+ struct nfs_inode_info nfs_i;
+ struct sysv_inode_info sysv_i;
+ struct affs_inode_info affs_i;
+ struct ufs_inode_info ufs_i;
+ struct efs_inode_info efs_i;
+ struct romfs_inode_info romfs_i;
+ struct shmem_inode_info shmem_i;
+ struct coda_inode_info coda_i;
+ struct smb_inode_info smbfs_i;
+ struct hfs_inode_info hfs_i;
+ struct adfs_inode_info adfs_i;
+ struct qnx4_inode_info qnx4_i;
+ struct reiserfs_inode_info reiserfs_i;
+ struct bfs_inode_info bfs_i;
+ struct udf_inode_info udf_i;
+ struct ncp_inode_info ncpfs_i;
+ struct proc_inode_info proc_i;
+ struct socket socket_i;
+ struct usbdev_inode_info usbdev_i;
+ struct jffs2_inode_info jffs2_i;
+ void *generic_ip;
+ } u;
+ };
+
+ struct fown_struct {
+ int pid; /* pid or -pgrp where SIGIO should be sent */
+ uid_t uid, euid; /* uid/euid of process setting the owner */
+ int signum; /* posix.1b rt signal to be delivered on IO */
+ };
+
+ struct file {
+ struct list_head f_list;
+ struct dentry *f_dentry;
+ struct vfsmount *f_vfsmnt;
+ struct file_operations *f_op;
+ atomic_t f_count;
+ unsigned int f_flags;
+ mode_t f_mode;
+ loff_t f_pos;
+ unsigned long f_reada, f_ramax, f_raend, f_ralen, f_rawin;
+ struct fown_struct f_owner;
+ unsigned int f_uid, f_gid;
+ int f_error;
+
+ unsigned long f_version;
+
+ /* needed for tty driver, and maybe others */
+ void *private_data;
+
+ /* preallocated helper kiobuf to speedup O_DIRECT */
+ struct kiobuf *f_iobuf;
+ long f_iobuf_lock;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+ #define file_list_unlock() spin_unlock(&files_lock);
+
+ #define get_file(x) atomic_inc(&(x)->f_count)
+ #define file_count(x) atomic_read(&(x)->f_count)
+
+ extern int init_private_file(struct file *, struct dentry *, int);
+
+ #define MAX_NON_LFS ((1UL<<31) - 1)
+
+ #define FL_POSIX 1
+ #define FL_FLOCK 2
+ #define FL_BROKEN 4 /* broken flock() emulation */
+ #define FL_ACCESS 8 /* for processes suspended by mandatory locking */
+ #define FL_LOCKD 16 /* lock held by rpc.lockd */
+ #define FL_LEASE 32 /* lease held on this file */
+
+ /*
+ * The POSIX file lock owner is determined by
+ * the "struct files_struct" in the thread group
+ * (or NULL for no owner - BSD locks).
+ *
+ * Lockd stuffs a "host" pointer into this.
+ */
+ typedef struct files_struct *fl_owner_t;
+
+ struct file_lock {
+ struct file_lock *fl_next; /* singly linked list for this inode */
+ struct list_head fl_link; /* doubly linked list of all locks */
+ struct list_head fl_block; /* circular list of blocked processes */
+ fl_owner_t fl_owner;
+ unsigned int fl_pid;
+ wait_queue_head_t fl_wait;
+ struct file *fl_file;
+ unsigned char fl_flags;
+ unsigned char fl_type;
+ loff_t fl_start;
+ loff_t fl_end;
+
+ void (*fl_notify)(struct file_lock *); /* unblock callback */
+ void (*fl_insert)(struct file_lock *); /* lock insertion callback */
+ void (*fl_remove)(struct file_lock *); /* lock removal callback */
+
+ struct fasync_struct * fl_fasync; /* for lease break notifications */
+
+ union {
+ struct nfs_lock_info nfs_fl;
+ } fl_u;
+ };
+
+ /* The following constant reflects the upper bound of the file/locking space */
+ #ifndef OFFSET_MAX
+ #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1)))
+ #define OFFSET_MAX INT_LIMIT(loff_t)
+ #define OFFT_OFFSET_MAX INT_LIMIT(off_t)
+ #endif
+
+ extern struct list_head file_lock_list;
+
+ #include <linux/fcntl.h>
+
+ extern int fcntl_getlk(unsigned int, struct flock *);
+ extern int fcntl_setlk(unsigned int, unsigned int, struct flock *);
+
+ extern int fcntl_getlk64(unsigned int, struct flock64 *);
+ extern int fcntl_setlk64(unsigned int, unsigned int, struct flock64 *);
+
+ /* fs/locks.c */
+ extern void locks_init_lock(struct file_lock *);
+ extern void locks_copy_lock(struct file_lock *, struct file_lock *);
+ extern void locks_remove_posix(struct file *, fl_owner_t);
+ extern void locks_remove_flock(struct file *);
+ extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
+ extern int posix_lock_file(struct file *, struct file_lock *, unsigned int);
+ extern void posix_block_lock(struct file_lock *, struct file_lock *);
+ extern void posix_unblock_lock(struct file_lock *);
+ extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
+ extern int __get_lease(struct inode *inode, unsigned int flags);
+ extern time_t lease_get_mtime(struct inode *);
+ extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
+ extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+
+ struct fasync_struct {
+ int magic;
+ int fa_fd;
+ struct fasync_struct *fa_next; /* singly linked list */
+ struct file *fa_file;
+ };
+
+ #define FASYNC_MAGIC 0x4601
+
+ /* SMP safe fasync helpers: */
+ extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
+ /* can be called from interrupts */
+ extern void kill_fasync(struct fasync_struct **, int, int);
+ /* only for net: no internal synchronization */
+ extern void __kill_fasync(struct fasync_struct *, int, int);
+
+ struct nameidata {
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+ struct qstr last;
+ unsigned int flags;
+ int last_type;
+ };
+
+ #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */
+ #define DQUOT_GRP_ENABLED 0x02 /* Group diskquotas enabled */
+
+ struct quota_mount_options
+ {
+ unsigned int flags; /* Flags for diskquotas on this device */
+ struct semaphore dqio_sem; /* lock device while I/O in progress */
+ struct semaphore dqoff_sem; /* serialize quota_off() and quota_on() on device */
+ struct file *files[MAXQUOTAS]; /* fp's to quotafiles */
+ time_t inode_expire[MAXQUOTAS]; /* expiretime for inode-quota */
+ time_t block_expire[MAXQUOTAS]; /* expiretime for block-quota */
+ char rsquash[MAXQUOTAS]; /* for quotas threat root as any other user */
+ };
+
+ /*
+ * Umount options
+ */
+
+ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */
+ #define MNT_DETACH 0x00000002 /* Just detach from the tree */
+
+ #include <linux/minix_fs_sb.h>
+ #include <linux/ext2_fs_sb.h>
+ #include <linux/hpfs_fs_sb.h>
+ #include <linux/ntfs_fs_sb.h>
+ #include <linux/msdos_fs_sb.h>
+ #include <linux/iso_fs_sb.h>
+ #include <linux/nfs_fs_sb.h>
+ #include <linux/sysv_fs_sb.h>
+ #include <linux/affs_fs_sb.h>
+ #include <linux/ufs_fs_sb.h>
+ #include <linux/efs_fs_sb.h>
+ #include <linux/romfs_fs_sb.h>
+ #include <linux/smb_fs_sb.h>
+ #include <linux/hfs_fs_sb.h>
+ #include <linux/adfs_fs_sb.h>
+ #include <linux/qnx4_fs_sb.h>
+ #include <linux/reiserfs_fs_sb.h>
+ #include <linux/bfs_fs_sb.h>
+ #include <linux/udf_fs_sb.h>
+ #include <linux/ncp_fs_sb.h>
+ #include <linux/usbdev_fs_sb.h>
+ #include <linux/cramfs_fs_sb.h>
+ #include <linux/jffs2_fs_sb.h>
+
+ extern struct list_head super_blocks;
+ extern spinlock_t sb_lock;
+
+ #define sb_entry(list) list_entry((list), struct super_block, s_list)
+ #define S_BIAS (1<<30)
+ struct super_block {
+ struct list_head s_list; /* Keep this first */
+ kdev_t s_dev;
+ unsigned long s_blocksize;
+ unsigned char s_blocksize_bits;
+ unsigned char s_dirt;
+ unsigned long long s_maxbytes; /* Max file size */
+ struct file_system_type *s_type;
+ struct super_operations *s_op;
+ struct dquot_operations *dq_op;
+ unsigned long s_flags;
+ unsigned long s_magic;
+ struct dentry *s_root;
+ struct rw_semaphore s_umount;
+ struct semaphore s_lock;
+ int s_count;
+ atomic_t s_active;
+
+ struct list_head s_dirty; /* dirty inodes */
+ struct list_head s_locked_inodes;/* inodes being synced */
+ struct list_head s_files;
+
+ struct block_device *s_bdev;
+ struct list_head s_instances;
+ struct quota_mount_options s_dquot; /* Diskquota specific options */
+
+ union {
+ struct minix_sb_info minix_sb;
+ struct ext2_sb_info ext2_sb;
+ struct hpfs_sb_info hpfs_sb;
+ struct ntfs_sb_info ntfs_sb;
+ struct msdos_sb_info msdos_sb;
+ struct isofs_sb_info isofs_sb;
+ struct nfs_sb_info nfs_sb;
+ struct sysv_sb_info sysv_sb;
+ struct affs_sb_info affs_sb;
+ struct ufs_sb_info ufs_sb;
+ struct efs_sb_info efs_sb;
+ struct shmem_sb_info shmem_sb;
+ struct romfs_sb_info romfs_sb;
+ struct smb_sb_info smbfs_sb;
+ struct hfs_sb_info hfs_sb;
+ struct adfs_sb_info adfs_sb;
+ struct qnx4_sb_info qnx4_sb;
+ struct reiserfs_sb_info reiserfs_sb;
+ struct bfs_sb_info bfs_sb;
+ struct udf_sb_info udf_sb;
+ struct ncp_sb_info ncpfs_sb;
+ struct usbdev_sb_info usbdevfs_sb;
+ struct jffs2_sb_info jffs2_sb;
+ struct cramfs_sb_info cramfs_sb;
+ void *generic_sbp;
+ } u;
+ /*
+ * The next field is for VFS *only*. No filesystems have any business
+ * even looking at it. You had been warned.
+ */
+ struct semaphore s_vfs_rename_sem; /* Kludge */
+
+ /* The next field is used by knfsd when converting a (inode number based)
+ * file handle into a dentry. As it builds a path in the dcache tree from
+ * the bottom up, there may for a time be a subpath of dentrys which is not
+ * connected to the main tree. This semaphore ensure that there is only ever
+ * one such free path per filesystem. Note that unconnected files (or other
+ * non-directories) are allowed, but not unconnected diretories.
+ */
+ struct semaphore s_nfsd_free_path_sem;
+ };
+
+ /*
+ * VFS helper functions..
+ */
+ extern int vfs_create(struct inode *, struct dentry *, int);
+ extern int vfs_mkdir(struct inode *, struct dentry *, int);
+ extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
+ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *);
+ extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+
+ /*
+ * File types
+ */
+ #define DT_UNKNOWN 0
+ #define DT_FIFO 1
+ #define DT_CHR 2
+ #define DT_DIR 4
+ #define DT_BLK 6
+ #define DT_REG 8
+ #define DT_LNK 10
+ #define DT_SOCK 12
+ #define DT_WHT 14
+
+ /*
+ * This is the "filldir" function type, used by readdir() to let
+ * the kernel specify what kind of dirent layout it wants to have.
+ * This allows the kernel to read directories into kernel space or
+ * to have different dirent layouts depending on the binary type.
+ */
+ typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned);
+
+ struct block_device_operations {
+ int (*open) (struct inode *, struct file *);
+ int (*release) (struct inode *, struct file *);
+ int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+ int (*check_media_change) (kdev_t);
+ int (*revalidate) (kdev_t);
+ };
+
+ /*
+ * NOTE:
+ * read, write, poll, fsync, readv, writev can be called
+ * without the big kernel lock held in all filesystems.
+ */
+ struct file_operations {
+ struct module *owner;
+ loff_t (*llseek) (struct file *, loff_t, int);
+ ssize_t (*read) (struct file *, char *, size_t, loff_t *);
+ ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+ int (*readdir) (struct file *, void *, filldir_t);
+ unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+ int (*mmap) (struct file *, struct vm_area_struct *);
+ int (*open) (struct inode *, struct file *);
+ int (*flush) (struct file *);
+ int (*release) (struct inode *, struct file *);
+ int (*fsync) (struct file *, struct dentry *, int datasync);
+ int (*fasync) (int, struct file *, int);
+ int (*lock) (struct file *, int, struct file_lock *);
+ ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ };
+
+ struct inode_operations {
+ int (*create) (struct inode *,struct dentry *,int);
+ struct dentry * (*lookup) (struct inode *,struct dentry *);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
+ int (*unlink) (struct inode *,struct dentry *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,int);
+ int (*rmdir) (struct inode *,struct dentry *);
+ int (*mknod) (struct inode *,struct dentry *,int,int);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
+ int (*readlink) (struct dentry *, char *,int);
+ int (*follow_link) (struct dentry *, struct nameidata *);
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int);
+ int (*revalidate) (struct dentry *);
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*getattr) (struct dentry *, struct iattr *);
+ };
+
+ /*
+ * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
+ * without the big kernel lock held in all filesystems.
+ */
+ struct super_operations {
+ void (*read_inode) (struct inode *);
+
+ /* reiserfs kludge. reiserfs needs 64 bits of information to
+ ** find an inode. We are using the read_inode2 call to get
+ ** that information. We don't like this, and are waiting on some
+ ** VFS changes for the real solution.
+ ** iget4 calls read_inode2, iff it is defined
+ */
+ void (*read_inode2) (struct inode *, void *) ;
+ void (*dirty_inode) (struct inode *);
+ void (*write_inode) (struct inode *, int);
+ void (*put_inode) (struct inode *);
+ void (*delete_inode) (struct inode *);
+ void (*put_super) (struct super_block *);
+ void (*write_super) (struct super_block *);
+ void (*write_super_lockfs) (struct super_block *);
+ void (*unlockfs) (struct super_block *);
+ int (*statfs) (struct super_block *, struct statfs *);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*clear_inode) (struct inode *);
+ void (*umount_begin) (struct super_block *);
+
+ /* Following are for knfsd to interact with "interesting" filesystems
+ * Currently just reiserfs, but possibly FAT and others later
+ *
+ * fh_to_dentry is given a filehandle fragement with length, and a type flag
+ * and must return a dentry for the referenced object or, if "parent" is
+ * set, a dentry for the parent of the object.
+ * If a dentry cannot be found, a "root" dentry should be created and
+ * flaged as DCACHE_NFSD_DISCONNECTED. nfsd_iget is an example implementation.
+ *
+ * dentry_to_fh is given a dentry and must generate the filesys specific
+ * part of the file handle. Available length is passed in *lenp and used
+ * length should be returned therein.
+ * If need_parent is set, then dentry_to_fh should encode sufficient information
+ * to find the (current) parent.
+ * dentry_to_fh should return a 1byte "type" which will be passed back in
+ * the fhtype arguement to fh_to_dentry. Type of 0 is reserved.
+ * If filesystem was exportable before the introduction of fh_to_dentry,
+ * types 1 and 2 should be used is that same way as the generic code.
+ * Type 255 means error.
+ *
+ * Lengths are in units of 4bytes, not bytes.
+ */
+ struct dentry * (*fh_to_dentry)(struct super_block *sb, __u32 *fh, int len, int fhtype, int parent);
+ int (*dentry_to_fh)(struct dentry *, __u32 *fh, int *lenp, int need_parent);
+ };
+
+ /* Inode state bits.. */
+ #define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */
+ #define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */
+ #define I_DIRTY_PAGES 4 /* Data-related inode changes pending */
+ #define I_LOCK 8
+ #define I_FREEING 16
+ #define I_CLEAR 32
+
+ #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+
+ extern void __mark_inode_dirty(struct inode *, int);
+ static inline void mark_inode_dirty(struct inode *inode)
+ {
+ __mark_inode_dirty(inode, I_DIRTY);
+ }
+
+ static inline void mark_inode_dirty_sync(struct inode *inode)
+ {
+ __mark_inode_dirty(inode, I_DIRTY_SYNC);
+ }
+
+ static inline void mark_inode_dirty_pages(struct inode *inode)
+ {
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ }
+
+ struct dquot_operations {
+ void (*initialize) (struct inode *, short);
+ void (*drop) (struct inode *);
+ int (*alloc_block) (struct inode *, unsigned long, char);
+ int (*alloc_inode) (const struct inode *, unsigned long);
+ void (*free_block) (struct inode *, unsigned long);
+ void (*free_inode) (const struct inode *, unsigned long);
+ int (*transfer) (struct inode *, struct iattr *);
+ };
+
+ struct file_system_type {
+ const char *name;
+ int fs_flags;
+ struct super_block *(*read_super) (struct super_block *, void *, int);
+ struct module *owner;
+ struct file_system_type * next;
+ struct list_head fs_supers;
+ };
+
+ #define DECLARE_FSTYPE(var,type,read,flags) \
+ struct file_system_type var = { \
+ name: type, \
+ read_super: read, \
+ fs_flags: flags, \
+ owner: THIS_MODULE, \
+ }
+
+ #define DECLARE_FSTYPE_DEV(var,type,read) \
+ DECLARE_FSTYPE(var,type,read,FS_REQUIRES_DEV)
+
+ /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
+ #define fops_get(fops) \
+ (((fops) && (fops)->owner) \
+ ? ( try_inc_mod_count((fops)->owner) ? (fops) : NULL ) \
+ : (fops))
+
+ #define fops_put(fops) \
+ do { \
+ if ((fops) && (fops)->owner) \
+ __MOD_DEC_USE_COUNT((fops)->owner); \
+ } while(0)
+
+ extern int register_filesystem(struct file_system_type *);
+ extern int unregister_filesystem(struct file_system_type *);
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+
+ #define kern_umount mntput
+
+ extern int vfs_statfs(struct super_block *, struct statfs *);
+
+ /* Return value for VFS lock functions - tells locks.c to lock conventionally
+ * REALLY kosha for root NFS and nfs_lock
+ */
+ #define LOCK_USE_CLNT 1
+
+ #define FLOCK_VERIFY_READ 1
+ #define FLOCK_VERIFY_WRITE 2
+
+ extern int locks_mandatory_locked(struct inode *);
+ extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
+
+ /*
+ * Candidates for mandatory locking have the setgid bit set
+ * but no group execute bit - an otherwise meaningless combination.
+ */
+ #define MANDATORY_LOCK(inode) \
+ (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+
+ static inline int locks_verify_locked(struct inode *inode)
+ {
+ if (MANDATORY_LOCK(inode))
+ return locks_mandatory_locked(inode);
+ return 0;
+ }
+
+ static inline int locks_verify_area(int read_write, struct inode *inode,
+ struct file *filp, loff_t offset,
+ size_t count)
+ {
+ if (inode->i_flock && MANDATORY_LOCK(inode))
+ return locks_mandatory_area(read_write, inode, filp, offset, count);
+ return 0;
+ }
+
+ static inline int locks_verify_truncate(struct inode *inode,
+ struct file *filp,
+ loff_t size)
+ {
+ if (inode->i_flock && MANDATORY_LOCK(inode))
+ return locks_mandatory_area(
+ FLOCK_VERIFY_WRITE, inode, filp,
+ size < inode->i_size ? size : inode->i_size,
+ (size < inode->i_size ? inode->i_size - size
+ : size - inode->i_size)
+ );
+ return 0;
+ }
+
+ static inline int get_lease(struct inode *inode, unsigned int mode)
+ {
+ if (inode->i_flock && (inode->i_flock->fl_flags & FL_LEASE))
+ return __get_lease(inode, mode);
+ return 0;
+ }
+
+ /* fs/open.c */
+
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */
+ extern int do_truncate(struct dentry *, loff_t start);
+
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+
+ /* fs/dcache.c */
+ extern void vfs_caches_init(unsigned long);
+
+ #define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL)
+ #define putname(name) kmem_cache_free(names_cachep, (void *)(name))
+
+ enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW};
+ extern int register_blkdev(unsigned int, const char *, struct block_device_operations *);
+ extern int unregister_blkdev(unsigned int, const char *);
+ extern struct block_device *bdget(dev_t);
+ extern int bd_acquire(struct inode *inode);
+ extern void bd_forget(struct inode *inode);
+ extern void bdput(struct block_device *);
+ extern struct char_device *cdget(dev_t);
+ extern void cdput(struct char_device *);
+ extern int blkdev_open(struct inode *, struct file *);
+ extern int blkdev_close(struct inode *, struct file *);
+ extern struct file_operations def_blk_fops;
+ extern struct address_space_operations def_blk_aops;
+ extern struct file_operations def_fifo_fops;
+ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
+ extern int blkdev_get(struct block_device *, mode_t, unsigned, int);
+ extern int blkdev_put(struct block_device *, int);
+
+ /* fs/devices.c */
+ extern const struct block_device_operations *get_blkfops(unsigned int);
+ extern int register_chrdev(unsigned int, const char *, struct file_operations *);
+ extern int unregister_chrdev(unsigned int, const char *);
+ extern int chrdev_open(struct inode *, struct file *);
+ extern const char * bdevname(kdev_t);
+ extern const char * cdevname(kdev_t);
+ extern const char * kdevname(kdev_t);
+ extern void init_special_inode(struct inode *, umode_t, int);
+
+ /* Invalid inode operations -- fs/bad_inode.c */
+ extern void make_bad_inode(struct inode *);
+ extern int is_bad_inode(struct inode *);
+
+ extern struct file_operations read_fifo_fops;
+ extern struct file_operations write_fifo_fops;
+ extern struct file_operations rdwr_fifo_fops;
+ extern struct file_operations read_pipe_fops;
+ extern struct file_operations write_pipe_fops;
+ extern struct file_operations rdwr_pipe_fops;
+
+ extern int fs_may_remount_ro(struct super_block *);
+
+ extern int try_to_free_buffers(struct page *, unsigned int);
+ extern void refile_buffer(struct buffer_head * buf);
+ extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+
+ /* reiserfs_writepage needs this */
+ extern void set_buffer_async_io(struct buffer_head *bh) ;
+
+ #define BUF_CLEAN 0
+ #define BUF_LOCKED 1 /* Buffers scheduled for write */
+ #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */
+ #define NR_LIST 3
+
+ static inline void get_bh(struct buffer_head * bh)
+ {
+ atomic_inc(&(bh)->b_count);
+ }
+
+ static inline void put_bh(struct buffer_head *bh)
+ {
+ smp_mb__before_atomic_dec();
+ atomic_dec(&bh->b_count);
+ }
+
+ /*
+ * This is called by bh->b_end_io() handlers when I/O has completed.
+ */
+ static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+ {
+ if (on)
+ set_bit(BH_Uptodate, &bh->b_state);
+ else
+ clear_bit(BH_Uptodate, &bh->b_state);
+ }
+
+ #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
+
+ static inline void __mark_buffer_clean(struct buffer_head *bh)
+ {
+ refile_buffer(bh);
+ }
+
+ static inline void mark_buffer_clean(struct buffer_head * bh)
+ {
+ if (atomic_set_buffer_clean(bh))
+ __mark_buffer_clean(bh);
+ }
+
+ extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
+ extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
+ extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+ extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *));
+
+ #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
+
+ static inline void mark_buffer_async(struct buffer_head * bh, int on)
+ {
+ if (on)
+ set_bit(BH_Async, &bh->b_state);
+ else
+ clear_bit(BH_Async, &bh->b_state);
+ }
+
+ /*
+ * If an error happens during the make_request, this function
+ * has to be recalled. It marks the buffer as clean and not
+ * uptodate, and it notifys the upper layer about the end
+ * of the I/O.
+ */
+ static inline void buffer_IO_error(struct buffer_head * bh)
+ {
+ mark_buffer_clean(bh);
+ /*
+ * b_end_io has to clear the BH_Uptodate bitflag in the error case!
+ */
+ bh->b_end_io(bh, 0);
+ }
+
+ extern void buffer_insert_inode_queue(struct buffer_head *, struct inode *);
+ static inline void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+ {
+ mark_buffer_dirty(bh);
+ buffer_insert_inode_queue(bh, inode);
+ }
+
+ extern void balance_dirty(void);
+ extern int check_disk_change(kdev_t);
+ extern int invalidate_inodes(struct super_block *);
+ extern int invalidate_device(kdev_t, int);
+ extern void invalidate_inode_pages(struct inode *);
+ extern void invalidate_inode_pages2(struct address_space *);
+ extern void invalidate_inode_buffers(struct inode *);
+ #define invalidate_buffers(dev) __invalidate_buffers((dev), 0)
+ #define destroy_buffers(dev) __invalidate_buffers((dev), 1)
+ extern void invalidate_bdev(struct block_device *, int);
+ extern void __invalidate_buffers(kdev_t dev, int);
+ extern void sync_inodes(kdev_t);
+ extern void sync_unlocked_inodes(void);
+ extern void write_inode_now(struct inode *, int);
+ extern int sync_buffers(kdev_t, int);
+ extern void sync_dev(kdev_t);
+ extern int fsync_dev(kdev_t);
+ extern int fsync_super(struct super_block *);
+ extern int fsync_no_super(kdev_t);
+ extern void sync_inodes_sb(struct super_block *);
+ extern int osync_inode_buffers(struct inode *);
+ extern int osync_inode_data_buffers(struct inode *);
+ extern int fsync_inode_buffers(struct inode *);
+ extern int fsync_inode_data_buffers(struct inode *);
+ extern int inode_has_buffers(struct inode *);
+ extern void filemap_fdatasync(struct address_space *);
+ extern void filemap_fdatawait(struct address_space *);
+ extern void sync_supers(kdev_t);
+ extern int bmap(struct inode *, int);
+ extern int notify_change(struct dentry *, struct iattr *);
+ extern int permission(struct inode *, int);
+ extern int vfs_permission(struct inode *, int);
+ extern int get_write_access(struct inode *);
+ extern int deny_write_access(struct file *);
+ static inline void put_write_access(struct inode * inode)
+ {
+ atomic_dec(&inode->i_writecount);
+ }
+ static inline void allow_write_access(struct file *file)
+ {
+ if (file)
+ atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ }
+ extern int do_pipe(int *);
+
+ extern int open_namei(const char *, int, int, struct nameidata *);
+
+ extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
+ extern struct file * open_exec(const char *);
+
+ /* fs/dcache.c -- generic fs support functions */
+ extern int is_subdir(struct dentry *, struct dentry *);
+ extern ino_t find_inode_number(struct dentry *, struct qstr *);
+
+ /*
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a dentry
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ */
+ static inline void *ERR_PTR(long error)
+ {
+ return (void *) error;
+ }
+
+ static inline long PTR_ERR(const void *ptr)
+ {
+ return (long) ptr;
+ }
+
+ static inline long IS_ERR(const void *ptr)
+ {
+ return (unsigned long)ptr > (unsigned long)-1000L;
+ }
+
+ /*
+ * The bitmask for a lookup event:
+ * - follow links at the end
+ * - require a directory
+ * - ending slashes ok even for nonexistent files
+ * - internal "there are more path compnents" flag
+ */
+ #define LOOKUP_FOLLOW (1)
+ #define LOOKUP_DIRECTORY (2)
+ #define LOOKUP_CONTINUE (4)
+ #define LOOKUP_POSITIVE (8)
+ #define LOOKUP_PARENT (16)
+ #define LOOKUP_NOALT (32)
+ /*
+ * Type of the last component on LOOKUP_PARENT
+ */
+ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+
+ /*
+ * "descriptor" for what we're up to with a read for sendfile().
+ * This allows us to use the same read code yet
+ * have multiple different users of the data that
+ * we read from a file.
+ *
+ * The simplest case just copies the data to user
+ * mode.
+ */
+ typedef struct {
+ size_t written;
+ size_t count;
+ char * buf;
+ int error;
+ } read_descriptor_t;
+
+ typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
+
+ /* needed for stackable file system support */
+ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
+
+ extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+ #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
+ #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
+
+ extern void iput(struct inode *);
+ extern void force_delete(struct inode *);
+ extern struct inode * igrab(struct inode *);
+ extern ino_t iunique(struct super_block *, ino_t);
+
+ typedef int (*find_inode_t)(struct inode *, unsigned long, void *);
+ extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *);
+ static inline struct inode *iget(struct super_block *sb, unsigned long ino)
+ {
+ return iget4(sb, ino, NULL, NULL);
+ }
+
+ extern void clear_inode(struct inode *);
+ extern struct inode * get_empty_inode(void);
+
+ static inline struct inode * new_inode(struct super_block *sb)
+ {
+ struct inode *inode = get_empty_inode();
+ if (inode) {
+ inode->i_sb = sb;
+ inode->i_dev = sb->s_dev;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ }
+ return inode;
+ }
+ extern void remove_suid(struct inode *inode);
+
+ extern void insert_inode_hash(struct inode *);
+ extern void remove_inode_hash(struct inode *);
+ extern struct file * get_empty_filp(void);
+ extern void file_move(struct file *f, struct list_head *list);
+ extern struct buffer_head * get_hash_table(kdev_t, int, int);
+ extern struct buffer_head * getblk(kdev_t, int, int);
+ extern void ll_rw_block(int, int, struct buffer_head * bh[]);
+ extern void submit_bh(int, struct buffer_head *);
+ extern int is_read_only(kdev_t);
+ extern void __brelse(struct buffer_head *);
+ static inline void brelse(struct buffer_head *buf)
+ {
+ if (buf)
+ __brelse(buf);
+ }
+ extern void __bforget(struct buffer_head *);
+ static inline void bforget(struct buffer_head *buf)
+ {
+ if (buf)
+ __bforget(buf);
+ }
+ extern int set_blocksize(kdev_t, int);
+ extern struct buffer_head * bread(kdev_t, int, int);
+ extern void wakeup_bdflush(void);
+
+ extern int brw_page(int, struct page *, kdev_t, int [], int);
+
+ typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
+
+ /* Generic buffer handling for block filesystems.. */
+ extern int discard_bh_page(struct page *, unsigned long, int);
+ #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
+ #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
+ extern int block_symlink(struct inode *, const char *, int);
+ extern int block_write_full_page(struct page*, get_block_t*);
+ extern int block_read_full_page(struct page*, get_block_t*);
+ extern int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
+ extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
+ unsigned long *);
+ extern int block_commit_write(struct page *page, unsigned from, unsigned to);
+ extern int block_sync_page(struct page *);
+
+ int generic_block_bmap(struct address_space *, long, get_block_t *);
+ int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
+ int block_truncate_page(struct address_space *, loff_t, get_block_t *);
+ extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
+
+ extern int waitfor_one_page(struct page*);
+ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
+ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
+ extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
+ extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
+ extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+ extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
+ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
+ extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
+ extern int generic_file_open(struct inode * inode, struct file * filp);
+
+ extern struct file_operations generic_ro_fops;
+
+ extern int vfs_readlink(struct dentry *, char *, int, const char *);
+ extern int vfs_follow_link(struct nameidata *, const char *);
+ extern int page_readlink(struct dentry *, char *, int);
+ extern int page_follow_link(struct dentry *, struct nameidata *);
+ extern struct inode_operations page_symlink_inode_operations;
+
+ extern int vfs_readdir(struct file *, filldir_t, void *);
+ extern int dcache_readdir(struct file *, void *, filldir_t);
+
+ extern struct file_system_type *get_fs_type(const char *name);
+ extern struct super_block *get_super(kdev_t);
+ extern void drop_super(struct super_block *sb);
+ static inline int is_mounted(kdev_t dev)
+ {
+ struct super_block *sb = get_super(dev);
+ if (sb) {
+ drop_super(sb);
+ return 1;
+ }
+ return 0;
+ }
+ unsigned long generate_cluster(kdev_t, int b[], int);
+ unsigned long generate_cluster_swab32(kdev_t, int b[], int);
+ extern kdev_t ROOT_DEV;
+ extern char root_device_name[];
+
+
+ extern void show_buffers(void);
+ extern void mount_root(void);
+
+ #ifdef CONFIG_BLK_DEV_INITRD
+ extern kdev_t real_root_dev;
+ extern int change_root(kdev_t, const char *);
+ #endif
+
+ extern ssize_t char_read(struct file *, char *, size_t, loff_t *);
+ extern ssize_t block_read(struct file *, char *, size_t, loff_t *);
+ extern int read_ahead[];
+
+ extern ssize_t char_write(struct file *, const char *, size_t, loff_t *);
+ extern ssize_t block_write(struct file *, const char *, size_t, loff_t *);
+
+ extern int file_fsync(struct file *, struct dentry *, int);
+ extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx);
+ extern int generic_osync_inode(struct inode *, int);
+ #define OSYNC_METADATA (1<<0)
+ #define OSYNC_DATA (1<<1)
+ #define OSYNC_INODE (1<<2)
+
+ extern int inode_change_ok(struct inode *, struct iattr *);
+ extern int inode_setattr(struct inode *, struct iattr *);
+
+ /*
+ * Common dentry functions for inclusion in the VFS
+ * or in other stackable file systems. Some of these
+ * functions were in linux/fs/ C (VFS) files.
+ *
+ */
+
+ /*
+ * Locking the parent is needed to:
+ * - serialize directory operations
+ * - make sure the parent doesn't change from
+ * under us in the middle of an operation.
+ *
+ * NOTE! Right now we'd rather use a "struct inode"
+ * for this, but as I expect things to move toward
+ * using dentries instead for most things it is
+ * probably better to start with the conceptually
+ * better interface of relying on a path of dentries.
+ */
+ static inline struct dentry *lock_parent(struct dentry *dentry)
+ {
+ struct dentry *dir = dget(dentry->d_parent);
+
+ down(&dir->d_inode->i_sem);
+ return dir;
+ }
+
+ static inline struct dentry *get_parent(struct dentry *dentry)
+ {
+ return dget(dentry->d_parent);
+ }
+
+ static inline void unlock_dir(struct dentry *dir)
+ {
+ up(&dir->d_inode->i_sem);
+ dput(dir);
+ }
+
+ /*
+ * Whee.. Deadlock country. Happily there are only two VFS
+ * operations that does this..
+ */
+ static inline void double_down(struct semaphore *s1, struct semaphore *s2)
+ {
+ if (s1 != s2) {
+ if ((unsigned long) s1 < (unsigned long) s2) {
+ struct semaphore *tmp = s2;
+ s2 = s1; s1 = tmp;
+ }
+ down(s1);
+ }
+ down(s2);
+ }
+
+ /*
+ * Ewwwwwwww... _triple_ lock. We are guaranteed that the 3rd argument is
+ * not equal to 1st and not equal to 2nd - the first case (target is parent of
+ * source) would be already caught, the second is plain impossible (target is
+ * its own parent and that case would be caught even earlier). Very messy.
+ * I _think_ that it works, but no warranties - please, look it through.
+ * Pox on bloody lusers who mandated overwriting rename() for directories...
+ */
+
+ static inline void triple_down(struct semaphore *s1,
+ struct semaphore *s2,
+ struct semaphore *s3)
+ {
+ if (s1 != s2) {
+ if ((unsigned long) s1 < (unsigned long) s2) {
+ if ((unsigned long) s1 < (unsigned long) s3) {
+ struct semaphore *tmp = s3;
+ s3 = s1; s1 = tmp;
+ }
+ if ((unsigned long) s1 < (unsigned long) s2) {
+ struct semaphore *tmp = s2;
+ s2 = s1; s1 = tmp;
+ }
+ } else {
+ if ((unsigned long) s1 < (unsigned long) s3) {
+ struct semaphore *tmp = s3;
+ s3 = s1; s1 = tmp;
+ }
+ if ((unsigned long) s2 < (unsigned long) s3) {
+ struct semaphore *tmp = s3;
+ s3 = s2; s2 = tmp;
+ }
+ }
+ down(s1);
+ } else if ((unsigned long) s2 < (unsigned long) s3) {
+ struct semaphore *tmp = s3;
+ s3 = s2; s2 = tmp;
+ }
+ down(s2);
+ down(s3);
+ }
+
+ static inline void double_up(struct semaphore *s1, struct semaphore *s2)
+ {
+ up(s1);
+ if (s1 != s2)
+ up(s2);
+ }
+
+ static inline void triple_up(struct semaphore *s1,
+ struct semaphore *s2,
+ struct semaphore *s3)
+ {
+ up(s1);
+ if (s1 != s2)
+ up(s2);
+ up(s3);
+ }
+
+ static inline void double_lock(struct dentry *d1, struct dentry *d2)
+ {
+ double_down(&d1->d_inode->i_sem, &d2->d_inode->i_sem);
+ }
+
+ static inline void double_unlock(struct dentry *d1, struct dentry *d2)
+ {
+ double_up(&d1->d_inode->i_sem,&d2->d_inode->i_sem);
+ dput(d1);
+ dput(d2);
+ }
+
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_FS_H */
diff -rc2P linux/include/linux/jbd.h linux-2.4.13/include/linux/jbd.h
*** linux/include/linux/jbd.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/jbd.h Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,878 ----
+ /*
+ * linux/include/linux/jbd.h
+ *
+ * Written by Stephen C. Tweedie <[email protected]>
+ *
+ * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Definitions for transaction data structures for the buffer cache
+ * filesystem journaling support.
+ */
+
+ #ifndef _LINUX_JBD_H
+ #define _LINUX_JBD_H
+
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || !defined(__KERNEL__)
+
+ /* Allow this file to be included directly into e2fsprogs */
+ #ifndef __KERNEL__
+ #include "jfs_compat.h"
+ #define JFS_DEBUG
+ #define jfs_debug jbd_debug
+ #else
+
+ #include <linux/journal-head.h>
+ #include <linux/stddef.h>
+ #include <asm/semaphore.h>
+ #endif
+
+ extern int journal_oom_retry;
+
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+ * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
+ * consistency checks. By default we don't do this unless
+ * CONFIG_JBD_DEBUG is on.
+ */
+ #define JBD_EXPENSIVE_CHECKING
+
+ extern int journal_enable_debug;
+ extern int journal_no_write[2];
+
+ #define jbd_debug(n, f, a...) \
+ do { \
+ if ((n) <= journal_enable_debug) { \
+ printk (KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __FUNCTION__); \
+ printk (f, ## a); \
+ } \
+ } while (0)
+ #else
+ #define jbd_debug(f, a...) /**/
+ #endif
+
+ extern void * __jbd_kmalloc (char *where, size_t size, int flags, int retry);
+ #define jbd_kmalloc(size, flags) \
+ __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
+ #define jbd_rep_kmalloc(size, flags) \
+ __jbd_kmalloc(__FUNCTION__, (size), (flags), 1)
+
+ #define JFS_MIN_JOURNAL_BLOCKS 1024
+
+ #ifdef __KERNEL__
+ typedef struct handle_s handle_t; /* Atomic operation type */
+ typedef struct journal_s journal_t; /* Journal control structure */
+ #endif
+
+ /*
+ * Internal structures used by the logging mechanism:
+ */
+
+ #define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
+
+ /*
+ * On-disk structures
+ */
+
+ /*
+ * Descriptor block types:
+ */
+
+ #define JFS_DESCRIPTOR_BLOCK 1
+ #define JFS_COMMIT_BLOCK 2
+ #define JFS_SUPERBLOCK_V1 3
+ #define JFS_SUPERBLOCK_V2 4
+ #define JFS_REVOKE_BLOCK 5
+
+ /*
+ * Standard header for all descriptor blocks:
+ */
+ typedef struct journal_header_s
+ {
+ __u32 h_magic;
+ __u32 h_blocktype;
+ __u32 h_sequence;
+ } journal_header_t;
+
+
+ /*
+ * The block tag: used to describe a single buffer in the journal
+ */
+ typedef struct journal_block_tag_s
+ {
+ __u32 t_blocknr; /* The on-disk block number */
+ __u32 t_flags; /* See below */
+ } journal_block_tag_t;
+
+ /*
+ * The revoke descriptor: used on disk to describe a series of blocks to
+ * be revoked from the log
+ */
+ typedef struct journal_revoke_header_s
+ {
+ journal_header_t r_header;
+ int r_count; /* Count of bytes used in the block */
+ } journal_revoke_header_t;
+
+
+ /* Definitions for the journal tag flags word: */
+ #define JFS_FLAG_ESCAPE 1 /* on-disk block is escaped */
+ #define JFS_FLAG_SAME_UUID 2 /* block has same uuid as previous */
+ #define JFS_FLAG_DELETED 4 /* block deleted by this transaction */
+ #define JFS_FLAG_LAST_TAG 8 /* last tag in this descriptor block */
+
+
+ /*
+ * The journal superblock. All fields are in big-endian byte order.
+ */
+ typedef struct journal_superblock_s
+ {
+ /* 0x0000 */
+ journal_header_t s_header;
+
+ /* 0x000C */
+ /* Static information describing the journal */
+ __u32 s_blocksize; /* journal device blocksize */
+ __u32 s_maxlen; /* total blocks in journal file */
+ __u32 s_first; /* first block of log information */
+
+ /* 0x0018 */
+ /* Dynamic information describing the current state of the log */
+ __u32 s_sequence; /* first commit ID expected in log */
+ __u32 s_start; /* blocknr of start of log */
+
+ /* 0x0020 */
+ /* Error value, as set by journal_abort(). */
+ __s32 s_errno;
+
+ /* 0x0024 */
+ /* Remaining fields are only valid in a version-2 superblock */
+ __u32 s_feature_compat; /* compatible feature set */
+ __u32 s_feature_incompat; /* incompatible feature set */
+ __u32 s_feature_ro_compat; /* readonly-compatible feature set */
+ /* 0x0030 */
+ __u8 s_uuid[16]; /* 128-bit uuid for journal */
+
+ /* 0x0040 */
+ __u32 s_nr_users; /* Nr of filesystems sharing log */
+
+ __u32 s_dynsuper; /* Blocknr of dynamic superblock copy*/
+
+ /* 0x0048 */
+ __u32 s_max_transaction; /* Limit of journal blocks per trans.*/
+ __u32 s_max_trans_data; /* Limit of data blocks per trans. */
+
+ /* 0x0050 */
+ __u32 s_padding[44];
+
+ /* 0x0100 */
+ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */
+ /* 0x0400 */
+ } journal_superblock_t;
+
+ #define JFS_HAS_COMPAT_FEATURE(j,mask) \
+ ((j)->j_format_version >= 2 && \
+ ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
+ #define JFS_HAS_RO_COMPAT_FEATURE(j,mask) \
+ ((j)->j_format_version >= 2 && \
+ ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
+ #define JFS_HAS_INCOMPAT_FEATURE(j,mask) \
+ ((j)->j_format_version >= 2 && \
+ ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+
+ #define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
+
+ /* Features known to this kernel version: */
+ #define JFS_KNOWN_COMPAT_FEATURES 0
+ #define JFS_KNOWN_ROCOMPAT_FEATURES 0
+ #define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
+
+ #ifdef __KERNEL__
+
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+
+ #define JBD_ASSERTIONS
+ #ifdef JBD_ASSERTIONS
+ #define J_ASSERT(assert) \
+ do { \
+ if (!(assert)) { \
+ printk (KERN_EMERG \
+ "Assertion failure in %s() at %s:%d: \"%s\"\n", \
+ __FUNCTION__, __FILE__, __LINE__, # assert); \
+ BUG(); \
+ } \
+ } while (0)
+
+ #if defined(CONFIG_BUFFER_DEBUG)
+ void buffer_assertion_failure(struct buffer_head *bh);
+ #define J_ASSERT_BH(bh, expr) \
+ do { \
+ if (!(expr)) \
+ buffer_assertion_failure(bh); \
+ J_ASSERT(expr); \
+ } while (0)
+ #define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr)
+ #else
+ #define J_ASSERT_BH(bh, expr) J_ASSERT(expr)
+ #define J_ASSERT_JH(jh, expr) J_ASSERT(expr)
+ #endif
+
+ #else
+ #define J_ASSERT(assert)
+ #endif /* JBD_ASSERTIONS */
+
+ enum jbd_state_bits {
+ BH_JWrite
+ = BH_PrivateStart, /* 1 if being written to log (@@@ DEBUGGING) */
+ BH_Freed, /* 1 if buffer has been freed (truncated) */
+ BH_Revoked, /* 1 if buffer has been revoked from the log */
+ BH_RevokeValid, /* 1 if buffer revoked flag is valid */
+ BH_JBDDirty, /* 1 if buffer is dirty but journaled */
+ };
+
+ /* Return true if the buffer is one which JBD is managing */
+ static inline int buffer_jbd(struct buffer_head *bh)
+ {
+ return __buffer_state(bh, JBD);
+ }
+
+ static inline struct buffer_head *jh2bh(struct journal_head *jh)
+ {
+ return jh->b_bh;
+ }
+
+ static inline struct journal_head *bh2jh(struct buffer_head *bh)
+ {
+ return bh->b_private;
+ }
+
+ struct jbd_revoke_table_s;
+
+ /* The handle_t type represents a single atomic update being performed
+ * by some process. All filesystem modifications made by the process go
+ * through this handle. Recursive operations (such as quota operations)
+ * are gathered into a single update.
+ *
+ * The buffer credits field is used to account for journaled buffers
+ * being modified by the running process. To ensure that there is
+ * enough log space for all outstanding operations, we need to limit the
+ * number of outstanding buffers possible at any time. When the
+ * operation completes, any buffer credits not used are credited back to
+ * the transaction, so that at all times we know how many buffers the
+ * outstanding updates on a transaction might possibly touch. */
+
+ struct handle_s
+ {
+ /* Which compound transaction is this update a part of? */
+ transaction_t * h_transaction;
+
+ /* Number of remaining buffers we are allowed to dirty: */
+ int h_buffer_credits;
+
+ /* Reference count on this handle */
+ int h_ref;
+
+ /* Field for caller's use to track errors through large fs
+ operations */
+ int h_err;
+
+ /* Flags */
+ unsigned int h_sync: 1; /* sync-on-close */
+ unsigned int h_jdata: 1; /* force data journaling */
+ unsigned int h_aborted: 1; /* fatal error on handle */
+ };
+
+
+ /* The transaction_t type is the guts of the journaling mechanism. It
+ * tracks a compound transaction through its various states:
+ *
+ * RUNNING: accepting new updates
+ * LOCKED: Updates still running but we don't accept new ones
+ * RUNDOWN: Updates are tidying up but have finished requesting
+ * new buffers to modify (state not used for now)
+ * FLUSH: All updates complete, but we are still writing to disk
+ * COMMIT: All data on disk, writing commit record
+ * FINISHED: We still have to keep the transaction for checkpointing.
+ *
+ * The transaction keeps track of all of the buffers modified by a
+ * running transaction, and all of the buffers committed but not yet
+ * flushed to home for finished transactions.
+ */
+
+ struct transaction_s
+ {
+ /* Pointer to the journal for this transaction. */
+ journal_t * t_journal;
+
+ /* Sequence number for this transaction */
+ tid_t t_tid;
+
+ /* Transaction's current state */
+ enum {
+ T_RUNNING,
+ T_LOCKED,
+ T_RUNDOWN,
+ T_FLUSH,
+ T_COMMIT,
+ T_FINISHED
+ } t_state;
+
+ /* Where in the log does this transaction's commit start? */
+ unsigned long t_log_start;
+
+ /* Doubly-linked circular list of all inodes owned by this
+ transaction */ /* AKPM: unused */
+ struct inode * t_ilist;
+
+ /* Number of buffers on the t_buffers list */
+ int t_nr_buffers;
+
+ /* Doubly-linked circular list of all buffers reserved but not
+ yet modified by this transaction */
+ struct journal_head * t_reserved_list;
+
+ /* Doubly-linked circular list of all metadata buffers owned by this
+ transaction */
+ struct journal_head * t_buffers;
+
+ /*
+ * Doubly-linked circular list of all data buffers still to be
+ * flushed before this transaction can be committed.
+ * Protected by journal_datalist_lock.
+ */
+ struct journal_head * t_sync_datalist;
+
+ /*
+ * Doubly-linked circular list of all writepage data buffers
+ * still to be written before this transaction can be committed.
+ * Protected by journal_datalist_lock.
+ */
+ struct journal_head * t_async_datalist;
+
+ /* Doubly-linked circular list of all forget buffers (superceded
+ buffers which we can un-checkpoint once this transaction
+ commits) */
+ struct journal_head * t_forget;
+
+ /*
+ * Doubly-linked circular list of all buffers still to be
+ * flushed before this transaction can be checkpointed.
+ */
+ /* Protected by journal_datalist_lock */
+ struct journal_head * t_checkpoint_list;
+
+ /* Doubly-linked circular list of temporary buffers currently
+ undergoing IO in the log */
+ struct journal_head * t_iobuf_list;
+
+ /* Doubly-linked circular list of metadata buffers being
+ shadowed by log IO. The IO buffers on the iobuf list and the
+ shadow buffers on this list match each other one for one at
+ all times. */
+ struct journal_head * t_shadow_list;
+
+ /* Doubly-linked circular list of control buffers being written
+ to the log. */
+ struct journal_head * t_log_list;
+
+ /* Number of outstanding updates running on this transaction */
+ int t_updates;
+
+ /* Number of buffers reserved for use by all handles in this
+ * transaction handle but not yet modified. */
+ int t_outstanding_credits;
+
+ /*
+ * Forward and backward links for the circular list of all
+ * transactions awaiting checkpoint.
+ */
+ /* Protected by journal_datalist_lock */
+ transaction_t *t_cpnext, *t_cpprev;
+
+ /* When will the transaction expire (become due for commit), in
+ * jiffies ? */
+ unsigned long t_expires;
+
+ /* How many handles used this transaction? */
+ int t_handle_count;
+ };
+
+
+ /* The journal_t maintains all of the journaling state information for a
+ * single filesystem. It is linked to from the fs superblock structure.
+ *
+ * We use the journal_t to keep track of all outstanding transaction
+ * activity on the filesystem, and to manage the state of the log
+ * writing process. */
+
+ struct journal_s
+ {
+ /* General journaling state flags */
+ unsigned long j_flags;
+
+ /* Is there an outstanding uncleared error on the journal (from
+ * a prior abort)? */
+ int j_errno;
+
+ /* The superblock buffer */
+ struct buffer_head * j_sb_buffer;
+ journal_superblock_t * j_superblock;
+
+ /* Version of the superblock format */
+ int j_format_version;
+
+ /* Number of processes waiting to create a barrier lock */
+ int j_barrier_count;
+
+ /* The barrier lock itself */
+ struct semaphore j_barrier;
+
+ /* Transactions: The current running transaction... */
+ transaction_t * j_running_transaction;
+
+ /* ... the transaction we are pushing to disk ... */
+ transaction_t * j_committing_transaction;
+
+ /* ... and a linked circular list of all transactions waiting
+ * for checkpointing. */
+ /* Protected by journal_datalist_lock */
+ transaction_t * j_checkpoint_transactions;
+
+ /* Wait queue for waiting for a locked transaction to start
+ committing, or for a barrier lock to be released */
+ wait_queue_head_t j_wait_transaction_locked;
+
+ /* Wait queue for waiting for checkpointing to complete */
+ wait_queue_head_t j_wait_logspace;
+
+ /* Wait queue for waiting for commit to complete */
+ wait_queue_head_t j_wait_done_commit;
+
+ /* Wait queue to trigger checkpointing */
+ wait_queue_head_t j_wait_checkpoint;
+
+ /* Wait queue to trigger commit */
+ wait_queue_head_t j_wait_commit;
+
+ /* Wait queue to wait for updates to complete */
+ wait_queue_head_t j_wait_updates;
+
+ /* Semaphore for locking against concurrent checkpoints */
+ struct semaphore j_checkpoint_sem;
+
+ /* The main journal lock, used by lock_journal() */
+ struct semaphore j_sem;
+
+ /* Journal head: identifies the first unused block in the journal. */
+ unsigned long j_head;
+
+ /* Journal tail: identifies the oldest still-used block in the
+ * journal. */
+ unsigned long j_tail;
+
+ /* Journal free: how many free blocks are there in the journal? */
+ unsigned long j_free;
+
+ /* Journal start and end: the block numbers of the first usable
+ * block and one beyond the last usable block in the journal. */
+ unsigned long j_first, j_last;
+
+ /* Device, blocksize and starting block offset for the location
+ * where we store the journal. */
+ kdev_t j_dev;
+ int j_blocksize;
+ unsigned int j_blk_offset;
+
+ /* Device which holds the client fs. For internal journal this
+ * will be equal to j_dev. */
+ kdev_t j_fs_dev;
+
+ /* Total maximum capacity of the journal region on disk. */
+ unsigned int j_maxlen;
+
+ /* Optional inode where we store the journal. If present, all
+ * journal block numbers are mapped into this inode via
+ * bmap(). */
+ struct inode * j_inode;
+
+ /* Sequence number of the oldest transaction in the log */
+ tid_t j_tail_sequence;
+ /* Sequence number of the next transaction to grant */
+ tid_t j_transaction_sequence;
+ /* Sequence number of the most recently committed transaction */
+ tid_t j_commit_sequence;
+ /* Sequence number of the most recent transaction wanting commit */
+ tid_t j_commit_request;
+
+ /* Journal uuid: identifies the object (filesystem, LVM volume
+ * etc) backed by this journal. This will eventually be
+ * replaced by an array of uuids, allowing us to index multiple
+ * devices within a single journal and to perform atomic updates
+ * across them. */
+
+ __u8 j_uuid[16];
+
+ /* Pointer to the current commit thread for this journal */
+ struct task_struct * j_task;
+
+ /* Maximum number of metadata buffers to allow in a single
+ * compound commit transaction */
+ int j_max_transaction_buffers;
+
+ /* What is the maximum transaction lifetime before we begin a
+ * commit? */
+ unsigned long j_commit_interval;
+
+ /* The timer used to wakeup the commit thread: */
+ struct timer_list * j_commit_timer;
+ int j_commit_timer_active;
+
+ /* Link all journals together - system-wide */
+ struct list_head j_all_journals;
+
+ /* The revoke table: maintains the list of revoked blocks in the
+ current transaction. */
+ struct jbd_revoke_table_s *j_revoke;
+ };
+
+ /*
+ * Journal flag definitions
+ */
+ #define JFS_UNMOUNT 0x001 /* Journal thread is being destroyed */
+ #define JFS_ABORT 0x002 /* Journaling has been aborted for errors. */
+ #define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */
+ #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
+ #define JFS_LOADED 0x010 /* The journal superblock has been loaded */
+
+ /*
+ * Function declarations for the journaling transaction and buffer
+ * management
+ */
+
+ /* Filing buffers */
+ extern void __journal_unfile_buffer(struct journal_head *);
+ extern void journal_unfile_buffer(struct journal_head *);
+ extern void __journal_refile_buffer(struct journal_head *);
+ extern void journal_refile_buffer(struct journal_head *);
+ extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
+ extern void __journal_free_buffer(struct journal_head *bh);
+ extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
+ extern void __journal_clean_data_list(transaction_t *transaction);
+
+ /* Log buffer allocation */
+ extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
+ extern unsigned long journal_next_log_block(journal_t *);
+
+ /* Commit management */
+ extern void journal_commit_transaction(journal_t *);
+
+ /* Checkpoint list management */
+ int __journal_clean_checkpoint_list(journal_t *journal);
+ extern void journal_remove_checkpoint(struct journal_head *);
+ extern void __journal_remove_checkpoint(struct journal_head *);
+ extern void journal_insert_checkpoint(struct journal_head *, transaction_t *);
+ extern void __journal_insert_checkpoint(struct journal_head *,transaction_t *);
+
+ /* Buffer IO */
+ extern int
+ journal_write_metadata_buffer(transaction_t *transaction,
+ struct journal_head *jh_in,
+ struct journal_head **jh_out,
+ int blocknr);
+
+ /* Transaction locking */
+ extern void __wait_on_journal (journal_t *);
+
+ /*
+ * Journal locking.
+ *
+ * We need to lock the journal during transaction state changes so that
+ * nobody ever tries to take a handle on the running transaction while
+ * we are in the middle of moving it to the commit phase.
+ *
+ * Note that the locking is completely interrupt unsafe. We never touch
+ * journal structures from interrupts.
+ *
+ * In 2.2, the BKL was required for lock_journal. This is no longer
+ * the case.
+ */
+
+ static inline void lock_journal(journal_t *journal)
+ {
+ down(&journal->j_sem);
+ }
+
+ /* This returns zero if we acquired the semaphore */
+ static inline int try_lock_journal(journal_t * journal)
+ {
+ return down_trylock(&journal->j_sem);
+ }
+
+ static inline void unlock_journal(journal_t * journal)
+ {
+ up(&journal->j_sem);
+ }
+
+
+ static inline handle_t *journal_current_handle(void)
+ {
+ return current->journal_info;
+ }
+
+ /* The journaling code user interface:
+ *
+ * Create and destroy handles
+ * Register buffer modifications against the current transaction.
+ */
+
+ extern handle_t *journal_start(journal_t *, int nblocks);
+ extern handle_t *journal_try_start(journal_t *, int nblocks);
+ extern int journal_restart (handle_t *, int nblocks);
+ extern int journal_extend (handle_t *, int nblocks);
+ extern int journal_get_write_access (handle_t *, struct buffer_head *);
+ extern int journal_get_create_access (handle_t *, struct buffer_head *);
+ extern int journal_get_undo_access (handle_t *, struct buffer_head *);
+ extern int journal_dirty_data (handle_t *,
+ struct buffer_head *, int async);
+ extern int journal_dirty_metadata (handle_t *, struct buffer_head *);
+ extern void journal_release_buffer (handle_t *, struct buffer_head *);
+ extern void journal_forget (handle_t *, struct buffer_head *);
+ extern void journal_sync_buffer (struct buffer_head *);
+ extern int journal_flushpage(journal_t *, struct page *, unsigned long);
+ extern int journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int journal_stop(handle_t *);
+ extern int journal_flush (journal_t *);
+
+ extern void journal_lock_updates (journal_t *);
+ extern void journal_unlock_updates (journal_t *);
+
+ extern journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
+ int start, int len, int bsize);
+ extern journal_t * journal_init_inode (struct inode *);
+ extern int journal_update_format (journal_t *);
+ extern int journal_check_used_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_check_available_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_set_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_create (journal_t *);
+ extern int journal_load (journal_t *journal);
+ extern void journal_destroy (journal_t *);
+ extern int journal_recover (journal_t *journal);
+ extern int journal_wipe (journal_t *, int);
+ extern int journal_skip_recovery (journal_t *);
+ extern void journal_update_superblock (journal_t *, int);
+ extern void __journal_abort (journal_t *);
+ extern void journal_abort (journal_t *, int);
+ extern int journal_errno (journal_t *);
+ extern void journal_ack_err (journal_t *);
+ extern int journal_clear_err (journal_t *);
+ extern unsigned long journal_bmap(journal_t *journal, unsigned long blocknr);
+ extern int journal_force_commit(journal_t *journal);
+
+ /*
+ * journal_head management
+ */
+ extern struct journal_head
+ *journal_add_journal_head(struct buffer_head *bh);
+ extern void journal_remove_journal_head(struct buffer_head *bh);
+ extern void __journal_remove_journal_head(struct buffer_head *bh);
+ extern void journal_unlock_journal_head(struct journal_head *jh);
+
+ /* Primary revoke support */
+ #define JOURNAL_REVOKE_DEFAULT_HASH 256
+ extern int journal_init_revoke(journal_t *, int);
+ extern void journal_destroy_revoke_caches(void);
+ extern int journal_init_revoke_caches(void);
+
+ extern void journal_destroy_revoke(journal_t *);
+ extern int journal_revoke (handle_t *,
+ unsigned long, struct buffer_head *);
+ extern int journal_cancel_revoke(handle_t *, struct journal_head *);
+ extern void journal_write_revoke_records(journal_t *, transaction_t *);
+
+ /* Recovery revoke support */
+ extern int journal_set_revoke(journal_t *, unsigned long, tid_t);
+ extern int journal_test_revoke(journal_t *, unsigned long, tid_t);
+ extern void journal_clear_revoke(journal_t *);
+ extern void journal_brelse_array(struct buffer_head *b[], int n);
+
+ /* The log thread user interface:
+ *
+ * Request space in the current transaction, and force transaction commit
+ * transitions on demand.
+ */
+
+ extern int log_space_left (journal_t *); /* Called with journal locked */
+ extern tid_t log_start_commit (journal_t *, transaction_t *);
+ extern void log_wait_commit (journal_t *, tid_t);
+ extern int log_do_checkpoint (journal_t *, int);
+
+ extern void log_wait_for_space(journal_t *, int nblocks);
+ extern void __journal_drop_transaction(journal_t *, transaction_t *);
+ extern int cleanup_journal_tail(journal_t *);
+
+ /* Reduce journal memory usage by flushing */
+ extern void shrink_journal_memory(void);
+
+ /* Debugging code only: */
+
+ #define jbd_ENOSYS() \
+ do { \
+ printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \
+ current->state = TASK_UNINTERRUPTIBLE; \
+ schedule(); \
+ } while (1)
+
+ /*
+ * is_journal_abort
+ *
+ * Simple test wrapper function to test the JFS_ABORT state flag. This
+ * bit, when set, indicates that we have had a fatal error somewhere,
+ * either inside the journaling layer or indicated to us by the client
+ * (eg. ext3), and that we and should not commit any further
+ * transactions.
+ */
+
+ static inline int is_journal_aborted(journal_t *journal)
+ {
+ return journal->j_flags & JFS_ABORT;
+ }
+
+ static inline int is_handle_aborted(handle_t *handle)
+ {
+ if (handle->h_aborted)
+ return 1;
+ return is_journal_aborted(handle->h_transaction->t_journal);
+ }
+
+ static inline void journal_abort_handle(handle_t *handle)
+ {
+ handle->h_aborted = 1;
+ }
+
+ /* Not all architectures define BUG() */
+ #ifndef BUG
+ #define BUG() do { \
+ printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+ * ((char *) 0) = 0; \
+ } while (0)
+ #endif /* BUG */
+
+ #endif /* __KERNEL__ */
+
+ /* Comparison functions for transaction IDs: perform comparisons using
+ * modulo arithmetic so that they work over sequence number wraps. */
+
+ static inline int tid_gt(tid_t x, tid_t y)
+ {
+ int difference = (x - y);
+ return (difference > 0);
+ }
+
+ static inline int tid_geq(tid_t x, tid_t y)
+ {
+ int difference = (x - y);
+ return (difference >= 0);
+ }
+
+ extern int journal_blocks_per_page(struct inode *inode);
+
+ /*
+ * Definitions which augment the buffer_head layer
+ */
+
+ /* JBD additions */
+
+ /* journaling buffer types */
+ #define BJ_None 0 /* Not journaled */
+ #define BJ_SyncData 1 /* Normal data: flush before commit */
+ #define BJ_AsyncData 2 /* writepage data: wait on it before commit */
+ #define BJ_Metadata 3 /* Normal journaled metadata */
+ #define BJ_Forget 4 /* Buffer superceded by this transaction */
+ #define BJ_IO 5 /* Buffer is for temporary IO use */
+ #define BJ_Shadow 6 /* Buffer contents being shadowed to the log */
+ #define BJ_LogCtl 7 /* Buffer contains log descriptors */
+ #define BJ_Reserved 8 /* Buffer is reserved for access by journal */
+ #define BJ_Types 9
+
+ extern int jbd_blocks_per_page(struct inode *inode);
+
+ #ifdef __KERNEL__
+
+ extern spinlock_t jh_splice_lock;
+ /*
+ * Once `expr1' has been found true, take jh_splice_lock
+ * and then reevaluate everything.
+ */
+ #define SPLICE_LOCK(expr1, expr2) \
+ ({ \
+ int ret = (expr1); \
+ if (ret) { \
+ spin_lock(&jh_splice_lock); \
+ ret = (expr1) && (expr2); \
+ spin_unlock(&jh_splice_lock); \
+ } \
+ ret; \
+ })
+
+ /*
+ * A number of buffer state predicates. They test for
+ * buffer_jbd() because they are used in core kernel code.
+ *
+ * These will be racy on SMP unless we're *sure* that the
+ * buffer won't be detached from the journalling system
+ * in parallel.
+ */
+
+ /* Return true if the buffer is on journal list `list' */
+ static inline int buffer_jlist_eq(struct buffer_head *bh, int list)
+ {
+ return SPLICE_LOCK(buffer_jbd(bh), bh2jh(bh)->b_jlist == list);
+ }
+
+ /* Return true if this bufer is dirty wrt the journal */
+ static inline int buffer_jdirty(struct buffer_head *bh)
+ {
+ return buffer_jbd(bh) && __buffer_state(bh, JBDDirty);
+ }
+
+ /* Return true if it's a data buffer which journalling is managing */
+ static inline int buffer_jbd_data(struct buffer_head *bh)
+ {
+ return SPLICE_LOCK(buffer_jbd(bh),
+ bh2jh(bh)->b_jlist == BJ_SyncData ||
+ bh2jh(bh)->b_jlist == BJ_AsyncData);
+ }
+
+ #ifdef CONFIG_SMP
+ #define assert_spin_locked(lock) J_ASSERT(spin_is_locked(lock))
+ #else
+ #define assert_spin_locked(lock) do {} while(0)
+ #endif
+
+ #endif /* __KERNEL__ */
+
+ #endif /* CONFIG_JBD || CONFIG_JBD_MODULE || !__KERNEL__ */
+
+ /*
+ * Compatibility no-ops which allow the kernel to compile without CONFIG_JBD
+ * go here.
+ */
+
+ #if defined(__KERNEL__) && !(defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE))
+
+ #define J_ASSERT(expr) do {} while (0)
+ #define J_ASSERT_BH(bh, expr) do {} while (0)
+ #define buffer_jbd(bh) 0
+ #define buffer_jlist_eq(bh, val) 0
+ #define journal_buffer_journal_lru(bh) 0
+
+ #endif /* defined(__KERNEL__) && !defined(CONFIG_JBD) */
+ #endif /* _LINUX_JBD_H */
diff -rc2P linux/include/linux/journal-head.h linux-2.4.13/include/linux/journal-head.h
*** linux/include/linux/journal-head.h Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/journal-head.h Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,70 ----
+ /*
+ * include/linux/journal-head.h
+ *
+ * buffer_head fields for JBD
+ *
+ * 27 May 2001 ANdrew Morton <[email protected]>
+ * Created - pulled out of fs.h
+ */
+
+ #ifndef JOURNAL_HEAD_H_INCLUDED
+ #define JOURNAL_HEAD_H_INCLUDED
+
+ typedef unsigned int tid_t; /* Unique transaction ID */
+ typedef struct transaction_s transaction_t; /* Compound transaction type */
+ struct buffer_head;
+
+ struct journal_head {
+ #ifndef CONFIG_JBD_UNIFIED_BUFFERS
+ /* Points back to our buffer_head. */
+ struct buffer_head *b_bh;
+ #endif
+
+ /* Reference count - see description in journal.c */
+ int b_jcount;
+
+ /* Journaling list for this buffer */
+ unsigned b_jlist;
+
+ /* Copy of the buffer data frozen for writing to the log. */
+ char * b_frozen_data;
+
+ /* Pointer to a saved copy of the buffer containing no
+ uncommitted deallocation references, so that allocations can
+ avoid overwriting uncommitted deletes. */
+ char * b_committed_data;
+
+ /* Pointer to the compound transaction which owns this buffer's
+ metadata: either the running transaction or the committing
+ transaction (if there is one). Only applies to buffers on a
+ transaction's data or metadata journaling list. */
+ /* Protected by journal_datalist_lock */
+ transaction_t * b_transaction;
+
+ /* Pointer to the running compound transaction which is
+ currently modifying the buffer's metadata, if there was
+ already a transaction committing it when the new transaction
+ touched it. */
+ transaction_t * b_next_transaction;
+
+ /* Doubly-linked list of buffers on a transaction's data,
+ metadata or forget queue. */
+ /* Protected by journal_datalist_lock */
+ struct journal_head *b_tnext, *b_tprev;
+
+ /*
+ * Pointer to the compound transaction against which this buffer
+ * is checkpointed. Only dirty buffers can be checkpointed.
+ */
+ /* Protected by journal_datalist_lock */
+ transaction_t * b_cp_transaction;
+
+ /*
+ * Doubly-linked list of buffers still remaining to be flushed
+ * before an old transaction can be checkpointed.
+ */
+ /* Protected by journal_datalist_lock */
+ struct journal_head *b_cpnext, *b_cpprev;
+ };
+
+ #endif /* JOURNAL_HEAD_H_INCLUDED */
diff -rc2P linux/include/linux/sched.h linux-2.4.13/include/linux/sched.h
*** linux/include/linux/sched.h Fri Nov 9 16:15:08 2001
--- linux-2.4.13/include/linux/sched.h Fri Nov 9 16:58:32 2001
***************
*** 420,423 ****
--- 420,425 ----
/* Protection of (de-)allocation: mm, files, fs, tty */
spinlock_t alloc_lock;
+ /* journalling filesystem info */
+ void *journal_info;
/* Field to make virtual server running in chroot more isolated */
int s_context; /* Process can only deal with other processes */
***************
*** 513,516 ****
--- 515,519 ----
blocked: {{0}}, \
alloc_lock: SPIN_LOCK_UNLOCKED, \
+ journal_info: NULL, \
cap_bset: CAP_INIT_EFF_SET, \
}
diff -rc2P linux/include/linux/sched.h.orig linux-2.4.13/include/linux/sched.h.orig
*** linux/include/linux/sched.h.orig Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/sched.h.orig Fri Nov 9 16:15:08 2001
***************
*** 0 ****
--- 1,936 ----
+ #ifndef _LINUX_SCHED_H
+ #define _LINUX_SCHED_H
+
+ #include <asm/param.h> /* for HZ */
+
+ extern unsigned long event;
+
+ #include <linux/config.h>
+ #include <linux/binfmts.h>
+ #include <linux/threads.h>
+ #include <linux/kernel.h>
+ #include <linux/types.h>
+ #include <linux/times.h>
+ #include <linux/timex.h>
+ #include <linux/rbtree.h>
+
+ #include <asm/system.h>
+ #include <asm/semaphore.h>
+ #include <asm/page.h>
+ #include <asm/ptrace.h>
+ #include <asm/mmu.h>
+
+ #include <linux/smp.h>
+ #include <linux/tty.h>
+ #include <linux/sem.h>
+ #include <linux/signal.h>
+ #include <linux/securebits.h>
+ #include <linux/fs_struct.h>
+
+ struct exec_domain;
+
+ /*
+ * cloning flags:
+ */
+ #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
+ #define CLONE_VM 0x00000100 /* set if VM shared between processes */
+ #define CLONE_FS 0x00000200 /* set if fs info shared between processes */
+ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */
+ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
+ #define CLONE_PID 0x00001000 /* set if pid shared */
+ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
+ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
+ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
+ #define CLONE_THREAD 0x00010000 /* Same thread group? */
+
+ #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
+
+ /*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ * - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ * a load-average precision of 10 bits integer + 11 bits fractional
+ * - if you want to count load-averages more often, you need more
+ * precision, or rounding will get you. With 2-second counting freq,
+ * the EXP_n values would be 1981, 2034 and 2043 if still using only
+ * 11 bit fractions.
+ */
+ extern unsigned long avenrun[]; /* Load averages */
+
+ #define FSHIFT 11 /* nr of bits of precision */
+ #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
+ #define LOAD_FREQ (5*HZ) /* 5 sec intervals */
+ #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
+ #define EXP_5 2014 /* 1/exp(5sec/5min) */
+ #define EXP_15 2037 /* 1/exp(5sec/15min) */
+
+ #define CALC_LOAD(load,exp,n) \
+ load *= exp; \
+ load += n*(FIXED_1-exp); \
+ load >>= FSHIFT;
+
+ #define CT_TO_SECS(x) ((x) / HZ)
+ #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
+
+ extern int nr_running, nr_threads;
+ extern int last_pid;
+
+ #include <linux/fs.h>
+ #include <linux/time.h>
+ #include <linux/param.h>
+ #include <linux/resource.h>
+ #include <linux/timer.h>
+
+ #include <asm/processor.h>
+
+ #define TASK_RUNNING 0
+ #define TASK_INTERRUPTIBLE 1
+ #define TASK_UNINTERRUPTIBLE 2
+ #define TASK_ZOMBIE 4
+ #define TASK_STOPPED 8
+
+ #define __set_task_state(tsk, state_value) \
+ do { (tsk)->state = (state_value); } while (0)
+ #ifdef CONFIG_SMP
+ #define set_task_state(tsk, state_value) \
+ set_mb((tsk)->state, (state_value))
+ #else
+ #define set_task_state(tsk, state_value) \
+ __set_task_state((tsk), (state_value))
+ #endif
+
+ #define __set_current_state(state_value) \
+ do { current->state = (state_value); } while (0)
+ #ifdef CONFIG_SMP
+ #define set_current_state(state_value) \
+ set_mb(current->state, (state_value))
+ #else
+ #define set_current_state(state_value) \
+ __set_current_state(state_value)
+ #endif
+
+ /*
+ * Scheduling policies
+ */
+ #define SCHED_OTHER 0
+ #define SCHED_FIFO 1
+ #define SCHED_RR 2
+
+ /*
+ * This is an additional bit set when we want to
+ * yield the CPU for one re-schedule..
+ */
+ #define SCHED_YIELD 0x10
+
+ struct sched_param {
+ int sched_priority;
+ };
+
+ struct completion;
+
+ #ifdef __KERNEL__
+
+ #include <linux/spinlock.h>
+
+ /*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+ extern rwlock_t tasklist_lock;
+ extern spinlock_t runqueue_lock;
+ extern spinlock_t mmlist_lock;
+
+ extern void sched_init(void);
+ extern void init_idle(void);
+ extern void show_state(void);
+ extern void cpu_init (void);
+ extern void trap_init(void);
+ extern void update_process_times(int user);
+ extern void update_one_process(struct task_struct *p, unsigned long user,
+ unsigned long system, int cpu);
+
+ #define MAX_SCHEDULE_TIMEOUT LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+ asmlinkage void schedule(void);
+
+ extern int schedule_task(struct tq_struct *task);
+ extern void flush_scheduled_tasks(void);
+ extern int start_context_thread(void);
+ extern int current_is_keventd(void);
+
+ /*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+ #define NR_OPEN_DEFAULT BITS_PER_LONG
+
+ /*
+ * Open file table structure
+ */
+ struct files_struct {
+ atomic_t count;
+ rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */
+ int max_fds;
+ int max_fdset;
+ int next_fd;
+ struct file ** fd; /* current fd array */
+ fd_set *close_on_exec;
+ fd_set *open_fds;
+ fd_set close_on_exec_init;
+ fd_set open_fds_init;
+ struct file * fd_array[NR_OPEN_DEFAULT];
+ };
+
+ #define INIT_FILES \
+ { \
+ count: ATOMIC_INIT(1), \
+ file_lock: RW_LOCK_UNLOCKED, \
+ max_fds: NR_OPEN_DEFAULT, \
+ max_fdset: __FD_SETSIZE, \
+ next_fd: 0, \
+ fd: &init_files.fd_array[0], \
+ close_on_exec: &init_files.close_on_exec_init, \
+ open_fds: &init_files.open_fds_init, \
+ close_on_exec_init: { { 0, } }, \
+ open_fds_init: { { 0, } }, \
+ fd_array: { NULL, } \
+ }
+
+ /* Maximum number of active map areas.. This is a random (large) number */
+ #define MAX_MAP_COUNT (65536)
+
+ struct mm_struct {
+ struct vm_area_struct * mmap; /* list of VMAs */
+ rb_root_t mm_rb;
+ struct vm_area_struct * mmap_cache; /* last find_vma result */
+ pgd_t * pgd;
+ atomic_t mm_users; /* How many users with user space? */
+ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
+ int map_count; /* number of VMAs */
+ struct rw_semaphore mmap_sem;
+ spinlock_t page_table_lock; /* Protects task page tables and mm->rss */
+
+ struct list_head mmlist; /* List of all active mm's. These are globally strung
+ * together off init_mm.mmlist, and are protected
+ * by mmlist_lock
+ */
+
+ unsigned long start_code, end_code, start_data, end_data;
+ unsigned long start_brk, brk, start_stack;
+ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long rss, total_vm, locked_vm;
+ unsigned long def_flags;
+ unsigned long cpu_vm_mask;
+ unsigned long swap_address;
+
+ unsigned dumpable:1;
+
+ /* Architecture-specific MM context */
+ mm_context_t context;
+ };
+
+ extern int mmlist_nr;
+
+ #define INIT_MM(name) \
+ { \
+ mm_rb: RB_ROOT, \
+ pgd: swapper_pg_dir, \
+ mm_users: ATOMIC_INIT(2), \
+ mm_count: ATOMIC_INIT(1), \
+ mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \
+ page_table_lock: SPIN_LOCK_UNLOCKED, \
+ mmlist: LIST_HEAD_INIT(name.mmlist), \
+ }
+
+ struct signal_struct {
+ atomic_t count;
+ struct k_sigaction action[_NSIG];
+ spinlock_t siglock;
+ };
+
+
+ #define INIT_SIGNALS { \
+ count: ATOMIC_INIT(1), \
+ action: { {{0,}}, }, \
+ siglock: SPIN_LOCK_UNLOCKED \
+ }
+
+ /*
+ * Some day this will be a full-fledged user tracking system..
+ */
+ struct user_struct {
+ atomic_t __count; /* reference count */
+ atomic_t processes; /* How many processes does this user have? */
+ atomic_t files; /* How many open files does this user have? */
+
+ /* Hash table maintenance information */
+ struct user_struct *next, **pprev;
+ uid_t uid;
+ };
+
+ #define get_current_user() ({ \
+ struct user_struct *__user = current->user; \
+ atomic_inc(&__user->__count); \
+ __user; })
+
+
+ /*
+ We may have a different domainname and nodename for each security
+ context. By default, a security context share the same as its
+ parent, potentially the information in system_utsname
+ */
+ #define S_CTX_INFO_LOCK 1 /* Can't request a new s_context */
+ #define S_CTX_INFO_SCHED 2 /* All process in the s_context */
+ /* Contribute to the schedular */
+ struct context_info{
+ int refcount;
+ int s_context;
+ char nodename[65];
+ char domainname[65];
+ int flags; /* S_CTX_INFO_xxx */
+ atomic_t ticks; /* Number of ticks used by all process */
+ /* in the s_context */
+ };
+
+
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+
+ struct task_struct {
+ /*
+ * offsets of these are hardcoded elsewhere - touch with care
+ */
+ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
+ unsigned long flags; /* per process flags, defined below */
+ int sigpending;
+ mm_segment_t addr_limit; /* thread address space:
+ 0-0xBFFFFFFF for user-thead
+ 0-0xFFFFFFFF for kernel-thread
+ */
+ struct exec_domain *exec_domain;
+ volatile long need_resched;
+ unsigned long ptrace;
+
+ int lock_depth; /* Lock depth */
+
+ /*
+ * offset 32 begins here on 32-bit platforms. We keep
+ * all fields in a single cacheline that are needed for
+ * the goodness() loop in schedule().
+ */
+ long counter;
+ long nice;
+ unsigned long policy;
+ struct mm_struct *mm;
+ int has_cpu, processor;
+ unsigned long cpus_allowed;
+ /*
+ * (only the 'next' pointer fits into the cacheline, but
+ * that's just fine.)
+ */
+ struct list_head run_list;
+ unsigned long sleep_time;
+
+ struct task_struct *next_task, *prev_task;
+ struct mm_struct *active_mm;
+ struct list_head local_pages;
+ unsigned int allocation_order, nr_local_pages;
+
+ /* task state */
+ struct linux_binfmt *binfmt;
+ int exit_code, exit_signal;
+ int pdeath_signal; /* The signal sent when the parent dies */
+ /* ??? */
+ unsigned long personality;
+ int did_exec:1;
+ pid_t pid;
+ pid_t pgrp;
+ pid_t tty_old_pgrp;
+ pid_t session;
+ pid_t tgid;
+ /* boolean value for session group leader */
+ int leader;
+ /*
+ * pointers to (original) parent process, youngest child, younger sibling,
+ * older sibling, respectively. (p->father can be replaced with
+ * p->p_pptr->pid)
+ */
+ struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+ struct list_head thread_group;
+
+ /* PID hash table linkage. */
+ struct task_struct *pidhash_next;
+ struct task_struct **pidhash_pprev;
+
+ wait_queue_head_t wait_chldexit; /* for wait4() */
+ struct completion *vfork_done; /* for vfork() */
+ unsigned long rt_priority;
+ unsigned long it_real_value, it_prof_value, it_virt_value;
+ unsigned long it_real_incr, it_prof_incr, it_virt_incr;
+ struct timer_list real_timer;
+ struct tms times;
+ unsigned long start_time;
+ long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+ int swappable:1;
+ /* process credentials */
+ uid_t uid,euid,suid,fsuid;
+ gid_t gid,egid,sgid,fsgid;
+ int ngroups;
+ gid_t groups[NGROUPS];
+ kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
+ int keep_capabilities:1;
+ struct user_struct *user;
+ /* limits */
+ struct rlimit rlim[RLIM_NLIMITS];
+ unsigned short used_math;
+ char comm[16];
+ /* file system info */
+ int link_count, total_link_count;
+ struct tty_struct *tty; /* NULL if no tty */
+ unsigned int locks; /* How many file locks are being held */
+ /* ipc stuff */
+ struct sem_undo *semundo;
+ struct sem_queue *semsleeping;
+ /* CPU-specific state of this task */
+ struct thread_struct thread;
+ /* filesystem information */
+ struct fs_struct *fs;
+ /* open file information */
+ struct files_struct *files;
+ /* signal handlers */
+ spinlock_t sigmask_lock; /* Protects signal and blocked */
+ struct signal_struct *sig;
+
+ sigset_t blocked;
+ struct sigpending pending;
+
+ unsigned long sas_ss_sp;
+ size_t sas_ss_size;
+ int (*notifier)(void *priv);
+ void *notifier_data;
+ sigset_t *notifier_mask;
+
+ /* Thread group tracking */
+ u32 parent_exec_id;
+ u32 self_exec_id;
+ /* Protection of (de-)allocation: mm, files, fs, tty */
+ spinlock_t alloc_lock;
+ /* Field to make virtual server running in chroot more isolated */
+ int s_context; /* Process can only deal with other processes */
+ /* with the same s_context */
+ __u32 cap_bset; /* Maximum capability of this process and children */
+ unsigned long ipv4root; /* Process can only bind to this iP */
+ struct context_info *s_info;
+ };
+
+ /*
+ * Per process flags
+ */
+ #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
+ /* Not implemented yet, only for 486*/
+ #define PF_STARTING 0x00000002 /* being created */
+ #define PF_EXITING 0x00000004 /* getting shut down */
+ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
+ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
+ #define PF_DUMPCORE 0x00000200 /* dumped core */
+ #define PF_SIGNALED 0x00000400 /* killed by a signal */
+ #define PF_MEMALLOC 0x00000800 /* Allocating memory */
+ #define PF_FREE_PAGES 0x00002000 /* per process page freeing */
+
+ #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */
+
+ /*
+ * Ptrace flags
+ */
+
+ #define PT_PTRACED 0x00000001
+ #define PT_TRACESYS 0x00000002
+ #define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */
+ #define PT_TRACESYSGOOD 0x00000008
+ #define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */
+
+ /*
+ * Limit the stack by to some sane default: root can always
+ * increase this limit if needed.. 8MB seems reasonable.
+ */
+ #define _STK_LIM (8*1024*1024)
+
+ #define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
+ #define MAX_COUNTER (20*HZ/100)
+ #define DEF_NICE (0)
+
+
+ /*
+ * The default (Linux) execution domain.
+ */
+ extern struct exec_domain default_exec_domain;
+
+ /*
+ * INIT_TASK is used to set up the first task table, touch at
+ * your own risk!. Base=0, limit=0x1fffff (=2MB)
+ */
+ #define INIT_TASK(tsk) \
+ { \
+ state: 0, \
+ flags: 0, \
+ sigpending: 0, \
+ addr_limit: KERNEL_DS, \
+ exec_domain: &default_exec_domain, \
+ lock_depth: -1, \
+ counter: DEF_COUNTER, \
+ nice: DEF_NICE, \
+ policy: SCHED_OTHER, \
+ mm: NULL, \
+ active_mm: &init_mm, \
+ cpus_allowed: -1, \
+ run_list: LIST_HEAD_INIT(tsk.run_list), \
+ next_task: &tsk, \
+ prev_task: &tsk, \
+ p_opptr: &tsk, \
+ p_pptr: &tsk, \
+ thread_group: LIST_HEAD_INIT(tsk.thread_group), \
+ wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
+ real_timer: { \
+ function: it_real_fn \
+ }, \
+ cap_effective: CAP_INIT_EFF_SET, \
+ cap_inheritable: CAP_INIT_INH_SET, \
+ cap_permitted: CAP_FULL_SET, \
+ keep_capabilities: 0, \
+ rlim: INIT_RLIMITS, \
+ user: INIT_USER, \
+ comm: "swapper", \
+ thread: INIT_THREAD, \
+ fs: &init_fs, \
+ files: &init_files, \
+ sigmask_lock: SPIN_LOCK_UNLOCKED, \
+ sig: &init_signals, \
+ pending: { NULL, &tsk.pending.head, {{0}}}, \
+ blocked: {{0}}, \
+ alloc_lock: SPIN_LOCK_UNLOCKED, \
+ cap_bset: CAP_INIT_EFF_SET, \
+ }
+
+
+ #ifndef INIT_TASK_SIZE
+ # define INIT_TASK_SIZE 2048*sizeof(long)
+ #endif
+
+ union task_union {
+ struct task_struct task;
+ unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+ };
+
+ extern union task_union init_task_union;
+
+ extern struct mm_struct init_mm;
+ extern struct task_struct *init_tasks[NR_CPUS];
+
+ /* PID hashing. (shouldnt this be dynamic?) */
+ #define PIDHASH_SZ (4096 >> 2)
+ extern struct task_struct *pidhash[PIDHASH_SZ];
+
+ #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+
+ static inline void hash_pid(struct task_struct *p)
+ {
+ struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
+
+ if((p->pidhash_next = *htable) != NULL)
+ (*htable)->pidhash_pprev = &p->pidhash_next;
+ *htable = p;
+ p->pidhash_pprev = htable;
+ }
+
+ static inline void unhash_pid(struct task_struct *p)
+ {
+ if(p->pidhash_next)
+ p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
+ *p->pidhash_pprev = p->pidhash_next;
+ }
+
+ static inline struct task_struct *find_task_by_pid(int pid)
+ {
+ struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
+
+ for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
+ ;
+
+ return p;
+ }
+
+ /* per-UID process charging. */
+ extern struct user_struct * alloc_uid(uid_t);
+ extern void free_uid(struct user_struct *);
+
+ #include <asm/current.h>
+
+ extern unsigned long volatile jiffies;
+ extern unsigned long itimer_ticks;
+ extern unsigned long itimer_next;
+ extern struct timeval xtime;
+ extern void do_timer(struct pt_regs *);
+
+ extern unsigned int * prof_buffer;
+ extern unsigned long prof_len;
+ extern unsigned long prof_shift;
+
+ #define CURRENT_TIME (xtime.tv_sec)
+
+ extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
+ extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
+ extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+ signed long timeout));
+ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+ signed long timeout));
+ extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+
+ #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+ #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
+ #define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+ #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1)
+ #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr)
+ #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0)
+ #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+ #define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr)
+ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
+
+ extern int in_group_p(gid_t);
+ extern int in_egroup_p(gid_t);
+
+ extern void proc_caches_init(void);
+ extern void flush_signals(struct task_struct *);
+ extern void flush_signal_handlers(struct task_struct *);
+ extern int dequeue_signal(sigset_t *, siginfo_t *);
+ extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+ sigset_t *mask);
+ extern void unblock_all_signals(void);
+ extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+ extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+ extern int kill_pg_info(int, struct siginfo *, pid_t);
+ extern int kill_sl_info(int, struct siginfo *, pid_t);
+ extern int kill_proc_info(int, struct siginfo *, pid_t);
+ extern void notify_parent(struct task_struct *, int);
+ extern void do_notify_parent(struct task_struct *, int);
+ extern void force_sig(int, struct task_struct *);
+ extern int send_sig(int, struct task_struct *, int);
+ extern int kill_pg(pid_t, int, int);
+ extern int kill_sl(pid_t, int, int);
+ extern int kill_proc(pid_t, int, int);
+ extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+ extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
+
+ static inline int signal_pending(struct task_struct *p)
+ {
+ return (p->sigpending != 0);
+ }
+
+ /*
+ * Re-calculate pending state from the set of locally pending
+ * signals, globally pending signals, and blocked signals.
+ */
+ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+ {
+ unsigned long ready;
+ long i;
+
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+ ready |= signal->sig[i] &~ blocked->sig[i];
+ break;
+
+ case 4: ready = signal->sig[3] &~ blocked->sig[3];
+ ready |= signal->sig[2] &~ blocked->sig[2];
+ ready |= signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 2: ready = signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 1: ready = signal->sig[0] &~ blocked->sig[0];
+ }
+ return ready != 0;
+ }
+
+ /* Reevaluate whether the task has signals pending delivery.
+ This is required every time the blocked sigset_t changes.
+ All callers should have t->sigmask_lock. */
+
+ static inline void recalc_sigpending(struct task_struct *t)
+ {
+ t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ }
+
+ /* True if we are on the alternate signal stack. */
+
+ static inline int on_sig_stack(unsigned long sp)
+ {
+ return (sp - current->sas_ss_sp < current->sas_ss_size);
+ }
+
+ static inline int sas_ss_flags(unsigned long sp)
+ {
+ return (current->sas_ss_size == 0 ? SS_DISABLE
+ : on_sig_stack(sp) ? SS_ONSTACK : 0);
+ }
+
+ extern int request_irq(unsigned int,
+ void (*handler)(int, void *, struct pt_regs *),
+ unsigned long, const char *, void *);
+ extern void free_irq(unsigned int, void *);
+
+ /*
+ * This has now become a routine instead of a macro, it sets a flag if
+ * it returns true (to do BSD-style accounting where the process is flagged
+ * if it uses root privs). The implication of this is that you should do
+ * normal permissions checks first, and check suser() last.
+ *
+ * [Dec 1997 -- Chris Evans]
+ * For correctness, the above considerations need to be extended to
+ * fsuser(). This is done, along with moving fsuser() checks to be
+ * last.
+ *
+ * These will be removed, but in the mean time, when the SECURE_NOROOT
+ * flag is set, uids don't grant privilege.
+ */
+ static inline int suser(void)
+ {
+ if (!issecure(SECURE_NOROOT) && current->euid == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+ }
+
+ static inline int fsuser(void)
+ {
+ if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * capable() checks for a particular capability.
+ * New privilege checks should use this interface, rather than suser() or
+ * fsuser(). See include/linux/capability.h for defined capabilities.
+ */
+
+ static inline int capable(int cap)
+ {
+ #if 1 /* ok now */
+ if (cap_raised(current->cap_effective, cap))
+ #else
+ if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
+ #endif
+ {
+ current->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Routines for handling mm_structs
+ */
+ extern struct mm_struct * mm_alloc(void);
+
+ extern struct mm_struct * start_lazy_tlb(void);
+ extern void end_lazy_tlb(struct mm_struct *mm);
+
+ /* mmdrop drops the mm and the page tables */
+ extern inline void FASTCALL(__mmdrop(struct mm_struct *));
+ static inline void mmdrop(struct mm_struct * mm)
+ {
+ if (atomic_dec_and_test(&mm->mm_count))
+ __mmdrop(mm);
+ }
+
+ /* mmput gets rid of the mappings and all user-space */
+ extern void mmput(struct mm_struct *);
+ /* Remove the current tasks stale references to the old mm_struct */
+ extern void mm_release(void);
+
+ /*
+ * Routines for handling the fd arrays
+ */
+ extern struct file ** alloc_fd_array(int);
+ extern int expand_fd_array(struct files_struct *, int nr);
+ extern void free_fd_array(struct file **, int);
+
+ extern fd_set *alloc_fdset(int);
+ extern int expand_fdset(struct files_struct *, int nr);
+ extern void free_fdset(fd_set *, int);
+
+ extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+ extern void flush_thread(void);
+ extern void exit_thread(void);
+
+ extern void exit_mm(struct task_struct *);
+ extern void exit_files(struct task_struct *);
+ extern void exit_sighand(struct task_struct *);
+
+ extern void reparent_to_init(void);
+ extern void daemonize(void);
+
+ extern int do_execve(char *, char **, char **, struct pt_regs *);
+ extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
+
+ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+
+ #define __wait_event(wq, condition) \
+ do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ schedule(); \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+ } while (0)
+
+ #define wait_event(wq, condition) \
+ do { \
+ if (condition) \
+ break; \
+ __wait_event(wq, condition); \
+ } while (0)
+
+ #define __wait_event_interruptible(wq, condition, ret) \
+ do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ if (!signal_pending(current)) { \
+ schedule(); \
+ continue; \
+ } \
+ ret = -ERESTARTSYS; \
+ break; \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+ } while (0)
+
+ #define wait_event_interruptible(wq, condition) \
+ ({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __wait_event_interruptible(wq, condition, __ret); \
+ __ret; \
+ })
+
+ #define REMOVE_LINKS(p) do { \
+ (p)->next_task->prev_task = (p)->prev_task; \
+ (p)->prev_task->next_task = (p)->next_task; \
+ if ((p)->p_osptr) \
+ (p)->p_osptr->p_ysptr = (p)->p_ysptr; \
+ if ((p)->p_ysptr) \
+ (p)->p_ysptr->p_osptr = (p)->p_osptr; \
+ else \
+ (p)->p_pptr->p_cptr = (p)->p_osptr; \
+ } while (0)
+
+ #define SET_LINKS(p) do { \
+ (p)->next_task = &init_task; \
+ (p)->prev_task = init_task.prev_task; \
+ init_task.prev_task->next_task = (p); \
+ init_task.prev_task = (p); \
+ (p)->p_ysptr = NULL; \
+ if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
+ (p)->p_osptr->p_ysptr = p; \
+ (p)->p_pptr->p_cptr = p; \
+ } while (0)
+
+ #define for_each_task(p) \
+ for (p = &init_task ; (p = p->next_task) != &init_task ; )
+
+ #define next_thread(p) \
+ list_entry((p)->thread_group.next, struct task_struct, thread_group)
+
+ static inline void del_from_runqueue(struct task_struct * p)
+ {
+ nr_running--;
+ p->sleep_time = jiffies;
+ list_del(&p->run_list);
+ p->run_list.next = NULL;
+ }
+
+ static inline int task_on_runqueue(struct task_struct *p)
+ {
+ return (p->run_list.next != NULL);
+ }
+
+ static inline void unhash_process(struct task_struct *p)
+ {
+ if (task_on_runqueue(p)) BUG();
+ write_lock_irq(&tasklist_lock);
+ nr_threads--;
+ unhash_pid(p);
+ REMOVE_LINKS(p);
+ list_del(&p->thread_group);
+ write_unlock_irq(&tasklist_lock);
+ }
+
+ /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
+ static inline void task_lock(struct task_struct *p)
+ {
+ spin_lock(&p->alloc_lock);
+ }
+
+ static inline void task_unlock(struct task_struct *p)
+ {
+ spin_unlock(&p->alloc_lock);
+ }
+
+ /* write full pathname into buffer and return start of pathname */
+ static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
+ char *buf, int buflen)
+ {
+ char *res;
+ struct vfsmount *rootmnt;
+ struct dentry *root;
+ read_lock(&current->fs->lock);
+ rootmnt = mntget(current->fs->rootmnt);
+ root = dget(current->fs->root);
+ read_unlock(&current->fs->lock);
+ spin_lock(&dcache_lock);
+ res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
+ spin_unlock(&dcache_lock);
+ dput(root);
+ mntput(rootmnt);
+ return res;
+ }
+
+ /* Manage the reference count of the context_info pointer */
+ void sys_release_s_info (struct task_struct *);
+ void sys_assign_s_info (struct task_struct *);
+ void sys_alloc_s_info (void);
+
+ #endif /* __KERNEL__ */
+
+ #endif
diff -rc2P linux/include/linux/sched.h.rej linux-2.4.13/include/linux/sched.h.rej
*** linux/include/linux/sched.h.rej Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/sched.h.rej Fri Nov 9 16:58:00 2001
***************
*** 0 ****
--- 1,36 ----
+ ***************
+ *** 399,404 ****
+ u32 self_exec_id;
+ /* Protection of (de-)allocation: mm, files, fs, tty */
+ spinlock_t alloc_lock;
+ };
+
+ /*
+ --- 399,407 ----
+ u32 self_exec_id;
+ /* Protection of (de-)allocation: mm, files, fs, tty */
+ spinlock_t alloc_lock;
+ +
+ + /* journalling filesystem info */
+ + void *journal_info;
+ };
+
+ /*
+ ***************
+ *** 485,491 ****
+ sig: &init_signals, \
+ pending: { NULL, &tsk.pending.head, {{0}}}, \
+ blocked: {{0}}, \
+ - alloc_lock: SPIN_LOCK_UNLOCKED \
+ }
+
+
+ --- 488,495 ----
+ sig: &init_signals, \
+ pending: { NULL, &tsk.pending.head, {{0}}}, \
+ blocked: {{0}}, \
+ + alloc_lock: SPIN_LOCK_UNLOCKED, \
+ + journal_info: NULL \
+ }
+
+
diff -rc2P linux/kernel/sysctl.c linux-2.4.13/kernel/sysctl.c
*** linux/kernel/sysctl.c Fri Nov 9 16:15:08 2001
--- linux-2.4.13/kernel/sysctl.c Fri Nov 9 16:58:00 2001
***************
*** 30,33 ****
--- 30,35 ----
#include <linux/init.h>
#include <linux/sysrq.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
#include <linux/highuid.h>

***************
*** 303,306 ****
--- 305,316 ----
{FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
0644, NULL, &proc_dointvec},
+ #ifdef CONFIG_JBD_DEBUG
+ {FS_LEASE_TIME+1, "jbd-debug", &journal_enable_debug, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ #endif
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ {FS_LEASE_TIME+2, "jbd-oom-retry", &journal_oom_retry, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ #endif
{0}
};
diff -rc2P linux/mm/filemap.c linux-2.4.13/mm/filemap.c
*** linux/mm/filemap.c Tue Oct 23 20:52:48 2001
--- linux-2.4.13/mm/filemap.c Fri Nov 9 16:58:00 2001
***************
*** 201,211 ****
}

static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-
if (page->buffers)
! block_flushpage(page, partial);
!
}

--- 201,218 ----
}

+ static int do_flushpage(struct page *page, unsigned long offset)
+ {
+ int (*flushpage) (struct page *, unsigned long);
+ flushpage = page->mapping->a_ops->flushpage;
+ if (flushpage)
+ return (*flushpage)(page, offset);
+ return block_flushpage(page, offset);
+ }
+
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
if (page->buffers)
! do_flushpage(page, partial);
}

***************
*** 213,217 ****
{
/* Leave it on the LRU if it gets converted into anonymous buffers */
! if (!page->buffers || block_flushpage(page, 0))
lru_cache_del(page);

--- 220,224 ----
{
/* Leave it on the LRU if it gets converted into anonymous buffers */
! if (!page->buffers || do_flushpage(page, 0))
lru_cache_del(page);

***************
*** 1119,1122 ****
--- 1126,1130 ----
}

+
/*
* Mark a page as having seen activity.
***************
*** 2817,2821 ****
err = written ? written : status;
out:
-
up(&inode->i_sem);
return err;
--- 2825,2828 ----
diff -rc2P linux/mm/memory.c linux-2.4.13/mm/memory.c
*** linux/mm/memory.c Mon Oct 15 15:09:50 2001
--- linux-2.4.13/mm/memory.c Fri Nov 9 16:58:00 2001
***************
*** 1243,1250 ****
struct page * new_page;
pte_t entry;
!
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
spin_unlock(&mm->page_table_lock);

/*
--- 1243,1256 ----
struct page * new_page;
pte_t entry;
! int ret;
! struct inode *inode = NULL;
!
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
spin_unlock(&mm->page_table_lock);
+ if (vma->vm_file && vma->vm_file->f_dentry)
+ inode = vma->vm_file->f_dentry->d_inode;
+ if (inode)
+ down_read(&inode->i_truncate_sem);

/*
***************
*** 1256,1263 ****

spin_lock(&mm->page_table_lock);
! if (new_page == NULL) /* no page was available -- SIGBUS */
! return 0;
! if (new_page == NOPAGE_OOM)
! return -1;
/*
* This silly early PAGE_DIRTY setting removes a race
--- 1262,1275 ----

spin_lock(&mm->page_table_lock);
! if (new_page == NULL) { /* no page was available -- SIGBUS */
! ret = 0;
! goto out;
! }
!
! if (new_page == NOPAGE_OOM) {
! ret = -1;
! goto out;
! }
!
/*
* This silly early PAGE_DIRTY setting removes a race
***************
*** 1285,1294 ****
/* One of our sibling threads was faster, back out. */
page_cache_release(new_page);
! return 1;
}

/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
! return 2; /* Major fault */
}

--- 1297,1311 ----
/* One of our sibling threads was faster, back out. */
page_cache_release(new_page);
! ret = 1;
! goto out;
}

/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
! ret = 2; /* Major fault */
! out:
! if (inode)
! up_read(&inode->i_truncate_sem);
! return ret;
}

diff -rc2P linux/mm/vmscan.c linux-2.4.13/mm/vmscan.c
*** linux/mm/vmscan.c Wed Oct 24 00:48:55 2001
--- linux-2.4.13/mm/vmscan.c Fri Nov 9 16:58:00 2001
***************
*** 8,12 ****
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
- * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
* Zone aware kswapd started 02/00, Kanoj Sarcar ([email protected]).
* Multiqueue VM started 5.8.00, Rik van Riel.
--- 8,11 ----
***************
*** 415,419 ****
page_cache_get(page);

! if (try_to_free_buffers(page, gfp_mask)) {
if (!page->mapping) {
/*
--- 414,418 ----
page_cache_get(page);

! if (try_to_release_page(page, gfp_mask)) {
if (!page->mapping) {
/*
***************
*** 436,440 ****
/*
* The page is still in pagecache so undo the stuff
! * before the try_to_free_buffers since we've not
* finished and we can now try the next step.
*/
--- 435,439 ----
/*
* The page is still in pagecache so undo the stuff
! * before the try_to_release_page since we've not
* finished and we can now try the next step.
*/