From 8f72fbdf0d92e6127583cc548bf043c60cd4720f Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Wed, 29 Oct 2008 17:13:08 -0400 Subject: [PATCH 01/57] ext4: fix printk format warning fs/ext4/balloc.c:607: warning: format '%lld' expects type 'long long int', but argument 2 has type 's64' fs/ext4/inode.c:1822: warning: format '%lld' expects type 'long long int', but argument 2 has type 's64' fs/ext4/inode.c:1824: warning: format '%lld' expects type 'long long int', but argument 2 has type 's64' Signed-off-by: Alexander Beregalov Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 +- fs/ext4/inode.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38b3acf5683..152c390f3c3 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -614,7 +614,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) if (dirty_blocks < 0) { printk(KERN_CRIT "Dirty block accounting " "went wrong %lld\n", - dirty_blocks); + (long long)dirty_blocks); } } /* Check whether we have space after diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6702a49992a..5b088121686 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1831,9 +1831,9 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_count_free_blocks(inode->i_sb)); printk(KERN_EMERG "Free/Dirty block details\n"); printk(KERN_EMERG "free_blocks=%lld\n", - percpu_counter_sum(&sbi->s_freeblocks_counter)); + (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); printk(KERN_EMERG "dirty_blocks=%lld\n", - percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); printk(KERN_EMERG "Block reservation details\n"); printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", EXT4_I(inode)->i_reserved_data_blocks); From 5e1f8c9e20a92743eefc9a82c2db835213905e26 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 28 Oct 2008 13:21:55 -0400 Subject: [PATCH 02/57] ext3: Add support for non-native signed/unsigned htree hash algorithms The original ext3 hash algorithms assumed that variables of type char were signed, as God and K&R intended. Unfortunately, this assumption is not true on some architectures. Userspace support for marking filesystems with non-native signed/unsigned chars was added two years ago, but the kernel-side support was never added (until now). Signed-off-by: "Theodore Ts'o" Cc: akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org --- fs/ext3/hash.c | 81 ++++++++++++++++++++++++++++++++------ fs/ext3/namei.c | 7 ++++ fs/ext3/super.c | 12 ++++++ include/linux/ext3_fs.h | 28 ++++++++++++- include/linux/ext3_fs_sb.h | 1 + 5 files changed, 116 insertions(+), 13 deletions(-) diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index c30e149fbd2..7d215b4d4f2 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; - if (hash & 0x80000000) hash -= 0x7fffffff; + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i=0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { - case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1dd2abe6313..287b304d42a 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -368,6 +368,8 @@ dx_probe(struct qstr *entry, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; if (entry) ext3fs_dirhash(entry->name, entry->len, hinfo); @@ -636,6 +638,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -1398,6 +1403,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; ext3fs_dirhash(name, namelen, &hinfo); frame = frames; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f6c94f232ec..541d5e4f7f6 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1744,6 +1744,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk (KERN_ERR diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index d14f0291848..9004794a35f 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -353,6 +353,13 @@ struct ext3_inode { #define EXT3_ERROR_FS 0x0002 /* Errors detected */ #define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + /* * Mount flags */ @@ -489,7 +496,23 @@ struct ext3_super_block { __u16 s_reserved_word_pad; __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ - __u32 s_reserved[190]; /* Padding to the end of the block */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -694,6 +717,9 @@ static inline __le16 ext3_rec_len_to_disk(unsigned len) #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 #ifdef __KERNEL__ diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index e024e38248f..a4e9216b3a6 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -57,6 +57,7 @@ struct ext3_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; From f99b25897a86fcfff9140396a97261ae65fed872 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 28 Oct 2008 13:21:44 -0400 Subject: [PATCH 03/57] ext4: Add support for non-native signed/unsigned htree hash algorithms The original ext3 hash algorithms assumed that variables of type char were signed, as God and K&R intended. Unfortunately, this assumption is not true on some architectures. Userspace support for marking filesystems with non-native signed/unsigned chars was added two years ago, but the kernel-side support was never added (until now). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++ fs/ext4/ext4_sb.h | 1 + fs/ext4/hash.c | 81 ++++++++++++++++++++++++++++++++++++++++------- fs/ext4/namei.c | 7 ++++ fs/ext4/super.c | 12 +++++++ 5 files changed, 92 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0537c82702..8370ffd2d62 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -891,6 +891,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len) #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 #ifdef __KERNEL__ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index b21f16713db..ad7ea09baa7 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -57,6 +57,7 @@ struct ext4_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 556ca8eba3d..ac8f168c8ab 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -35,23 +35,43 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash(const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; - if (hash & 0x80000000) hash -= 0x7fffffff; + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const signed char *scp = (const signed char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +82,35 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { - case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9fd2a5e1be4..315858db807 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -372,6 +372,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (d_name) ext4fs_dirhash(d_name->name, d_name->len, hinfo); @@ -641,6 +643,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -1408,6 +1413,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; ext4fs_dirhash(name, namelen, &hinfo); frame = frames; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04158ad74db..08fc86a358d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2118,6 +2118,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk(KERN_ERR From 59e315b4c410b00a9acd0f24a00dbadbe81ce692 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 6 Dec 2008 16:58:39 -0500 Subject: [PATCH 04/57] ext3/4: Fix loop index in do_split() so it is signed This fixes a gcc warning but it doesn't appear able to result in a failure, since the primary way the loop is exited is the first conditional in the for loop, and at least for a consistent filesystem, the signed/unsigned should in practice never be exposed. Signed-off-by: Roel Kluin Signed-off-by: "Theodore Ts'o" --- fs/ext3/namei.c | 4 ++-- fs/ext4/namei.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 287b304d42a..2c2d700c1cc 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1161,9 +1161,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 315858db807..84a68ae623c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1171,9 +1171,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) { From 8e1a4857cd92e32e642b3e7184c7f6bf85c96e2e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 14:53:06 -0500 Subject: [PATCH 05/57] Update Documentation/filesystems/ext4.txt Fix paragraph with recommendations on how to tune ext4 for benchmarks. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 42 +++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 174eaff7ded..f75ab101c00 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be # mount -t ext4 /dev/hda1 /wherever - - When comparing performance with other filesystems, remember that - ext3/4 by default offers higher data integrity guarantees than most. - So when comparing with a metadata-only journalling filesystem, such - as ext3, use `mount -o data=writeback'. And you might as well use - `mount -o nobh' too along with it. Making the journal larger than - the mke2fs default often helps performance with metadata-intensive - workloads. + - When comparing performance with other filesystems, it's always + important to try multiple workloads; very often a subtle change in a + workload parameter can completely change the ranking of which + filesystems do well compared to others. When comparing versus ext3, + note that ext4 enables write barriers by default, while ext3 does + not enable write barriers by default. So it is useful to use + explicitly specify whether barriers are enabled or not when via the + '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems + for a fair comparison. When tuning ext3 for best benchmark numbers, + it is often worthwhile to try changing the data journaling mode; '-o + data=writeback,nobh' can be faster for some workloads. (Note + however that running mounted with data=writeback can potentially + leave stale data exposed in recently written files in case of an + unclean shutdown, which could be a security exposure in some + situations.) Configuring the filesystem with a large journal can + also be helpful for metadata-intensive workloads. 2. Features =========== @@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be * ability to use filesystems > 16TB (e2fsprogs support not available yet) * extent format reduces metadata overhead (RAM, IO for access, transactions) * extent format more robust in face of on-disk corruption due to magics, -* internal redunancy in tree +* internal redundancy in tree * improved file allocation (multi-block alloc) * fix 32000 subdirectory limit * nsec timestamps for mtime, atime, ctime, create time @@ -116,6 +125,12 @@ grouping of bitmaps and inode tables. Some test results available here: When mounting an ext4 filesystem, the following option are accepted: (*) == default +ro Mount filesystem read only. Note that ext4 will + replay the journal (and thus write to the + partition) even when mounted "read only". The + mount options "ro,noload" can be used to prevent + writes to the filesystem. + extents (*) ext4 will use extents to address file data. The file system will no longer be mountable by ext3. @@ -144,7 +159,11 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. +noload Don't load the journal on mounting. Note that + if the filesystem was not unmounted cleanly, + skipping the journal replay will lead to the + filesystem containing inconsistencies that can + lead to any number of problems. data=journal All data are committed into the journal prior to being written into the main file system. @@ -219,9 +238,12 @@ minixdf Make 'df' act like Minix. debug Extra debugging information is sent to syslog. -errors=remount-ro(*) Remount the filesystem read-only on an error. +errors=remount-ro Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. + (These mount options override the errors behavior + specified in the superblock, which can be configured + using tune2fs) data_err=ignore(*) Just print an error message if an error occurs in a file data buffer in ordered mode. From 815a1130687ffac2c3e91513ce64aab629d6a54d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 1 Jan 2009 23:59:43 -0500 Subject: [PATCH 06/57] ext4: remove ext4_new_blocks() and call ext4_mb_new_blocks() directly There was only one caller of the compatibility function ext4_new_blocks(), in balloc.c's ext4_alloc_blocks(). Change it to call ext4_mb_new_blocks() directly, and remove ext4_new_blocks() altogether. This cleans up the code, by removing two extra functions from the call chain, and hopefully saving some stack usage. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 20 -------------------- fs/ext4/ext4.h | 3 --- fs/ext4/inode.c | 18 +++++++++++++----- 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 152c390f3c3..10ce275ebbf 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -741,26 +741,6 @@ ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, return ext4_new_meta_blocks(handle, inode, goal, &count, errp); } -/* - * ext4_new_blocks() -- allocate data blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: total number of blocks need - * @errp: error code - * - * Return 1st allocated block numberon success, *count stores total account - * error stores in errp pointer - */ - -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp) -{ - return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); -} - /** * ext4_count_free_blocks() -- count filesystem free blocks * @sb: superblock diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8370ffd2d62..74cb395e689 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1002,9 +1002,6 @@ extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, int *errp); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b088121686..5120243024f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -547,6 +547,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, int indirect_blks, int blks, ext4_fsblk_t new_blocks[4], int *err) { + struct ext4_allocation_request ar; int target, i; unsigned long count = 0, blk_allocated = 0; int index = 0; @@ -595,10 +596,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (!target) goto allocated; /* Now allocate data blocks */ - count = target; - /* allocating blocks for data blocks */ - current_block = ext4_new_blocks(handle, inode, iblock, - goal, &count, err); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (*err && (target == blks)) { /* * if the allocation failed and we didn't allocate @@ -614,7 +622,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, */ new_blocks[index] = current_block; } - blk_allocated += count; + blk_allocated += ar.len; } allocated: /* total number of blocks allocated for direct blocks */ From cfe82c856747b7841a3a00d591ce9ed46f579d27 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 7 Dec 2008 14:10:54 -0500 Subject: [PATCH 07/57] ext4: remove ext4_new_meta_block() There were only two one callers of the function ext4_new_meta_block(), which just a very simpler wrapper function around ext4_new_meta_blocks(). Change those two functions to call ext4_new_meta_blocks() directly, to save code and stack space usage. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 17 ----------------- fs/ext4/ext4.h | 2 -- fs/ext4/extents.c | 3 ++- fs/ext4/xattr.c | 5 +++-- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 10ce275ebbf..98a97129fc5 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -724,23 +724,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, return ret; } -/* - * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @errp: error code - * - * Return allocated block number on success - */ -ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) -{ - unsigned long count = 1; - return ext4_new_meta_blocks(handle, inode, goal, &count, errp); -} - /** * ext4_count_free_blocks() -- count filesystem free blocks * @sb: superblock diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 74cb395e689..ac8551e0b70 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -998,8 +998,6 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea2ce3c0ae6..e5b169b44b4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -189,9 +189,10 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_extent *ex, int *err) { ext4_fsblk_t goal, newblock; + unsigned long count = 1; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_block(handle, inode, goal, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, &count, err); return newblock; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 80626d516fe..f896e2c452f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -689,6 +689,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { + unsigned long count = 1; struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search *s = &bs->s; @@ -810,8 +811,8 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_meta_block(handle, inode, - goal, &error); + ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, + goal, &count, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); From 97df5d155dee478efe33b001f502e9630e1bba92 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 12 Dec 2008 12:41:28 -0500 Subject: [PATCH 08/57] ext4: remove do_blk_alloc() The convenience function do_blk_alloc() is a static function with only one caller, so fold it into ext4_new_meta_blocks() to simplify the code and to make it easier to understand. To save more stack space, if count is a null pointer in ext4_new_meta_blocks() assume that caller wanted a single block (and if there is an error, no blocks were allocated). Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 49 +++++++++++++++-------------------------------- fs/ext4/extents.c | 3 +-- fs/ext4/xattr.c | 3 +-- 3 files changed, 17 insertions(+), 38 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 98a97129fc5..35f5f9a2772 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -666,59 +666,40 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); } -#define EXT4_META_BLOCK 0x1 - -static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp, int flags) -{ - struct ext4_allocation_request ar; - ext4_fsblk_t ret; - - memset(&ar, 0, sizeof(ar)); - /* Fill with neighbour allocated blocks */ - - ar.inode = inode; - ar.goal = goal; - ar.len = *count; - ar.logical = iblock; - - if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) - /* enable in-core preallocation for data block allocation */ - ar.flags = EXT4_MB_HINT_DATA; - else - /* disable in-core preallocation for non-regular files */ - ar.flags = 0; - - ret = ext4_mb_new_blocks(handle, &ar, errp); - *count = ar.len; - return ret; -} - /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: total number of blocks need + * @count: pointer to total number of blocks needed * @errp: error code * - * Return 1st allocated block numberon success, *count stores total account + * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { + struct ext4_allocation_request ar; ext4_fsblk_t ret; - ret = do_blk_alloc(handle, inode, 0, goal, - count, errp, EXT4_META_BLOCK); + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* * Account for the allocated meta blocks */ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += *count; + EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } return ret; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e5b169b44b4..59401d057c6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -189,10 +189,9 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_extent *ex, int *err) { ext4_fsblk_t goal, newblock; - unsigned long count = 1; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_blocks(handle, inode, goal, &count, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); return newblock; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f896e2c452f..9b4a368c572 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -689,7 +689,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { - unsigned long count = 1; struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search *s = &bs->s; @@ -812,7 +811,7 @@ inserted: ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, - goal, &count, &error); + goal, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); From 2a21e37e48b94388f2cc8c0392f104f5443d4bb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Nov 2008 09:22:24 -0500 Subject: [PATCH 09/57] ext4: tone down ext4_da_writepages warnings If the filesystem has errors, ext4_da_writepages() will return a *lot* of errors, including lots and lots of stack dumps. While it's true that we are dropping user data on the floor, which is unfortunate, the stack dumps aren't helpful, and they tend to obscure the true original root cause of the problem. So in the case where the filesystem has aborted, return an EROFS right away. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5120243024f..ac97348f85b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2397,6 +2397,20 @@ static int ext4_da_writepages(struct address_space *mapping, */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_da_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) + return -EROFS; + /* * Make sure nr_to_write is >= sbi->s_mb_stream_request * This make sure small files blocks are allocated in @@ -2441,7 +2455,7 @@ static int ext4_da_writepages(struct address_space *mapping, handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - printk(KERN_EMERG "%s: jbd2_start: " + printk(KERN_CRIT "%s: jbd2_start: " "%ld pages, ino %lu; err %d\n", __func__, wbc->nr_to_write, inode->i_ino, ret); dump_stack(); From 791b7f08954869d7b8ff438f3dac3cfb39778297 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:50:43 -0500 Subject: [PATCH 10/57] ext4: Fix the delalloc writepages to allocate blocks at the right offset. When iterating through the pages which have mapped buffer_heads, we failed to update the b_state value. This results in allocating blocks at logical offset 0. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 56 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ac97348f85b..c77a7ac753f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1653,35 +1653,39 @@ struct mpage_da_data { */ static int mpage_da_submit_io(struct mpage_da_data *mpd) { - struct address_space *mapping = mpd->inode->i_mapping; - int ret = 0, err, nr_pages, i; - unsigned long index, end; - struct pagevec pvec; long pages_skipped; + struct pagevec pvec; + unsigned long index, end; + int ret = 0, err, nr_pages, i; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); + /* + * We need to start from the first_page to the next_page - 1 + * to make sure we also write the mapped dirty buffer_heads. + * If we look at mpd->lbh.b_blocknr we would only be looking + * at the currently mapped buffer_heads. + */ index = mpd->first_page; end = mpd->next_page - 1; + pagevec_init(&pvec, 0); while (index <= end) { - /* - * We can use PAGECACHE_TAG_DIRTY lookup here because - * even though we have cleared the dirty flag on the page - * We still keep the page in the radix tree with tag - * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. - * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback - * which is called via the below writepage callback. - */ - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); if (!err && (pages_skipped == mpd->wbc->pages_skipped)) @@ -2095,11 +2099,29 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_da_writepage + */ if (buffer_dirty(bh) && (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need to update + * the b_state because we look at + * b_state in mpage_da_map_blocks. We don't + * update b_size because if we find an + * unmapped buffer_head later we need to + * use the b_state flag of that buffer_head. + */ + if (mpd->lbh.b_size == 0) + mpd->lbh.b_state = + bh->b_state & BH_FLAGS; } logical++; } while ((bh = bh->b_this_page) != head); From 565a9617b2151e21b22700e97a8b04e70e103153 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:51:07 -0500 Subject: [PATCH 11/57] ext4: avoid ext4_error when mounting a fs with a single bg Remove some completely unneeded code which which caused an ext4_error to be generated when mounting a file system with only a single block group. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 08fc86a358d..81aed8b825a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1445,7 +1445,6 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; - __u64 block_bitmap = 0; int i; if (!sbi->s_es->s_log_groups_per_flex) { @@ -1468,9 +1467,6 @@ static int ext4_fill_flex_info(struct super_block *sb) goto failed; } - gdp = ext4_get_group_desc(sb, 1, &bh); - block_bitmap = ext4_block_bitmap(sb, gdp) - 1; - for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, &bh); From 25f1ee3aba17584ba4810da892175acab7fff9c8 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 25 Nov 2008 17:24:23 -0500 Subject: [PATCH 12/57] ext4: fix build warning Replace `if' with `goto' to assure gcc that ix has been initialized. Signed-off-by: Wu Fengguang --- fs/ext4/extents.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 59401d057c6..0917be51f10 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1160,15 +1160,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) - break; + goto got_index; } - if (depth < 0) { - /* we've gone up to the root and - * found no index to the right */ - return 0; - } + /* we've gone up to the root and found no index to the right */ + return 0; +got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ @@ -1201,7 +1199,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, *phys = ext_pblock(ex); put_bh(bh); return 0; - } /* From 171bbfbeab7730031eec8025341401fabe540bd5 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 25 Nov 2008 17:42:31 -0500 Subject: [PATCH 13/57] jbd2: Add BH_JBDPrivateStart Add this so that file systems using JBD2 can safely allocate unused b_state bits. In this case, we add it so that Ocfs2 can define a single bit for tracking the validation state of a buffer. Signed-off-by: Mark Fasheh Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index c7d106ef22e..f3664574548 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -329,6 +329,7 @@ enum jbd_state_bits { BH_State, /* Pins most journal_head state */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ + BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) From 23475e264c4f5c8b635a31924851287ead1ebe32 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 26 Nov 2008 02:23:19 -0500 Subject: [PATCH 14/57] ext4: Use simple_strtol() instead of simple_strtoul() in ext4_ui_proc_open Signed-off-by: Roel Kluin Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 81aed8b825a..8a0ae883f56 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3521,18 +3521,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file) static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, size_t cnt, loff_t *ppos) { - unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; + unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; char str[32]; - unsigned long value; if (cnt >= sizeof(str)) return -EINVAL; if (copy_from_user(str, buf, cnt)) return -EFAULT; - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - *p = value; + + *p = simple_strtoul(str, NULL, 0); return cnt; } From 93c0d86371a5b2e68473752a6e54ff03185c473e Mon Sep 17 00:00:00 2001 From: "Solofo.Ramangalahy@bull.net" <> Date: Wed, 26 Nov 2008 23:44:10 -0500 Subject: [PATCH 15/57] ext4: When resizing set the EXT4_BG_INODE_ZEROED flag for new block groups The inode table has been zeroed in setup_new_group_blocks(). Mark it as such in ext4_group_add(). Since we are currently clearing inode table for the new block group, we should set the EXT4_BG_INODE_ZEROED flag. If at some point in the future we don't immediately zero out the inode table as part of the resize operation, then obviously we shouldn't do this. Signed-off-by: Solofo.Ramangalahy@bull.net Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6ec1843a01..d448eb1d9ba 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -864,6 +864,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* From ff7ef329b268b603ea4a2303241ef1c3829fd574 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Wed, 17 Dec 2008 00:48:39 -0500 Subject: [PATCH 16/57] ext4: Widen type of ext4_sb_info.s_mb_maxs[] I chased the cause of following ext4 oops report which is tested on ia64 box. http://bugzilla.kernel.org/show_bug.cgi?id=12018 The cause is the size of s_mb_maxs array that is defined as "unsigned short" in ext4_sb_info structure. If the file system's block size is 8k or greater, an unsigned short is not wide enough to contain the value fs->blocksize << 3. Signed-off-by: Yasunori Goto Signed-off-by: "Theodore Ts'o" Cc: Li Zefan Cc: Miao Xie Cc: stable@kernel.org --- fs/ext4/ext4_sb.h | 3 ++- fs/ext4/mballoc.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index ad7ea09baa7..3db800f399a 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -102,7 +102,8 @@ struct ext4_sb_info { spinlock_t s_reserve_lock; spinlock_t s_md_lock; tid_t s_last_transaction; - unsigned short *s_mb_offsets, *s_mb_maxs; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; /* tunables */ unsigned long s_stripe; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 444ad998f72..7beab7141dd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2493,6 +2493,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_mb_offsets == NULL) { return -ENOMEM; } + + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { kfree(sbi->s_mb_maxs); From 0390131ba84fd3f726f9e24fc4553828125700bb Mon Sep 17 00:00:00 2001 From: Frank Mayhar Date: Wed, 7 Jan 2009 00:06:22 -0500 Subject: [PATCH 17/57] ext4: Allow ext4 to run without a journal A few weeks ago I posted a patch for discussion that allowed ext4 to run without a journal. Since that time I've integrated the excellent comments from Andreas and fixed several serious bugs. We're currently running with this patch and generating some performance numbers against both ext2 (with backported reservations code) and ext4 with and without a journal. It just so happens that running without a journal is slightly faster for most everything. We did iozone -T -t 4 s 2g -r 256k -T -I -i0 -i1 -i2 which creates 4 threads, each of which create and do reads and writes on a 2G file, with a buffer size of 256K, using O_DIRECT for all file opens to bypass the page cache. Results: ext2 ext4, default ext4, no journal initial writes 13.0 MB/s 15.4 MB/s 15.7 MB/s rewrites 13.1 MB/s 15.6 MB/s 15.9 MB/s reads 15.2 MB/s 16.9 MB/s 17.2 MB/s re-reads 15.3 MB/s 16.9 MB/s 17.2 MB/s random readers 5.6 MB/s 5.6 MB/s 5.7 MB/s random writers 5.1 MB/s 5.3 MB/s 5.4 MB/s So it seems that, so far, this was a useful exercise. Signed-off-by: Frank Mayhar Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 4 +- fs/ext4/ext4_jbd2.c | 83 +++++++++++++----- fs/ext4/ext4_jbd2.h | 83 ++++++++++++++---- fs/ext4/extents.c | 12 +-- fs/ext4/ialloc.c | 25 +++--- fs/ext4/inode.c | 130 +++++++++++++++++++--------- fs/ext4/ioctl.c | 2 +- fs/ext4/mballoc.c | 17 ++-- fs/ext4/migrate.c | 5 +- fs/ext4/namei.c | 56 ++++++------ fs/ext4/resize.c | 31 +++---- fs/ext4/super.c | 207 ++++++++++++++++++++++++++++++-------------- fs/ext4/xattr.c | 21 +++-- 13 files changed, 452 insertions(+), 224 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 35f5f9a2772..31ebeb5e7b0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -531,11 +531,11 @@ do_more: /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; *pdquot_freed_blocks += group_freed; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index c75384b34f2..ad13a84644e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,53 +7,96 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_undo_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_undo_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_write_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_fsblk_t blocknr, struct buffer_head *bh) { - int err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } -int __ext4_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh) +int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, + struct inode *inode, struct buffer_head *bh) { - int err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } else { + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long) bh->b_blocknr); + err = -EIO; + } + } + } return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b455c685a98..663197adae5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); * been done yet. */ -static inline void ext4_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - jbd2_journal_release_buffer(handle, bh); -} - void ext4_journal_abort_handle(const char *caller, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); @@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); -int __ext4_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh); +int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, + struct inode *inode, struct buffer_head *bh); #define ext4_journal_get_undo_access(handle, bh) \ __ext4_journal_get_undo_access(__func__, (handle), (bh)) @@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where, __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_dirty_metadata(handle, bh) \ - __ext4_journal_dirty_metadata(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ __ext4_journal_forget(__func__, (handle), (bh)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); +#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) + +static inline int ext4_handle_valid(handle_t *handle) +{ + if (handle == EXT4_NOJOURNAL_HANDLE) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline void ext4_handle_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +{ + if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) + return 0; + return 1; +} + +static inline void ext4_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) { return ext4_journal_start_sb(inode->i_sb, nblocks); @@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void) static inline int ext4_journal_extend(handle_t *handle, int nblocks) { - return jbd2_journal_extend(handle, nblocks); + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks); + return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks) { - return jbd2_journal_restart(handle, nblocks); + if (ext4_handle_valid(handle)) + return jbd2_journal_restart(handle, nblocks); + return 0; } static inline int ext4_journal_blocks_per_page(struct inode *inode) { - return jbd2_journal_blocks_per_page(inode); + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_page(inode); + return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { - return jbd2_journal_force_commit(journal); + if (journal) + return jbd2_journal_force_commit(journal); + return 0; } static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + if (ext4_handle_valid(handle)) + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + return 0; } /* super.c */ @@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb); static inline int ext4_should_journal_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 1; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode) static inline int ext4_should_order_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 0; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) @@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 0; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0917be51f10..743e3feb3e5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; + if (!ext4_handle_valid(handle)) + return 0; if (handle->h_buffer_credits > needed) return 0; err = ext4_journal_extend(handle, needed); @@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode, int err; if (path->p_bh) { /* path points to block */ - err = ext4_journal_dirty_metadata(handle, path->p_bh); + err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; @@ -2947,7 +2949,7 @@ void ext4_ext_truncate(struct inode *inode) * transaction synchronous. */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6e6052879aa..9dd21b75f4b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -253,12 +253,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) spin_unlock(sb_bgl_lock(sbi, flex_group)); } } - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh2); if (!fatal) fatal = err; } - BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; sb->s_dirt = 1; @@ -656,15 +656,16 @@ repeat_in_this_group: ino, bitmap_bh->b_data)) { /* we won it */ BUFFER_TRACE(bitmap_bh, - "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, + "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, + inode, bitmap_bh); if (err) goto fail; goto got; } /* we lost it */ - jbd2_journal_release_buffer(handle, bitmap_bh); + ext4_handle_release_buffer(handle, bitmap_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -726,7 +727,8 @@ got: /* Don't need to dirty bitmap block if we didn't change it */ if (free) { BUFFER_TRACE(block_bh, "dirty block bitmap"); - err = ext4_journal_dirty_metadata(handle, block_bh); + err = ext4_handle_dirty_metadata(handle, + NULL, block_bh); } brelse(block_bh); @@ -771,8 +773,8 @@ got: } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh2); if (err) goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); @@ -825,7 +827,7 @@ got: ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { err = -EINVAL; goto fail_drop; @@ -1028,4 +1030,3 @@ unsigned long ext4_count_dirs(struct super_block * sb) } return count; } - diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c77a7ac753f..45d0f70a1f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. + * + * If the handle isn't valid we're not journaling so there's nothing to do. */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; + if (!ext4_handle_valid(handle)) + return 0; + might_sleep(); BUFFER_TRACE(bh, "enter"); @@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode) */ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) { - if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) return 0; @@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) */ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) { + BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); return ext4_journal_restart(handle, blocks_for_truncate(inode)); } @@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode) } if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode) * enough credits left in the handle to remove the inode from * the orphan list and set the dtime field. */ - if (handle->h_buffer_credits < 3) { + if (!ext4_handle_has_enough_credits(handle, 3)) { err = ext4_journal_extend(handle, 3); if (err > 0) err = ext4_journal_restart(handle, 3); @@ -717,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto failed; } @@ -800,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, where->bh); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); if (err) goto err_out; } else { @@ -1229,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); } unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (!fatal) fatal = err; } else { @@ -1395,7 +1403,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_journal_dirty_metadata(handle, bh); + return ext4_handle_dirty_metadata(handle, NULL, bh); } /* @@ -2762,7 +2770,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + BUG_ON(!EXT4_JOURNAL(inode) && + EXT4_I(inode)->i_state & EXT4_STATE_JDATA); + + if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -3033,7 +3044,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) if (offset == 0) ClearPageChecked(page); - jbd2_journal_invalidatepage(journal, page, offset); + if (journal) + jbd2_journal_invalidatepage(journal, page, offset); + else + block_invalidatepage(page, offset); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3043,7 +3057,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait) WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; - return jbd2_journal_try_to_free_buffers(journal, page, wait); + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); } /* @@ -3315,7 +3332,7 @@ int ext4_block_truncate_page(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); } else { if (ext4_should_order_data(inode)) err = ext4_jbd2_file_inode(handle, inode); @@ -3439,8 +3456,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *p; if (try_to_extend_transaction(handle, inode)) { if (bh) { - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, inode, bh); } ext4_mark_inode_dirty(handle, inode); ext4_journal_test_restart(handle, inode); @@ -3540,7 +3557,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, count, block_to_free_p, p); if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this @@ -3549,7 +3566,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, * the block was cleared. Check for this instead of OOPSing. */ if (bh2jh(this_bh)) - ext4_journal_dirty_metadata(handle, this_bh); + ext4_handle_dirty_metadata(handle, inode, this_bh); else ext4_error(inode->i_sb, __func__, "circular indirect block detected, " @@ -3579,7 +3596,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, ext4_fsblk_t nr; __le32 *p; - if (is_handle_aborted(handle)) + if (ext4_handle_is_aborted(handle)) return; if (depth--) { @@ -3649,7 +3666,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * will merely complain about releasing a free block, * rather than leaking blocks. */ - if (is_handle_aborted(handle)) + if (ext4_handle_is_aborted(handle)) return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); @@ -3668,9 +3685,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, parent_bh)){ *p = 0; BUFFER_TRACE(parent_bh, - "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, - parent_bh); + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); } } } @@ -3858,7 +3876,7 @@ do_indirects: * synchronous */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate(), and the file will remain alive @@ -4357,8 +4375,8 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; - handle->h_sync = 1; - err = ext4_journal_dirty_metadata(handle, + ext4_handle_sync(handle); + err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); } } @@ -4385,9 +4403,8 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } - - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - rc = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, inode, bh); if (!err) err = rc; ei->i_state &= ~EXT4_STATE_NEW; @@ -4450,6 +4467,25 @@ int ext4_write_inode(struct inode *inode, int wait) return ext4_force_commit(inode->i_sb); } +int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) +{ + int err = 0; + + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)bh->b_blocknr); + err = -EIO; + } + } + return err; +} + /* * ext4_setattr() * @@ -4754,16 +4790,15 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, iloc); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); - if (err) { - brelse(iloc->bh); - iloc->bh = NULL; - } + int err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); @@ -4835,7 +4870,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); err = ext4_reserve_inode_write(handle, inode, &iloc); - if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block @@ -4887,6 +4923,11 @@ void ext4_dirty_inode(struct inode *inode) handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; + if (!ext4_handle_valid(current_handle)) { + ext4_mark_inode_dirty(current_handle, inode); + return; + } + handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; @@ -4924,8 +4965,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) BUFFER_TRACE(iloc.bh, "get_write_access"); err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) - err = ext4_journal_dirty_metadata(handle, - iloc.bh); + err = ext4_handle_dirty_metadata(handle, + inode, + iloc.bh); brelse(iloc.bh); } } @@ -4951,6 +4993,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) */ journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; if (is_journal_aborted(journal)) return -EROFS; @@ -4980,7 +5024,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); err = ext4_mark_inode_dirty(handle, inode); - handle->h_sync = 1; + ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index dc99b4776d5..42dc83fb247 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto flags_out; } if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto flags_err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7beab7141dd..edb512b2ec4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2553,7 +2553,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); - sbi->s_journal->j_commit_callback = release_blocks_on_commit; + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = release_blocks_on_commit; printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; @@ -2854,7 +2855,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; @@ -2901,10 +2902,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, spin_unlock(sb_bgl_lock(sbi, flex_group)); } - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; - err = ext4_journal_dirty_metadata(handle, gdp_bh); + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: sb->s_dirt = 1; @@ -4414,7 +4415,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; - + BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); @@ -4600,7 +4601,7 @@ do_more: /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (ac) { ac->ac_b_ex.fe_group = block_group; @@ -4609,7 +4610,7 @@ do_more: ext4_mb_store_history(ac); } - if (metadata) { + if (metadata && ext4_handle_valid(handle)) { /* blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); @@ -4639,7 +4640,7 @@ do_more: /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f2a9cf498ec..e7cd488da4b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * Make sure the credit we accumalated is not really high */ - if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { + if (needed && ext4_handle_has_enough_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS)) { retval = ext4_journal_restart(handle, needed); if (retval) goto err_out; @@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) { int retval = 0, needed; - if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; /* * We are freeing a blocks. During this we touch diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 84a68ae623c..08873e938ab 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1233,10 +1233,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, de = de2; } dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_journal_dirty_metadata(handle, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; - err = ext4_journal_dirty_metadata(handle, frame->bh); + err = ext4_handle_dirty_metadata(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); @@ -1340,8 +1340,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); brelse(bh); @@ -1581,7 +1581,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_journal_dirty_metadata(handle, bh2); + err = ext4_handle_dirty_metadata(handle, inode, bh2); if (err) goto journal_error; brelse (bh2); @@ -1607,7 +1607,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext4_journal_dirty_metadata(handle, frames[0].bh); + ext4_handle_dirty_metadata(handle, inode, frames[0].bh); } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1653,8 +1653,8 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, bh); return 0; } i += ext4_rec_len_from_disk(de->rec_len); @@ -1732,7 +1732,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode (handle, dir, mode); err = PTR_ERR(inode); @@ -1766,7 +1766,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, mode); err = PTR_ERR(inode); @@ -1802,7 +1802,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode); err = PTR_ERR(inode); @@ -1831,8 +1831,8 @@ retry: strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; - BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, dir_block); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, dir_block); brelse(dir_block); ext4_mark_inode_dirty(handle, inode); err = ext4_add_entry(handle, dentry, inode); @@ -1944,6 +1944,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0, rc; + if (!ext4_handle_valid(handle)) + return 0; + lock_super(sb); if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; @@ -1972,7 +1975,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -2010,6 +2013,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; + if (!ext4_handle_valid(handle)) + return 0; + lock_super(inode->i_sb); if (list_empty(&ei->i_orphan)) { unlock_super(inode->i_sb); @@ -2028,7 +2034,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (!handle) + if (sbi->s_journal && !handle) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); @@ -2042,7 +2048,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = @@ -2093,7 +2099,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = dentry->d_inode; @@ -2147,7 +2153,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de); @@ -2204,7 +2210,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); @@ -2267,7 +2273,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); @@ -2316,7 +2322,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, return PTR_ERR(handle); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); /* @@ -2370,8 +2376,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, new_bh); + BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, new_dir, new_bh); brelse(new_bh); new_bh = NULL; } @@ -2421,8 +2427,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, BUFFER_TRACE(dir_bh, "get_write_access"); ext4_journal_get_write_access(handle, dir_bh); PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, dir_bh); + BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, old_dir, dir_bh); ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d448eb1d9ba..1665aa131d1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, { int err; - if (handle->h_buffer_credits >= thresh) + if (ext4_handle_has_enough_credits(handle, thresh)) return 0; err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); @@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext4_journal_dirty_metadata(handle, gdb); + ext4_handle_dirty_metadata(handle, NULL, gdb); ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(bh); goto exit_bh; } - ext4_journal_dirty_metadata(handle, gdb); + ext4_handle_dirty_metadata(handle, NULL, gdb); ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(it); goto exit_bh; } - ext4_journal_dirty_metadata(handle, it); + ext4_handle_dirty_metadata(handle, NULL, it); brelse(it); ext4_set_bit(bit, bh->b_data); } @@ -286,7 +286,7 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), bh->b_data); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); /* Mark unused entries in inode bitmap used */ @@ -299,7 +299,7 @@ static int setup_new_group_blocks(struct super_block *sb, mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), bh->b_data); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); exit_bh: brelse(bh); @@ -486,12 +486,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - ext4_journal_dirty_metadata(handle, dind); + ext4_handle_dirty_metadata(handle, NULL, dind); brelse(dind); inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); memset((*primary)->b_data, 0, sb->s_blocksize); - ext4_journal_dirty_metadata(handle, *primary); + ext4_handle_dirty_metadata(handle, NULL, *primary); o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -502,7 +502,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); return 0; @@ -618,7 +618,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, primary[i]->b_blocknr, gdbackups, blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); - err2 = ext4_journal_dirty_metadata(handle, primary[i]); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) err = err2; } @@ -676,7 +676,8 @@ static void update_backups(struct super_block *sb, struct buffer_head *bh; /* Out of journal space, and can't get more - abort - so sad */ - if (handle->h_buffer_credits == 0 && + if (ext4_handle_valid(handle) && + handle->h_buffer_credits == 0 && ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) break; @@ -696,7 +697,7 @@ static void update_backups(struct super_block *sb, memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); } if ((err2 = ext4_journal_stop(handle)) && !err) @@ -916,7 +917,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext4_journal_dirty_metadata(handle, primary); + ext4_handle_dirty_metadata(handle, NULL, primary); /* Update the reserved block counts only once the new group is * active. */ @@ -938,7 +939,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) EXT4_INODES_PER_GROUP(sb); } - ext4_journal_dirty_metadata(handle, sbi->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); sb->s_dirt = 1; exit_journal: @@ -1072,7 +1073,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); sb->s_dirt = 1; unlock_super(sb); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a0ae883f56..9b9076d9c4f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -136,13 +136,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; - if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, - "Detected aborted journal"); - return ERR_PTR(-EROFS); + if (journal) { + if (is_journal_aborted(journal)) { + ext4_abort(sb, __func__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2_journal_start(journal, nblocks); } - - return jbd2_journal_start(journal, nblocks); + /* + * We're not journaling, return the appropriate indication. + */ + current->journal_info = EXT4_NOJOURNAL_HANDLE; + return current->journal_info; } /* @@ -157,6 +163,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int err; int rc; + if (!ext4_handle_valid(handle)) { + /* + * Do this here since we don't call jbd2_journal_stop() in + * no-journal mode. + */ + current->journal_info = NULL; + return 0; + } sb = handle->h_transaction->t_journal->j_private; err = handle->h_err; rc = jbd2_journal_stop(handle); @@ -174,6 +188,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); + BUG_ON(!ext4_handle_valid(handle)); + if (bh) BUFFER_TRACE(bh, "abort"); @@ -448,11 +464,13 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - if (err < 0) - ext4_abort(sb, __func__, "Couldn't clean up the journal"); - + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, __func__, + "Couldn't clean up the journal"); + } if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -522,6 +540,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + /* + * Note: We can be called before EXT4_SB(sb)->s_journal is set, + * therefore it can be null here. Don't check it, just initialize + * jinode. + */ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; @@ -588,7 +611,8 @@ static void ext4_clear_inode(struct inode *inode) } #endif ext4_discard_preallocations(inode); - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + if (EXT4_JOURNAL(inode)) + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode); } @@ -1406,20 +1430,15 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, printk(KERN_WARNING "EXT4-fs warning: checktime reached, " "running e2fsck is recommended\n"); -#if 0 - /* @@@ We _will_ want to clear the valid bit if we find - * inconsistencies, to force a fsck at reboot. But for - * a plain journaled filesystem we can keep it set as - * valid forever! :) - */ - es->s_state &= cpu_to_le16(~EXT4_VALID_FS); -#endif + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); es->s_mtime = cpu_to_le32(get_seconds()); ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + if (sbi->s_journal) + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) @@ -1431,9 +1450,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", - sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : - "external", EXT4_SB(sb)->s_journal->j_devname); + if (EXT4_SB(sb)->s_journal) { + printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", + sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + "external", EXT4_SB(sb)->s_journal->j_devname); + } else { + printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); + } return res; } @@ -1867,6 +1890,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned long def_mount_opts; struct inode *root; char *cp; + const char *descr; int ret = -EINVAL; int blocksize; int db_count; @@ -2278,21 +2302,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_commit_super(sb, es, 1); - printk(KERN_CRIT - "EXT4-fs (device %s): mount failed\n", - sb->s_id); goto failed_mount4; } } } else if (journal_inum) { if (ext4_create_journal(sb, es, journal_inum)) goto failed_mount3; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + printk(KERN_ERR "EXT4-fs: required journal recovery " + "suppressed and not mounted read-only\n"); + goto failed_mount4; } else { - if (!silent) - printk(KERN_ERR - "ext4: No journal on filesystem on %s\n", - sb->s_id); - goto failed_mount3; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + sbi->s_journal = NULL; + needs_recovery = 0; + goto no_journal; } if (ext4_blocks_count(es) > 0xffffffffULL && @@ -2344,6 +2370,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) break; } +no_journal: + if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - " @@ -2428,13 +2456,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; - if (needs_recovery) + if (needs_recovery) { printk(KERN_INFO "EXT4-fs: recovery complete.\n"); - ext4_mark_recovery_complete(sb, es); - printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", - test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); + ext4_mark_recovery_complete(sb, es); + } + if (EXT4_SB(sb)->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", + sb->s_id, descr); lock_kernel(); return 0; @@ -2446,8 +2483,11 @@ cantfind_ext4: goto failed_mount; failed_mount4: - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -2508,6 +2548,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + /* First, test for the existence of a valid inode on disk. Bad * things happen if we iget() an unused inode, as the subsequent * iput() will try to delete it. */ @@ -2556,6 +2598,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + bdev = ext4_blkdev_get(j_dev); if (bdev == NULL) return NULL; @@ -2643,6 +2687,8 @@ static int ext4_load_journal(struct super_block *sb, int err = 0; int really_read_only; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { printk(KERN_INFO "EXT4-fs: external journal device major/minor " @@ -2817,6 +2863,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb, { journal_t *journal = EXT4_SB(sb)->s_journal; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + BUG_ON(journal != NULL); + return; + } jbd2_journal_lock_updates(journal); if (jbd2_journal_flush(journal) < 0) goto out; @@ -2846,6 +2896,8 @@ static void ext4_clear_journal_err(struct super_block *sb, int j_errno; const char *errstr; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + journal = EXT4_SB(sb)->s_journal; /* @@ -2878,14 +2930,17 @@ static void ext4_clear_journal_err(struct super_block *sb, int ext4_force_commit(struct super_block *sb) { journal_t *journal; - int ret; + int ret = 0; if (sb->s_flags & MS_RDONLY) return 0; journal = EXT4_SB(sb)->s_journal; - sb->s_dirt = 0; - ret = ext4_journal_force_commit(journal); + if (journal) { + sb->s_dirt = 0; + ret = ext4_journal_force_commit(journal); + } + return ret; } @@ -2897,9 +2952,13 @@ int ext4_force_commit(struct super_block *sb) */ static void ext4_write_super(struct super_block *sb) { - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; + if (EXT4_SB(sb)->s_journal) { + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; + } else { + ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + } } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -2908,10 +2967,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (EXT4_SB(sb)->s_journal) { + if (wait) + ret = ext4_force_commit(sb); + else + jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + } else { + ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); + } return ret; } @@ -2926,15 +2989,17 @@ static void ext4_write_super_lockfs(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { journal_t *journal = EXT4_SB(sb)->s_journal; - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* - * We don't want to clear needs_recovery flag when we failed - * to flush the journal. - */ - if (jbd2_journal_flush(journal) < 0) - return; + /* + * We don't want to clear needs_recovery flag when we + * failed to flush the journal. + */ + if (jbd2_journal_flush(journal) < 0) + return; + } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -2948,7 +3013,7 @@ static void ext4_write_super_lockfs(struct super_block *sb) */ static void ext4_unlockfs(struct super_block *sb) { - if (!(sb->s_flags & MS_RDONLY)) { + if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -2999,7 +3064,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) es = sbi->s_es; - ext4_init_journal_params(sb, sbi->s_journal); + if (sbi->s_journal) + ext4_init_journal_params(sb, sbi->s_journal); if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > ext4_blocks_count(es)) { @@ -3028,9 +3094,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * We have to unlock super so that we can wait for * transactions. */ - unlock_super(sb); - ext4_mark_recovery_complete(sb, es); - lock_super(sb); + if (sbi->s_journal) { + unlock_super(sb); + ext4_mark_recovery_complete(sb, es); + lock_super(sb); + } } else { __le32 ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -3084,7 +3152,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * been changed by e2fsck since we originally mounted * the partition.) */ - ext4_clear_journal_err(sb, es); + if (sbi->s_journal) + ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); if ((err = ext4_group_extend(sb, es, n_blocks_count))) goto restore_opts; @@ -3092,6 +3161,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } } + if (sbi->s_journal == NULL) + ext4_commit_super(sb, es, 1); + #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) @@ -3368,7 +3440,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext4_should_journal_data(path.dentry->d_inode)) { + if (EXT4_SB(sb)->s_journal && + ext4_should_journal_data(path.dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -3442,7 +3515,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - if (!handle) { + if (EXT4_SB(sb)->s_journal && !handle) { printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); @@ -3467,7 +3540,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); unlock_buffer(bh); if (journal_quota) - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); else { /* Always do at least ordered writes for quotas */ err = ext4_jbd2_file_inode(handle, inode); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9b4a368c572..157ce6589c5 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); sb->s_dirt = 1; - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_forget(handle, 1, inode, bh, bh->b_blocknr); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); - error = ext4_journal_dirty_metadata(handle, bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); DQUOT_FREE_BLOCK(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); @@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error == -EIO) goto bad_block; if (!error) - error = ext4_journal_dirty_metadata(handle, - bs->bh); + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -794,8 +795,9 @@ inserted: ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); - error = ext4_journal_dirty_metadata(handle, - new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -833,7 +835,8 @@ getblk_failed: set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(new_bh); - error = ext4_journal_dirty_metadata(handle, new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); if (error) goto cleanup; } @@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, */ is.iloc.bh = NULL; if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); } cleanup: From fd98496f467b3d26d05ab1498f41718b5ef13de5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 21:34:13 -0500 Subject: [PATCH 18/57] jbd2: Add barrier not supported test to journal_wait_on_commit_record Xen doesn't report that barriers are not supported until buffer I/O is reported as completed, instead of when the buffer I/O is submitted. Add a check and a fallback codepath to journal_wait_on_commit_record() to detect this case, so that attempts to mount ext4 filesystems on LVM/devicemapper devices on Xen guests don't blow up with an "Aborting journal on device XXX"; "Remounting filesystem read-only" error. Thanks to Andreas Sundstrom for reporting this issue. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/jbd2/commit.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ebc667bc54a..6393fd0d804 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal, * This function along with journal_submit_commit_record * allows to write the commit record asynchronously. */ -static int journal_wait_on_commit_record(struct buffer_head *bh) +static int journal_wait_on_commit_record(journal_t *journal, + struct buffer_head *bh) { int ret = 0; +retry: clear_buffer_dirty(bh); wait_on_buffer(bh); + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { + printk(KERN_WARNING + "JBD2: wait_on_commit_record: sync failed on %s - " + "disabling barriers\n", journal->j_devname); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_BARRIER; + spin_unlock(&journal->j_state_lock); + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + ret = submit_bh(WRITE_SYNC, bh); + if (ret) { + unlock_buffer(bh); + return ret; + } + goto retry; + } if (unlikely(!buffer_uptodate(bh))) ret = -EIO; @@ -799,7 +822,7 @@ wait_for_iobuf: __jbd2_journal_abort_hard(journal); } if (!err && !is_journal_aborted(journal)) - err = journal_wait_on_commit_record(cbh); + err = journal_wait_on_commit_record(journal, cbh); if (err) jbd2_journal_abort(journal, err); From fde4d95ad8711c84a36735a17136c45b19746af9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:17:35 -0500 Subject: [PATCH 19/57] ext4: remove extraneous newlines from calls to ext4_error() and ext4_warning() This removes annoying blank syslog entries emitted by ext4_error() or ext4_warning(), since these functions add their own newline. Signed-off-by: Nick Warne Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/mballoc.c | 24 ++++++++++++------------ fs/ext4/resize.c | 7 +++---- fs/ext4/super.c | 4 ++-- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 31ebeb5e7b0..0cb1c4572f5 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -100,7 +100,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, - "Checksum bad for group %lu\n", block_group); + "Checksum bad for group %lu", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; gdp->bg_itable_unused = 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9dd21b75f4b..4794d2ce613 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -74,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %lu\n", + ext4_error(sb, __func__, "Checksum bad for group %lu", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index edb512b2ec4..48d606cd740 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -447,7 +447,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", + " %lu's block %llu(bit %u in group %lu)", inode ? inode->i_ino : 0, blocknr, first + i, e4b->bd_group); } @@ -691,7 +691,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_error(sb, __func__, - "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", + "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd", group, free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -1096,7 +1096,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_unlock_group(sb, e4b->bd_group); ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", + " %lu's block %llu(bit %u in group %lu)", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); ext4_lock_group(sb, e4b->bd_group); @@ -1576,7 +1576,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * we have free blocks */ ext4_error(sb, __func__, "%d free blocks as per " - "group info. But bitmap says 0\n", + "group info. But bitmap says 0", free); break; } @@ -1585,7 +1585,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { ext4_error(sb, __func__, "%d free blocks as per " - "group info. But got %d blocks\n", + "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly @@ -3629,7 +3629,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_error(sb, __func__, "free %u, pa_free %u\n", + ext4_error(sb, __func__, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained @@ -3703,14 +3703,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu\n", group); + "bitmap for %lu", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %lu", group); put_bh(bitmap_bh); return 0; } @@ -3877,14 +3877,14 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %lu", group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu\n", group); + "bitmap for %lu", group); ext4_mb_release_desc(&e4b); continue; } @@ -4149,7 +4149,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %lu", group); continue; } ext4_lock_group(sb, group); @@ -4446,7 +4446,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else { ext4_unlock_group(sb, group); ext4_error(sb, __func__, - "Double free of blocks %d (%d %d)\n", + "Double free of blocks %d (%d %d)", block, entry->start_blk, entry->count); return 0; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 1665aa131d1..41133811744 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -762,13 +762,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __func__, "blocks_count overflow\n"); + ext4_warning(sb, __func__, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __func__, "inodes_count overflow\n"); + ext4_warning(sb, __func__, "inodes_count overflow"); return -EINVAL; } @@ -999,8 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, - "CONFIG_LBD not enabled\n"); + ext4_warning(sb, __func__, "CONFIG_LBD not enabled"); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9b9076d9c4f..dc27d4c613c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1309,7 +1309,7 @@ set_qf_format: EXT4_FEATURE_INCOMPAT_EXTENTS)) { ext4_warning(sb, __func__, "extents feature not enabled " - "on this filesystem, use tune2fs\n"); + "on this filesystem, use tune2fs"); return 0; } set_opt(sbi->s_mount_opt, EXTENTS); @@ -1993,7 +1993,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) else ext4_warning(sb, __func__, "extents feature not enabled on this filesystem, " - "use tune2fs.\n"); + "use tune2fs."); /* * enable delayed allocation by default From 032115fcef837a00336ddf7bda584e89789ea498 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:34:30 -0500 Subject: [PATCH 20/57] ext4: Don't overwrite allocation_context ac_status We can call ext4_mb_check_limits even after successfully allocating the requested blocks. In that case, make sure we don't overwrite ac_status if it already has the status AC_STATUS_FOUND. This fixes the lockdep warning: ============================================= [ INFO: possible recursive locking detected ] 2.6.28-rc6-autokern1 #1 --------------------------------------------- fsstress/11948 is trying to acquire lock: (&meta_group_info[i]->alloc_sem){----}, at: [] ext4_mb_load_buddy+0x9f/0x278 ..... stack backtrace: ..... [] ext4_mb_regular_allocator+0xbb5/0xd44 ..... but task is already holding lock: (&meta_group_info[i]->alloc_sem){----}, at: [] ext4_mb_load_buddy+0x9f/0x278 Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 48d606cd740..6dea637b020 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1326,6 +1326,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_free_extent ex; int max; + if (ac->ac_status == AC_STATUS_FOUND) + return; /* * We don't want to scan for a whole year */ From e07f7183a486cf9783d1f8c9d2997b5b39eeb2d4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Nov 2008 01:14:26 -0500 Subject: [PATCH 21/57] jbd2: improve jbd2 fsync batching This patch removes the static sleep time in favor of a more self optimizing approach where we measure the average amount of time it takes to commit a transaction to disk and the ammount of time a transaction has been running. If somebody does a sync write or an fsync() traditionally we would sleep for 1 jiffies, which depending on the value of HZ could be a significant amount of time compared to how long it takes to commit a transaction to the underlying storage. With this patch instead of sleeping for a jiffie, we check to see if the amount of time this transaction has been running is less than the average commit time, and if it is we sleep for the delta using schedule_hrtimeout to give us a higher precision sleep time. This greatly benefits high end storage where you could end up sleeping for longer than it takes to commit the transaction and therefore sitting idle instead of allowing the transaction to be committed by keeping the sleep time to a minimum so you are sure to always be doing something. Signed-off-by: Josef Bacik Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 14 +++++++++++ fs/jbd2/transaction.c | 58 ++++++++++++++++++++++++++++++++----------- include/linux/jbd2.h | 15 +++++++++++ 3 files changed, 73 insertions(+), 14 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6393fd0d804..f22d1828ea8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -355,6 +355,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) int flags; int err; unsigned long long blocknr; + ktime_t start_time; + u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; @@ -481,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; + start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); @@ -995,6 +998,17 @@ restart_loop: J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time + + journal->j_average_commit_time*3) / 4; + else + journal->j_average_commit_time = commit_time; spin_unlock(&journal->j_state_lock); if (journal->j_commit_callback) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 39b7805a599..13dcbc990f4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -25,6 +25,7 @@ #include #include #include +#include static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); @@ -1193,7 +1195,7 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1216,24 +1218,52 @@ int jbd2_journal_stop(handle_t *handle) /* * Implement synchronous transaction batching. If the handle * was synchronous, don't force a commit immediately. Let's - * yield and let another thread piggyback onto this transaction. - * Keep doing that while new threads continue to arrive. - * It doesn't cost much - we're about to run a commit and sleep - * on IO anyway. Speeds up many-threaded, many-dir operations - * by 30x or more... + * yield and let another thread piggyback onto this + * transaction. Keep doing that while new threads continue to + * arrive. It doesn't cost much - we're about to run a commit + * and sleep on IO anyway. Speeds up many-threaded, many-dir + * operations by 30x or more... * - * But don't do this if this process was the most recent one to - * perform a synchronous write. We do this to detect the case where a - * single process is doing a stream of sync writes. No point in waiting - * for joiners in that case. + * We try and optimize the sleep time against what the + * underlying disk can do, instead of having a static sleep + * time. This is useful for the case where our storage is so + * fast that it is more optimal to go ahead and force a flush + * and wait for the transaction to be committed than it is to + * wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how + * long it takes to commit a transaction, and compare it with + * how long this transaction has been running, and if run time + * < commit time then we sleep for the delta and commit. This + * greatly helps super fast disks that would see slowdowns as + * more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one + * to perform a synchronous write. We do this to detect the + * case where a single process is doing a stream of sync + * writes. No point in waiting for joiners in that case. */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + journal->j_last_sync_writer = pid; - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f3664574548..ab8cef130c2 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -637,6 +637,11 @@ struct transaction_s */ unsigned long t_expires; + /* + * When this transaction started, in nanoseconds [no locking] + */ + ktime_t t_start_time; + /* * How many handles used this transaction? [t_handle_lock] */ @@ -939,8 +944,18 @@ struct journal_s struct buffer_head **j_wbuf; int j_wbufsize; + /* + * this is the pid of hte last person to run a synchronous operation + * through the journal + */ pid_t j_last_sync_writer; + /* + * the average amount of time in nanoseconds it takes to commit a + * transaction to disk. [j_state_lock] + */ + u64 j_average_commit_time; + /* This function is called when a transaction is closed */ void (*j_commit_callback)(journal_t *, transaction_t *); From d7cfa4684d82f58e5d7cb73b8a3c88c169937f25 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 17 Dec 2008 00:20:45 -0500 Subject: [PATCH 22/57] ext4: display average commit time Display the average commit time (which is used by the ext4 fsync batching patch) in /proc/fs/jbd2/*/info for performance tuning purposes. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e70d657a19f..74d87290381 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -40,6 +40,7 @@ #include #include +#include EXPORT_SYMBOL(jbd2_journal_start); EXPORT_SYMBOL(jbd2_journal_restart); @@ -824,6 +825,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %luus average transaction commit time\n", + do_div(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", s->stats->u.run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", From 30773840c19cea60dcef39545960d541b1ac1cf8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 20:27:38 -0500 Subject: [PATCH 23/57] ext4: add fsync batch tuning knobs Add new mount options, min_batch_time and max_batch_time, which controls how long the jbd2 layer should wait for additional filesystem operations to get batched with a synchronous write transaction. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 29 ++++++++++++++++++ fs/ext4/ext4.h | 7 +++++ fs/ext4/ext4_sb.h | 2 ++ fs/ext4/super.c | 47 +++++++++++++++++++++++++----- fs/jbd2/journal.c | 2 ++ fs/jbd2/transaction.c | 4 ++- include/linux/jbd2.h | 8 +++++ 7 files changed, 91 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index f75ab101c00..e3fcbea3ec8 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time. nodelalloc Disable delayed allocation. Blocks are allocation when data is copied from user to page cache. +max_batch_time=usec Maximum amount of time ext4 should wait for + additional filesystem operations to be batch + together with a synchronous write operation. + Since a synchronous write operation is going to + force a commit and then a wait for the I/O + complete, it doesn't cost much, and can be a + huge throughput win, we wait for a small amount + of time to see if any other transactions can + piggyback on the synchronous write. The + algorithm used is designed to automatically tune + for the speed of the disk, by measuring the + amount of time (on average) that it takes to + finish committing a transaction. Call this time + the "commit time". If the time that the + transactoin has been running is less than the + commit time, ext4 will try sleeping for the + commit time to see if other operations will join + the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us + (15ms). This optimization can be turned off + entirely by setting max_batch_time to 0. + +min_batch_time=usec This parameter sets the commit time (as + described above) to be at least min_batch_time. + It defaults to zero microseconds. Increasing + this parameter may improve the throughput of + multi-threaded, synchronous workloads on very + fast disks, at the cost of increasing latency. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ac8551e0b70..9ba9fd6d14d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -328,6 +328,7 @@ struct ext4_mount_options { uid_t s_resuid; gid_t s_resgid; unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[MAXQUOTAS]; @@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + /* * Structure of a directory entry */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 3db800f399a..039b6ea1a04 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -74,6 +74,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_JBD2_DEBUG struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dc27d4c613c..da377f9521b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) #endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); - if (sbi->s_commit_interval) { + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } + if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { + seq_printf(seq, ",min_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { + seq_printf(seq, ",max_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + /* * We're changing the default of barrier mount option, so * let's always display its mount state so it's clear what its @@ -874,7 +883,8 @@ enum { Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, + Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, @@ -913,6 +923,8 @@ static const match_table_t tokens = { {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, @@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb, option = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * option; break; + case Opt_max_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = option; + break; + case Opt_min_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_min_batch_time = option; + break; case Opt_data_journal: data_opt = EXT4_MOUNT_JOURNAL_DATA; goto datacheck; @@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); @@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_commit_interval) - journal->j_commit_interval = sbi->s_commit_interval; - /* We could also set up an ext4-specific default for the commit - * interval here, but for now we'll just fall back to the jbd - * default. */ + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; spin_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) @@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) @@ -3178,6 +3209,8 @@ restore_opts: sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 74d87290381..fd1d7557a09 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -964,6 +964,8 @@ static journal_t * journal_init_common (void) spin_lock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 13dcbc990f4..48c21bac5a5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle) trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); commit_time = min_t(u64, commit_time, - 1000*jiffies_to_usecs(1)); + 1000*journal->j_max_batch_time); if (trans_time < commit_time) { ktime_t expires = ktime_add_ns(ktime_get(), diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ab8cef130c2..a3cd647ea1b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -956,6 +956,14 @@ struct journal_s */ u64 j_average_commit_time; + /* + * minimum and maximum times that we should wait for + * additional filesystem operations to get batched into a + * synchronous handle in microseconds + */ + u32 j_min_batch_time; + u32 j_max_batch_time; + /* This function is called when a transaction is closed */ void (*j_commit_callback)(journal_t *, transaction_t *); From cde6436004ad9cd8cab5a874b6fa8b01f1da91bf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 4 Nov 2008 18:46:03 -0500 Subject: [PATCH 24/57] ext4: Remove i_ext_generation from ext4_inode_info structure The i_ext_generation was incremented, but never used. Remove it to slim down the ext4_inode_info structure. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 5 ----- fs/ext4/ext4_i.h | 1 - fs/ext4/extents.c | 2 -- 3 files changed, 8 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index bec7ce59fc0..18cb67b2cbb 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void ext4_ext_tree_changed(struct inode *inode) -{ - EXT4_I(inode)->i_ext_generation++; -} - static inline void ext4_ext_invalidate_cache(struct inode *inode) { diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 5c124c0ac6d..acc0b726d8a 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -117,7 +117,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode jinode; - unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; /* * File creation time. Its function is same as that of diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 743e3feb3e5..b9e27bc3155 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1621,7 +1621,6 @@ cleanup: ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_tree_changed(inode); ext4_ext_invalidate_cache(inode); return err; } @@ -2232,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) } } out: - ext4_ext_tree_changed(inode); ext4_ext_drop_refs(path); kfree(path); ext4_journal_stop(handle); From a9df9a49102f3578909cba7bd33784eb3b9caaa4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:18:16 -0500 Subject: [PATCH 25/57] ext4: Make ext4_group_t be an unsigned int Nearly all places in the ext3/4 code which uses "unsigned long" is probably a bug, since on 32-bit systems a ulong a 32-bits, which means we are wasting stack space on 64-bit systems. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 10 ++++---- fs/ext4/ext4.h | 2 +- fs/ext4/ext4_i.h | 2 +- fs/ext4/ialloc.c | 8 +++---- fs/ext4/mballoc.c | 58 +++++++++++++++++++++++------------------------ fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 14 ++++++------ 7 files changed, 49 insertions(+), 49 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0cb1c4572f5..a711898923f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -100,7 +100,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, - "Checksum bad for group %lu", block_group); + "Checksum bad for group %u", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; gdp->bg_itable_unused = 0; @@ -213,7 +213,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, if (block_group >= sbi->s_groups_count) { ext4_error(sb, "ext4_get_group_desc", "block_group >= groups_count - " - "block_group = %lu, groups_count = %lu", + "block_group = %u, groups_count = %u", block_group, sbi->s_groups_count); return NULL; @@ -225,7 +225,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, if (!sbi->s_group_desc[group_desc]) { ext4_error(sb, "ext4_get_group_desc", "Group descriptor not loaded - " - "block_group = %lu, group_desc = %lu, desc = %lu", + "block_group = %u, group_desc = %lu, desc = %lu", block_group, group_desc, offset); return NULL; } @@ -315,7 +315,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %lu, block_bitmap = %llu", + "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -337,7 +337,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) put_bh(bh); ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %lu, block_bitmap = %llu", + "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9ba9fd6d14d..e9aacecfbf4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -965,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) #define ERR_BAD_DX_DIR -75000 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - unsigned long *blockgrpp, ext4_grpblk_t *offsetp); + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern struct proc_dir_entry *ext4_proc_root; diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index acc0b726d8a..0a9ebe58019 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t; typedef __u32 ext4_lblk_t; /* data type for block group number */ -typedef unsigned long ext4_group_t; +typedef unsigned int ext4_group_t; #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4794d2ce613..cac3617ec78 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -74,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %lu", + ext4_error(sb, __func__, "Checksum bad for group %u", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; @@ -111,7 +111,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -133,7 +133,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) put_bh(bh); ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -690,7 +690,7 @@ got: ino > EXT4_INODES_PER_GROUP(sb)) { ext4_error(sb, __func__, "reserved inode or inode > inodes count - " - "block_group = %lu, inode=%lu", group, + "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); err = -EIO; goto fail; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6dea637b020..6cfe68a7e07 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -447,7 +447,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)", + " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, first + i, e4b->bd_group); } @@ -477,7 +477,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %lu " + printk(KERN_ERR "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc\n", e4b->bd_group, i, i * 8, b1[i], b2[i]); @@ -691,7 +691,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_error(sb, __func__, - "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd", + "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", group, free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -800,7 +800,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); - mb_debug("read bitmap for group %lu\n", first_group + i); + mb_debug("read bitmap for group %u\n", first_group + i); } /* wait for I/O completion */ @@ -895,7 +895,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct page *page; int ret; - mb_debug("load group %lu\n", group); + mb_debug("load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; @@ -1096,7 +1096,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_unlock_group(sb, e4b->bd_group); ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)", + " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); ext4_lock_group(sb, e4b->bd_group); @@ -1934,13 +1934,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) if (hs->op == EXT4_MB_HISTORY_ALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " "%-5u %-5s %-5u %-6u\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, hs->orig.fe_start, hs->orig.fe_len, hs->orig.fe_logical); - sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, + sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, hs->goal.fe_start, hs->goal.fe_len, hs->goal.fe_logical); seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, @@ -1949,20 +1949,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) hs->buddy ? 1 << hs->buddy : 0); } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, hs->orig.fe_start, hs->orig.fe_len, hs->orig.fe_logical); seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len); seq_printf(seq, "%-5u %-8u %-23s discard\n", hs->pid, hs->ino, buf2); } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len); seq_printf(seq, "%-5u %-8u %-23s free\n", hs->pid, hs->ino, buf2); @@ -2075,7 +2075,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) return NULL; group = *pos + 1; - return (void *) group; + return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2088,13 +2088,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) if (*pos < 0 || *pos >= sbi->s_groups_count) return NULL; group = *pos + 1; - return (void *) group;; + return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = seq->private; - long group = (long) v; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err; struct ext4_buddy e4b; @@ -2116,7 +2116,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) sizeof(struct ext4_group_info); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5lu: I/O error\n", group); + seq_printf(seq, "#%-5u: I/O error\n", group); return 0; } ext4_lock_group(sb, group); @@ -2124,7 +2124,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ext4_unlock_group(sb, group); ext4_mb_release_desc(&e4b); - seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? @@ -2459,7 +2459,7 @@ static int ext4_mb_init_backend(struct super_block *sb) desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR - "EXT4-fs: can't read descriptor %lu\n", i); + "EXT4-fs: can't read descriptor %u\n", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2657,7 +2657,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) list_for_each_safe(l, ltmp, &txn->t_private_list) { entry = list_entry(l, struct ext4_free_data, list); - mb_debug("gonna free %u blocks in group %lu (0x%p):", + mb_debug("gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2829,7 +2829,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!gdp) goto out_err; - ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, gdp->bg_free_blocks_count); err = ext4_journal_get_write_access(handle, gdp_bh); @@ -3351,7 +3351,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, preallocated += len; count++; } - mb_debug("prellocated %u for group %lu\n", preallocated, group); + mb_debug("prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3368,7 +3368,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head) static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { - unsigned long grp; + ext4_group_t grp; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3697,7 +3697,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug("discard preallocation for group %lu\n", group); + mb_debug("discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; @@ -3705,14 +3705,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu", group); + "bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu", group); + "information for %u", group); put_bh(bitmap_bh); return 0; } @@ -3879,14 +3879,14 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu", group); + "information for %u", group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu", group); + "bitmap for %u", group); ext4_mb_release_desc(&e4b); continue; } @@ -4151,7 +4151,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu", group); + "information for %u", group); continue; } ext4_lock_group(sb, group); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 41133811744..1865d6a53de 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) ext4_warning(sb, __func__, - "Cannot add at group %u (only %lu groups)", + "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); else if (offset != 0) ext4_warning(sb, __func__, "Last group not full"); @@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb, exit_err: if (err) { ext4_warning(sb, __func__, - "can't update backup for group %lu (err %d), " + "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index da377f9521b..8fa57be5040 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1470,7 +1470,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", sb->s_blocksize, sbi->s_groups_count, @@ -1514,7 +1514,7 @@ static int ext4_fill_flex_info(struct super_block *sb) sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { printk(KERN_ERR "EXT4-fs: not enough memory for " - "%lu flex groups\n", flex_group_count); + "%u flex groups\n", flex_group_count); goto failed; } @@ -1599,14 +1599,14 @@ static int ext4_check_descriptors(struct super_block *sb) block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Block bitmap for group %lu not in group " + "Block bitmap for group %u not in group " "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Inode bitmap for group %lu not in group " + "Inode bitmap for group %u not in group " "(block %llu)!\n", i, inode_bitmap); return 0; } @@ -1614,14 +1614,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Inode table for group %lu not in group " + "Inode table for group %u not in group " "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Checksum for group %lu failed (%u!=%u)\n", + "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { @@ -3154,7 +3154,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_remount: " - "Checksum for group %lu failed (%u!=%u)\n", + "Checksum for group %u failed (%u!=%u)\n", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EINVAL; From 498e5f24158da7bf8fa48074a70e370e22844492 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Nov 2008 00:14:04 -0500 Subject: [PATCH 26/57] ext4: Change unsigned long to unsigned int Convert the unsigned longs that are most responsible for bloating the stack usage on 64-bit systems. Nearly all places in the ext3/4 code which uses "unsigned long" is probably a bug, since on 32-bit systems a ulong a 32-bits, which means we are wasting stack space on 64-bit systems. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 14 +++++++------- fs/ext4/bitmap.c | 5 ++--- fs/ext4/dir.c | 8 ++++---- fs/ext4/ext4.h | 22 ++++++++++------------ fs/ext4/ext4_i.h | 13 +++++++------ fs/ext4/extents.c | 24 ++++++++++++------------ fs/ext4/inode.c | 25 +++++++++++++------------ fs/ext4/mballoc.c | 20 ++++++++++---------- fs/ext4/namei.c | 25 ++++++++++++------------- 9 files changed, 77 insertions(+), 79 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a711898923f..a0c23b03a26 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -205,8 +205,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { - unsigned long group_desc; - unsigned long offset; + unsigned int group_desc; + unsigned int offset; struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -225,7 +225,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, if (!sbi->s_group_desc[group_desc]) { ext4_error(sb, "ext4_get_group_desc", "Group descriptor not loaded - " - "block_group = %u, group_desc = %lu, desc = %lu", + "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } @@ -372,8 +372,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned long i; - unsigned long overflow; + unsigned int i; + unsigned int overflow; struct ext4_group_desc *desc; struct ext4_super_block *es; struct ext4_sb_info *sbi; @@ -720,7 +720,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; - unsigned long x; + unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; @@ -740,7 +740,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh, sb->s_blocksize); - printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", i, le16_to_cpu(gdp->bg_free_blocks_count), x); bitmap_count += x; } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 0a7a6663c19..fa3af81ac56 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,10 +15,9 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) +unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; + unsigned int i, sum = 0; if (!map) return 0; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index fed5b610df5..cf3ccf4a94b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) int ext4_check_dir_entry(const char *function, struct inode *dir, struct ext4_dir_entry_2 *de, struct buffer_head *bh, - unsigned long offset) + unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len); @@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, if (error_msg != NULL) ext4_error(dir->i_sb, function, "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + "offset=%u, inode=%u, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, - (unsigned long) le32_to_cpu(de->inode), + le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; } @@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp, void *dirent, filldir_t filldir) { int error = 0; - unsigned long offset; + unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; struct super_block *sb; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e9aacecfbf4..558545d1fea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -94,9 +94,9 @@ struct ext4_allocation_request { /* phys. block for ^^^ */ ext4_fsblk_t pright; /* how many blocks we want to allocate */ - unsigned long len; + unsigned int len; /* flags. see above EXT4_MB_HINT_* */ - unsigned long flags; + unsigned int flags; }; /* @@ -997,6 +997,9 @@ do { \ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, +/* bitmap.c */ +extern unsigned int ext4_count_free(struct buffer_head *, unsigned); + /* balloc.c */ extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); @@ -1024,7 +1027,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned long); + struct buffer_head *, unsigned int); extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1044,7 +1047,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); -extern unsigned long ext4_count_free(struct buffer_head *, unsigned); /* mballoc.c */ extern long ext4_mb_stats; @@ -1074,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize); extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, int); @@ -1276,16 +1274,16 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize); + ext4_lblk_t iblock, unsigned int max_blocks, + struct buffer_head *bh_result, + int create, int extend_disksize); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, - sector_t block, unsigned long max_blocks, + sector_t block, unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag); #endif /* __KERNEL__ */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 0a9ebe58019..e69acc16f5c 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -100,9 +100,6 @@ struct ext4_inode_info { */ loff_t i_disksize; - /* on-disk additional length */ - __u16 i_extra_isize; - /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's @@ -129,10 +126,14 @@ struct ext4_inode_info { spinlock_t i_prealloc_lock; /* allocation reservation info for delalloc */ - unsigned long i_reserved_data_blocks; - unsigned long i_reserved_meta_blocks; - unsigned long i_allocated_meta_blocks; + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; unsigned short i_delalloc_reserved_flag; + + /* on-disk additional length */ + __u16 i_extra_isize; + spinlock_t i_block_reservation_lock; }; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b9e27bc3155..b92cb60737b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2377,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t iblock, - unsigned long max_blocks) + unsigned int max_blocks) { struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; @@ -2675,26 +2675,26 @@ fix_extent_len: */ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, + unsigned int max_blocks, struct buffer_head *bh_result, int create, int extend_disksize) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t goal, newblock; - int err = 0, depth, ret; - unsigned long allocated = 0; + ext4_fsblk_t newblock; + int err = 0, depth, ret, cache_type; + unsigned int allocated = 0; struct ext4_allocation_request ar; loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); - ext_debug("blocks %u/%lu requested for inode %u\n", + ext_debug("blocks %u/%u requested for inode %u\n", iblock, max_blocks, inode->i_ino); /* check in cache */ - goal = ext4_ext_in_cache(inode, iblock, &newex); - if (goal) { - if (goal == EXT4_EXT_CACHE_GAP) { + cache_type = ext4_ext_in_cache(inode, iblock, &newex); + if (cache_type) { + if (cache_type == EXT4_EXT_CACHE_GAP) { if (!create) { /* * block isn't allocated yet and @@ -2703,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (goal == EXT4_EXT_CACHE_EXTENT) { + } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { /* block is already allocated */ newblock = iblock - le32_to_cpu(newex.ee_block) @@ -2851,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (!newblock) goto out2; ext_debug("allocate new block: goal %llu, found %llu/%lu\n", - goal, newblock, allocated); + ar.goal, newblock, allocated); /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); @@ -3001,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) handle_t *handle; ext4_lblk_t block; loff_t new_size; - unsigned long max_blocks; + unsigned int max_blocks; int ret = 0; int ret2 = 0; int retries = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 45d0f70a1f0..91e06f88f08 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -514,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { - unsigned long count = 0; + unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet @@ -856,10 +856,10 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) */ -int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize) +static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -1061,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * It returns the error in case of allocation failure. */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, - unsigned long max_blocks, struct buffer_head *bh, + unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag) { int retval; @@ -1641,7 +1641,7 @@ struct mpage_da_data { get_block_t *get_block; struct writeback_control *wbc; int io_done; - long pages_written; + int pages_written; int retval; }; @@ -1855,9 +1855,9 @@ static void ext4_print_free_blocks(struct inode *inode) printk(KERN_EMERG "dirty_blocks=%lld\n", (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); printk(KERN_EMERG "Block reservation details\n"); - printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", + printk(KERN_EMERG "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", + printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2307,7 +2307,7 @@ static int ext4_da_writepage(struct page *page, { int ret = 0; loff_t size; - unsigned long len; + unsigned int len; struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; @@ -2416,7 +2416,8 @@ static int ext4_da_writepages(struct address_space *mapping, struct mpage_da_data mpd; struct inode *inode = mapping->host; int no_nrwrite_index_update; - long pages_written = 0, pages_skipped; + int pages_written = 0; + long pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6cfe68a7e07..1d78435ce38 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2796,7 +2796,7 @@ void exit_ext4_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned long reserv_blks) + handle_t *handle, unsigned int reserv_blks) { struct buffer_head *bitmap_bh = NULL; struct ext4_super_block *es; @@ -3036,7 +3036,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - unsigned long pa_end; + ext4_lblk_t pa_end; if (pa->pa_deleted) continue; @@ -3080,7 +3080,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - unsigned long pa_end; + ext4_lblk_t pa_end; spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { pa_end = pa->pa_lstart + pa->pa_len; @@ -3584,8 +3584,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long end; - unsigned long next; + unsigned int end; + unsigned int next; ext4_group_t group; ext4_grpblk_t bit; sector_t start; @@ -4029,8 +4029,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; - unsigned long len; - unsigned long goal; + unsigned int len; + ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ @@ -4291,8 +4291,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; - unsigned long inquota; - unsigned long reserv_blks = 0; + unsigned int inquota; + unsigned int reserv_blks = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4504,7 +4504,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; struct ext4_super_block *es; - unsigned long overflow; + unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 08873e938ab..183a09a8b14 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -811,7 +811,7 @@ static inline int ext4_match (int len, const char * const name, static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, const struct qstr *d_name, - unsigned long offset, + unsigned int offset, struct ext4_dir_entry_2 ** res_dir) { struct ext4_dir_entry_2 * de; @@ -1048,11 +1048,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru bh = ext4_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { - unsigned long ino = le32_to_cpu(de->inode); + __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { ext4_error(dir->i_sb, "ext4_lookup", - "bad inode number: %lu", ino); + "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); @@ -1065,7 +1065,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { - unsigned long ino; + __u32 ino; struct inode *inode; static const struct qstr dotdot = { .name = "..", @@ -1083,7 +1083,7 @@ struct dentry *ext4_get_parent(struct dentry *child) if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { ext4_error(child->d_inode->i_sb, "ext4_get_parent", - "bad inode number: %lu", ino); + "bad inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1271,7 +1271,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned long offset = 0; + unsigned int offset = 0; unsigned short reclen; int nlen, rlen, err; char *top; @@ -1444,7 +1444,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -1466,7 +1465,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { + for (block = 0; block < blocks; block++) { bh = ext4_bread(handle, dir, block, 0, &retval); if(!bh) return retval; @@ -1861,7 +1860,7 @@ out_stop: */ static int empty_dir(struct inode *inode) { - unsigned long offset; + unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de, *de1; struct super_block *sb; @@ -1906,7 +1905,7 @@ static int empty_dir(struct inode *inode) if (err) ext4_error(sb, __func__, "error %d reading directory" - " #%lu offset %lu", + " #%lu offset %u", err, inode->i_ino, offset); offset += sb->s_blocksize; continue; @@ -2009,7 +2008,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi; - unsigned long ino_next; + __u32 ino_next; struct ext4_iloc iloc; int err = 0; @@ -2042,7 +2041,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) goto out_err; if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %lu\n", ino_next); + jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) @@ -2054,7 +2053,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - jbd_debug(4, "orphan inode %lu will point to %lu\n", + jbd_debug(4, "orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) From 1a0d3786dd57dbd74f340322054c3d618b999dcf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Nov 2008 00:09:22 -0500 Subject: [PATCH 27/57] jbd2: Remove a large array of bh's from the stack of the checkpoint routine jbd2_log_do_checkpoint()n is one of the kernel's largest stack users. Move the array of buffer head's from the stack of jbd2_log_do_checkpoint() to the in-core journal structure. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/checkpoint.c | 22 +++++++++------------- fs/jbd2/journal.c | 2 ++ include/linux/jbd2.h | 10 ++++++++++ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9497718fe92..adc08ec875e 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -249,16 +249,14 @@ restart: return ret; } -#define NR_BATCH 64 - static void -__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) +__flush_batch(journal_t *journal, int *batch_count) { int i; - ll_rw_block(SWRITE, *batch_count, bhs); + ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); for (i = 0; i < *batch_count; i++) { - struct buffer_head *bh = bhs[i]; + struct buffer_head *bh = journal->j_chkpt_bhs[i]; clear_buffer_jwrite(bh); BUFFER_TRACE(bh, "brelse"); __brelse(bh); @@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count, - transaction_t *transaction) + int *batch_count, transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; @@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); - bhs[*batch_count] = bh; + journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); jbd_unlock_bh_state(bh); transaction->t_chp_stats.cs_written++; (*batch_count)++; - if (*batch_count == NR_BATCH) { + if (*batch_count == JBD2_NR_BATCH) { spin_unlock(&journal->j_list_lock); - __flush_batch(journal, bhs, batch_count); + __flush_batch(journal, batch_count); ret = 1; } } @@ -388,7 +385,6 @@ restart: if (journal->j_checkpoint_transactions == transaction && transaction->t_tid == this_tid) { int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; struct journal_head *jh; int retry = 0, err; @@ -402,7 +398,7 @@ restart: retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, &batch_count, + retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) result = retry; @@ -419,7 +415,7 @@ restart: spin_unlock(&journal->j_list_lock); retry = 1; } - __flush_batch(journal, bhs, &batch_count); + __flush_batch(journal, &batch_count); } if (retry) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fd1d7557a09..34ef9805720 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1477,7 +1477,9 @@ int jbd2_journal_destroy(journal_t *journal) spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a3cd647ea1b..004c9a8d63e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -687,6 +687,8 @@ jbd2_time_diff(unsigned long start, unsigned long end) return end + (MAX_JIFFY_OFFSET - start); } +#define JBD2_NR_BATCH 64 + /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. @@ -830,6 +832,14 @@ struct journal_s /* Semaphore for locking against concurrent checkpoints */ struct mutex j_checkpoint_mutex; + /* + * List of buffer heads used by the checkpoint routine. This + * was moved from jbd2_log_do_checkpoint() to reduce stack + * usage. Access to this array is controlled by the + * j_checkpoint_mutex. [j_checkpoint_mutex] + */ + struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; + /* * Journal head: identifies the first unused block in the journal. * [j_state_lock] From 3a06d778dfeda7eaeeb79bfa49cf97f2aae132b4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 22 Nov 2008 15:04:59 -0500 Subject: [PATCH 28/57] ext4: sparse fixes * Change EXT4_HAS_*_FEATURE to return a boolean * Add a function prototype for ext4_fiemap() in ext4.h * Make ext4_ext_fiemap_cb() and ext4_xattr_fiemap() be static functions * Add lock annotations to mb_free_blocks() Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 9 ++++++--- fs/ext4/extents.c | 5 +++-- fs/ext4/file.c | 3 --- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 4 +++- fs/ext4/super.c | 19 +++++++++++-------- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 558545d1fea..5125c1f6e7e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -727,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ @@ -1286,6 +1286,9 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b92cb60737b..c64080e4949 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3080,7 +3080,7 @@ retry: /* * Callback function called for each extent to gather FIEMAP information. */ -int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { @@ -3149,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) -int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) { __u64 physical = 0; __u64 length; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6bd11fba71f..f731cb545a0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len); - const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 91e06f88f08..bcd5ffa76c0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3907,7 +3907,7 @@ static int __ext4_get_inode_loc(struct inode *inode, ext4_fsblk_t block; int inodes_per_block, inode_offset; - iloc->bh = 0; + iloc->bh = NULL; if (!ext4_valid_inum(sb, inode->i_ino)) return -EIO; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1d78435ce38..edf9730ba72 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1056,6 +1056,8 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) +__releases(bitlock) +__acquires(bitlock) { int block = 0; int max = 0; @@ -2244,7 +2246,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) /* Create and initialize ext4_group_info data for the given group. */ -int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, +static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i, len; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8fa57be5040..a9dd1170bfe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1924,7 +1924,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int db_count; int i; int needs_recovery, has_huge_files; - __le32 features; + int features; __u64 blocks_count; int err; @@ -2056,15 +2056,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); if (features) { printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + "unsupported optional features (%x).\n", sb->s_id, + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); goto failed_mount; } features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + "unsupported optional features (%x).\n", sb->s_id, + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); goto failed_mount; } has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -3131,13 +3133,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) lock_super(sb); } } else { - __le32 ret; + int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP))) { printk(KERN_WARNING "EXT4-fs: %s: couldn't " "remount RDWR because of unsupported " - "optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + "optional features (%x).\n", sb->s_id, + (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); err = -EROFS; goto restore_opts; } From e21675d4b63975d09eb75c443c48ebe663d23e18 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:36:02 -0500 Subject: [PATCH 29/57] ext4: Add blocks added during resize to bitmap With this change new blocks added during resize are marked as free in the block bitmap and the group is flagged with EXT4_GROUP_INFO_NEED_INIT_BIT flag. This makes sure when mballoc tries to allocate blocks from the new group we would reload the buddy information using the bitmap present in the disk. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/balloc.c | 136 +++++++++++------------------------------------ fs/ext4/ext4.h | 5 +- fs/ext4/resize.c | 11 +--- 3 files changed, 34 insertions(+), 118 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a0c23b03a26..c54192e2384 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -20,6 +20,7 @@ #include "ext4.h" #include "ext4_jbd2.h" #include "group.h" +#include "mballoc.h" /* * balloc.c contains the blocks allocation and deallocation routines @@ -350,62 +351,43 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_free_blocks_sb() -- Free given blocks and update quota + * ext4_add_groupblocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to free + * @block: start physcial block to add to the block group * @count: number of blocks to free - * @pdquot_freed_blocks: pointer to quota * - * XXX This function is only used by the on-line resizing code, which - * should probably be fixed up to call the mballoc variant. There - * this needs to be cleaned up later; in fact, I'm not convinced this - * is 100% correct in the face of the mballoc code. The online resizing - * code needs to be fixed up to more tightly (and correctly) interlock - * with the mballoc code. + * This marks the blocks as free in the bitmap. We ask the + * mballoc to reload the buddy after this by setting group + * EXT4_GROUP_INFO_NEED_INIT_BIT flag */ -void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks) +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; unsigned int i; - unsigned int overflow; struct ext4_group_desc *desc; struct ext4_super_block *es; struct ext4_sb_info *sbi; int err = 0, ret; - ext4_grpblk_t group_freed; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; - *pdquot_freed_blocks = 0; sbi = EXT4_SB(sb); es = sbi->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { - ext4_error(sb, "ext4_free_blocks", - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); - goto error_return; - } + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1); - -do_more: - overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); - count -= overflow; + goto error_return; } - brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; @@ -418,18 +400,17 @@ do_more: in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error(sb, "ext4_free_blocks", - "Freeing blocks in system zones - " + ext4_error(sb, __func__, + "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); goto error_return; } /* - * We are about to start releasing blocks in the bitmap, + * We are about to add blocks to the bitmap, * so we need undo access. */ - /* @@@ check errors */ BUFFER_TRACE(bitmap_bh, "getting undo access"); err = ext4_journal_get_undo_access(handle, bitmap_bh); if (err) @@ -445,87 +426,28 @@ do_more: if (err) goto error_return; - jbd_lock_bh_state(bitmap_bh); - - for (i = 0, group_freed = 0; i < count; i++) { - /* - * An HJ special. This is expensive... - */ -#ifdef CONFIG_JBD2_DEBUG - jbd_unlock_bh_state(bitmap_bh); - { - struct buffer_head *debug_bh; - debug_bh = sb_find_get_block(sb, block + i); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "Deleted!"); - if (!bh2jh(bitmap_bh)->b_committed_data) - BUFFER_TRACE(debug_bh, - "No commited data in bitmap"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); - __brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); -#endif - if (need_resched()) { - jbd_unlock_bh_state(bitmap_bh); - cond_resched(); - jbd_lock_bh_state(bitmap_bh); - } - /* @@@ This prevents newly-allocated data from being - * freed and then reallocated within the same - * transaction. - * - * Ideally we would want to allow that to happen, but to - * do so requires making jbd2_journal_forget() capable of - * revoking the queued write of a data block, which - * implies blocking on the journal lock. *forget() - * cannot block due to truncate races. - * - * Eventually we can fix this by making jbd2_journal_forget() - * return a status indicating whether or not it was able - * to revoke the buffer. On successful revoke, it is - * safe not to set the allocation bit in the committed - * bitmap, because we know that there is no outstanding - * activity on the buffer any more and so it is safe to - * reallocate it. - */ - BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); - J_ASSERT_BH(bitmap_bh, - bh2jh(bitmap_bh)->b_committed_data != NULL); - ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, - bh2jh(bitmap_bh)->b_committed_data); - - /* - * We clear the bit in the bitmap after setting the committed - * data bit, because this is the reverse order to that which - * the allocator uses. - */ + for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { - jbd_unlock_bh_state(bitmap_bh); ext4_error(sb, __func__, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); - jbd_lock_bh_state(bitmap_bh); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - group_freed++; + blocks_freed++; } } - jbd_unlock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&desc->bg_free_blocks_count, group_freed); + le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += count; + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; spin_unlock(sb_bgl_lock(sbi, flex_group)); } @@ -536,15 +458,17 @@ do_more: /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) err = ret; - *pdquot_freed_blocks += group_freed; - - if (overflow && !err) { - block += count; - count = overflow; - goto do_more; - } + if (!err) + err = ret; sb->s_dirt = 1; + /* + * request to reload the buddy with the + * new bitmap information + */ + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + ext4_mb_update_group_info(grp, blocks_freed); + error_return: brelse(bitmap_bh); ext4_std_error(sb, err); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5125c1f6e7e..8021bf558d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1014,9 +1014,8 @@ extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); -extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 1865d6a53de..526db73701b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -977,9 +977,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, struct buffer_head *bh; handle_t *handle; int err; - unsigned long freed_blocks; ext4_group_t group; - struct ext4_group_info *grp; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -1077,7 +1075,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, unlock_super(sb); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + /* We add the blocks to the bitmap and set the group need init bit */ + ext4_add_groupblocks(handle, sb, o_blocks_count, add); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) @@ -1120,12 +1119,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ClearPageUptodate(page); page_cache_release(page); } - - /* Get the info on the last group */ - grp = ext4_get_group_info(sb, group); - - /* Update free blocks in group info */ - ext4_mb_update_group_info(grp, add); } if (test_opt(sb, DEBUG)) From 920313a726e04fef0f2c0bcb04ad8229c0e700d8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:36:19 -0500 Subject: [PATCH 30/57] ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize The new groups added during resize are flagged as need_init group. Make sure we properly initialize these groups. When we have block size < page size and we are adding new groups the page may still be marked uptodate even though we haven't initialized the group. While forcing the init of buddy cache we need to make sure other groups part of the same page of buddy cache is not using the cache. group_info->alloc_sem is added to ensure the same. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" cc: stable@kernel.org --- fs/ext4/balloc.c | 21 ++-- fs/ext4/ext4.h | 7 +- fs/ext4/mballoc.c | 261 +++++++++++++++++++++++++++++++++++----------- fs/ext4/mballoc.h | 3 + fs/ext4/resize.c | 49 ++------- 5 files changed, 230 insertions(+), 111 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c54192e2384..404d81cc915 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -381,6 +381,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. @@ -425,7 +426,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - + /* + * make sure we don't allow a parallel init on other groups in the + * same buddy cache + */ + down_write(&grp->alloc_sem); for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), @@ -450,6 +455,13 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; spin_unlock(sb_bgl_lock(sbi, flex_group)); } + /* + * request to reload the buddy with the + * new bitmap information + */ + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + ext4_mb_update_group_info(grp, blocks_freed); + up_write(&grp->alloc_sem); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -461,13 +473,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, if (!err) err = ret; sb->s_dirt = 1; - /* - * request to reload the buddy with the - * new bitmap information - */ - grp = ext4_get_group_info(sb, block_group); - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - ext4_mb_update_group_info(grp, blocks_freed); error_return: brelse(bitmap_bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8021bf558d1..8152b5603f0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1060,12 +1060,13 @@ extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, unsigned long *); -extern int ext4_mb_add_more_groupinfo(struct super_block *sb, +extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add); - - +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); +extern void ext4_mb_put_buddy_cache_lock(struct super_block *, + ext4_group_t, int); /* inode.c */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index edf9730ba72..d2b1bcaf88e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -886,18 +886,20 @@ static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; int blocks_per_page; int block; int pnum; int poff; struct page *page; int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; mb_debug("load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = ext4_get_group_info(sb, group); @@ -905,6 +907,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + e4b->alloc_semp = &grp->alloc_sem; + + /* Take the read lock on the group alloc + * sem. This would make sure a parallel + * ext4_mb_init_group happening on other + * groups mapped by the page is blocked + * till we are done with allocation + */ + down_read(e4b->alloc_semp); /* * the buddy cache inode stores the block bitmap @@ -920,6 +931,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, page = find_get_page(inode->i_mapping, pnum); if (page == NULL || !PageUptodate(page)) { if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ page_cache_release(page); page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { @@ -985,6 +1004,9 @@ err: page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; + + /* Done with the buddy cache */ + up_read(e4b->alloc_semp); return ret; } @@ -994,6 +1016,8 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); + /* Done with the buddy cache */ + up_read(e4b->alloc_semp); } @@ -1696,6 +1720,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } +/* + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock + */ +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) +{ + int i; + int block, pnum; + int blocks_per_page; + int groups_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write(&grp->alloc_sem); + } + return i; +} + +void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) +{ + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); + } + +} + +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug("init group %lu\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures we don't add group + * to this buddy cache via resize + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -1779,7 +1970,7 @@ repeat: group = 0; /* quick check to skip empty groups */ - grp = ext4_get_group_info(ac->ac_sb, group); + grp = ext4_get_group_info(sb, group); if (grp->bb_free == 0) continue; @@ -1792,10 +1983,9 @@ repeat: * we need full data about the group * to make a good selection */ - err = ext4_mb_load_buddy(sb, group, &e4b); + err = ext4_mb_init_group(sb, group); if (err) goto out; - ext4_mb_release_desc(&e4b); } /* @@ -2246,7 +2436,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) /* Create and initialize ext4_group_info data for the given group. */ -static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i, len; @@ -2304,6 +2494,7 @@ static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK @@ -2330,54 +2521,6 @@ exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ -/* - * Add a group to the existing groups. - * This function is used for online resize - */ -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, - struct ext4_group_desc *desc) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - int blocks_per_page; - int block; - int pnum; - struct page *page; - int err; - - /* Add group based on group descriptor*/ - err = ext4_mb_add_groupinfo(sb, group, desc); - if (err) - return err; - - /* - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap - * datas) are set not up to date so that they will be re-initilaized - * during the next call to ext4_mb_load_buddy - */ - - /* Set buddy page as not up to date */ - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - block = group * 2; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Set bitmap page as not up to date */ - block++; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - return 0; -} - /* * Update an existing group. * This function is used for online resize @@ -4588,11 +4731,6 @@ do_more: err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - #ifdef AGGRESSIVE_CHECK { int i; @@ -4606,6 +4744,8 @@ do_more: /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto error_return; if (ac) { ac->ac_b_ex.fe_group = block_group; @@ -4614,6 +4754,9 @@ do_more: ext4_mb_store_history(ac); } + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; if (metadata && ext4_handle_valid(handle)) { /* blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b5dff1fff1e..a931b6b4f6a 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -130,6 +131,7 @@ struct ext4_group_info { #ifdef DOUBLE_CHECK void *bb_bitmap; #endif + struct rw_semaphore alloc_sem; unsigned short bb_counters[]; }; @@ -250,6 +252,7 @@ struct ext4_buddy { struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; + struct rw_semaphore *alloc_semp; }; #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 526db73701b..92034d2c8a7 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -748,6 +748,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) struct inode *inode = NULL; handle_t *handle; int gdb_off, gdb_num; + int num_grp_locked = 0; int err, err2; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); @@ -788,6 +789,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } } + if ((err = verify_group_input(sb, input))) goto exit_put; @@ -856,6 +858,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * using the new disk blocks. */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); @@ -872,9 +875,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * We can allocate memory for mb_alloc based on the new group * descriptor */ - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); - if (err) + err = ext4_mb_add_groupinfo(sb, input->group, gdp); + if (err) { + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); goto exit_journal; + } /* * Make the new blocks and inodes valid next. We do this before @@ -916,6 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); ext4_handle_dirty_metadata(handle, NULL, primary); @@ -1082,45 +1088,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if ((err = ext4_journal_stop(handle))) goto exit_put; - /* - * Mark mballoc pages as not up to date so that they will be updated - * next time they are loaded by ext4_mb_load_buddy. - * - * XXX Bad, Bad, BAD!!! We should not be overloading the - * Uptodate flag, particularly on thte bitmap bh, as way of - * hinting to ext4_mb_load_buddy() that it needs to be - * overloaded. A user could take a LVM snapshot, then do an - * on-line fsck, and clear the uptodate flag, and this would - * not be a bug in userspace, but a bug in the kernel. FIXME!!! - */ - { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - int blocks_per_page; - int block; - int pnum; - struct page *page; - - /* Set buddy page as not up to date */ - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - block = group * 2; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Set bitmap page as not up to date */ - block++; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - } - if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", ext4_blocks_count(es)); From c3a326a657562dab81acf05aee106dc1fe345eb4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Nov 2008 15:11:52 -0500 Subject: [PATCH 31/57] ext4: cleanup mballoc header files Move some of the forward declaration of the static functions to mballoc.c where they are used. This enables us to include mballoc.h in other .c files. Also correct the buddy cache documentation. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 23 +++++++++++++++++++---- fs/ext4/mballoc.h | 18 +----------------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d2b1bcaf88e..c17063ddb30 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -100,7 +100,7 @@ * inode as: * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we @@ -330,6 +330,16 @@ * object * */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static int ext4_mb_init_per_dev_proc(struct super_block *sb); +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); + + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -716,7 +726,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, * stored in the inode as * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. @@ -1322,8 +1332,13 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; - /* XXXXXXX: SUCH A HORRIBLE **CK */ - /*FIXME!! Why ? */ + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ ac->ac_bitmap_page = e4b->bd_bitmap_page; get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index a931b6b4f6a..997f78fff12 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -99,9 +99,6 @@ */ #define MB_DEFAULT_GROUP_PREALLOC 512 -static struct kmem_cache *ext4_pspace_cachep; -static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; struct ext4_free_data { /* this links the free block information from group_info */ @@ -262,25 +259,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) { return; } -#else -static void ext4_mb_store_history(struct ext4_allocation_context *ac); #endif #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group); -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, sector_t block, - int count); -static void ext4_mb_put_pa(struct ext4_allocation_context *, - struct super_block *, struct ext4_prealloc_space *pa); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); - static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { @@ -306,7 +290,7 @@ static inline int ext4_is_group_locked(struct super_block *sb, &(grinfo->bb_state)); } -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { ext4_fsblk_t block; From fb68407b0d9efba962c03f55009c797e22f024bc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Nov 2008 17:50:21 -0500 Subject: [PATCH 32/57] jbd2: Call journal commit callback without holding j_list_lock Avoid freeing the transaction in __jbd2_journal_drop_transaction() so the journal commit callback can run without holding j_list_lock, to avoid lock contention on this spinlock. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 13 ++++++++----- include/linux/jbd2.h | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index adc08ec875e..17159cacbd9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -682,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) safely remove this transaction from the log */ __jbd2_journal_drop_transaction(journal, transaction); + kfree(transaction); /* Just in case anybody was waiting for more transactions to be checkpointed... */ @@ -756,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(journal->j_running_transaction != transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); - kfree(transaction); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f22d1828ea8..0ad84162c42 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -363,7 +363,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int space_left = 0; int first_tag = 0; int tag_flag; - int i; + int i, to_free = 0; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -1011,12 +1011,10 @@ restart_loop: journal->j_average_commit_time = commit_time; spin_unlock(&journal->j_state_lock); - if (journal->j_commit_callback) - journal->j_commit_callback(journal, commit_transaction); - if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); + to_free = 1; } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; @@ -1035,11 +1033,16 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", - journal->j_devname, journal->j_commit_sequence, + journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); + if (to_free) + kfree(commit_transaction); wake_up(&journal->j_wait_done_commit); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 004c9a8d63e..9d82084a160 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1179,8 +1179,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); void __jbd2_log_wait_for_space(journal_t *journal); -extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); -extern int jbd2_cleanup_journal_tail(journal_t *); +extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); +extern int jbd2_cleanup_journal_tail(journal_t *); /* Debugging code only: */ From 7a2fcbf7f85737735fd44eb34b62315bccf6d6e4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:36:55 -0500 Subject: [PATCH 33/57] ext4: don't use blocks freed but not yet committed in buddy cache init When we generate buddy cache (especially during resize) we need to make sure we don't use the blocks freed but not yet comitted. This makes sure we have the right value of free blocks count in the group info and also in the bitmap. This also ensures the ordered mode consistency Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 82 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c17063ddb30..860766421fe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -335,6 +335,8 @@ static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_ext_cachep; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); @@ -858,7 +860,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* * incore got set to the group block bitmap below */ + ext4_lock_group(sb, group); ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ @@ -872,6 +876,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -3471,6 +3476,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) return 0; } +/* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, node); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, entry->start_blk, + entry->count); + n = rb_next(n); + } + return; +} + /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap @@ -4568,12 +4599,13 @@ static int can_merge(struct ext4_free_data *entry1, static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, - ext4_group_t group, ext4_grpblk_t block, int count) + struct ext4_free_data *new_entry) { + ext4_grpblk_t block; + struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_data *entry, *new_entry; struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; @@ -4581,14 +4613,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); - new_entry->start_blk = block; - new_entry->group = group; - new_entry->count = count; - new_entry->t_tid = handle->h_transaction->t_tid; new_node = &new_entry->node; + block = new_entry->start_blk; - ext4_lock_group(sb, group); if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, @@ -4606,7 +4633,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_unlock_group(sb, group); ext4_error(sb, __func__, "Double free of blocks %d (%d %d)", block, entry->start_blk, entry->count); @@ -4648,7 +4674,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, spin_lock(&sbi->s_md_lock); list_add(&new_entry->list, &handle->h_transaction->t_private_list); spin_unlock(&sbi->s_md_lock); - ext4_unlock_group(sb, group); return 0; } @@ -4753,15 +4778,6 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (err) - goto error_return; - if (ac) { ac->ac_b_ex.fe_group = block_group; ac->ac_b_ex.fe_start = bit; @@ -4773,11 +4789,29 @@ do_more: if (err) goto error_return; if (metadata && ext4_handle_valid(handle)) { - /* blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed */ - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = bit; + new_entry->group = block_group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); + ext4_mb_free_metadata(handle, &e4b, new_entry); + ext4_unlock_group(sb, block_group); } else { ext4_lock_group(sb, block_group); + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); ext4_unlock_group(sb, block_group); @@ -4800,6 +4834,10 @@ do_more: *freed += count; + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); From b7be019e80da4db96d283734d55366014509911c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 23 Nov 2008 23:51:53 -0500 Subject: [PATCH 34/57] ext4: Fix lockdep recursive locking warning In ext4_mb_init_group(), if the filesystem block size is less than PAGE_SIZE/2, the code tries to grab alloc_sem for multiple block groups in a loop. We need to allow for this by using down_write_nested() and passing in the loop index as a lock subclass number. This works because no other code path needs to take multiple alloc_sem's. Note that lockdep will fail for filesystem blocksize smaller than to PAGE_SIZE/16k. (e.g., a 1k filesystem blocksize with a 32k page size, or a 2k filesystem blocksize with a 64k blocksize, etc.) Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 860766421fe..0bf4c4c06b1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1780,7 +1780,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) * no block allocation going on in any * of that groups */ - down_write(&grp->alloc_sem); + down_write_nested(&grp->alloc_sem, i); } return i; } From 5d1b1b3f492f8696ea18950a454a141381b0f926 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 22:19:52 -0500 Subject: [PATCH 35/57] ext4: fix BUG when calling ext4_error with locked block group The mballoc code likes to call ext4_error while it is holding locked block groups. This can causes a scheduling in atomic context BUG. We can't just unlock the block group and relock it after/if ext4_error returns since that might result in race conditions in the case where the filesystem is set to continue after finding errors. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/mballoc.c | 30 +++++++++++++++--------------- fs/ext4/mballoc.h | 47 ----------------------------------------------- fs/ext4/super.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 105 insertions(+), 64 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8152b5603f0..f0b1db6acf8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1126,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_warning(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, + const char *, const char *, ...) + __attribute__ ((format (printf, 4, 5))); extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -1249,6 +1252,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) return ; } +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + unsigned short bb_counters[]; +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_LOCKED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline int ext4_is_group_locked(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, + &(grinfo->bb_state)); +} + /* * Inodes and files operations */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0bf4c4c06b1..cda69632eea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -457,8 +457,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr += first + i; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - - ext4_error(sb, __func__, "double-free of inode" + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, first + i, e4b->bd_group); @@ -702,7 +702,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_error(sb, __func__, + ext4_grp_locked_error(sb, group, __func__, "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", group, free, grp->bb_free); /* @@ -1095,8 +1095,6 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) -__releases(bitlock) -__acquires(bitlock) { int block = 0; int max = 0; @@ -1135,12 +1133,11 @@ __acquires(bitlock) blocknr += block; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_unlock_group(sb, e4b->bd_group); - ext4_error(sb, __func__, "double-free of inode" + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); - ext4_lock_group(sb, e4b->bd_group); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1623,7 +1620,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_error(sb, __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "%d free blocks as per " "group info. But bitmap says 0", free); break; @@ -1632,7 +1630,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_error(sb, __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "%d free blocks as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -3822,8 +3821,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_error(sb, __func__, "free %u, pa_free %u", - free, pa->pa_free); + ext4_grp_locked_error(sb, group, + __func__, "free %u, pa_free %u", + free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. @@ -4633,9 +4633,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_error(sb, __func__, - "Double free of blocks %d (%d %d)", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, e4b->bd_group, __func__, + "Double free of blocks %d (%d %d)", + block, entry->start_blk, entry->count); return 0; } } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 997f78fff12..95d4c7f29a8 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -118,27 +118,6 @@ struct ext4_free_data { tid_t t_tid; }; -struct ext4_group_info { - unsigned long bb_state; - struct rb_root bb_free_root; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; - struct list_head bb_prealloc_list; -#ifdef DOUBLE_CHECK - void *bb_bitmap; -#endif - struct rw_semaphore alloc_sem; - unsigned short bb_counters[]; -}; - -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 - -#define EXT4_MB_GRP_NEED_INIT(grp) \ - (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) - - struct ext4_prealloc_space { struct list_head pa_inode_list; struct list_head pa_group_list; @@ -264,32 +243,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); - - -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); -} - static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9dd1170bfe..2415e2b0970 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -366,6 +366,44 @@ void ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, + const char *function, const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + va_list args; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (test_opt(sb, ERRORS_CONT)) { + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, es, 0); + return; + } + ext4_unlock_group(sb, grp); + ext4_handle_error(sb); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish beween the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -2868,8 +2906,11 @@ static void ext4_commit_super(struct super_block *sb, set_buffer_uptodate(sbh); } es->s_wtime = cpu_to_le32(get_seconds()); - ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); - es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); + BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { From e8134b27e351e813414da3b95aa8eac6d3908088 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:38:26 -0500 Subject: [PATCH 36/57] ext4: Fix race between read_block_bitmap() and mark_diskspace_used() We need to make sure we update the block bitmap and clear EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held, since ext4_read_block_bitmap() looks at EXT4_BG_BLOCK_UNINIT to decide whether to initialize the block bitmap each time it is called (introduced by commit c806e68f), and this can race with block allocations in ext4_mb_mark_diskspace_used(). ext4_read_block_bitmap does: spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); Now on the block allocation side we do mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); .... spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ie on allocation we update the bitmap then we take the sb_bgl_lock and clear the EXT4_BG_BLOCK_UNINIT flag. What can happen is a parallel ext4_read_block_bitmap can zero out the bitmap in between the above mb_set_bits and spin_lock(sb_bg_lock..) The race results in below user visible errors EXT4-fs error (device sdb1): ext4_mb_release_inode_pa: free 100, pa_free 105 EXT4-fs error (device sdb1): mb_free_blocks: double-free of inode 0's block .. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cda69632eea..d559a03f3eb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1070,7 +1070,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_clear_bit_atomic(lock, cur, bm); + if (lock) + mb_clear_bit_atomic(lock, cur, bm); + else + mb_clear_bit(cur, bm); cur++; } } @@ -1088,7 +1091,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_set_bit_atomic(lock, cur, bm); + if (lock) + mb_set_bit_atomic(lock, cur, bm); + else + mb_set_bit(cur, bm); cur++; } } @@ -3035,10 +3041,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + mb_set_bits(NULL, bitmap_bh->b_data, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); gdp->bg_free_blocks_count = From 560671a0d3c9ad2d647fa6d09375a262e1f19c4f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 22:20:24 -0500 Subject: [PATCH 37/57] ext4: Use high 16 bits of the block group descriptor's free counts fields Rename the lower bits with suffix _lo and add helper to access the values. Also rename bg_itable_unused_hi to bg_pad as in e2fsprogs. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 13 ++++---- fs/ext4/ext4.h | 26 ++++++++++++--- fs/ext4/ialloc.c | 83 +++++++++++++++++++++++++---------------------- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 15 +++++---- fs/ext4/resize.c | 4 +-- fs/ext4/super.c | 68 ++++++++++++++++++++++++++++++++++++-- 7 files changed, 149 insertions(+), 62 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 404d81cc915..902bf66c8df 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -102,9 +102,9 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, "Checksum bad for group %u", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } @@ -372,7 +372,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, struct ext4_group_desc *desc; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int err = 0, ret; + int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; struct ext4_group_info *grp; @@ -444,7 +444,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, } } spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); @@ -685,7 +686,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + desc_count += ext4_free_blks_count(sb, gdp); } return desc_count; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f0b1db6acf8..ec862f4ca89 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -156,12 +156,12 @@ struct ext4_group_desc __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ - __le16 bg_free_blocks_count; /* Free blocks count */ - __le16 bg_free_inodes_count; /* Free inodes count */ - __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ - __le16 bg_itable_unused; /* Unused inodes count */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ @@ -169,7 +169,7 @@ struct ext4_group_desc __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ - __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __u32 bg_reserved2[3]; }; @@ -1142,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); +extern __u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index cac3617ec78..11c4f6f5bd6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,9 +76,9 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, "Checksum bad for group %u", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } @@ -168,7 +168,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err; + int fatal = 0, err, count; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -236,9 +236,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (gdp) { spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_inodes_count, 1); - if (is_directory) - le16_add_cpu(&gdp->bg_used_dirs_count, -1); + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); @@ -291,13 +294,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, for (group = 0; group < ngroups; group++) { desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { + (ext4_free_blks_count(sb, desc) > + ext4_free_blks_count(sb, best_desc))) { *best_group = group; best_desc = desc; ret = 0; @@ -369,7 +372,7 @@ found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { desc = ext4_get_group_desc(sb, i, &bh); - if (le16_to_cpu(desc->bg_free_inodes_count)) { + if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; } @@ -443,17 +446,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + if (ext4_used_dirs_count(sb, desc) >= best_ndir) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + if (ext4_free_blks_count(sb, desc) < avefreeb) continue; *group = grp; ret = 0; - best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + best_ndir = ext4_used_dirs_count(sb, desc); } if (ret == 0) return ret; @@ -479,13 +482,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + if (ext4_used_dirs_count(sb, desc) >= max_dirs) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + if (ext4_free_inodes_count(sb, desc) < min_inodes) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + if (ext4_free_blks_count(sb, desc) < min_blocks) continue; return 0; } @@ -494,8 +497,8 @@ fallback: for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && desc->bg_free_inodes_count && - le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_inodes_count(sb, desc) >= avefreei) return 0; } @@ -524,8 +527,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; /* @@ -548,8 +551,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; } @@ -562,7 +565,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && ext4_free_inodes_count(sb, desc)) return 0; } @@ -591,7 +594,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; - int ret2, err = 0; + int ret2, err = 0, count; struct inode *ret; ext4_group_t i; int free = 0; @@ -718,7 +721,7 @@ got: if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); - gdp->bg_free_blocks_count = cpu_to_le16(free); + ext4_free_blks_set(sb, gdp, free); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } @@ -753,7 +756,7 @@ got: free = 0; } else { free = EXT4_INODES_PER_GROUP(sb) - - le16_to_cpu(gdp->bg_itable_unused); + ext4_itable_unused_count(sb, gdp); } /* @@ -763,13 +766,15 @@ got: * */ if (ino > free) - gdp->bg_itable_unused = - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); } - le16_add_cpu(&gdp->bg_free_inodes_count, -1); + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); if (S_ISDIR(mode)) { - le16_add_cpu(&gdp->bg_used_dirs_count, 1); + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); @@ -987,7 +992,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (!bitmap_bh) @@ -995,7 +1000,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_inodes_count), x); + i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); @@ -1009,7 +1014,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; @@ -1026,7 +1031,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - count += le16_to_cpu(gdp->bg_used_dirs_count); + count += ext4_used_dirs_count(sb, gdp); } return count; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bcd5ffa76c0..56142accf5c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4014,7 +4014,7 @@ make_io: num = EXT4_INODES_PER_GROUP(sb); if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - num -= le16_to_cpu(gdp->bg_itable_unused); + num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d559a03f3eb..3809a9348f2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2515,7 +2515,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, ext4_free_blocks_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); + ext4_free_blks_count(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); @@ -3046,12 +3046,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - gdp->bg_free_blocks_count = - cpu_to_le16(ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, - gdp)); + ext4_free_blks_set(sb, gdp, + ext4_free_blocks_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); + len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); @@ -4823,7 +4823,8 @@ do_more: } spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_blocks_count, count); + ret = ext4_free_blks_count(sb, gdp) + count; + ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 92034d2c8a7..2d24f423fd8 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -866,8 +866,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); - gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); + ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2415e2b0970..a3321bf2231 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -93,6 +93,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } +__u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_inodes_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { @@ -117,6 +149,38 @@ void ext4_inode_table_set(struct super_block *sb, bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } +void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + /* * Wrappers for jbd2_journal_start/end. * @@ -1561,9 +1625,9 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); sbi->s_flex_groups[flex_group].free_inodes += - le16_to_cpu(gdp->bg_free_inodes_count); + ext4_free_inodes_count(sb, gdp); sbi->s_flex_groups[flex_group].free_blocks += - le16_to_cpu(gdp->bg_free_blocks_count); + ext4_free_blks_count(sb, gdp); } return 1; From 3300beda523136f9f87821e4fba85c5c9e319645 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 3 Jan 2009 22:33:39 -0500 Subject: [PATCH 38/57] ext4: code cleanup Rename some variables. We also unlock locks in the reverse order we acquired as a part of cleanup. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- fs/ext4/ialloc.c | 65 +++++++++++++++++++++++++---------------------- fs/ext4/mballoc.c | 2 +- 3 files changed, 37 insertions(+), 32 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 902bf66c8df..1b26b68aa42 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -329,8 +329,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 11c4f6f5bd6..b47427a21f1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -124,8 +124,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); @@ -585,8 +585,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) { struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; ext4_group_t group = 0; unsigned long ino = 0; struct inode *inode; @@ -634,41 +634,44 @@ got_group: for (i = 0; i < sbi->s_groups_count; i++) { err = -EIO; - gdp = ext4_get_group_desc(sb, group, &bh2); + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto fail; - brelse(bitmap_bh); - bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!bitmap_bh) + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!inode_bitmap_bh) goto fail; ino = 0; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) - bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + if (ino < EXT4_INODES_PER_GROUP(sb)) { - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + inode_bitmap_bh); if (err) goto fail; if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) { + ino, inode_bitmap_bh->b_data)) { /* we won it */ - BUFFER_TRACE(bitmap_bh, + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, - inode, - bitmap_bh); + inode, + inode_bitmap_bh); if (err) goto fail; goto got; } /* we lost it */ - ext4_handle_release_buffer(handle, bitmap_bh); + ext4_handle_release_buffer(handle, inode_bitmap_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -699,19 +702,21 @@ got: goto fail; } - BUFFER_TRACE(bh2, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh2); - if (err) goto fail; + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) + goto fail; /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); + struct buffer_head *block_bitmap_bh; - BUFFER_TRACE(block_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bh); + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { - brelse(block_bh); + brelse(block_bitmap_bh); goto fail; } @@ -719,8 +724,8 @@ got: spin_lock(sb_bgl_lock(sbi, group)); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, free); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); @@ -729,12 +734,12 @@ got: /* Don't need to dirty bitmap block if we didn't change it */ if (free) { - BUFFER_TRACE(block_bh, "dirty block bitmap"); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, - NULL, block_bh); + NULL, block_bitmap_bh); } - brelse(block_bh); + brelse(block_bitmap_bh); if (err) goto fail; } @@ -778,8 +783,8 @@ got: } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bh2); + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); @@ -881,7 +886,7 @@ out: iput(inode); ret = ERR_PTR(err); really_out: - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ret; fail_free_drop: @@ -893,7 +898,7 @@ fail_drop: inode->i_nlink = 0; unlock_new_inode(inode); iput(inode); - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ERR_PTR(err); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3809a9348f2..aac33590ac6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -804,8 +804,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_buffer_uptodate(bh[i]); - unlock_buffer(bh[i]); spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + unlock_buffer(bh[i]); continue; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); From 393418676a7602e1d7d3f6e560159c65c8cbd50e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:38:14 -0500 Subject: [PATCH 39/57] ext4: Fix the race between read_inode_bitmap() and ext4_new_inode() We need to make sure we update the inode bitmap and clear EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held, since ext4_read_inode_bitmap() looks at EXT4_BG_INODE_UNINIT to decide whether to initialize the inode bitmap each time it is called. (introduced by commit c806e68f.) ext4_read_inode_bitmap does: spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); and ext4_new_inode does if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), ino, inode_bitmap_bh->b_data)) ...... ... spin_lock(sb_bgl_lock(sbi, group)); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); i.e., on allocation we update the bitmap then we take the sb_bgl_lock and clear the EXT4_BG_INODE_UNINIT flag. What can happen is a parallel ext4_read_inode_bitmap can zero out the bitmap in between the above ext4_set_bit_atomic and spin_lock(sb_bg_lock..) The race results in below user visible errors EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449 EXT4-fs warning (device sdb1): ext4_unlink: Deleting nonexistent file ... EXT4-fs warning (device sdb1): ext4_rmdir: empty directory has too many links ... # ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71 ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ialloc.c | 146 ++++++++++++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 60 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b47427a21f1..d4e544f30be 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -572,6 +572,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent, return -1; } +/* + * claim the inode from the inode bitmap. If the group + * is uninit we need to take the groups's sb_bgl_lock + * and clear the uninit flag. The inode bitmap update + * and group desc uninit flag clear should be done + * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * doesn't race with the ext4_claim_inode + */ +static int ext4_claim_inode(struct super_block *sb, + struct buffer_head *inode_bitmap_bh, + unsigned long ino, ext4_group_t group, int mode) +{ + int free = 0, retval = 0, count; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + spin_lock(sb_bgl_lock(sbi, group)); + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + /* not a free inode */ + retval = 1; + goto err_ret; + } + ino++; + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || + ino > EXT4_INODES_PER_GROUP(sb)) { + spin_unlock(sb_bgl_lock(sbi, group)); + ext4_error(sb, __func__, + "reserved inode or inode > inodes count - " + "block_group = %u, inode=%lu", group, + ino + group * EXT4_INODES_PER_GROUP(sb)); + return 1; + } + /* If we didn't allocate from within the initialized part of the inode + * table then we need to initialize up to this inode. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + /* When marking the block group with + * ~EXT4_BG_INODE_UNINIT we don't want to depend + * on the value of bg_itable_unused even though + * mke2fs could have initialized the same for us. + * Instead we calculated the value below + */ + + free = 0; + } else { + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + * + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + } + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); + if (S_ISDIR(mode)) { + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +err_ret: + spin_unlock(sb_bgl_lock(sbi, group)); + return retval; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -594,7 +667,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; - int ret2, err = 0, count; + int ret2, err = 0; struct inode *ret; ext4_group_t i; int free = 0; @@ -658,8 +731,13 @@ repeat_in_this_group: if (err) goto fail; - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, inode_bitmap_bh->b_data)) { + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + group_desc_bh); + if (err) + goto fail; + if (!ext4_claim_inode(sb, inode_bitmap_bh, + ino, group, mode)) { /* we won it */ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); @@ -668,10 +746,13 @@ repeat_in_this_group: inode_bitmap_bh); if (err) goto fail; + /* zero bit is inode number 1*/ + ino++; goto got; } /* we lost it */ ext4_handle_release_buffer(handle, inode_bitmap_bh); + ext4_handle_release_buffer(handle, group_desc_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -691,22 +772,6 @@ repeat_in_this_group: goto out; got: - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - err = -EIO; - goto fail; - } - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; - /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -743,49 +808,10 @@ got: if (err) goto fail; } - - spin_lock(sb_bgl_lock(sbi, group)); - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) goto fail; + if (err) + goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) From 2ccb5fb9f113dae969d1ae9b6c10e80fa34f8cd3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:49:55 -0500 Subject: [PATCH 40/57] ext4: Use new buffer_head flag to check uninit group bitmaps initialization For uninit block group, the on-disk bitmap is not initialized. That implies we cannot depend on the uptodate flag on the bitmap buffer_head to find bitmap validity. Use a new buffer_head flag which would be set after we properly initialize the bitmap. This also prevents (re-)initializing the uninit group bitmap every time we call ext4_read_block_bitmap(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/balloc.c | 25 +++++++++++++++++++++++-- fs/ext4/ext4.h | 18 ++++++++++++++++++ fs/ext4/ialloc.c | 24 ++++++++++++++++++++++-- fs/ext4/mballoc.c | 24 ++++++++++++++++++++++-- 4 files changed, 85 insertions(+), 6 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1b26b68aa42..6bba06b09dd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -320,20 +320,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) + + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ec862f4ca89..695b45cc34e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -19,6 +19,7 @@ #include #include #include +#include #include "ext4_i.h" /* @@ -1352,6 +1353,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +/* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index d4e544f30be..7b12aedc531 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -115,20 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index aac33590ac6..18a52d39d09 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -794,22 +794,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (bh[i] == NULL) goto out; - if (buffer_uptodate(bh[i]) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) + if (bitmap_uptodate(bh[i])) continue; lock_buffer(bh[i]); + if (bitmap_uptodate(bh[i])) { + unlock_buffer(bh[i]); + continue; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); + set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); unlock_buffer(bh[i]); continue; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + if (buffer_uptodate(bh[i])) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh[i]); + unlock_buffer(bh[i]); + continue; + } get_bh(bh[i]); + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); mb_debug("read bitmap for group %u\n", first_group + i); From 648f5879f5892dddd3ba71cd0d285599f40f2512 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:46:04 -0500 Subject: [PATCH 41/57] ext4: mark the blocks/inode bitmap beyond end of group as used We need to mark the block/inode bitmap beyond the end of the group with '1'. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ialloc.c | 2 +- fs/ext4/mballoc.c | 4 ++-- fs/ext4/resize.c | 6 ++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7b12aedc531..e3aa3fa3860 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); return EXT4_INODES_PER_GROUP(sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 18a52d39d09..7d7f6f91d55 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3038,8 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, in_range(block + len - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, __func__, - "Allocating block in system zone - block = %llu", - block); + "Allocating block %llu in system zone of %d group\n", + block, ac->ac_b_ex.fe_group); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 2d24f423fd8..c328be5d688 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb, if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), - bh->b_data); + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); - /* Mark unused entries in inode bitmap used */ ext4_debug("clear inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, input->inode_bitmap - start); @@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_journal; } - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); ext4_handle_dirty_metadata(handle, NULL, bh); exit_bh: From 8556e8f3b6c4c11601ce1e9ea8090a6d8bd5daae Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:46:55 -0500 Subject: [PATCH 42/57] ext4: Don't allow new groups to be added during block allocation After we mark the blocks in the buddy cache as allocated, we need to ensure that we don't reinit the buddy cache until the block bitmap is updated. This commit achieves this by holding the group_info alloc_semaphore till ext4_mb_release_context Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 16 +++++++++++++--- fs/ext4/mballoc.h | 5 +++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7d7f6f91d55..0c7e247f714 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1052,7 +1052,8 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b) if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); /* Done with the buddy cache */ - up_read(e4b->alloc_semp); + if (e4b->alloc_semp) + up_read(e4b->alloc_semp); } @@ -1371,7 +1372,9 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - + /* on allocation we use ac to track the held semaphore */ + ac->alloc_semp = e4b->alloc_semp; + e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { spin_lock(&sbi->s_md_lock); @@ -4289,6 +4292,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_pa = NULL; ac->ac_bitmap_page = NULL; ac->ac_buddy_page = NULL; + ac->alloc_semp = NULL; ac->ac_lg = NULL; /* we have to define context: we'll we work with a file or @@ -4469,6 +4473,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) } ext4_mb_put_pa(ac, ac->ac_sb, pa); } + if (ac->alloc_semp) + up_read(ac->alloc_semp); if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) @@ -4569,10 +4575,14 @@ repeat: ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } - if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 95d4c7f29a8..10a2921baf1 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -195,6 +195,11 @@ struct ext4_allocation_context { __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; + /* + * pointer to the held semaphore upon successful + * block allocation + */ + struct rw_semaphore *alloc_semp; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; From 29eaf024980e07cc01f31ae4ea5d68c917f4b7da Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:48:56 -0500 Subject: [PATCH 43/57] ext4: Init the complete page while building buddy cache We need to init the complete page during buddy cache init by setting the contents to '1'. Otherwise we can see the following errors after doing an online resize of the filesystem: EXT4-fs error (device sdb1): ext4_mb_mark_diskspace_used: Allocating block 1040385 in system zone of 127 group Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0c7e247f714..fd2294de404 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -846,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) err = 0; first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { int group; struct ext4_group_info *grinfo; @@ -872,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore == NULL); mb_debug("put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); - memset(data, 0xff, blocksize); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, From 0087d9fb3f29f59e8d42c8b058376d80e5adde4c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 5 Jan 2009 21:49:12 -0500 Subject: [PATCH 44/57] ext4: Fix s_dirty_blocks_counter if block allocation failed with nodelalloc With nodelalloc option we need to update the dirty block counter on block allocation failure. This is needed because we increment the dirty block counter early in the block allocation phase. Without the patch s_dirty_blocks_counter goes wrong so that filesystem's free blocks decreases incorrectly. Tested-by: Akira Fujita Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index fd2294de404..05d9f81956c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4541,7 +4541,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } if (ar->len == 0) { *errp = -EDQUOT; - return 0; + goto out3; } inquota = ar->len; @@ -4614,6 +4614,13 @@ out2: out1: if (ar->len < inquota) DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); +out3: + if (!ar->len) { + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + reserv_blks); + } return block; } From 87d8fe1ee6b8d2f95076142d58c440dba4e7bdc2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 09:47:09 -0500 Subject: [PATCH 45/57] add releasepage hooks to block devices which can be used by file systems Implement blkdev_releasepage() to release the buffer_heads and pages after we release private data belonging to a mounted filesystem. Cc: Toshiyuki Okajima Cc: linux-fsdevel@vger.kernel.org Signed-off-by: "Theodore Ts'o" --- fs/block_dev.c | 15 +++++++++++++++ fs/super.c | 2 ++ include/linux/fs.h | 2 ++ 3 files changed, 19 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 349a26c1000..1dd07e66e98 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1220,6 +1220,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +/* + * Try to release a page associated with block device when the system + * is under memory pressure. + */ +static int blkdev_releasepage(struct page *page, gfp_t wait) +{ + struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; + + if (super && super->s_op->bdev_try_to_free_page) + return super->s_op->bdev_try_to_free_page(super, page, wait); + + return try_to_free_buffers(page); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .writepage = blkdev_writepage, @@ -1227,6 +1241,7 @@ static const struct address_space_operations def_blk_aops = { .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = generic_writepages, + .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, }; diff --git a/fs/super.c b/fs/super.c index ddba069d7a9..d5fd4498548 100644 --- a/fs/super.c +++ b/fs/super.c @@ -800,6 +800,7 @@ int get_sb_bdev(struct file_system_type *fs_type, } s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; } return simple_set_mnt(mnt, s); @@ -819,6 +820,7 @@ void kill_block_super(struct super_block *sb) struct block_device *bdev = sb->s_bdev; fmode_t mode = sb->s_mode; + bdev->bd_super = 0; generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_exclusive(bdev, mode); diff --git a/include/linux/fs.h b/include/linux/fs.h index f2a3010140e..0f54ae0f0cc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -565,6 +565,7 @@ struct address_space { struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ struct inode * bd_inode; /* will die */ + struct super_block * bd_super; int bd_openers; struct mutex bd_mutex; /* open/close mutex */ struct semaphore bd_mount_sem; @@ -1385,6 +1386,7 @@ struct super_operations { ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif + int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); }; /* From 6b082b531228c43d454c082fc0f969da1695b060 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Mon, 5 Jan 2009 22:38:14 -0500 Subject: [PATCH 46/57] ext3: provide function to release metadata pages under memory pressure Pages in the page cache belonging to ext3 data files are released via the ext3_releasepage() function specified in the ext3 inode's address_space_ops. However, metadata blocks (such as indirect blocks, directory blocks, etc) are managed via the block device address_space_ops, and they can not be released by try_to_free_buffers() if they have a journal head attached to them. To address this, we supply a try_to_free_pages() function which calls journal_try_to_free_buffers() function to free the metadata, and which is called by the block device's blkdev_releasepage() function. Signed-off-by: Toshiyuki Okajima Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org --- fs/ext3/super.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 541d5e4f7f6..6900ff05e3a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -682,6 +682,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, ext3_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -746,6 +766,7 @@ static const struct super_operations ext3_sops = { .quota_read = ext3_quota_read, .quota_write = ext3_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext3_export_ops = { From c39a7f84d7845aa95d1c7c168f38215aedcc13c2 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Mon, 5 Jan 2009 22:38:48 -0500 Subject: [PATCH 47/57] ext4: provide function to release metadata pages under memory pressure Pages in the page cache belonging to ext4 data files are released via the ext4_releasepage() function specified in the ext4 inode's address_space_ops. However, metadata blocks (such as indirect blocks, directory blocks, etc) are managed via the block device address_space_ops, and they can not be released by try_to_free_buffers() if they have a journal head attached to them. To address this, we supply a release_metadata function which calls jbd2_journal_try_to_free_buffers() function to free the metadata, and which is called by the block device's blkdev_releasepage() function. Signed-off-by: Toshiyuki Okajima Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org --- fs/ext4/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a3321bf2231..e5ab520724d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -907,6 +907,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd2 layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -971,6 +990,7 @@ static const struct super_operations ext4_sops = { .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext4_export_ops = { From c31910672376dfb8d020e32afa7249763bcd924a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 11:14:25 -0500 Subject: [PATCH 48/57] ext4: Remove code to create the journal inode This code has been obsolete in quite some time, since the supported method for adding a journal inode is to use tune2fs (or to creating new filesystem with a journal via mke2fs or mkfs.ext4). Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 4 -- fs/ext4/super.c | 68 ++-------------------------- fs/jbd2/journal.c | 72 ------------------------------ include/linux/jbd2.h | 1 - 4 files changed, 4 insertions(+), 141 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index e3fcbea3ec8..9ec29d86ff8 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -149,10 +149,6 @@ journal_async_commit Commit block can be written to disk without waiting journal=update Update the ext4 file system's journal to the current format. -journal=inum When a journal already exists, this option is ignored. - Otherwise, it specifies the number of the inode which - will represent the ext4 file system's journal file. - journal_dev=devnum When the external journal device's major/minor numbers have changed, this option allows the user to specify the new journal location. The journal device is diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5ab520724d..8036392b212 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_create_journal(struct super_block *, struct ext4_super_block *, - unsigned int); static void ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, @@ -1006,7 +1004,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, @@ -1048,7 +1046,6 @@ static const match_table_t tokens = { {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -1102,7 +1099,7 @@ static ext4_fsblk_t get_sb_block(void **data) } static int parse_options(char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, + unsigned long *journal_devnum, ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1226,16 +1223,6 @@ static int parse_options(char *options, struct super_block *sb, } set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; - case Opt_journal_inum: - if (is_remount) { - printk(KERN_ERR "EXT4-fs: cannot specify " - "journal on remount\n"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *inum = option; - break; case Opt_journal_dev: if (is_remount) { printk(KERN_ERR "EXT4-fs: cannot specify " @@ -2035,7 +2022,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned int journal_inum = 0; unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; @@ -2155,8 +2141,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) + if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2460,9 +2445,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } } - } else if (journal_inum) { - if (ext4_create_journal(sb, es, journal_inum)) - goto failed_mount3; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { printk(KERN_ERR "EXT4-fs: required journal recovery " @@ -2926,48 +2908,6 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static int ext4_create_journal(struct super_block *sb, - struct ext4_super_block *es, - unsigned int journal_inum) -{ - journal_t *journal; - int err; - - if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " - "create journal.\n"); - return -EROFS; - } - - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; - - printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", - journal_inum); - - err = jbd2_journal_create(journal); - if (err) { - printk(KERN_ERR "EXT4-fs: error creating journal.\n"); - jbd2_journal_destroy(journal); - return -EIO; - } - - EXT4_SB(sb)->s_journal = journal; - - ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL); - - es->s_journal_inum = cpu_to_le32(journal_inum); - sb->s_dirt = 1; - - /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); - - return 0; -} - static void ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync) { @@ -3209,7 +3149,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 34ef9805720..b10d7283ba5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -66,7 +66,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); -EXPORT_SYMBOL(jbd2_journal_create); EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_destroy); EXPORT_SYMBOL(jbd2_journal_abort); @@ -1162,77 +1161,6 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -/** - * int jbd2_journal_create() - Initialise the new journal file - * @journal: Journal to create. This structure must have been initialised - * - * Given a journal_t structure which tells us which disk blocks we can - * use, create a new journal superblock and initialise all of the - * journal fields from scratch. - **/ -int jbd2_journal_create(journal_t *journal) -{ - unsigned long long blocknr; - struct buffer_head *bh; - journal_superblock_t *sb; - int i, err; - - if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) { - printk (KERN_ERR "Journal length (%d blocks) too short.\n", - journal->j_maxlen); - journal_fail_superblock(journal); - return -EINVAL; - } - - if (journal->j_inode == NULL) { - /* - * We don't know what block to start at! - */ - printk(KERN_EMERG - "%s: creation of journal on external device!\n", - __func__); - BUG(); - } - - /* Zero out the entire journal on disk. We cannot afford to - have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */ - jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); - for (i = 0; i < journal->j_maxlen; i++) { - err = jbd2_journal_bmap(journal, i, &blocknr); - if (err) - return err; - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - lock_buffer(bh); - memset (bh->b_data, 0, journal->j_blocksize); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - __brelse(bh); - } - - sync_blockdev(journal->j_dev); - jbd_debug(1, "JBD: journal cleared.\n"); - - /* OK, fill in the initial static fields in the new superblock */ - sb = journal->j_superblock; - - sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - - sb->s_blocksize = cpu_to_be32(journal->j_blocksize); - sb->s_maxlen = cpu_to_be32(journal->j_maxlen); - sb->s_first = cpu_to_be32(1); - - journal->j_transaction_sequence = 1; - - journal->j_flags &= ~JBD2_ABORT; - journal->j_format_version = 2; - - return journal_reset(journal); -} - /** * void jbd2_journal_update_superblock() - Update journal sb on disk. * @journal: The journal to update. diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 9d82084a160..adef1c9940d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1104,7 +1104,6 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); -extern int jbd2_journal_create (journal_t *); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); From ba80b1019aa722b24506db1ee755e0bb2f513022 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 20:03:21 -0500 Subject: [PATCH 49/57] ext4: Add markers for better debuggability Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 9 ++++++++ fs/ext4/inode.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/mballoc.c | 54 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 116 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e3aa3fa3860..369c34c6429 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -210,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); + trace_mark(ext4_free_inode, + "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", + sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, + (unsigned long long) inode->i_blocks); /* * Note: we must free any quota before locking the superblock, @@ -698,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, + dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -925,6 +932,8 @@ got: } ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", + sb->s_id, inode->i_ino, dir->i_ino, mode); goto really_out; fail: ext4_std_error(sb, err); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 56142accf5c..4cac8da4e0c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1351,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; + trace_mark(ext4_write_begin, + "dev %s ino %lu pos %llu len %u flags %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, flags); index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1422,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file, struct inode *inode = mapping->host; int ret = 0, ret2; + trace_mark(ext4_ordered_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1460,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file, int ret = 0, ret2; loff_t new_i_size; + trace_mark(ext4_writeback_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); @@ -1495,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; loff_t new_i_size; + trace_mark(ext4_journalled_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -2311,6 +2327,9 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; + trace_mark(ext4_da_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2421,6 +2440,20 @@ static int ext4_da_writepages(struct address_space *mapping, int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + trace_mark(ext4_da_writepages, + "dev %s ino %lu nr_t_write %ld " + "pages_skipped %ld range_start %llu " + "range_end %llu nonblocking %d " + "for_kupdate %d for_reclaim %d " + "for_writepages %d range_cyclic %d", + inode->i_sb->s_id, inode->i_ino, + wbc->nr_to_write, wbc->pages_skipped, + (unsigned long long) wbc->range_start, + (unsigned long long) wbc->range_end, + wbc->nonblocking, wbc->for_kupdate, + wbc->for_reclaim, wbc->for_writepages, + wbc->range_cyclic); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -2539,6 +2572,14 @@ out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; + trace_mark(ext4_da_writepage_result, + "dev %s ino %lu ret %d pages_written %d " + "pages_skipped %ld congestion %d " + "more_io %d no_nrwrite_index_update %d", + inode->i_sb->s_id, inode->i_ino, ret, + pages_written, wbc->pages_skipped, + wbc->encountered_congestion, wbc->more_io, + wbc->no_nrwrite_index_update); return ret; } @@ -2590,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; + + trace_mark(ext4_da_write_begin, + "dev %s ino %lu pos %llu len %u flags %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, flags); retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2679,6 +2725,10 @@ static int ext4_da_write_end(struct file *file, } } + trace_mark(ext4_da_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -2892,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; + trace_mark(ext4_normal_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2977,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; + trace_mark(ext4_journalled_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 05d9f81956c..918aec0c8a1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2878,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + entry->start_blk + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, - (unsigned long long) discard_block, entry->count); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", + sb->s_id, (unsigned long long) discard_block, + entry->count); sb_issue_discard(sb, discard_block, entry->count); kmem_cache_free(ext4_free_ext_cachep, entry); @@ -3697,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_mark(ext4_mb_new_inode_pa, + "dev %s ino %lu pstart %llu len %u lstart %u", + sb->s_id, ac->ac_inode->i_ino, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3754,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_linear = 1; mb_debug("new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", + sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3807,12 +3814,14 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, unsigned int next; ext4_group_t group; ext4_grpblk_t bit; + unsigned long long grp_blk_start; sector_t start; int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; @@ -3842,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_mb_store_history(ac); } + trace_mark(ext4_mb_release_inode_pa, + "dev %s ino %lu block %llu count %u", + sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3875,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (ac) ac->ac_op = EXT4_MB_HISTORY_DISCARD; + trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", + sb->s_id, pa->pa_pstart, pa->pa_len); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -4040,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode) } mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, + inode->i_ino); INIT_LIST_HEAD(&list); @@ -4492,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) int ret; int freed = 0; + trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", + sb->s_id, needed); for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; @@ -4520,6 +4539,18 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, sb = ar->inode->i_sb; sbi = EXT4_SB(sb); + trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " + "lblk %llu goal %llu lleft %llu lright %llu " + "pleft %llu pright %llu ", + sb->s_id, ar->flags, ar->len, + ar->inode ? ar->inode->i_ino : 0, + (unsigned long long) ar->logical, + (unsigned long long) ar->goal, + (unsigned long long) ar->lleft, + (unsigned long long) ar->lright, + (unsigned long long) ar->pleft, + (unsigned long long) ar->pright); + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { /* * With delalloc we already reserved the blocks @@ -4622,6 +4653,19 @@ out3: reserv_blks); } + trace_mark(ext4_allocate_blocks, + "dev %s block %llu flags %u len %u ino %lu " + "logical %llu goal %llu lleft %llu lright %llu " + "pleft %llu pright %llu ", + sb->s_id, (unsigned long long) block, + ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, + (unsigned long long) ar->logical, + (unsigned long long) ar->goal, + (unsigned long long) ar->lleft, + (unsigned long long) ar->lright, + (unsigned long long) ar->pleft, + (unsigned long long) ar->pright); + return block; } @@ -4755,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %lu\n", block); + trace_mark(ext4_free_blocks, + "dev %s block %llu count %lu metadata %d ino %lu", + sb->s_id, (unsigned long long) block, count, metadata, + inode ? inode->i_ino : 0); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { From 4a9bf99b205448ec1f0cbdee1776a29f9c503ce4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 22:56:44 -0500 Subject: [PATCH 50/57] jbd2: Add pid and journal device name to the "kjournald2 starting" message Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b10d7283ba5..fe20e40ee7c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -131,8 +131,9 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); - printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", - journal->j_commit_interval / HZ); + printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " + "commit interval %ld seconds\n", current->pid, + journal->j_devname, journal->j_commit_interval / HZ); /* * And now, wait forever for commit wakeup events. From 40a1984d22294ab202f616e432bb8d3481897675 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 4 Jan 2009 19:55:57 -0500 Subject: [PATCH 51/57] jbd2: Submit writes to the journal using WRITE_SYNC Since we will be waiting the write of the commit record to the journal to complete in journal_submit_commit_record(), submit it using WRITE_SYNC. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0ad84162c42..073124a29b8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal, set_buffer_ordered(bh); barrier_done = 1; } - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); if (barrier_done) clear_buffer_ordered(bh); @@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal, lock_buffer(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } *cbh = bh; return ret; From b3881f74b31b7d47d0f1c4d89ac3e7f0b9c05e3e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:46:26 -0500 Subject: [PATCH 52/57] ext4: Add mount option to set kjournald's I/O priority Signed-off-by: "Theodore Ts'o" Cc: Jens Axboe --- Documentation/filesystems/ext4.txt | 7 +++++++ fs/ext4/super.c | 29 +++++++++++++++++++++++++---- fs/ioprio.c | 3 ++- include/linux/ioprio.h | 2 ++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 9ec29d86ff8..8938949b201 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -308,6 +308,13 @@ min_batch_time=usec This parameter sets the commit time (as multi-threaded, synchronous workloads on very fast disks, at the cost of increasing latency. +journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the + highest priorty) which should be used for I/O + operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is + a slightly higher priority than the default I/O + priority. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8036392b212..8ff8709828f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1013,7 +1013,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_inode_readahead_blks + Opt_inode_readahead_blks, Opt_journal_ioprio }; static const match_table_t tokens = { @@ -1074,6 +1074,7 @@ static const match_table_t tokens = { {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_err, NULL}, }; @@ -1098,8 +1099,11 @@ static ext4_fsblk_t get_sb_block(void **data) return sb_block; } +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static int parse_options(char *options, struct super_block *sb, unsigned long *journal_devnum, + unsigned int *journal_ioprio, ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1492,6 +1496,14 @@ set_qf_format: return 0; sbi->s_inode_readahead_blks = option; break; + case Opt_journal_ioprio: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 7) + break; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, + option); + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -2035,6 +2047,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int features; __u64 blocks_count; int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -2141,7 +2154,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0)) + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2506,6 +2520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: @@ -3127,6 +3142,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err; #ifdef CONFIG_QUOTA int i; @@ -3145,11 +3161,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) for (i = 0; i < MAXQUOTAS; i++) old_opts.s_qf_names[i] = sbi->s_qf_names[i]; #endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, + &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } @@ -3162,8 +3181,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) es = sbi->s_es; - if (sbi->s_journal) + if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > ext4_blocks_count(es)) { diff --git a/fs/ioprio.c b/fs/ioprio.c index 3569e0ad86a..1a39ac37094 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -27,7 +27,7 @@ #include #include -static int set_task_ioprio(struct task_struct *task, int ioprio) +int set_task_ioprio(struct task_struct *task, int ioprio) { int err; struct io_context *ioc; @@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) task_unlock(task); return err; } +EXPORT_SYMBOL_GPL(set_task_ioprio); asmlinkage long sys_ioprio_set(int which, int who, int ioprio) { diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index f98a656b17e..76dad480884 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -86,4 +86,6 @@ static inline int task_nice_ioclass(struct task_struct *task) */ extern int ioprio_best(unsigned short aprio, unsigned short bprio); +extern int set_task_ioprio(struct task_struct *task, int ioprio); + #endif From 4ec110281379826c5cf6ed14735e47027c3c5765 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 14:53:26 -0500 Subject: [PATCH 53/57] ext4: Add sanity checks for the superblock before mounting the filesystem This avoids insane superblock configurations that could lead to kernel oops due to null pointer derefences. http://bugzilla.kernel.org/show_bug.cgi?id=12371 Thanks to David Maciejak at Fortinet's FortiGuard Global Security Research Team who discovered this bug independently (but at approximately the same time) as Thiemo Nagel, who submitted the patch. Signed-off-by: Thiemo Nagel Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8ff8709828f..517c90ad25b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2041,8 +2041,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) const char *descr; int ret = -EINVAL; int blocksize; - int db_count; - int i; + unsigned int db_count; + unsigned int i; int needs_recovery, has_huge_files; int features; __u64 blocks_count; @@ -2331,20 +2331,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext4; - /* ensure blocks_count calculation below doesn't sign-extend */ - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < - le32_to_cpu(es->s_first_data_block) + 1) { - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " - "first data block %u, blocks per group %lu\n", - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + printk(KERN_WARNING "EXT4-fs: bad geometry: first data" + "block %u is beyond end of filesystem (%llu)\n", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); goto failed_mount; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + printk(KERN_WARNING "EXT4-fs: groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)\n", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } sbi->s_groups_count = blocks_count; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); From abda14189251563a50f56da5ea2e37e904ac4cba Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 00:20:32 -0500 Subject: [PATCH 54/57] ext4: Make printk's consistently prefixed with "EXT4-fs: " Previously, some were "ext4: ", and some were "EXT4: "; change them to be consistent with most ext4 printk's, which is to use "EXT4-fs: ". Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 2 +- fs/ext4/super.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index cf3ccf4a94b..2df2e40b01a 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent, sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "ext4: call_filldir: called with " + printk(KERN_ERR "EXT4-fs: call_filldir: called with " "null fname?!?\n"); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 517c90ad25b..b69d0920386 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -505,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev) return bdev; fail: - printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", + printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; } @@ -2485,7 +2485,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_blocks_count(es) > 0xffffffffULL && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { - printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); + printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); goto failed_mount4; } @@ -2766,7 +2766,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (bd_claim(bdev, sb)) { printk(KERN_ERR - "EXT4: failed to claim external journal device.\n"); + "EXT4-fs: failed to claim external journal device.\n"); blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -2949,7 +2949,7 @@ static void ext4_commit_super(struct super_block *sb, * be remapped. Nothing we can do but to retry the * write and hope for the best. */ - printk(KERN_ERR "ext4: previous I/O error to " + printk(KERN_ERR "EXT4-fs: previous I/O error to " "superblock detected for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); @@ -2965,7 +2965,7 @@ static void ext4_commit_super(struct super_block *sb, if (sync) { sync_dirty_buffer(sbh); if (buffer_write_io_error(sbh)) { - printk(KERN_ERR "ext4: I/O error while writing " + printk(KERN_ERR "EXT4-fs: I/O error while writing " "superblock for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); From 4d783b093cd4f3e2738527365979cbd1c5101065 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 15:16:33 -0500 Subject: [PATCH 55/57] block: Add Kconfig help which notes that ext4 needs CONFIG_LBD Signed-off-by: "Theodore Ts'o" Cc: Jens Axboe --- block/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/Kconfig b/block/Kconfig index ac0956f7778..0cbb3b88b59 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -36,6 +36,12 @@ config LBD This option also enables support for single files larger than 2TB. + The ext4 filesystem requires that this feature be enabled in + order to support filesystems that have the huge_file feature + enabled. Otherwise, it will refuse to mount any filesystems + that use the huge_file feature, which is enabled by default + by mke2fs.ext4. The GFS2 filesystem also requires this feature. + If unsure, say N. config BLK_DEV_IO_TRACE From 83982b6f47201c4c7767210d24d7d8c99567a0b3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 14:53:16 -0500 Subject: [PATCH 56/57] ext4: Remove "extents" mount option This mount option is largely superfluous, and in fact the way it was implemented was buggy; if a filesystem which did not have the extents feature flag was mounted -o extents, the filesystem would attempt to create and use extents-based file even though the extents feature flag was not eabled. The simplest thing to do is to nuke the mount option entirely. It's not all that useful to force the non-creation of new extent-based files if the filesystem can support it. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 5 ---- fs/ext4/ext4.h | 1 - fs/ext4/ext4_jbd2.h | 4 +-- fs/ext4/extents.c | 4 +-- fs/ext4/ialloc.c | 2 +- fs/ext4/migrate.c | 14 ++++----- fs/ext4/super.c | 48 ++---------------------------- 7 files changed, 14 insertions(+), 64 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 8938949b201..cec829bc729 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -131,11 +131,6 @@ ro Mount filesystem read only. Note that ext4 will mount options "ro,noload" can be used to prevent writes to the filesystem. -extents (*) ext4 will use extents to address file data. The - file system will no longer be mountable by ext3. - -noextents ext4 will not use extents for newly created files - journal_checksum Enable checksumming of the journal transactions. This will allow the recovery code in e2fsck and the kernel to detect corruption in the kernel. It is a diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 695b45cc34e..db1718833f5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -536,7 +536,6 @@ do { \ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 663197adae5..be2f426f680 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -32,8 +32,8 @@ * 5 levels of tree + root which are stored in the inode. */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - || test_opt(sb, EXTENTS) ? 27U : 8U) + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c64080e4949..240cf0daad4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2247,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); @@ -2272,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!test_opt(sb, EXTENTS)) + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) return; #ifdef EXTENTS_STATS diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 369c34c6429..4fb86a0061d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -917,7 +917,7 @@ got: if (err) goto fail_free_drop; - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7cd488da4b..734abca25e3 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -459,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode) struct list_blocks_struct lb; unsigned long max_entries; - if (!test_opt(inode->i_sb, EXTENTS)) - /* - * if mounted with noextents we don't allow the migrate - */ - return -EINVAL; - - if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b69d0920386..acb69c00fd4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -829,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (!test_opt(sb, EXTENTS)) - seq_puts(seq, ",noextents"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); if (!test_opt(sb, DELALLOC)) @@ -1011,7 +1009,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1066,8 +1064,6 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, @@ -1115,7 +1111,6 @@ static int parse_options(char *options, struct super_block *sb, int qtype, qfmt; char *qname; #endif - ext4_fsblk_t last_block; if (!options) return 1; @@ -1445,33 +1440,6 @@ set_qf_format: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; - case Opt_extents: - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { - ext4_warning(sb, __func__, - "extents feature not enabled " - "on this filesystem, use tune2fs"); - return 0; - } - set_opt(sbi->s_mount_opt, EXTENTS); - break; - case Opt_noextents: - /* - * When e2fsprogs support resizing an already existing - * ext3 file system to greater than 2**32 we need to - * add support to block allocator to handle growing - * already existing block mapped inode so that blocks - * allocated for them fall within 2**32 - */ - last_block = ext4_blocks_count(sbi->s_es) - 1; - if (last_block > 0xffffffffULL) { - printk(KERN_ERR "EXT4-fs: Filesystem too " - "large to mount with " - "-o noextents options\n"); - return 0; - } - clear_opt(sbi->s_mount_opt, EXTENTS); - break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; @@ -2135,18 +2103,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); - /* - * turn on extents feature by default in ext4 filesystem - * only if feature flag already set by mkfs or tune2fs. - * Use -o noextents to turn it off - */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - set_opt(sbi->s_mount_opt, EXTENTS); - else - ext4_warning(sb, __func__, - "extents feature not enabled on this filesystem, " - "use tune2fs."); - /* * enable delayed allocation by default * Use -o nodelalloc to turn it off @@ -3825,7 +3781,7 @@ static void __exit exit_ext4_fs(void) } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); module_init(init_ext4_fs) module_exit(exit_ext4_fs) From 4b905671d2ea09fd48fed72c581df17e40823f39 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 6 Jan 2009 14:53:35 -0500 Subject: [PATCH 57/57] jbd2: Fix oops in jbd2_journal_init_inode() on corrupted fs On 32-bit system with CONFIG_LBD getblk can fail because provided block number is too big. Add error checks so we fail gracefully if getblk() returns NULL (which can also happen on memory allocation failures). Thanks to David Maciejak from Fortinet's FortiGuard Global Security Research Team for reporting this bug. http://bugzilla.kernel.org/show_bug.cgi?id=12370 Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" cc: stable@kernel.org --- fs/jbd2/journal.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fe20e40ee7c..2932c8f5519 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -632,6 +632,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); @@ -1021,15 +1023,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - journal = NULL; - goto out; + goto out_err; } journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -1039,14 +1040,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, p = journal->j_devname; while ((p = strchr(p, '/'))) *p = '!'; - jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; -out: + return journal; +out_err: + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; } /** @@ -1094,9 +1103,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; + goto out_err; } err = jbd2_journal_bmap(journal, 0, &blocknr); @@ -1104,17 +1111,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; + goto out_err; } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; +out_err: + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; } /*