From 2de770a406b06dfc619faabbf5d85c835ed3f2e1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:25:49 -0500 Subject: [PATCH 01/47] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC Previously add_dirent_to_buf() did not free its passed-in buffer head in the case of ENOSPC, since in some cases the caller still needed it. However, this led to potential buffer head leaks since not all callers dealt with this correctly. Fix this by making simplifying the freeing convention; now add_dirent_to_buf() *never* frees the passed-in buffer head, and leaves that to the responsibility of its caller. This makes things cleaner and easier to prove that the code is neither leaking buffer heads or calling brelse() one time too many. Signed-off-by: "Theodore Ts'o" Cc: Curt Wohlgemuth Cc: stable@kernel.org --- fs/ext4/namei.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d2c1b897fc..fde08c919d1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1292,9 +1292,6 @@ errout: * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) { - brelse(bh); + bh, offset)) return -EIO; - } - if (ext4_match(namelen, name, de)) { - brelse(bh); + if (ext4_match(namelen, name, de)) return -EEXIST; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_journal_get_write_access(handle, bh); if (err) { ext4_std_error(dir->i_sb, err); - brelse(bh); return err; } @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - brelse(bh); return 0; } @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (!(de)) return retval; - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if(!bh) return retval; retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) + if (retval != -ENOSPC) { + brelse(bh); return retval; + } if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto journal_error; err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; + if (err != -ENOSPC) goto cleanup; - } /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; goto cleanup; journal_error: From 503358ae01b70ce6909d19dd01287093f6b6271c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:46 -0500 Subject: [PATCH 02/47] ext4: avoid divide by zero when trying to mount a corrupted file system If s_log_groups_per_flex is greater than 31, then groups_per_flex will will overflow and cause a divide by zero error. This can cause kernel BUG if such a file system is mounted. Thanks to Nageswara R Sastry for analyzing the failure and providing an initial patch. http://bugzilla.kernel.org/show_bug.cgi?id=14287 Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d4ca92aab51..8662b2e6e9f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1673,14 +1673,14 @@ static int ext4_fill_flex_info(struct super_block *sb) size_t size; int i; - if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + if (groups_per_flex < 2) { sbi->s_log_groups_per_flex = 0; return 1; } - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << From f868a48d06f8886cb0367568a12367fa4f21ea0d Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:25:48 -0500 Subject: [PATCH 03/47] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails If the EXT4_IOC_MOVE_EXT ioctl fails, the number of blocks that were exchanged before the failure should be returned to the userspace caller. Unfortunately, currently if the block size is not the same as the page size, the returned block count that is returned is the page-aligned block count instead of the actual block count. This commit addresses this bug. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 139 ++++++++++++++++++++++-------------------- 1 file changed, 73 insertions(+), 66 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 25b6b145736..83f8c9e47c6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -661,6 +661,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @donor_inode: donor inode * @from: block offset of orig_inode * @count: block count to be replaced + * @err: pointer to save return value * * Replace original inode extents and donor inode extents page by page. * We implement this replacement in the following three steps: @@ -671,19 +672,18 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * 3. Change the block information of donor inode to point at the saved * original inode blocks in the dummy extents. * - * Return 0 on success, or a negative error value on failure. + * Return replaced block count. */ static int mext_replace_branches(handle_t *handle, struct inode *orig_inode, struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count) + ext4_lblk_t count, int *err) { struct ext4_ext_path *orig_path = NULL; struct ext4_ext_path *donor_path = NULL; struct ext4_extent *oext, *dext; struct ext4_extent tmp_dext, tmp_oext; ext4_lblk_t orig_off = from, donor_off = from; - int err = 0; int depth; int replaced_count = 0; int dext_alen; @@ -691,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, mext_double_down_write(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; /* Get the donor extent for the head */ - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -707,9 +707,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); - if (err) + if (*err) goto out; /* Loop for the donor extents */ @@ -718,7 +718,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (!dext) { ext4_error(donor_inode->i_sb, __func__, "The extent for donor must be found"); - err = -EIO; + *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { ext4_error(donor_inode->i_sb, __func__, @@ -726,20 +726,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, "extent(%u) should be equal", donor_off, le32_to_cpu(tmp_dext.ee_block)); - err = -EIO; + *err = -EIO; goto out; } /* Set donor extent to orig extent */ - err = mext_leaf_block(handle, orig_inode, + *err = mext_leaf_block(handle, orig_inode, orig_path, &tmp_dext, &orig_off); - if (err < 0) + if (*err) goto out; /* Set orig extent to donor extent */ - err = mext_leaf_block(handle, donor_inode, + *err = mext_leaf_block(handle, donor_inode, donor_path, &tmp_oext, &donor_off); - if (err < 0) + if (*err) goto out; dext_alen = ext4_ext_get_actual_len(&tmp_dext); @@ -753,35 +753,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; - if (le32_to_cpu(oext->ee_block) + - ext4_ext_get_actual_len(oext) <= orig_off) { - err = 0; - goto out; - } tmp_oext = *oext; if (donor_path) ext4_ext_drop_refs(donor_path); - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; - if (le32_to_cpu(dext->ee_block) + - ext4_ext_get_actual_len(dext) <= donor_off) { - err = 0; - goto out; - } tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count - replaced_count); - if (err) + if (*err) goto out; } @@ -796,7 +786,7 @@ out: } mext_double_up_write(orig_inode, donor_inode); - return err; + return replaced_count; } /** @@ -808,16 +798,17 @@ out: * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling mext_replace_branches(). - * Finally, write out the saved data in new original inode blocks. Return 0 - * on success, or a negative error value on failure. + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit) + int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; struct address_space *mapping = orig_inode->i_mapping; @@ -829,9 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; - unsigned int tmp_data_len, data_len; + unsigned int tmp_data_size, data_size, replaced_size; void *fsdata; - int ret, i, jblocks; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* @@ -841,8 +834,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; + *err = PTR_ERR(handle); + return 0; } if (segment_eq(get_fs(), KERNEL_DS)) @@ -858,9 +851,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - ret = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); @@ -870,27 +863,28 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_len */ + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ - tmp_data_len = orig_inode->i_size & (blocksize - 1); + tmp_data_size = orig_inode->i_size & (blocksize - 1); /* - * If data_len equal zero, it shows data_len is multiples of + * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ - if (tmp_data_len == 0) - tmp_data_len = blocksize; + if (tmp_data_size == 0) + tmp_data_size = blocksize; - data_len = tmp_data_len + + data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else { - data_len = block_len_in_page << orig_inode->i_blkbits; - } + } else + data_size = block_len_in_page << orig_inode->i_blkbits; - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, + replaced_size = data_size; + + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, &page, &fsdata); - if (unlikely(ret < 0)) + if (unlikely(*err < 0)) goto out; if (!PageUptodate(page)) { @@ -911,10 +905,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - ret = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page); - if (ret < 0) - goto out; + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); @@ -928,16 +929,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { - ret = ext4_get_block(orig_inode, + *err = ext4_get_block(orig_inode, (sector_t)(orig_blk_offset + i), bh, 0); - if (ret < 0) + if (*err < 0) goto out; if (bh->b_this_page != NULL) bh = bh->b_this_page; } - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, page, fsdata); page = NULL; @@ -951,7 +952,10 @@ out: out2: ext4_journal_stop(handle); - return ret < 0 ? ret : 0; + if (err2) + *err = err2; + + return replaced_count; } /** @@ -1367,15 +1371,17 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret1 = move_extent_per_page(o_filp, donor_inode, + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit); - if (ret1 < 0) - goto out; - orig_page_offset++; + block_len_in_page, uninit, + &ret1); + /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; + if (ret1 < 0) + goto out; if (*moved_len > len) { ext4_error(orig_inode->i_sb, __func__, "We replaced blocks too much! " @@ -1385,6 +1391,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, goto out; } + orig_page_offset++; data_offset_in_page = 0; rest_blocks -= block_len_in_page; if (rest_blocks > blocks_per_page) From fc04cb49a898c372a22b21fffc47f299d8710801 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:24:43 -0500 Subject: [PATCH 04/47] ext4: fix lock order problem in ext4_move_extents() ext4_move_extents() checks the logical block contiguousness of original file with ext4_find_extent() and mext_next_extent(). Therefore the extent which ext4_ext_path structure indicates must not be changed between above functions. But in current implementation, there is no i_data_sem protection between ext4_ext_find_extent() and mext_next_extent(). So the extent which ext4_ext_path structure indicates may be overwritten by delalloc. As a result, ext4_move_extents() will exchange wrong blocks between original and donor files. I change the place where acquire/release i_data_sem to solve this problem. Moreover, I changed move_extent_per_page() to start transaction first, and then acquire i_data_sem. Without this change, there is a possibility of the deadlock between mmap() and ext4_move_extents(): * NOTE: "A", "B" and "C" mean different processes A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes. B: do_page_fault() starts the transaction (T), and then tries to acquire i_data_sem. But process "A" is already holding it, so it is kept waiting. C: While "A" and "B" running, kjournald2 tries to commit transaction (T) but it is under updating, so kjournald2 waits for it. A-2: Call ext4_journal_start with holding i_data_sem, but transaction (T) is locked. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 117 +++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 64 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 83f8c9e47c6..a7410b34c5e 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -77,12 +77,14 @@ static int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { + struct ext4_extent_header *eh; int ppos, leaf_ppos = path->p_depth; ppos = leaf_ppos; if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext_pblock(path[ppos].p_ext); return 0; } @@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, ext_block_hdr(path[cur_ppos+1].p_bh); } + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + /* leaf block */ path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, } /** - * mext_double_down_read - Acquire two inodes' read semaphore + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * * @orig_inode: original inode structure * @donor_inode: donor inode structure - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. */ static void -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) -{ - struct inode *first = orig_inode, *second = donor_inode; - - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; - } - - down_read(&EXT4_I(first)->i_data_sem); - down_read(&EXT4_I(second)->i_data_sem); -} - -/** - * mext_double_down_write - Acquire two inodes' write semaphore - * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. - */ -static void -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; @@ -207,28 +193,14 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) } /** - * mext_double_up_read - Release two inodes' read semaphore + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second - * Release read semaphore of two inodes (orig and donor). + * Release write lock of i_data_sem of two inodes (orig and donor). */ static void -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) -{ - up_read(&EXT4_I(orig_inode)->i_data_sem); - up_read(&EXT4_I(donor_inode)->i_data_sem); -} - -/** - * mext_double_up_write - Release two inodes' write semaphore - * - * @orig_inode: original inode structure to be released its lock first - * @donor_inode: donor inode structure to be released its lock second - * Release write semaphore of two inodes (orig and donor). - */ -static void -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; - mext_double_down_write(orig_inode, donor_inode); - /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -785,7 +755,6 @@ out: kfree(donor_path); } - mext_double_up_write(orig_inode, donor_inode); return replaced_count; } @@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { + /* + * Protect extent trees against block allocations + * via delalloc + */ + double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); @@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); ext4_ext_invalidate_cache(donor_inode); + double_up_write_data_sem(orig_inode, donor_inode); goto out2; } @@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, &err2); @@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; - } else + } else { + double_up_write_data_sem(orig_inode, donor_inode); goto out; + } } /* Clear the inode cache not to refer to the old data */ ext4_ext_invalidate_cache(orig_inode); ext4_ext_invalidate_cache(donor_inode); + double_up_write_data_sem(orig_inode, donor_inode); + if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); @@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } - /* protect orig and donor against a truncate */ + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) return ret1; - mext_double_down_read(orig_inode, donor_inode); + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len, *moved_len); - mext_double_up_read(orig_inode, donor_inode); if (ret1) goto out; @@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_ext_get_actual_len(ext_cur), block_end + 1) - max(le32_to_cpu(ext_cur->ee_block), block_start); + /* Discard preallocations of two inodes */ + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { seq_blocks += add_blocks; @@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, seq_start = le32_to_cpu(ext_cur->ee_block); rest_blocks = seq_blocks; - /* Discard preallocations of two inodes */ - down_write(&EXT4_I(orig_inode)->i_data_sem); - ext4_discard_preallocations(orig_inode); - up_write(&EXT4_I(orig_inode)->i_data_sem); - - down_write(&EXT4_I(donor_inode)->i_data_sem); - ext4_discard_preallocations(donor_inode); - up_write(&EXT4_I(donor_inode)->i_data_sem); + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; if (ret1 < 0) - goto out; + break; if (*moved_len > len) { ext4_error(orig_inode->i_sb, __func__, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); ret1 = -EIO; - goto out; + break; } orig_page_offset++; @@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); @@ -1429,7 +1418,7 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - + double_up_write_data_sem(orig_inode, donor_inode); ret2 = mext_inode_double_unlock(orig_inode, donor_inode); if (ret1) From 49bd22bc4d603a2a4fc2a6a60e156cbea52eb494 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:24:41 -0500 Subject: [PATCH 05/47] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT If CONFIG_PROVE_LOCKING is enabled, the double_down_write_data_sem() will trigger a false-positive warning of a recursive lock. Since we take i_data_sem for the two inodes ordered by their inode numbers, this isn't a problem. Use of down_write_nested() will notify the lock dependency checker machinery that there is no problem here. This problem was reported by Brian Rogers: http://marc.info/?l=linux-ext4&m=125115356928011&w=1 Reported-by: Brian Rogers Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a7410b34c5e..2ca6aa3f34e 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -189,7 +189,7 @@ double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) } down_write(&EXT4_I(first)->i_data_sem); - down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** From 92c28159dce22913aef6aa811ce6fb0f7f3790b1 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 23 Nov 2009 07:24:50 -0500 Subject: [PATCH 06/47] ext4: fix spelling typos in move_extent.c Fix a few spelling typos in move_extent.c Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2ca6aa3f34e..5a106e02fd9 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -568,7 +568,7 @@ out: * @tmp_oext: the extent that will belong to the donor inode * @orig_off: block offset of original inode * @donor_off: block offset of donor inode - * @max_count: the maximun length of extents + * @max_count: the maximum length of extents * * Return 0 on success, or a negative error value on failure. */ @@ -1073,7 +1073,7 @@ mext_check_arguments(struct inode *orig_inode, } if (!*len) { - ext4_debug("ext4 move extent: len shoudld not be 0 " + ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; From 567f3e9a70d71e5c9be03701b8578be77857293b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Nov 2009 08:19:05 -0500 Subject: [PATCH 07/47] ext4: plug a buffer_head leak in an error path of ext4_iget() One of the invalid error paths in ext4_iget() forgot to brelse() the inode buffer head. Fix it by adding a brelse() in the common error return path, which also simplifies function. Thanks to Andi Kleen reporting the problem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c8caa51add..554c6798597 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4781,7 +4781,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; - struct buffer_head *bh; struct inode *inode; long ret; int block; @@ -4793,11 +4792,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); + iloc.bh = 0; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; - bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); @@ -4820,7 +4819,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -4852,7 +4850,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse(bh); ret = -EIO; goto bad_inode; } @@ -4905,10 +4902,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } - if (ret) { - brelse(bh); + if (ret) goto bad_inode; - } if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; @@ -4936,7 +4931,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { - brelse(bh); ret = -EIO; ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", @@ -4949,6 +4943,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; bad_inode: + brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } From e6a47428de84e19fda52f21ab73fde2906c40d09 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 15 Nov 2009 15:31:37 -0500 Subject: [PATCH 08/47] jbd2: don't wipe the journal on a failed journal checksum If there is a failed journal checksum, don't reset the journal. This allows for userspace programs to decide how to recover from this situation. It may be that ignoring the journal checksum failure might be a better way of recovering the file system. Once we add per-block checksums, we can definitely do better. Until then, a system administrator can try backing up the file system image (or taking a snapshot) and and trying to determine experimentally whether ignoring the checksum failure or aborting the journal replay results in less data loss. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/jbd2/journal.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fed85388ee8..af60d98ddd2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1248,6 +1248,13 @@ int jbd2_journal_load(journal_t *journal) if (jbd2_journal_recover(journal)) goto recovery_error; + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EIO; + } + /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ From cf40db137cc2b2a1b3f6850247ac2b181d9d3847 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 21:00:01 -0500 Subject: [PATCH 09/47] ext4: remove failed journal checksum check Now that we are checking for failed journal checksums in the jbd2 layer, we don't need to check in the ext4 mount path --- since a checksum fail will result in ext4_load_journal() returning an error, causing the file system to refuse to be mounted until e2fsck can deal with the problem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8662b2e6e9f..04c66907b2f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2721,26 +2721,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext4_load_journal(sb, es, journal_devnum)) goto failed_mount3; - if (!(sb->s_flags & MS_RDONLY) && - EXT4_SB(sb)->s_journal->j_failed_commit) { - ext4_msg(sb, KERN_CRIT, "error: " - "ext4_fill_super: Journal transaction " - "%u is corrupt", - EXT4_SB(sb)->s_journal->j_failed_commit); - if (test_opt(sb, ERRORS_RO)) { - ext4_msg(sb, KERN_CRIT, - "Mounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - } - if (test_opt(sb, ERRORS_PANIC)) { - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, 1); - goto failed_mount4; - } - } } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " From beac2da7565e42be59963824899825d0cc624295 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:25:08 -0500 Subject: [PATCH 10/47] ext4: add tracepoint for ext4_forget() Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 1 + include/trace/events/ext4.h | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 554c6798597..13de1dd751f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -89,6 +89,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d09550bf3f9..b390e1fc4a7 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -907,6 +907,32 @@ TRACE_EVENT(ext4_mballoc_free, __entry->result_len, __entry->result_logical) ); +TRACE_EVENT(ext4_forget, + TP_PROTO(struct inode *inode, int is_metadata, __u64 block), + + TP_ARGS(inode, is_metadata, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( int, is_metadata ) + __field( __u64, block ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->is_metadata = is_metadata; + __entry->block = block; + ), + + TP_printk("dev %s ino %lu mode %d is_metadata %d block %llu", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, __entry->is_metadata, __entry->block) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ From 50689696867d95b38d9c7be640a311494a04fb86 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:17:34 -0500 Subject: [PATCH 11/47] ext4: make sure directory and symlink blocks are revoked When an inode gets unlinked, the functions ext4_clear_blocks() and ext4_remove_blocks() call ext4_forget() for all the buffer heads corresponding to the deleted inode's data blocks. If the inode is a directory or a symlink, the is_metadata parameter must be non-zero so ext4_forget() will revoke them via jbd2_journal_revoke(). Otherwise, if these blocks are reused for a data file, and the system crashes before a journal checkpoint, the journal replay could end up corrupting these data blocks. Thanks to Curt Wohlgemuth for pointing out potential problems in this area. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 715264b4bae..74dcff84c3a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2074,7 +2074,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext_debug("free last %u blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, 0, inode, bh, start + i); + ext4_forget(handle, metadata, inode, bh, start + i); } ext4_free_blocks(handle, inode, start, num, metadata); } else if (from == le32_to_cpu(ex->ee_block) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 13de1dd751f..c420aaba6e9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4121,6 +4121,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4151,11 +4153,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, 0, inode, tbh, nr); + ext4_forget(handle, is_metadata, inode, tbh, nr); } } - ext4_free_blocks(handle, inode, block_to_free, count, 0); + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); } /** From 30c6e07a92ea4cb87160d32ffa9bce172576ae4c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 15 Nov 2009 15:30:58 -0500 Subject: [PATCH 12/47] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() We need to be testing the i_flags field in the ext4 specific portion of the inode, instead of the (confusingly aliased) i_flags field in the generic struct inode. Signed-off-by: Julia Lawall Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c420aaba6e9..9c097489af8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2789,7 +2789,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(inode->i_flags & EXT4_EXTENTS_FL) && + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; From 86ebfd08a1930ccedb8eac0aeb1ed4b8b6a41dbc Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 15 Nov 2009 15:30:52 -0500 Subject: [PATCH 13/47] ext4: journal all modifications in ext4_xattr_set_handle ext4_xattr_set_handle() was zeroing out an inode outside of journaling constraints; this is one of the accesses that was causing the crc errors in journal replay as seen in kernel.org bugzilla #14354. Reviewed-by: Andreas Dilger Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/xattr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fed5b01d7a8..025701926f9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -988,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -1013,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); From 3f8fb9490efbd300887470a2a880a64e04dcc3f5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:52 -0500 Subject: [PATCH 14/47] ext4: don't update the superblock in ext4_statfs() commit a71ce8c6c9bf269b192f352ea555217815cf027e updated ext4_statfs() to update the on-disk superblock counters, but modified this buffer directly without any journaling of the change. This is one of the accesses that was causing the crc errors in journal replay as seen in kernel.org bugzilla #14354. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04c66907b2f..f2d5ec77c1e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3648,13 +3648,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); - ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); From 8dadb198cb70ef811916668fe67eeec82e8858dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:38 -0500 Subject: [PATCH 15/47] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero The number of old-style block group descriptor blocks is s_meta_first_bg when the meta_bg feature flag is set. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/balloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1d0418980f8..f3032c919a2 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; } /** From 1032988c71f3f85483b2b4319684d1205a704c02 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 15 Nov 2009 15:29:56 -0500 Subject: [PATCH 16/47] ext4: fix block validity checks so they work correctly with meta_bg The block validity checks used by ext4_data_block_valid() wasn't correctly written to check file systems with the meta_bg feature. Fix this. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/block_validity.c | 2 +- fs/ext4/inode.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 50784ef0756..dc79b75d8f7 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - sbi->s_gdb_count + 1); + ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); if (ret) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c097489af8..0c0ddc1401e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4884,10 +4884,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - EXT4_SB(sb)->s_gdb_count)) || - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { ext4_error(sb, __func__, "bad extended attribute block %llu in inode #%lu", ei->i_file_acl, inode->i_ino); From 6b17d902fdd241adfa4ce780df20547b28bf5801 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:24:57 -0500 Subject: [PATCH 17/47] ext4: avoid issuing unnecessary barriers We don't to issue an I/O barrier on an error or if we force commit because we are doing data journaling. Signed-off-by: "Theodore Ts'o" Cc: Jan Kara Cc: stable@kernel.org --- fs/ext4/fsync.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2b1531266ee..a3c25076aef 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -60,7 +60,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) ret = flush_aio_dio_completed_IO(inode); if (ret < 0) - goto out; + return ret; /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -79,10 +79,8 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - goto out; - } + if (ext4_should_journal_data(inode)) + return ext4_force_commit(inode->i_sb); if (!journal) ret = sync_mapping_buffers(inode->i_mapping); From 2bba702d4f88d7b010ec37e2527b552588404ae7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 23 Nov 2009 07:24:48 -0500 Subject: [PATCH 18/47] ext4: fix error handling in ext4_ind_get_blocks() When an error happened in ext4_splice_branch we failed to notice that in ext4_ind_get_blocks and mapped the buffer anyway. Fix the problem by checking for error properly. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0c0ddc1401e..3673ec7b1c9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1022,7 +1022,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); From 5328e635315734d42080de9a5a1ee87bf4cae0a4 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 19 Nov 2009 14:25:42 -0500 Subject: [PATCH 19/47] ext4: make trim/discard optional (and off by default) It is anticipated that when sb_issue_discard starts doing real work on trim-capable devices, we may see issues. Make this mount-time optional, and default it to off until we know that things are working out OK. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 6 ++++++ fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 19 ++++++++++++------- fs/ext4/super.c | 14 +++++++++++++- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6d94e0696f8..26904ff6d61 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -353,6 +353,12 @@ noauto_da_alloc replacing existing files via patterns such as system crashes before the delayed allocation blocks are forced to disk. +discard Controls whether ext4 should issue discard/TRIM +nodiscard(*) commands to the underlying block device when + blocks are freed. This is useful for SSD devices + and sparse/thinly-provisioned LUNs, but it is off + by default until sufficient testing has been done. + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8825515eedd..05ce38b981c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -750,6 +750,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bba12824def..6e5a23a2cc2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct ext4_group_info *db; int err, count = 0, count2 = 0; struct ext4_free_data *entry; - ext4_fsblk_t discard_block; struct list_head *l, *ltmp; list_for_each_safe(l, ltmp, &txn->t_private_list) { @@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + discard_block = (ext4_fsblk_t)entry->group * + EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(es->s_first_data_block); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f2d5ec77c1e..2b382f6f80c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -899,6 +899,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); + if (test_opt(sb, DISCARD)) + seq_puts(seq, ",discard"); + ext4_show_quota_options(seq, sb); return 0; @@ -1079,7 +1082,8 @@ enum { Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, }; static const match_table_t tokens = { @@ -1144,6 +1148,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, }; @@ -1565,6 +1571,12 @@ set_qf_format: else set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; + case Opt_discard: + set_opt(sbi->s_mount_opt, DISCARD); + break; + case Opt_nodiscard: + clear_opt(sbi->s_mount_opt, DISCARD); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " From e3bb52ae2bb9573e84c17b8e3560378d13a5c798 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 19 Nov 2009 14:28:50 -0500 Subject: [PATCH 20/47] ext4: make "norecovery" an alias for "noload" Users on the linux-ext4 list recently complained about differences across filesystems w.r.t. how to mount without a journal replay. In the discussion it was noted that xfs's "norecovery" option is perhaps more descriptively accurate than "noload," so let's make that an alias for ext4. Also show this status in /proc/mounts Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/ext4.txt | 4 ++-- fs/ext4/super.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 26904ff6d61..af6885c3c82 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that - if the filesystem was not unmounted cleanly, +norecovery Don't load the journal on mounting. Note that +noload if the filesystem was not unmounted cleanly, skipping the journal replay will lead to the filesystem containing inconsistencies that can lead to any number of problems. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2b382f6f80c..5a2db612950 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -902,6 +902,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DISCARD)) seq_puts(seq, ",discard"); + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext4_show_quota_options(seq, sb); return 0; @@ -1108,6 +1111,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, From d6797d14b1640d088652c72508b529a3aea479e3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 20:52:12 -0500 Subject: [PATCH 21/47] ext4: move ext4_forget() to ext4_jbd2.c The ext4_forget() function better belongs in ext4_jbd2.c. This will allow us to do some cleanup of the ext4_journal_revoke() and ext4_journal_forget() functions, as well as giving us better error reporting since we can report the caller of ext4_forget() when things go wrong. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 -- fs/ext4/ext4_jbd2.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4_jbd2.h | 7 ++++++ fs/ext4/inode.c | 53 ------------------------------------------ 4 files changed, 63 insertions(+), 55 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 05ce38b981c..57c4e03afa0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1393,8 +1393,6 @@ extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); /* inode.c */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6a9409920de..913f8571543 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -4,6 +4,8 @@ #include "ext4_jbd2.h" +#include + int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { @@ -64,6 +66,60 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, return err; } +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + return __ext4_journal_forget(where, handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext4_journal_revoke"); + err = __ext4_journal_revoke(where, handle, blocknr, bh); + if (err) + ext4_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a2865980342..dc0b34a903e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -139,6 +139,10 @@ int __ext4_journal_forget(const char *where, handle_t *handle, int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_fsblk_t blocknr, struct buffer_head *bh); +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr); + int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); @@ -151,6 +155,9 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, __ext4_journal_get_write_access(__func__, (handle), (bh)) #define ext4_journal_revoke(handle, blocknr, bh) \ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ + (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3673ec7b1c9..fa37f9504ec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -70,59 +70,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } -/* - * The ext4 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - * - * If the handle isn't valid we're not journaling, but we still need to - * call into ext4_journal_revoke() to put the buffer head. - */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - trace_ext4_forget(inode, is_metadata, blocknr); - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext4_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return ext4_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext4_journal_revoke"); - err = ext4_journal_revoke(handle, blocknr, bh); - if (err) - ext4_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. From e4684b3fbb848446683feecb4aee133344c93933 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 24 Nov 2009 11:05:59 -0500 Subject: [PATCH 22/47] ext4: fold ext4_journal_revoke() into ext4_forget() The only caller of ext4_journal_revoke() is ext4_forget(), so we can fold ext4_journal_revoke() into ext4_forget() to simplify the code and shorten the call stack. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 30 +++++++++++------------------- fs/ext4/ext4_jbd2.h | 12 +----------- 2 files changed, 12 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 913f8571543..92c88a8f734 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -50,22 +50,6 @@ int __ext4_journal_forget(const char *where, handle_t *handle, return err; } -int __ext4_journal_revoke(const char *where, handle_t *handle, - ext4_fsblk_t blocknr, struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); - } - else - bforget(bh); - return err; -} - /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be @@ -94,6 +78,12 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); + /* In the no journal case, we can just do a bforget and return */ + if (!ext4_handle_valid(handle)) { + bforget(bh); + return 0; + } + /* Never use the revoke function if we are doing full data * journaling: there is no need to, and a V1 superblock won't * support it. Otherwise, only skip the revoke on un-journaled @@ -111,11 +101,13 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, /* * data!=journal && (is_metadata || should_journal_data(inode)) */ - BUFFER_TRACE(bh, "call ext4_journal_revoke"); - err = __ext4_journal_revoke(where, handle, blocknr, bh); - if (err) + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, __func__, bh, handle, err); ext4_abort(inode->i_sb, __func__, "error %d when attempting revoke", err); + } BUFFER_TRACE(bh, "exit"); return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index dc0b34a903e..f9fb4bb6957 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -116,12 +116,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* - * Wrapper functions with which ext4 calls into JBD. The intent here is - * to allow these to be turned into appropriate stubs so ext4 can control - * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't - * been done yet. + * Wrapper functions with which ext4 calls into JBD. */ - void ext4_journal_abort_handle(const char *caller, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); @@ -135,10 +131,6 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, int __ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh); -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_revoke(const char *where, handle_t *handle, - ext4_fsblk_t blocknr, struct buffer_head *bh); - int __ext4_forget(const char *where, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); @@ -153,8 +145,6 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, __ext4_journal_get_undo_access(__func__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, (handle), (bh)) -#define ext4_journal_revoke(handle, blocknr, bh) \ - __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ (block_nr)) From b7e57e7c2a41826e51fe060fae5158bfc7a04e81 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 21:00:13 -0500 Subject: [PATCH 23/47] ext4: fold ext4_journal_forget() into ext4_forget() Convert the last two callers of ext4_journal_forget() to use ext4_forget() instead, and then fold ext4_journal_forget() into ext4_forget(). This reduces are code complexity and shortens our call stack. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 22 +++++----------------- fs/ext4/ext4_jbd2.h | 6 ------ fs/ext4/inode.c | 16 ++++++++++++++-- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 92c88a8f734..b57e5c711b6 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -34,22 +34,6 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, return err; } -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); - } - else - bforget(bh); - return err; -} - /* * The ext4 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be @@ -93,7 +77,11 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, (!is_metadata && !ext4_should_journal_data(inode))) { if (bh) { BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return __ext4_journal_forget(where, handle, bh); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + return err; } return 0; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f9fb4bb6957..84bc98ab9f0 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -127,10 +127,6 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh); -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh); - int __ext4_forget(const char *where, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); @@ -150,8 +146,6 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_forget(handle, bh) \ - __ext4_journal_forget(__func__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fa37f9504ec..72c69432349 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -767,7 +767,13 @@ failed: /* Allocation failed, free what we already allocated */ for (i = 1; i <= n ; i++) { BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, branch[i].bh); + /* + * Note: is_metadata is 0 because branch[i].bh is + * newly allocated, so there is no need to revoke the + * block. If we do, it's harmless, but not necessary. + */ + ext4_forget(handle, 0, inode, branch[i].bh, + branch[i].bh->b_blocknr); } for (i = 0; i < indirect_blks; i++) ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); @@ -852,7 +858,13 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, where[i].bh); + /* + * Note: is_metadata is 0 because branch[i].bh is + * newly allocated, so there is no need to revoke the + * block. If we do, it's harmless, but not necessary. + */ + ext4_forget(handle, 0, inode, where[i].bh, + where[i].bh->b_blocknr); ext4_free_blocks(handle, inode, le32_to_cpu(where[i-1].key), 1, 0); } From 4433871130f36585fde38e7dd817433296648945 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 07:44:56 -0500 Subject: [PATCH 24/47] ext4: fold ext4_free_blocks() and ext4_mb_free_blocks() ext4_mb_free_blocks() is only called by ext4_free_blocks(), and the latter function doesn't really do much. So merge the two functions together, such that ext4_free_blocks() is now found in fs/ext4/mballoc.c. This saves about 200 bytes of compiled text space. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 38 -------------------------------------- fs/ext4/ext4.h | 7 +++---- fs/ext4/mballoc.c | 30 +++++++++++++++++++++++------- 3 files changed, 26 insertions(+), 49 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f3032c919a2..22bc7435d91 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -498,44 +498,6 @@ error_return: return; } -/** - * ext4_free_blocks() -- Free given blocks and update quota - * @handle: handle for this transaction - * @inode: inode - * @block: start physical block to free - * @count: number of blocks to count - * @metadata: Are these metadata blocks - */ -void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata) -{ - struct super_block *sb; - unsigned long dquot_freed_blocks; - - /* this isn't the right place to decide whether block is metadata - * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; - - /* We need to make sure we don't reuse - * block released untill the transaction commit. - * writeback mode have weak data consistency so - * don't force data as metadata when freeing block - * for writeback mode. - */ - if (metadata == 0 && !ext4_should_writeback_data(inode)) - metadata = 1; - - sb = inode->i_sb; - - ext4_mb_free_blocks(handle, inode, block, count, - metadata, &dquot_freed_blocks); - if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); - return; -} - /** * ext4_has_free_blocks() * @sbi: in-core super block structure. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 57c4e03afa0..210e1b53e91 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1325,8 +1325,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, int metadata); extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); @@ -1385,8 +1383,9 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); -extern void ext4_mb_free_blocks(handle_t *, struct inode *, - ext4_fsblk_t, unsigned long, int, unsigned long *); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t block, unsigned long count, + int metadata); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6e5a23a2cc2..0dca90be1af 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4427,18 +4427,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, return 0; } -/* - * Main entry point into mballoc to free blocks +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @metadata: Are these metadata blocks */ -void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata, unsigned long *freed) +void ext4_free_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t block, unsigned long count, + int metadata) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; struct ext4_super_block *es; + unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4448,7 +4454,15 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; - *freed = 0; + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + metadata = 1; sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; @@ -4577,7 +4591,7 @@ do_more: ext4_mb_release_desc(&e4b); - *freed += count; + freed += count; /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -4597,6 +4611,8 @@ do_more: } sb->s_dirt = 1; error_return: + if (freed) + vfs_dq_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) From e6362609b6c71c5b802026be9cf263bbdd67a50e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Nov 2009 07:17:05 -0500 Subject: [PATCH 25/47] ext4: call ext4_forget() from ext4_free_blocks() Add the facility for ext4_forget() to be called from ext4_free_blocks(). This simplifies the code in a large number of places, and centralizes most of the work of calling ext4_forget() into a single place. Also fix a bug in the extents migration code; it wasn't calling ext4_forget() when releasing the indirect blocks during the conversion. As a result, if the system cashed during or shortly after the extents migration, and the released indirect blocks get reused as data blocks, the journal replay would corrupt the data blocks. With this new patch, fixing this bug was as simple as adding the EXT4_FREE_BLOCKS_FORGET flags to the call to ext4_free_blocks(). Signed-off-by: "Theodore Ts'o" Cc: "Aneesh Kumar K.V" --- fs/ext4/ext4.h | 10 ++++-- fs/ext4/extents.c | 24 +++++-------- fs/ext4/inode.c | 67 ++++++++++++++----------------------- fs/ext4/mballoc.c | 49 ++++++++++++++++++++------- fs/ext4/migrate.c | 23 +++++++++---- fs/ext4/xattr.c | 8 +++-- include/trace/events/ext4.h | 16 +++++---- 7 files changed, 109 insertions(+), 88 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 210e1b53e91..4cfc2f0edb3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -375,6 +375,12 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_DIO_CREATE_EXT) +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 + /* * ioctl commands */ @@ -1384,8 +1390,8 @@ extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata); + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74dcff84c3a..2c4a9321fb1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1007,7 +1007,8 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); @@ -1957,7 +1958,6 @@ errout: static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { - struct buffer_head *bh; int err; ext4_fsblk_t leaf; @@ -1973,9 +1973,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); - bh = sb_find_get_block(inode->i_sb, leaf); - ext4_forget(handle, 1, inode, bh, leaf); - ext4_free_blocks(handle, inode, leaf, 1, 1); + ext4_free_blocks(handle, inode, 0, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; } @@ -2042,12 +2041,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_lblk_t to) { - struct buffer_head *bh; unsigned short ee_len = ext4_ext_get_actual_len(ex); - int i, metadata = 0; + int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; + flags |= EXT4_FREE_BLOCKS_METADATA; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2072,11 +2070,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, num = le32_to_cpu(ex->ee_block) + ee_len - from; start = ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); - for (i = 0; i < num; i++) { - bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, metadata, inode, bh, start + i); - } - ext4_free_blocks(handle, inode, start, num, metadata); + ext4_free_blocks(handle, inode, 0, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -3319,8 +3313,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), 0); goto out2; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 72c69432349..3b28e1fbfc9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -669,7 +669,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); return ret; } @@ -765,20 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); /* - * Note: is_metadata is 0 because branch[i].bh is - * newly allocated, so there is no need to revoke the - * block. If we do, it's harmless, but not necessary. + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_forget(handle, 0, inode, branch[i].bh, - branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); } - for (i = 0; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); return err; } @@ -857,18 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); /* - * Note: is_metadata is 0 because branch[i].bh is - * newly allocated, so there is no need to revoke the - * block. If we do, it's harmless, but not necessary. + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_forget(handle, 0, inode, where[i].bh, - where[i].bh->b_blocknr); - ext4_free_blocks(handle, inode, - le32_to_cpu(where[i-1].key), 1, 0); + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + blks, 0); return err; } @@ -4080,7 +4078,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; - int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; if (try_to_extend_transaction(handle, inode)) { if (bh) { @@ -4096,27 +4097,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } } - /* - * Any buffers which are on the journal will be in memory. We - * find them on the hash table so jbd2_journal_revoke() will - * run jbd2_journal_forget() on them. We've already detached - * each block from the file, so bforget() in - * jbd2_journal_forget() should be safe. - * - * AKPM: turn on bforget in jbd2_journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *tbh; + for (p = first; p < last; p++) + *p = 0; - *p = 0; - tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, is_metadata, inode, tbh, nr); - } - } - - ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); } /** @@ -4304,7 +4288,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } - ext4_free_blocks(handle, inode, nr, 1, 1); + ext4_free_blocks(handle, inode, 0, nr, 1, + EXT4_FREE_BLOCKS_METADATA); if (parent_bh) { /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0dca90be1af..78de5d3c5dc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4436,8 +4436,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * @metadata: Are these metadata blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata) + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; @@ -4454,15 +4454,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; - /* - * We need to make sure we don't reuse the freed block until - * after the transaction is committed, which we can do by - * treating the block as metadata, below. We make an - * exception if the inode is to be written in writeback mode - * since writeback mode has weak data consistency guarantees. - */ - if (!ext4_should_writeback_data(inode)) - metadata = 1; + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; @@ -4476,7 +4473,32 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %llu\n", block); - trace_ext4_free_blocks(inode, block, count, metadata); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4552,7 +4574,8 @@ do_more: err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_return; - if (metadata && ext4_handle_valid(handle)) { + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { struct ext4_free_data *new_entry; /* * blocks being freed are metadata. these blocks shouldn't diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a93d5b80f3e..d641e13e740 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle, for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(tmp_idata[i]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(i_data[0]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ @@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return retval; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 025701926f9..910bf9a59cb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ea_bdebug(bh, "refcount now=0; freeing"); if (ce) mb_cache_entry_free(ce); - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); get_bh(bh); - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); error = ext4_handle_dirty_metadata(handle, inode, bh); @@ -832,7 +833,8 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA); error = -EIO; goto cleanup; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b390e1fc4a7..74f628bfdd1 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -650,30 +650,32 @@ TRACE_EVENT(ext4_allocate_blocks, TRACE_EVENT(ext4_free_blocks, TP_PROTO(struct inode *inode, __u64 block, unsigned long count, - int metadata), + int flags), - TP_ARGS(inode, block, count, metadata), + TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) + __field( umode_t, mode ) __field( __u64, block ) __field( unsigned long, count ) - __field( int, metadata ) - + __field( int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; __entry->block = block; __entry->count = count; - __entry->metadata = metadata; + __entry->flags = flags; ), - TP_printk("dev %s ino %lu block %llu count %lu metadata %d", + TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->block, __entry->count, __entry->metadata) + __entry->mode, __entry->block, __entry->count, + __entry->flags) ); TRACE_EVENT(ext4_sync_file, From 6eebee625544ac4ef1d805da942f463275bd6caa Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 20:23:31 -0500 Subject: [PATCH 26/47] ext4: print i_mode in octal in ext4 tracepoints Inode permissions are much easier to understand if they are printed in octal. Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 74f628bfdd1..287347ca71b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -38,7 +38,7 @@ TRACE_EVENT(ext4_free_inode, __entry->blocks = inode->i_blocks; ), - TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", + TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid, (unsigned long long) __entry->blocks) @@ -61,7 +61,7 @@ TRACE_EVENT(ext4_request_inode, __entry->mode = mode; ), - TP_printk("dev %s dir %lu mode %d", + TP_printk("dev %s dir %lu mode 0%o", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, __entry->mode) ); @@ -85,7 +85,7 @@ TRACE_EVENT(ext4_allocate_inode, __entry->mode = mode; ), - TP_printk("dev %s ino %lu dir %lu mode %d", + TP_printk("dev %s ino %lu dir %lu mode 0%o", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); @@ -930,7 +930,7 @@ TRACE_EVENT(ext4_forget, __entry->block = block; ), - TP_printk("dev %s ino %lu mode %d is_metadata %d block %llu", + TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->is_metadata, __entry->block) ); From 1585d8d89ae1791d8f957731f5655700fbcc5664 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 20:48:34 -0500 Subject: [PATCH 27/47] ext4: add check for wraparound in ext4_data_block_valid() Signed-off-by: "Theodore Ts'o" --- fs/ext4/block_validity.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index dc79b75d8f7..4df8621ec31 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, struct rb_node *n = sbi->system_blks.rb_node; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; while (n) { From 9084d4719784b00ff0bf9c9580007fac8277dbcb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 22 Nov 2009 20:48:42 -0500 Subject: [PATCH 28/47] ext4: use ext4_data_block_valid() in ext4_free_blocks() The block validity framework does a more comprehensive set of checks, and it saves object code space to use the ext4_data_block_valid() than the limited open-coded version that had been in ext4_free_blocks(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 78de5d3c5dc..ab2dad1dfb7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4463,9 +4463,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { + if (!ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, __func__, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); From 94d7c16cbbbd0e03841fcf272bcaf0620ad39618 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Tue, 24 Nov 2009 10:19:57 -0500 Subject: [PATCH 29/47] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT At the beginning of ext4_move_extent(), we call ext4_discard_preallocations() to discard inode PAs of orig and donor inodes. But in the following case, blocks can be double freed, so move ext4_discard_preallocations() to the end of ext4_move_extents(). 1. Discard inode PAs of orig and donor inodes with ext4_discard_preallocations() in ext4_move_extents(). orig : [ DATA1 ] donor: [ DATA2 ] 2. While data blocks are exchanging between orig and donor inodes, new inode PAs is created to orig by other process's block allocation. (Since there are semaphore gaps in ext4_move_extents().) And new inode PAs is used partially (2-1). 2-1 Create new inode PAs to orig inode orig : [ DATA1 | used PA1 | free PA1 ] donor: [ DATA2 ] 3. Donor inode which has old orig inode's blocks is deleted after EXT4_IOC_MOVE_EXT finished (3-1, 3-2). So the block bitmap corresponds to old orig inode's blocks are freed. 3-1 After EXT4_IOC_MOVE_EXT finished orig : [ DATA2 | free PA1 ] donor: [ DATA1 | used PA1 ] 3-2 Delete donor inode orig : [ DATA2 | free PA1 ] donor: [ FREE SPACE(DATA1) | FREE SPACE(used PA1) ] 4. The double-free of blocks is occurred, when close() is called to orig inode. Because ext4_discard_preallocations() for orig inode frees used PA1 and free PA1, though used PA1 is already freed in 3. 4-1 Double-free of blocks is occurred orig : [ DATA2 | FREE SPACE(free PA1) ] donor: [ FREE SPACE(DATA1) | DOUBLE FREE(used PA1) ] Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5a106e02fd9..3478889e00b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1289,10 +1289,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_ext_get_actual_len(ext_cur), block_end + 1) - max(le32_to_cpu(ext_cur->ee_block), block_start); - /* Discard preallocations of two inodes */ - ext4_discard_preallocations(orig_inode); - ext4_discard_preallocations(donor_inode); - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { seq_blocks += add_blocks; @@ -1410,6 +1406,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + if (orig_path) { ext4_ext_drop_refs(orig_path); kfree(orig_path); From 446aaa6e7e993b38a6f21c6acfa68f3f1af3dbe3 Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Tue, 24 Nov 2009 10:28:48 -0500 Subject: [PATCH 30/47] ext4: initialize moved_len before calling ext4_move_extents() The move_extent.moved_len is used to pass back the number of exchanged blocks count to user space. Currently the caller must clear this field; but we spend more code space checking for this requirement than simply zeroing the field ourselves, so let's just make life easier for everyone all around. Signed-off-by: Kazuya Mio Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 1 + fs/ext4/move_extent.c | 14 +++----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c1cdf613e72..31e5ee0c858 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -239,6 +239,7 @@ setversion_out: } } + me.moved_len = 0; err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); fput(donor_filp); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3478889e00b..445ecd7616a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -947,7 +947,6 @@ out2: * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved - * @moved_len: moved block length * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. @@ -955,8 +954,8 @@ out2: */ static int mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len, __u64 moved_len) + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) { ext4_lblk_t orig_blocks, donor_blocks; unsigned int blkbits = orig_inode->i_blkbits; @@ -1010,13 +1009,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if (moved_len) { - ext4_debug("ext4 move extent: moved_len should be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); - return -EINVAL; - } - if ((orig_start > EXT_MAX_BLOCK) || (donor_start > EXT_MAX_BLOCK) || (*len > EXT_MAX_BLOCK) || @@ -1226,7 +1218,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len, *moved_len); + donor_start, &len); if (ret1) goto out; From ac48b0a1d068887141581bea8285de5fcab182b0 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Tue, 24 Nov 2009 10:31:56 -0500 Subject: [PATCH 31/47] ext4: move_extent_per_page() cleanup Integrate duplicate lines (acquire/release semaphore and invalidate extent cache in move_extent_per_page()) into mext_replace_branches(), to reduce source and object code size. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 445ecd7616a..cad1e2edda7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -660,6 +660,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -755,6 +758,11 @@ out: kfree(donor_path); } + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + return replaced_count; } @@ -820,19 +828,9 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - /* - * Protect extent trees against block allocations - * via delalloc - */ - double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - double_up_write_data_sem(orig_inode, donor_inode); goto out2; } @@ -880,8 +878,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - /* Protect extent trees against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, &err2); @@ -890,18 +886,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; - } else { - double_up_write_data_sem(orig_inode, donor_inode); + } else goto out; - } } - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - - double_up_write_data_sem(orig_inode, donor_inode); - if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); From b4d7241596ffb6398ac5535ae8cf80d845b0c254 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 24 Nov 2009 11:15:08 -0500 Subject: [PATCH 32/47] ext4: remove encountered_congestion trace It is no longer set and scheduled to be removed. Signed-off-by: Wu Fengguang Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 287347ca71b..f4c62d33077 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -310,7 +310,6 @@ TRACE_EVENT(ext4_da_writepages_result, __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) - __field( char, encountered_congestion ) __field( char, more_io ) __field( char, no_nrwrite_index_update ) __field( pgoff_t, writeback_index ) @@ -322,17 +321,16 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; - __entry->encountered_congestion = wbc->encountered_congestion; __entry->more_io = wbc->more_io; __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d no_nrwrite_index_update %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, - __entry->encountered_congestion, __entry->more_io, + __entry->more_io, __entry->no_nrwrite_index_update, (unsigned long) __entry->writeback_index) ); From 3f0ca309858ee186435c608ee9eaafd1c8dcb53a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 24 Nov 2009 11:15:44 -0500 Subject: [PATCH 33/47] ext4: remove unused parameter wbc from __ext4_journalled_writepage() CC: Jan Kara Signed-off-by: Wu Fengguang Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3b28e1fbfc9..d3f99e9e8a3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2558,7 +2558,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) } static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, unsigned int len) { struct address_space *mapping = page->mapping; @@ -2716,7 +2715,7 @@ static int ext4_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc, len); + return __ext4_journalled_writepage(page, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) From e6ec116b67f46e0e7808276476554727b2e6240b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 1 Dec 2009 09:04:42 -0500 Subject: [PATCH 34/47] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() OOM happens. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 4 ++++ fs/jbd2/journal.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d4cfd6d2779..8896c1d4feb 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index af60d98ddd2..3f473faa466 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -358,6 +358,10 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + jbd2_journal_put_journal_head(new_jh); + return -ENOMEM; + } jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); From c09eef305dd43846360944ad072f051f964fa383 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 7 Dec 2009 10:38:16 -0500 Subject: [PATCH 35/47] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() Signed-off-by: Roel Kluin Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3cfc343c41b..3b2c5541d8a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext4_handle_dirty_metadata(handle, NULL, gdb); From 24b584240a0006ea7436cd35f5e8983eb76f1e6f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 7 Dec 2009 14:08:51 -0500 Subject: [PATCH 36/47] ext4: Use ext4 file system driver for ext2/ext3 file system mounts Add a new config option, CONFIG_EXT4_USE_FOR_EXT23 which if enabled, will cause ext4 to be used for either ext2 or ext3 file system mounts when ext2 or ext3 is not enabled in the configuration. This allows minimalist kernel fanatics to drop to file system drivers from their compiled kernel with out losing functionality. Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 10 +++++++++ fs/ext4/super.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9f2d45d75b1..a6b4e93833d 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -26,6 +26,16 @@ config EXT4_FS If unsure, say N. +config EXT4_USE_FOR_EXT23 + bool "Use ext4 for ext2/ext3 file systems" + depends on !EXT3_FS || !EXT2_FS + default y + help + Allow the ext4 file system driver code to be used for ext2 or + ext3 file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + config EXT4_FS_XATTR bool "Ext4 extended attributes" depends on EXT4_FS diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5a2db612950..30476daf966 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3960,6 +3960,58 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } +#if !defined(CONTIG_EXT2_FS) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +#endif + +#if !defined(CONTIG_EXT3_FS) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} +#else +static inline void register_as_ext3(void) { } +static inline void unregister_as_ext3(void) { } +#endif + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", @@ -3989,11 +4041,15 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; + register_as_ext2(); + register_as_ext3(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: + unregister_as_ext2(); + unregister_as_ext3(); destroy_inodecache(); out1: exit_ext4_xattr(); @@ -4009,6 +4065,8 @@ out4: static void __exit exit_ext4_fs(void) { + unregister_as_ext2(); + unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); exit_ext4_xattr(); From b9a4207d5e911b938f73079a83cc2ae10524ec7f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Dec 2009 21:24:33 -0500 Subject: [PATCH 37/47] ext4: Avoid data / filesystem corruption when write fails to copy data When ext4_write_begin fails after allocating some blocks or generic_perform_write fails to copy data to write, we truncate blocks already instantiated beyond i_size. Although these blocks were never inside i_size, we have to truncate the pagecache of these blocks so that corresponding buffers get unmapped. Otherwise subsequent __block_prepare_write (called because we are retrying the write) will find the buffers mapped, not call ->get_block, and thus the page will be backed by already freed blocks leading to filesystem and data corruption. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3f99e9e8a3..0e2ea572856 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1492,6 +1492,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1557,7 +1567,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1667,7 +1677,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1709,7 +1719,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1772,7 +1782,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -3048,7 +3058,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) From d4edac314e9ad0b21ba20ba8bc61b61f186f79e1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 8 Dec 2009 21:48:58 -0500 Subject: [PATCH 38/47] ext4: wait for log to commit when umounting There is a potential race when a transaction is committing right when the file system is being umounting. This could reduce in a race because EXT4_SB(sb)->s_group_info could be freed in ext4_put_super before the commit code calls a callback so the mballoc code can release freed blocks in the transaction, resulting in a panic trying to access the freed s_group_info. The fix is to wait for the transaction to finish committing before we shutdown the multiblock allocator. Signed-off-by: Josef Bacik Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 30476daf966..8ab0c951847 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb) if (sb->s_dirt) ext4_commit_super(sb, 1); - ext4_release_system_zone(sb); - ext4_mb_release(sb); - ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, __func__, "Couldn't clean up the journal"); } + + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); From b844167edc7fcafda9623955c05e4c1b3c32ebc7 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Tue, 8 Dec 2009 22:18:25 -0500 Subject: [PATCH 39/47] ext4: remove blocks from inode prealloc list on failure This fixes a leak of blocks in an inode prealloc list if device failures cause ext4_mb_mark_diskspace_used() to fail. Signed-off-by: Curt Wohlgemuth Acked-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ab2dad1dfb7..19635c34199 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3010,6 +3010,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) trace_ext4_mballoc_prealloc(ac); } +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + /* * use blocks preallocated to inode */ @@ -4295,6 +4313,7 @@ repeat: ac->ac_status = AC_STATUS_CONTINUE; goto repeat; } else if (*errp) { + ext4_discard_allocated_blocks(ac); ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); From 8aa6790f876e81f5a2211fe1711a5fe3fe2d7b20 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:41:52 -0500 Subject: [PATCH 40/47] ext4: ext4_get_reserved_space() must return bytes instead of blocks Signed-off-by: Dmitry Monakhov Reviewed-by: Eric Sandeen Acked-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0e2ea572856..2da74f57a10 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1010,7 +1010,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) EXT4_I(inode)->i_reserved_meta_blocks; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return total; + return (total << inode->i_blkbits); } /* * Calculate the number of metadata blocks need to reserve From 5aca07eb7d8f14d90c740834d15ca15277f4820c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:42:15 -0500 Subject: [PATCH 41/47] ext4: quota macros cleanup Currently all quota block reservation macros contains hard-coded "2" aka MAXQUOTAS value. This is no good because in some places it is not obvious to understand what does this digit represent. Let's introduce new macro with self descriptive name. Signed-off-by: Dmitry Monakhov Acked-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.h | 8 ++++++-- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 2 +- fs/ext4/migrate.c | 4 ++-- fs/ext4/namei.c | 8 ++++---- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 84bc98ab9f0..2c2b262bd31 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -49,7 +49,7 @@ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. @@ -57,7 +57,7 @@ * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be @@ -92,6 +92,7 @@ * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else @@ -99,6 +100,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) int ext4_mark_iloc_dirty(handle_t *handle, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2c4a9321fb1..5967f18fd7e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2161,7 +2161,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2da74f57a10..1b1b7d91811 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5175,7 +5175,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d641e13e740..81415814b00 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) * So allocate a credit of 3. We may update * quota (user and group). */ - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); if (ext4_journal_extend(handle, needed) != 0) retval = ext4_journal_restart(handle, needed); @@ -486,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index fde08c919d1..17a17e10dd6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); From 194074acacebc169ded90a4657193f5180015051 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 8 Dec 2009 22:42:28 -0500 Subject: [PATCH 42/47] ext4: fix incorrect block reservation on quota transfer. Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid This means that we may end-up with transferring all quotas. Add we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in case of QUOTA_INIT_BLOCKS. Signed-off-by: Dmitry Monakhov Reviewed-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b1b7d91811..958c3ff800e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5176,7 +5176,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; From b436b9bef84de6893e86346d8fbf7104bc520645 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Dec 2009 23:51:10 -0500 Subject: [PATCH 43/47] ext4: Wait for proper transaction commit on fsync We cannot rely on buffer dirty bits during fsync because pdflush can come before fsync is called and clear dirty bits without forcing a transaction commit. What we do is that we track which transaction has last changed the inode and which transaction last changed allocation and force it to disk on fsync. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 7 +++++++ fs/ext4/ext4_jbd2.h | 13 +++++++++++++ fs/ext4/extents.c | 14 ++++++++++++-- fs/ext4/fsync.c | 46 +++++++++++++++++---------------------------- fs/ext4/inode.c | 29 ++++++++++++++++++++++++++++ fs/ext4/super.c | 2 ++ 6 files changed, 80 insertions(+), 31 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cfc2f0edb3..ab31e65d46d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -709,6 +709,13 @@ struct ext4_inode_info { struct list_head i_aio_dio_complete_list; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2c2b262bd31..05eca817d70 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5967f18fd7e..700206e525d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); goto out2; } /* buffered IO case */ @@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, path, iblock, max_blocks); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out: if (ret <= 0) { err = ret; @@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = ext4_ext_get_actual_len(&newex); set_buffer_new(bh_result); - /* Cache only when it is _not_ an uninitialized extent */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > max_blocks) allocated = max_blocks; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a3c25076aef..0b22497d92e 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -51,25 +51,30 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int err, ret = 0; + int ret; + tid_t commit_tid; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + ret = flush_aio_dio_completed_IO(inode); if (ret < 0) return ret; + + if (!journal) + return simple_fsync(file, dentry, datasync); + /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for proper transaction to + * commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ext4_should_journal_data(inode)) return ext4_force_commit(inode->i_sb); - if (!journal) - ret = sync_mapping_buffers(inode->i_mapping); - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - err = sync_inode(inode, &wbc); - if (ret == 0) - ret = err; - } -out: - if (journal && (journal->j_flags & JBD2_BARRIER)) + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (jbd2_log_start_commit(journal, commit_tid)) + jbd2_log_wait_commit(journal, commit_tid); + else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 958c3ff800e..f1bc1e33882 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > @@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ei->i_state &= ~EXT4_STATE_NEW; + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8ab0c951847..2b13dcfcf77 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&(ei->i_block_reservation_lock)); INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; return &ei->vfs_inode; } From 4a58579b9e4e2a35d57e6c9c8483e52f6f1b7fd6 Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Sun, 6 Dec 2009 23:38:31 -0500 Subject: [PATCH 44/47] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT This patch fixes three problems in the handling of the EXT4_IOC_MOVE_EXT ioctl: 1. In current EXT4_IOC_MOVE_EXT, there are read access mode checks for original and donor files, but they allow the illegal write access to donor file, since donor file is overwritten by original file data. To fix this problem, change access mode checks of original (r->r/w) and donor (r->w) files. 2. Disallow the use of donor files that have a setuid or setgid bits. 3. Call mnt_want_write() and mnt_drop_write() before and after ext4_move_extents() calling to get write access to a mount. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 30 ++++++++++++++++++------------ fs/ext4/move_extent.c | 7 +++++++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 31e5ee0c858..b63d193126d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -221,32 +221,38 @@ setversion_out: struct file *donor_filp; int err; + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; + me.moved_len = 0; donor_filp = fget(me.donor_fd); if (!donor_filp) return -EBADF; - if (!capable(CAP_DAC_OVERRIDE)) { - if ((current->real_cred->fsuid != inode->i_uid) || - !(inode->i_mode & S_IRUSR) || - !(donor_filp->f_dentry->d_inode->i_mode & - S_IRUSR)) { - fput(donor_filp); - return -EACCES; - } + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; } - me.moved_len = 0; + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto mext_out; + err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); - fput(donor_filp); + mnt_drop_write(filp->f_path.mnt); + if (me.moved_len > 0) + file_remove_suid(donor_filp); if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) - return -EFAULT; - + err = -EFAULT; +mext_out: + fput(donor_filp); return err; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index cad1e2edda7..82c415be87a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -957,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " From 3b799d15f2622c44bae93961892d90ab012ea2be Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 9 Dec 2009 20:42:53 -0500 Subject: [PATCH 45/47] jbd2: Export jbd2_log_start_commit to fix ext4 build This fixes: ERROR: "jbd2_log_start_commit" [fs/ext4/ext4.ko] undefined! Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 3f473faa466..b7ca3a92a4d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); From a214238d3bb03723f820b0a398928d8e1637c987 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 9 Dec 2009 21:09:58 -0500 Subject: [PATCH 46/47] ext4: Do not override ext2 or ext3 if built they are built as modules The CONFIG_EXT4_USE_FOR_EXT23 option must not try to take over the ext2 or ext3 file systems if the those file system drivers are configured to be built as mdoules. Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 2 +- fs/ext4/super.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index a6b4e93833d..9acf7e80813 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,7 +28,7 @@ config EXT4_FS config EXT4_USE_FOR_EXT23 bool "Use ext4 for ext2/ext3 file systems" - depends on !EXT3_FS || !EXT2_FS + depends on EXT3_FS=n || EXT2_FS=n default y help Allow the ext4 file system driver code to be used for ext2 or diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2b13dcfcf77..8b58a144c31 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3964,7 +3964,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } -#if !defined(CONTIG_EXT2_FS) && defined(CONFIG_EXT4_USE_FOR_EXT23) +#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", @@ -3990,7 +3990,7 @@ static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } #endif -#if !defined(CONTIG_EXT3_FS) && defined(CONFIG_EXT4_USE_FOR_EXT23) +#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", From fab3a549e204172236779f502eccb4f9bf0dc87d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 9 Dec 2009 21:30:02 -0500 Subject: [PATCH 47/47] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem) Fix the following potential circular locking dependency between mm->mmap_sem and ei->i_data_sem: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.32-04115-gec044c5 #37 ------------------------------------------------------- ureadahead/1855 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x5c/0xac but task is already holding lock: (&ei->i_data_sem){++++..}, at: [] ext4_fiemap+0x11b/0x159 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&ei->i_data_sem){++++..}: [] __lock_acquire+0xb67/0xd0f [] lock_acquire+0xdc/0x102 [] down_read+0x51/0x84 [] ext4_get_blocks+0x50/0x2a5 [] ext4_get_block+0xab/0xef [] do_mpage_readpage+0x198/0x48d [] mpage_readpages+0xd0/0x114 [] ext4_readpages+0x1d/0x1f [] __do_page_cache_readahead+0x12f/0x1bc [] ra_submit+0x21/0x25 [] filemap_fault+0x19f/0x32c [] __do_fault+0x55/0x3a2 [] handle_mm_fault+0x327/0x734 [] do_page_fault+0x292/0x2aa [] page_fault+0x25/0x30 [] clear_user+0x38/0x3c [] padzero+0x20/0x31 [] load_elf_binary+0x8bc/0x17ed [] search_binary_handler+0xc2/0x259 [] load_script+0x1b8/0x1cc [] search_binary_handler+0xc2/0x259 [] do_execve+0x1ce/0x2cf [] sys_execve+0x43/0x5a [] stub_execve+0x6a/0xc0 -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0xa11/0xd0f [] lock_acquire+0xdc/0x102 [] might_fault+0x89/0xac [] fiemap_fill_next_extent+0x95/0xda [] ext4_ext_fiemap_cb+0x138/0x157 [] ext4_ext_walk_space+0x178/0x1f1 [] ext4_fiemap+0x13c/0x159 [] do_vfs_ioctl+0x348/0x4d6 [] sys_ioctl+0x56/0x79 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: 1 lock held by ureadahead/1855: #0: (&ei->i_data_sem){++++..}, at: [] ext4_fiemap+0x11b/0x159 stack backtrace: Pid: 1855, comm: ureadahead Not tainted 2.6.32-04115-gec044c5 #37 Call Trace: [] print_circular_bug+0xa8/0xb7 [] __lock_acquire+0xa11/0xd0f [] ? sched_clock+0x9/0xd [] lock_acquire+0xdc/0x102 [] ? might_fault+0x5c/0xac [] might_fault+0x89/0xac [] ? might_fault+0x5c/0xac [] ? __kmalloc+0x13b/0x18c [] fiemap_fill_next_extent+0x95/0xda [] ext4_ext_fiemap_cb+0x138/0x157 [] ? ext4_ext_fiemap_cb+0x0/0x157 [] ext4_ext_walk_space+0x178/0x1f1 [] ext4_fiemap+0x13c/0x159 [] ? might_fault+0x5c/0xac [] do_vfs_ioctl+0x348/0x4d6 [] ? __up_read+0x8d/0x95 [] ? retint_swapgs+0x13/0x1b [] sys_ioctl+0x56/0x79 [] system_call_fastpath+0x16/0x1b Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 700206e525d..3a7928f825e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1762,7 +1762,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, while (block < last && block != EXT_MAX_BLOCK) { num = last - block; /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -3724,10 +3726,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_read(&EXT4_I(inode)->i_data_sem); } return error;