Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 * 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (43 commits) ext4: fix a BUG in mb_mark_used during trim. ext4: unused variables cleanup in fs/ext4/extents.c ext4: remove redundant set_buffer_mapped() in ext4_da_get_block_prep() ext4: add more tracepoints and use dev_t in the trace buffer ext4: don't kfree uninitialized s_group_info members ext4: add missing space in printk's in __ext4_grp_locked_error() ext4: add FITRIM to compat_ioctl. ext4: handle errors in ext4_clear_blocks() ext4: unify the ext4_handle_release_buffer() api ext4: handle errors in ext4_rename jbd2: add COW fields to struct jbd2_journal_handle jbd2: add the b_cow_tid field to journal_head struct ext4: Initialize fsync transaction ids in ext4_new_inode() ext4: Use single thread to perform DIO unwritten convertion ext4: optimize ext4_bio_write_page() when no extent conversion is needed ext4: skip orphan cleanup if fs has unknown ROCOMPAT features ext4: use the nblocks arg to ext4_truncate_restart_trans() ext4: fix missing iput of root inode for some mount error paths ext4: make FIEMAP and delayed allocation play well together ext4: suppress verbose debugging information if malloc-debug is off ... Fi up conflicts in fs/ext4/super.c due to workqueue changes

commit: ae005cbed12d0b340b04b59d6f5c56e710b3895d [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Fri Mar 25 09:57:40 2011 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Fri Mar 25 09:57:41 2011 -0700
tree: d464865bcc97bea05eab4eba0d10bcad4ec89b93
parent: 3961cdf85b749f6bab50ad31ee97e9277e7a3b70 [diff]
parent: 0ba0851714beebb800992e5105a79dc3a4c504b0 [diff]
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b8..97b970e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c

@@ -21,6 +21,8 @@
 #include "ext4_jbd2.h"
 #include "mballoc.h"
 
+#include <trace/events/ext4.h>
+
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -342,6 +344,7 @@
 	 * We do it here so the bitmap uptodate bit
 	 * get set with buffer lock held.
 	 */
+	trace_ext4_read_block_bitmap_load(sb, block_group);
 	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e..e25e99b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h

@@ -202,13 +202,6 @@
 	return 1;
 }
 
-static inline void ext4_journal_release_buffer(handle_t *handle,
-						struct buffer_head *bh)
-{
-	if (ext4_handle_valid(handle))
-		jbd2_journal_release_buffer(handle, bh);
-}
-
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
 	return ext4_journal_start_sb(inode->i_sb, nblocks);

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7516fb9..dd2cb50 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c

@@ -44,6 +44,8 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
+#include <trace/events/ext4.h>
+
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -664,6 +666,8 @@
 		if (unlikely(!bh))
 			goto err;
 		if (!bh_uptodate_or_lock(bh)) {
+			trace_ext4_ext_load_extent(inode, block,
+						path[ppos].p_block);
 			if (bh_submit_read(bh) < 0) {
 				put_bh(bh);
 				goto err;
@@ -1034,7 +1038,7 @@
 		for (i = 0; i < depth; i++) {
 			if (!ablocks[i])
 				continue;
-			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
 					 EXT4_FREE_BLOCKS_METADATA);
 		}
 	}
@@ -2059,7 +2063,7 @@
 	if (err)
 		return err;
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
-	ext4_free_blocks(handle, inode, 0, leaf, 1,
+	ext4_free_blocks(handle, inode, NULL, leaf, 1,
 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return err;
 }
@@ -2156,7 +2160,7 @@
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
 		start = ext4_ext_pblock(ex) + ee_len - num;
 		ext_debug("free last %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, 0, start, num, flags);
+		ext4_free_blocks(handle, inode, NULL, start, num, flags);
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -3108,14 +3112,13 @@
 {
 	int i, depth;
 	struct ext4_extent_header *eh;
-	struct ext4_extent *ex, *last_ex;
+	struct ext4_extent *last_ex;
 
 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
 		return 0;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
-	ex = path[depth].p_ext;
 
 	if (unlikely(!eh->eh_entries)) {
 		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3295,9 +3298,8 @@
 			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
-	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex;
-	ext4_fsblk_t newblock;
+	ext4_fsblk_t newblock = 0;
 	int err = 0, depth, ret;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
@@ -3305,6 +3307,7 @@
 
 	ext_debug("blocks %u/%u requested for inode %lu\n",
 		  map->m_lblk, map->m_len, inode->i_ino);
+	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* check in cache */
 	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3352,7 +3355,6 @@
 		err = -EIO;
 		goto out2;
 	}
-	eh = path[depth].p_hdr;
 
 	ex = path[depth].p_ext;
 	if (ex) {
@@ -3485,7 +3487,7 @@
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
 		ext4_discard_preallocations(inode);
-		ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
+		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
 				 ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
 	}
@@ -3525,6 +3527,8 @@
 		ext4_ext_drop_refs(path);
 		kfree(path);
 	}
+	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+		newblock, map->m_len, err ? err : allocated);
 	return err ? err : allocated;
 }
 
@@ -3658,6 +3662,7 @@
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
+	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
@@ -3673,6 +3678,7 @@
 	ret = inode_newsize_ok(inode, (len + offset));
 	if (ret) {
 		mutex_unlock(&inode->i_mutex);
+		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 		return ret;
 	}
 retry:
@@ -3717,6 +3723,8 @@
 		goto retry;
 	}
 	mutex_unlock(&inode->i_mutex);
+	trace_ext4_fallocate_exit(inode, offset, max_blocks,
+				ret > 0 ? ret2 : ret);
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -3775,6 +3783,7 @@
 	}
 	return ret > 0 ? ret2 : ret;
 }
+
 /*
  * Callback function called for each extent to gather FIEMAP information.
  */
@@ -3782,38 +3791,162 @@
 		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
 		       void *data)
 {
-	struct fiemap_extent_info *fieinfo = data;
-	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
 	__u64	logical;
 	__u64	physical;
 	__u64	length;
+	loff_t	size;
 	__u32	flags = 0;
-	int	error;
+	int		ret = 0;
+	struct fiemap_extent_info *fieinfo = data;
+	unsigned char blksize_bits;
 
-	logical =  (__u64)newex->ec_block << blksize_bits;
+	blksize_bits = inode->i_sb->s_blocksize_bits;
+	logical = (__u64)newex->ec_block << blksize_bits;
 
 	if (newex->ec_start == 0) {
-		pgoff_t offset;
-		struct page *page;
+		/*
+		 * No extent in extent-tree contains block @newex->ec_start,
+		 * then the block may stay in 1)a hole or 2)delayed-extent.
+		 *
+		 * Holes or delayed-extents are processed as follows.
+		 * 1. lookup dirty pages with specified range in pagecache.
+		 *    If no page is got, then there is no delayed-extent and
+		 *    return with EXT_CONTINUE.
+		 * 2. find the 1st mapped buffer,
+		 * 3. check if the mapped buffer is both in the request range
+		 *    and a delayed buffer. If not, there is no delayed-extent,
+		 *    then return.
+		 * 4. a delayed-extent is found, the extent will be collected.
+		 */
+		ext4_lblk_t	end = 0;
+		pgoff_t		last_offset;
+		pgoff_t		offset;
+		pgoff_t		index;
+		struct page	**pages = NULL;
 		struct buffer_head *bh = NULL;
+		struct buffer_head *head = NULL;
+		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+
+		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (pages == NULL)
+			return -ENOMEM;
 
 		offset = logical >> PAGE_SHIFT;
-		page = find_get_page(inode->i_mapping, offset);
-		if (!page || !page_has_buffers(page))
-			return EXT_CONTINUE;
+repeat:
+		last_offset = offset;
+		head = NULL;
+		ret = find_get_pages_tag(inode->i_mapping, &offset,
+					PAGECACHE_TAG_DIRTY, nr_pages, pages);
 
-		bh = page_buffers(page);
+		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+			/* First time, try to find a mapped buffer. */
+			if (ret == 0) {
+out:
+				for (index = 0; index < ret; index++)
+					page_cache_release(pages[index]);
+				/* just a hole. */
+				kfree(pages);
+				return EXT_CONTINUE;
+			}
 
-		if (!bh)
-			return EXT_CONTINUE;
+			/* Try to find the 1st mapped buffer. */
+			end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+				  blksize_bits;
+			if (!page_has_buffers(pages[0]))
+				goto out;
+			head = page_buffers(pages[0]);
+			if (!head)
+				goto out;
 
-		if (buffer_delay(bh)) {
-			flags |= FIEMAP_EXTENT_DELALLOC;
-			page_cache_release(page);
+			bh = head;
+			do {
+				if (buffer_mapped(bh)) {
+					/* get the 1st mapped buffer. */
+					if (end > newex->ec_block +
+						newex->ec_len)
+						/* The buffer is out of
+						 * the request range.
+						 */
+						goto out;
+					goto found_mapped_buffer;
+				}
+				bh = bh->b_this_page;
+				end++;
+			} while (bh != head);
+
+			/* No mapped buffer found. */
+			goto out;
 		} else {
-			page_cache_release(page);
-			return EXT_CONTINUE;
+			/*Find contiguous delayed buffers. */
+			if (ret > 0 && pages[0]->index == last_offset)
+				head = page_buffers(pages[0]);
+			bh = head;
 		}
+
+found_mapped_buffer:
+		if (bh != NULL && buffer_delay(bh)) {
+			/* 1st or contiguous delayed buffer found. */
+			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+				/*
+				 * 1st delayed buffer found, record
+				 * the start of extent.
+				 */
+				flags |= FIEMAP_EXTENT_DELALLOC;
+				newex->ec_block = end;
+				logical = (__u64)end << blksize_bits;
+			}
+			/* Find contiguous delayed buffers. */
+			do {
+				if (!buffer_delay(bh))
+					goto found_delayed_extent;
+				bh = bh->b_this_page;
+				end++;
+			} while (bh != head);
+
+			for (index = 1; index < ret; index++) {
+				if (!page_has_buffers(pages[index])) {
+					bh = NULL;
+					break;
+				}
+				head = page_buffers(pages[index]);
+				if (!head) {
+					bh = NULL;
+					break;
+				}
+				if (pages[index]->index !=
+					pages[0]->index + index) {
+					/* Blocks are not contiguous. */
+					bh = NULL;
+					break;
+				}
+				bh = head;
+				do {
+					if (!buffer_delay(bh))
+						/* Delayed-extent ends. */
+						goto found_delayed_extent;
+					bh = bh->b_this_page;
+					end++;
+				} while (bh != head);
+			}
+		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+			/* a hole found. */
+			goto out;
+
+found_delayed_extent:
+		newex->ec_len = min(end - newex->ec_block,
+						(ext4_lblk_t)EXT_INIT_MAX_LEN);
+		if (ret == nr_pages && bh != NULL &&
+			newex->ec_len < EXT_INIT_MAX_LEN &&
+			buffer_delay(bh)) {
+			/* Have not collected an extent and continue. */
+			for (index = 0; index < ret; index++)
+				page_cache_release(pages[index]);
+			goto repeat;
+		}
+
+		for (index = 0; index < ret; index++)
+			page_cache_release(pages[index]);
+		kfree(pages);
 	}
 
 	physical = (__u64)newex->ec_start << blksize_bits;
@@ -3822,32 +3955,16 @@
 	if (ex && ext4_ext_is_uninitialized(ex))
 		flags |= FIEMAP_EXTENT_UNWRITTEN;
 
-	/*
-	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
-	 *
-	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
-	 * this also indicates no more allocated blocks.
-	 *
-	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
-	 */
-	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
-		loff_t size = i_size_read(inode);
-		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
-
+	size = i_size_read(inode);
+	if (logical + length >= size)
 		flags |= FIEMAP_EXTENT_LAST;
-		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
-		    logical+length > size)
-			length = (size - logical + bs - 1) & ~(bs-1);
-	}
 
-	error = fiemap_fill_next_extent(fieinfo, logical, physical,
+	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
 					length, flags);
-	if (error < 0)
-		return error;
-	if (error == 1)
+	if (ret < 0)
+		return ret;
+	if (ret == 1)
 		return EXT_BREAK;
-
 	return EXT_CONTINUE;
 }
 

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b28..7f74019 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c

@@ -164,20 +164,20 @@
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
-	trace_ext4_sync_file(file, datasync);
+	trace_ext4_sync_file_enter(file, datasync);
 
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
 
 	ret = ext4_flush_completed_IO(inode);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	if (!journal) {
 		ret = generic_file_fsync(file, datasync);
 		if (!ret && !list_empty(&inode->i_dentry))
 			ext4_sync_parent(inode);
-		return ret;
+		goto out;
 	}
 
 	/*
@@ -194,8 +194,10 @@
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	if (ext4_should_journal_data(inode))
-		return ext4_force_commit(inode->i_sb);
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		goto out;
+	}
 
 	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
 	if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +217,7 @@
 		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ out:
+	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 78b79e1..21bb2f6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c

@@ -152,6 +152,7 @@
 	 * We do it here so the bitmap uptodate bit
 	 * get set with buffer lock held.
 	 */
+	trace_ext4_load_inode_bitmap(sb, block_group);
 	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
@@ -649,7 +650,7 @@
 		*group = parent_group + flex_size;
 		if (*group > ngroups)
 			*group = 0;
-		return find_group_orlov(sb, parent, group, mode, 0);
+		return find_group_orlov(sb, parent, group, mode, NULL);
 	}
 
 	/*
@@ -1054,6 +1055,11 @@
 		}
 	}
 
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_std_error(sb, err);

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9297ad4..1a86282 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -173,7 +173,7 @@
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
 	up_write(&EXT4_I(inode)->i_data_sem);
-	ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+	ret = ext4_journal_restart(handle, nblocks);
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_discard_preallocations(inode);
 
@@ -720,7 +720,7 @@
 	return ret;
 failed_out:
 	for (i = 0; i < index; i++)
-		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
 	return ret;
 }
 
@@ -823,20 +823,20 @@
 	return err;
 failed:
 	/* Allocation failed, free what we already allocated */
-	ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
+	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
 		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
 		 */
-		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
 	for (i = n+1; i < indirect_blks; i++)
-		ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
 
-	ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
+	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
 
 	return err;
 }
@@ -924,7 +924,7 @@
 		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
 				 EXT4_FREE_BLOCKS_FORGET);
 	}
-	ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
 			 blks, 0);
 
 	return err;
@@ -973,6 +973,7 @@
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 
+	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@
 		partial--;
 	}
 out:
+	trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+				map->m_pblk, map->m_len, err);
 	return err;
 }
 
@@ -2060,7 +2063,7 @@
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
-			int commit_write = 0, redirty_page = 0;
+			int commit_write = 0, skip_page = 0;
 			struct page *page = pvec.pages[i];
 
 			index = page->index;
@@ -2086,14 +2089,12 @@
 			 * If the page does not have buffers (for
 			 * whatever reason), try to create them using
 			 * __block_write_begin.  If this fails,
-			 * redirty the page and move on.
+			 * skip the page and move on.
 			 */
 			if (!page_has_buffers(page)) {
 				if (__block_write_begin(page, 0, len,
 						noalloc_get_block_write)) {
-				redirty_page:
-					redirty_page_for_writepage(mpd->wbc,
-								   page);
+				skip_page:
 					unlock_page(page);
 					continue;
 				}
@@ -2104,7 +2105,7 @@
 			block_start = 0;
 			do {
 				if (!bh)
-					goto redirty_page;
+					goto skip_page;
 				if (map && (cur_logical >= map->m_lblk) &&
 				    (cur_logical <= (map->m_lblk +
 						     (map->m_len - 1)))) {
@@ -2120,22 +2121,23 @@
 					clear_buffer_unwritten(bh);
 				}
 
-				/* redirty page if block allocation undone */
+				/* skip page if block allocation undone */
 				if (buffer_delay(bh) || buffer_unwritten(bh))
-					redirty_page = 1;
+					skip_page = 1;
 				bh = bh->b_this_page;
 				block_start += bh->b_size;
 				cur_logical++;
 				pblock++;
 			} while (bh != page_bufs);
 
-			if (redirty_page)
-				goto redirty_page;
+			if (skip_page)
+				goto skip_page;
 
 			if (commit_write)
 				/* mark the buffer_heads as dirty & uptodate */
 				block_commit_write(page, 0, len);
 
+			clear_page_dirty_for_io(page);
 			/*
 			 * Delalloc doesn't support data journalling,
 			 * but eventually maybe we'll lift this
@@ -2165,8 +2167,7 @@
 	return ret;
 }
 
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
-					sector_t logical, long blk_cnt)
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
@@ -2174,9 +2175,8 @@
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 
-	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	end   = (logical + blk_cnt - 1) >>
-				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	index = mpd->first_page;
+	end   = mpd->next_page - 1;
 	while (index <= end) {
 		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
@@ -2279,9 +2279,8 @@
 		err = blks;
 		/*
 		 * If get block returns EAGAIN or ENOSPC and there
-		 * appears to be free blocks we will call
-		 * ext4_writepage() for all of the pages which will
-		 * just redirty the pages.
+		 * appears to be free blocks we will just let
+		 * mpage_da_submit_io() unlock all of the pages.
 		 */
 		if (err == -EAGAIN)
 			goto submit_io;
@@ -2312,8 +2311,10 @@
 				ext4_print_free_blocks(mpd->inode);
 		}
 		/* invalidate all the pages */
-		ext4_da_block_invalidatepages(mpd, next,
-				mpd->b_size >> mpd->inode->i_blkbits);
+		ext4_da_block_invalidatepages(mpd);
+
+		/* Mark this page range as having been completed */
+		mpd->io_done = 1;
 		return;
 	}
 	BUG_ON(blks == 0);
@@ -2438,102 +2439,6 @@
 }
 
 /*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
-				struct writeback_control *wbc,
-				struct mpage_da_data *mpd)
-{
-	struct inode *inode = mpd->inode;
-	struct buffer_head *bh, *head;
-	sector_t logical;
-
-	/*
-	 * Can we merge this page to current extent?
-	 */
-	if (mpd->next_page != page->index) {
-		/*
-		 * Nope, we can't. So, we map non-allocated blocks
-		 * and start IO on them
-		 */
-		if (mpd->next_page != mpd->first_page) {
-			mpage_da_map_and_submit(mpd);
-			/*
-			 * skip rest of the page in the page_vec
-			 */
-			redirty_page_for_writepage(wbc, page);
-			unlock_page(page);
-			return MPAGE_DA_EXTENT_TAIL;
-		}
-
-		/*
-		 * Start next extent of pages ...
-		 */
-		mpd->first_page = page->index;
-
-		/*
-		 * ... and blocks
-		 */
-		mpd->b_size = 0;
-		mpd->b_state = 0;
-		mpd->b_blocknr = 0;
-	}
-
-	mpd->next_page = page->index + 1;
-	logical = (sector_t) page->index <<
-		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-	if (!page_has_buffers(page)) {
-		mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-				       (1 << BH_Dirty) | (1 << BH_Uptodate));
-		if (mpd->io_done)
-			return MPAGE_DA_EXTENT_TAIL;
-	} else {
-		/*
-		 * Page with regular buffer heads, just add all dirty ones
-		 */
-		head = page_buffers(page);
-		bh = head;
-		do {
-			BUG_ON(buffer_locked(bh));
-			/*
-			 * We need to try to allocate
-			 * unmapped blocks in the same page.
-			 * Otherwise we won't make progress
-			 * with the page in ext4_writepage
-			 */
-			if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-				mpage_add_bh_to_extent(mpd, logical,
-						       bh->b_size,
-						       bh->b_state);
-				if (mpd->io_done)
-					return MPAGE_DA_EXTENT_TAIL;
-			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-				/*
-				 * mapped dirty buffer. We need to update
-				 * the b_state because we look at
-				 * b_state in mpage_da_map_blocks. We don't
-				 * update b_size because if we find an
-				 * unmapped buffer_head later we need to
-				 * use the b_state flag of that buffer_head.
-				 */
-				if (mpd->b_size == 0)
-					mpd->b_state = bh->b_state & BH_FLAGS;
-			}
-			logical++;
-		} while ((bh = bh->b_this_page) != head);
-	}
-
-	return 0;
-}
-
-/*
  * This is a special get_blocks_t callback which is used by
  * ext4_da_write_begin().  It will either return mapped block or
  * reserve space for a single block.
@@ -2597,7 +2502,6 @@
 		 * for partial write.
 		 */
 		set_buffer_new(bh);
-		set_buffer_mapped(bh);
 	}
 	return 0;
 }
@@ -2811,27 +2715,27 @@
 
 /*
  * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
- * the pages).
- *
- * This is a forked version of write_cache_pages().  Differences:
- *	Range cyclic is ignored.
- *	no_nrwrite_index_update is always presumed true
+ * address space and accumulate pages that need writing, and call
+ * mpage_da_map_and_submit to map a single contiguous memory region
+ * and then write them.
  */
 static int write_cache_pages_da(struct address_space *mapping,
 				struct writeback_control *wbc,
 				struct mpage_da_data *mpd,
 				pgoff_t *done_index)
 {
-	int ret = 0;
-	int done = 0;
-	struct pagevec pvec;
-	unsigned nr_pages;
-	pgoff_t index;
-	pgoff_t end;		/* Inclusive */
-	long nr_to_write = wbc->nr_to_write;
-	int tag;
+	struct buffer_head	*bh, *head;
+	struct inode		*inode = mapping->host;
+	struct pagevec		pvec;
+	unsigned int		nr_pages;
+	sector_t		logical;
+	pgoff_t			index, end;
+	long			nr_to_write = wbc->nr_to_write;
+	int			i, tag, ret = 0;
 
+	memset(mpd, 0, sizeof(struct mpage_da_data));
+	mpd->wbc = wbc;
+	mpd->inode = inode;
 	pagevec_init(&pvec, 0);
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -2842,13 +2746,11 @@
 		tag = PAGECACHE_TAG_DIRTY;
 
 	*done_index = index;
-	while (!done && (index <= end)) {
-		int i;
-
+	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
-			break;
+			return 0;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -2860,60 +2762,100 @@
 			 * mapping. However, page->index will not change
 			 * because we have a reference on the page.
 			 */
-			if (page->index > end) {
-				done = 1;
-				break;
-			}
+			if (page->index > end)
+				goto out;
 
 			*done_index = page->index + 1;
 
+			/*
+			 * If we can't merge this page, and we have
+			 * accumulated an contiguous region, write it
+			 */
+			if ((mpd->next_page != page->index) &&
+			    (mpd->next_page != mpd->first_page)) {
+				mpage_da_map_and_submit(mpd);
+				goto ret_extent_tail;
+			}
+
 			lock_page(page);
 
 			/*
-			 * Page truncated or invalidated. We can freely skip it
-			 * then, even for data integrity operations: the page
-			 * has disappeared concurrently, so there could be no
-			 * real expectation of this data interity operation
-			 * even if there is now a new, dirty page at the same
-			 * pagecache address.
+			 * If the page is no longer dirty, or its
+			 * mapping no longer corresponds to inode we
+			 * are writing (which means it has been
+			 * truncated or invalidated), or the page is
+			 * already under writeback and we are not
+			 * doing a data integrity writeback, skip the page
 			 */
-			if (unlikely(page->mapping != mapping)) {
-continue_unlock:
+			if (!PageDirty(page) ||
+			    (PageWriteback(page) &&
+			     (wbc->sync_mode == WB_SYNC_NONE)) ||
+			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
 			}
 
-			if (!PageDirty(page)) {
-				/* someone wrote it for us */
-				goto continue_unlock;
-			}
-
-			if (PageWriteback(page)) {
-				if (wbc->sync_mode != WB_SYNC_NONE)
-					wait_on_page_writeback(page);
-				else
-					goto continue_unlock;
-			}
+			if (PageWriteback(page))
+				wait_on_page_writeback(page);
 
 			BUG_ON(PageWriteback(page));
-			if (!clear_page_dirty_for_io(page))
-				goto continue_unlock;
 
-			ret = __mpage_da_writepage(page, wbc, mpd);
-			if (unlikely(ret)) {
-				if (ret == AOP_WRITEPAGE_ACTIVATE) {
-					unlock_page(page);
-					ret = 0;
-				} else {
-					done = 1;
-					break;
-				}
+			if (mpd->next_page != page->index)
+				mpd->first_page = page->index;
+			mpd->next_page = page->index + 1;
+			logical = (sector_t) page->index <<
+				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+			if (!page_has_buffers(page)) {
+				mpage_add_bh_to_extent(mpd, logical,
+						       PAGE_CACHE_SIZE,
+						       (1 << BH_Dirty) | (1 << BH_Uptodate));
+				if (mpd->io_done)
+					goto ret_extent_tail;
+			} else {
+				/*
+				 * Page with regular buffer heads,
+				 * just add all dirty ones
+				 */
+				head = page_buffers(page);
+				bh = head;
+				do {
+					BUG_ON(buffer_locked(bh));
+					/*
+					 * We need to try to allocate
+					 * unmapped blocks in the same page.
+					 * Otherwise we won't make progress
+					 * with the page in ext4_writepage
+					 */
+					if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+						mpage_add_bh_to_extent(mpd, logical,
+								       bh->b_size,
+								       bh->b_state);
+						if (mpd->io_done)
+							goto ret_extent_tail;
+					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+						/*
+						 * mapped dirty buffer. We need
+						 * to update the b_state
+						 * because we look at b_state
+						 * in mpage_da_map_blocks.  We
+						 * don't update b_size because
+						 * if we find an unmapped
+						 * buffer_head later we need to
+						 * use the b_state flag of that
+						 * buffer_head.
+						 */
+						if (mpd->b_size == 0)
+							mpd->b_state = bh->b_state & BH_FLAGS;
+					}
+					logical++;
+				} while ((bh = bh->b_this_page) != head);
 			}
 
 			if (nr_to_write > 0) {
 				nr_to_write--;
 				if (nr_to_write == 0 &&
-				    wbc->sync_mode == WB_SYNC_NONE) {
+				    wbc->sync_mode == WB_SYNC_NONE)
 					/*
 					 * We stop writing back only if we are
 					 * not doing integrity sync. In case of
@@ -2924,14 +2866,18 @@
 					 * pages, but have not synced all of the
 					 * old dirty pages.
 					 */
-					done = 1;
-					break;
-				}
+					goto out;
 			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
+	return 0;
+ret_extent_tail:
+	ret = MPAGE_DA_EXTENT_TAIL;
+out:
+	pagevec_release(&pvec);
+	cond_resched();
 	return ret;
 }
 
@@ -2945,7 +2891,6 @@
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int pages_written = 0;
-	long pages_skipped;
 	unsigned int max_pages;
 	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0;
@@ -3028,11 +2973,6 @@
 		wbc->nr_to_write = desired_nr_to_write;
 	}
 
-	mpd.wbc = wbc;
-	mpd.inode = mapping->host;
-
-	pages_skipped = wbc->pages_skipped;
-
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag_pages_for_writeback(mapping, index, end);
@@ -3059,22 +2999,10 @@
 		}
 
 		/*
-		 * Now call __mpage_da_writepage to find the next
+		 * Now call write_cache_pages_da() to find the next
 		 * contiguous region of logical blocks that need
-		 * blocks to be allocated by ext4.  We don't actually
-		 * submit the blocks for I/O here, even though
-		 * write_cache_pages thinks it will, and will set the
-		 * pages as clean for write before calling
-		 * __mpage_da_writepage().
+		 * blocks to be allocated by ext4 and submit them.
 		 */
-		mpd.b_size = 0;
-		mpd.b_state = 0;
-		mpd.b_blocknr = 0;
-		mpd.first_page = 0;
-		mpd.next_page = 0;
-		mpd.io_done = 0;
-		mpd.pages_written = 0;
-		mpd.retval = 0;
 		ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
 		/*
 		 * If we have a contiguous extent of pages and we
@@ -3096,7 +3024,6 @@
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
-			wbc->pages_skipped = pages_skipped;
 			ret = 0;
 		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
 			/*
@@ -3104,7 +3031,6 @@
 			 * rest of the pages
 			 */
 			pages_written += mpd.pages_written;
-			wbc->pages_skipped = pages_skipped;
 			ret = 0;
 			io_done = 1;
 		} else if (wbc->nr_to_write)
@@ -3122,11 +3048,6 @@
 		wbc->range_end  = mapping->writeback_index - 1;
 		goto retry;
 	}
-	if (pages_skipped != wbc->pages_skipped)
-		ext4_msg(inode->i_sb, KERN_CRIT,
-			 "This should not happen leaving %s "
-			 "with nr_to_write = %ld ret = %d",
-			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
 	wbc->range_cyclic = range_cyclic;
@@ -3460,6 +3381,7 @@
 
 static int ext4_readpage(struct file *file, struct page *page)
 {
+	trace_ext4_readpage(page);
 	return mpage_readpage(page, ext4_get_block);
 }
 
@@ -3494,6 +3416,8 @@
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
+	trace_ext4_invalidatepage(page, offset);
+
 	/*
 	 * free any io_end structure allocated for buffers to be discarded
 	 */
@@ -3515,6 +3439,8 @@
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
+	trace_ext4_releasepage(page);
+
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
@@ -3873,11 +3799,16 @@
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
 
+	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
-
-	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+	else
+		ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+	trace_ext4_direct_IO_exit(inode, offset,
+				iov_length(iov, nr_segs), rw, ret);
+	return ret;
 }
 
 /*
@@ -4173,6 +4104,9 @@
  *
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
  */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh,
@@ -4199,33 +4133,32 @@
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 			err = ext4_handle_dirty_metadata(handle, inode, bh);
-			if (unlikely(err)) {
-				ext4_std_error(inode->i_sb, err);
-				return 1;
-			}
+			if (unlikely(err))
+				goto out_err;
 		}
 		err = ext4_mark_inode_dirty(handle, inode);
-		if (unlikely(err)) {
-			ext4_std_error(inode->i_sb, err);
-			return 1;
-		}
+		if (unlikely(err))
+			goto out_err;
 		err = ext4_truncate_restart_trans(handle, inode,
 						  blocks_for_truncate(inode));
-		if (unlikely(err)) {
-			ext4_std_error(inode->i_sb, err);
-			return 1;
-		}
+		if (unlikely(err))
+			goto out_err;
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
-			ext4_journal_get_write_access(handle, bh);
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err))
+				goto out_err;
 		}
 	}
 
 	for (p = first; p < last; p++)
 		*p = 0;
 
-	ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
 	return 0;
+out_err:
+	ext4_std_error(inode->i_sb, err);
+	return err;
 }
 
 /**
@@ -4259,7 +4192,7 @@
 	ext4_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
-	int err;
+	int err = 0;
 
 	if (this_bh) {				/* For indirect block */
 		BUFFER_TRACE(this_bh, "get_write_access");
@@ -4281,9 +4214,10 @@
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
-				if (ext4_clear_blocks(handle, inode, this_bh,
-						      block_to_free, count,
-						      block_to_free_p, p))
+				err = ext4_clear_blocks(handle, inode, this_bh,
+						        block_to_free, count,
+						        block_to_free_p, p);
+				if (err)
 					break;
 				block_to_free = nr;
 				block_to_free_p = p;
@@ -4292,9 +4226,12 @@
 		}
 	}
 
-	if (count > 0)
-		ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-				  count, block_to_free_p, p);
+	if (!err && count > 0)
+		err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+					count, block_to_free_p, p);
+	if (err < 0)
+		/* fatal error */
+		return;
 
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4412,7 +4349,7 @@
 			 * transaction where the data blocks are
 			 * actually freed.
 			 */
-			ext4_free_blocks(handle, inode, 0, nr, 1,
+			ext4_free_blocks(handle, inode, NULL, nr, 1,
 					 EXT4_FREE_BLOCKS_METADATA|
 					 EXT4_FREE_BLOCKS_FORGET);
 
@@ -4496,6 +4433,8 @@
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 
+	trace_ext4_truncate_enter(inode);
+
 	if (!ext4_can_truncate(inode))
 		return;
 
@@ -4506,6 +4445,7 @@
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
+		trace_ext4_truncate_exit(inode);
 		return;
 	}
 
@@ -4635,6 +4575,7 @@
 		ext4_orphan_del(handle, inode);
 
 	ext4_journal_stop(handle);
+	trace_ext4_truncate_exit(inode);
 }
 
 /*
@@ -4766,6 +4707,7 @@
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
+		trace_ext4_load_inode(inode);
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ_META, bh);
@@ -4871,7 +4813,7 @@
 		return inode;
 
 	ei = EXT4_I(inode);
-	iloc.bh = 0;
+	iloc.bh = NULL;
 
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a84faa1..808c554 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c

@@ -334,16 +334,22 @@
 	case FITRIM:
 	{
 		struct super_block *sb = inode->i_sb;
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
 		struct fstrim_range range;
 		int ret = 0;
 
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+
 		if (copy_from_user(&range, (struct fstrim_range *)arg,
 		    sizeof(range)))
 			return -EFAULT;
 
+		range.minlen = max((unsigned int)range.minlen,
+				   q->limits.discard_granularity);
 		ret = ext4_trim_fs(sb, &range);
 		if (ret < 0)
 			return ret;
@@ -421,6 +427,7 @@
 		return err;
 	}
 	case EXT4_IOC_MOVE_EXT:
+	case FITRIM:
 		break;
 	default:
 		return -ENOIOCTLCMD;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d1fe09a..a5837a8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c

@@ -432,9 +432,10 @@
 	}
 
 	/* at order 0 we see each particular block */
-	*max = 1 << (e4b->bd_blkbits + 3);
-	if (order == 0)
+	if (order == 0) {
+		*max = 1 << (e4b->bd_blkbits + 3);
 		return EXT4_MB_BITMAP(e4b);
+	}
 
 	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -616,7 +617,6 @@
 	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 
 	grp = ext4_get_group_info(sb, e4b->bd_group);
-	buddy = mb_find_buddy(e4b, 0, &max);
 	list_for_each(cur, &grp->bb_prealloc_list) {
 		ext4_group_t groupnr;
 		struct ext4_prealloc_space *pa;
@@ -635,7 +635,12 @@
 #define mb_check_buddy(e4b)
 #endif
 
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
 				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 					struct ext4_group_info *grp)
@@ -2381,7 +2386,7 @@
 	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
 	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
 	 * So a two level scheme suffices for now. */
-	sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
+	sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
 	if (sbi->s_group_info == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
 		return -ENOMEM;
@@ -3208,7 +3213,7 @@
 	cur_distance = abs(goal_block - cpa->pa_pstart);
 	new_distance = abs(goal_block - pa->pa_pstart);
 
-	if (cur_distance < new_distance)
+	if (cur_distance <= new_distance)
 		return cpa;
 
 	/* drop the previous reference */
@@ -3907,7 +3912,8 @@
 	struct super_block *sb = ac->ac_sb;
 	ext4_group_t ngroups, i;
 
-	if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+	if (!mb_enable_debug ||
+	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return;
 
 	printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4753,7 +4759,8 @@
  * bitmap. Then issue a TRIM command on this extent and free the extent in
  * the group buddy bitmap. This is done until whole group is scanned.
  */
-ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
 {
 	void *bitmap;
@@ -4863,10 +4870,15 @@
 			break;
 		}
 
-		if (len >= EXT4_BLOCKS_PER_GROUP(sb))
-			len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
-		else
+		/*
+		 * For all the groups except the last one, last block will
+		 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+		 * change it for the last group in which case start +
+		 * len < EXT4_BLOCKS_PER_GROUP(sb).
+		 */
+		if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
 			last_block = first_block + len;
+		len -= last_block - first_block;
 
 		if (e4b.bd_info->bb_free >= minlen) {
 			cnt = ext4_trim_all_free(sb, &e4b, first_block,

diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322..22bd4d7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h

@@ -169,7 +169,7 @@
 	/* original request */
 	struct ext4_free_extent ac_o_ex;
 
-	/* goal request (after normalization) */
+	/* goal request (normalized ac_o_ex) */
 	struct ext4_free_extent ac_g_ex;
 
 	/* the best found extent */

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b0a126f..d1bafa5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c

@@ -263,7 +263,7 @@
 	for (i = 0; i < max_entries; i++) {
 		if (tmp_idata[i]) {
 			extend_credit_for_blkdel(handle, inode);
-			ext4_free_blocks(handle, inode, 0,
+			ext4_free_blocks(handle, inode, NULL,
 					 le32_to_cpu(tmp_idata[i]), 1,
 					 EXT4_FREE_BLOCKS_METADATA |
 					 EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
 			 EXT4_FREE_BLOCKS_METADATA |
 			 EXT4_FREE_BLOCKS_FORGET);
 	return 0;
@@ -302,7 +302,7 @@
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
 			 EXT4_FREE_BLOCKS_METADATA |
 			 EXT4_FREE_BLOCKS_FORGET);
 	return 0;
@@ -315,7 +315,7 @@
 	/* ei->i_data[EXT4_IND_BLOCK] */
 	if (i_data[0]) {
 		extend_credit_for_blkdel(handle, inode);
-		ext4_free_blocks(handle, inode, 0,
+		ext4_free_blocks(handle, inode, NULL,
 				le32_to_cpu(i_data[0]), 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
@@ -428,7 +428,7 @@
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, 0, block, 1,
+	ext4_free_blocks(handle, inode, NULL, block, 1,
 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return retval;
 }

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e781b7e..67fd0b0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c

@@ -40,6 +40,7 @@
 #include "xattr.h"
 #include "acl.h"
 
+#include <trace/events/ext4.h>
 /*
  * define how far ahead to read directories while searching them.
  */
@@ -2183,6 +2184,7 @@
 	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
+	trace_ext4_unlink_enter(dir, dentry);
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	dquot_initialize(dir);
@@ -2228,6 +2230,7 @@
 end_unlink:
 	ext4_journal_stop(handle);
 	brelse(bh);
+	trace_ext4_unlink_exit(dentry, retval);
 	return retval;
 }
 
@@ -2402,6 +2405,10 @@
 		if (!new_inode && new_dir != old_dir &&
 		    EXT4_DIR_LINK_MAX(new_dir))
 			goto end_rename;
+		BUFFER_TRACE(dir_bh, "get_write_access");
+		retval = ext4_journal_get_write_access(handle, dir_bh);
+		if (retval)
+			goto end_rename;
 	}
 	if (!new_bh) {
 		retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2409,7 +2416,9 @@
 			goto end_rename;
 	} else {
 		BUFFER_TRACE(new_bh, "get write access");
-		ext4_journal_get_write_access(handle, new_bh);
+		retval = ext4_journal_get_write_access(handle, new_bh);
+		if (retval)
+			goto end_rename;
 		new_de->inode = cpu_to_le32(old_inode->i_ino);
 		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
 					      EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2470,8 +2479,6 @@
 	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
 	ext4_update_dx_flag(old_dir);
 	if (dir_bh) {
-		BUFFER_TRACE(dir_bh, "get_write_access");
-		ext4_journal_get_write_access(handle, dir_bh);
 		PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
 						cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index e2cd90e..b6dbd05 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c

@@ -259,6 +259,11 @@
 			     bi_sector >> (inode->i_blkbits - 9));
 	}
 
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		ext4_free_io_end(io_end);
+		return;
+	}
+
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
 	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
@@ -279,9 +284,9 @@
 		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
 		bio_put(io->io_bio);
 	}
-	io->io_bio = 0;
+	io->io_bio = NULL;
 	io->io_op = 0;
-	io->io_end = 0;
+	io->io_end = NULL;
 }
 
 static int io_submit_init(struct ext4_io_submit *io,
@@ -380,8 +385,6 @@
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	ClearPageError(page);
 
 	io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
 	if (!io_page) {
@@ -392,6 +395,8 @@
 	io_page->p_page = page;
 	atomic_set(&io_page->p_count, 1);
 	get_page(page);
+	set_page_writeback(page);
+	ClearPageError(page);
 
 	for (bh = head = page_buffers(page), block_start = 0;
 	     bh != head || !block_start;

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3ecc6e4..80bbc9c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c

@@ -230,7 +230,7 @@
 	}
 
 	/* Zero out all of the reserved backup group descriptor table blocks */
-	ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
 			block, sbi->s_itb_per_group);
 	err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
 			       GFP_NOFS);
@@ -248,7 +248,7 @@
 
 	/* Zero out all of the inode table blocks */
 	block = input->inode_table;
-	ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+	ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
 			block, sbi->s_itb_per_group);
 	err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
 	if (err)
@@ -499,12 +499,12 @@
 	return err;
 
 exit_inode:
-	/* ext4_journal_release_buffer(handle, iloc.bh); */
+	/* ext4_handle_release_buffer(handle, iloc.bh); */
 	brelse(iloc.bh);
 exit_dindj:
-	/* ext4_journal_release_buffer(handle, dind); */
+	/* ext4_handle_release_buffer(handle, dind); */
 exit_sbh:
-	/* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
+	/* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
 	brelse(dind);
 exit_bh:
@@ -586,7 +586,7 @@
 			/*
 			int j;
 			for (j = 0; j < i; j++)
-				ext4_journal_release_buffer(handle, primary[j]);
+				ext4_handle_release_buffer(handle, primary[j]);
 			 */
 			goto exit_bh;
 		}

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 203f9e4..22546ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -54,9 +54,9 @@
 
 static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
-struct ext4_lazy_init *ext4_li_info;
-struct mutex ext4_li_mtx;
-struct ext4_features *ext4_feat;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
@@ -75,6 +75,7 @@
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
@@ -594,7 +595,7 @@
 
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 	       sb->s_id, function, line, grp);
 	if (ino)
 		printk(KERN_CONT "inode %lu: ", ino);
@@ -997,13 +998,10 @@
 	if (test_opt(sb, OLDALLOC))
 		seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
-	if (test_opt(sb, XATTR_USER) &&
-		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
+	if (test_opt(sb, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
-	if (!test_opt(sb, XATTR_USER) &&
-	    (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
+	if (!test_opt(sb, XATTR_USER))
 		seq_puts(seq, ",nouser_xattr");
-	}
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1041,8 +1039,8 @@
 	    !(def_mount_opts & EXT4_DEFM_NODELALLOC))
 		seq_puts(seq, ",nodelalloc");
 
-	if (test_opt(sb, MBLK_IO_SUBMIT))
-		seq_puts(seq, ",mblk_io_submit");
+	if (!test_opt(sb, MBLK_IO_SUBMIT))
+		seq_puts(seq, ",nomblk_io_submit");
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
 	/*
@@ -1451,7 +1449,7 @@
 		 * Initialize args struct so we know whether arg was
 		 * found; some options take optional arguments.
 		 */
-		args[0].to = args[0].from = 0;
+		args[0].to = args[0].from = NULL;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
@@ -1771,7 +1769,7 @@
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
-			if (!is_power_of_2(option)) {
+			if (option && !is_power_of_2(option)) {
 				ext4_msg(sb, KERN_ERR,
 					 "EXT4-fs: inode_readahead_blks"
 					 " must be a power of 2");
@@ -2120,6 +2118,13 @@
 		return;
 	}
 
+	/* Check if feature set would not allow a r/w mount */
+	if (!ext4_feature_set_ok(sb, 0)) {
+		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
+		return;
+	}
+
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
 		if (es->s_last_orphan)
 			jbd_debug(1, "Errors on filesystem, "
@@ -2412,7 +2417,7 @@
 	if (parse_strtoul(buf, 0x40000000, &t))
 		return -EINVAL;
 
-	if (!is_power_of_2(t))
+	if (t && !is_power_of_2(t))
 		return -EINVAL;
 
 	sbi->s_inode_readahead_blks = t;
@@ -3095,14 +3100,14 @@
 	}
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sb, NO_UID32);
+	/* xattr user namespace & acls are now defaulted on */
 #ifdef CONFIG_EXT4_FS_XATTR
-	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-		set_opt(sb, XATTR_USER);
+	set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-	if (def_mount_opts & EXT4_DEFM_ACL)
-		set_opt(sb, POSIX_ACL);
+	set_opt(sb, POSIX_ACL);
 #endif
+	set_opt(sb, MBLK_IO_SUBMIT);
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
 		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3516,7 +3521,7 @@
 	 * concurrency isn't really necessary.  Limit it to 1.
 	 */
 	EXT4_SB(sb)->dio_unwritten_wq =
-		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
+		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
 		goto failed_mount_wq;
@@ -3531,17 +3536,16 @@
 	if (IS_ERR(root)) {
 		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
+		root = NULL;
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-		iput(root);
 		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		goto failed_mount4;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
-		iput(root);
 		ret = -ENOMEM;
 		goto failed_mount4;
 	}
@@ -3657,6 +3661,8 @@
 	goto failed_mount;
 
 failed_mount4:
+	iput(root);
+	sb->s_root = NULL;
 	ext4_msg(sb, KERN_ERR, "mount failed");
 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fc32176..b545ca1 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c

@@ -735,7 +735,7 @@
 			int offset = (char *)s->here - bs->bh->b_data;
 
 			unlock_buffer(bs->bh);
-			jbd2_journal_release_buffer(handle, bs->bh);
+			ext4_handle_release_buffer(handle, bs->bh);
 			if (ce) {
 				mb_cache_entry_release(ce);
 				ce = NULL;
@@ -833,7 +833,7 @@
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
 getblk_failed:
-				ext4_free_blocks(handle, inode, 0, block, 1,
+				ext4_free_blocks(handle, inode, NULL, block, 1,
 						 EXT4_FREE_BLOCKS_METADATA);
 				error = -EIO;
 				goto cleanup;
commit	ae005cbed12d0b340b04b59d6f5c56e710b3895d	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Fri Mar 25 09:57:40 2011 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Fri Mar 25 09:57:41 2011 -0700
tree	d464865bcc97bea05eab4eba0d10bcad4ec89b93
parent	3961cdf85b749f6bab50ad31ee97e9277e7a3b70 [diff]
parent	0ba0851714beebb800992e5105a79dc3a4c504b0 [diff]