Btrfs: extent_map and data=ordered fixes for space balancing

* Add an EXTENT_BOUNDARY state bit to keep the writepage code
from merging data extents that are in the process of being
relocated.  This allows us to do accounting for them properly.

* The balancing code relocates data extents indepdent of the underlying
inode.  The extent_map code was modified to properly account for
things moving around (invalidating extent_map caches in the inode).

* Don't take the drop_mutex in the create_subvol ioctl.  It isn't
required.

* Fix walking of the ordered extent list to avoid races with sys_unlink

* Change the lock ordering rules.  Transaction start goes outside
the drop_mutex.  This allows btrfs_commit_transaction to directly
drop the relocation trees.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48a3dc0..4516fbf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -135,7 +135,7 @@
 
 	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
-	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	while(num_bytes > 0) {
@@ -163,7 +163,7 @@
 				break;
 			}
 			btrfs_drop_extent_cache(inode, start,
-						start + ins.offset - 1);
+						start + ins.offset - 1, 0);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
@@ -587,7 +587,7 @@
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
-				ordered_extent->len - 1);
+				ordered_extent->len - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	ins.objectid = ordered_extent->start;
@@ -880,7 +880,7 @@
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
 	/* don't do orphan cleanup if the fs is readonly. */
-	if (root->inode->i_sb->s_flags & MS_RDONLY)
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return;
 
 	path = btrfs_alloc_path();
@@ -892,8 +892,6 @@
 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
 	key.offset = (u64)-1;
 
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, root->inode);
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -933,7 +931,7 @@
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
-		inode = btrfs_iget_locked(root->inode->i_sb,
+		inode = btrfs_iget_locked(root->fs_info->sb,
 					  found_key.offset, root);
 		if (!inode)
 			break;
@@ -965,7 +963,9 @@
 		 * do a destroy_inode
 		 */
 		if (is_bad_inode(inode)) {
+			trans = btrfs_start_transaction(root, 1);
 			btrfs_orphan_del(trans, inode);
+			btrfs_end_transaction(trans, root);
 			iput(inode);
 			continue;
 		}
@@ -988,7 +988,6 @@
 		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
 
 	btrfs_free_path(path);
-	btrfs_end_transaction(trans, root);
 }
 
 void btrfs_read_locked_inode(struct inode *inode)
@@ -1343,8 +1342,7 @@
 	u64 mask = root->sectorsize - 1;
 
 	if (root->ref_cows)
-		btrfs_drop_extent_cache(inode,
-					new_size & (~mask), (u64)-1);
+		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -1677,7 +1675,7 @@
 						       hole_start, 0, 0,
 						       hole_size, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
-						(u64)-1);
+						(u64)-1, 0);
 			btrfs_check_file(root, inode);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -1843,6 +1841,24 @@
 		args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	if (wait) {
+		inode = ilookup5(s, objectid, btrfs_find_actor,
+				 (void *)&args);
+	} else {
+		inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+					(void *)&args);
+	}
+	return inode;
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root)
 {
@@ -3266,7 +3282,7 @@
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
-	btrfs_drop_extent_cache(inode, 0, (u64)-1);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
@@ -3412,16 +3428,22 @@
 {
 	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
+	struct inode *inode;
 	unsigned long flags;
 
 	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	while(!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
-		atomic_inc(&binode->vfs_inode.i_count);
+		inode = igrab(&binode->vfs_inode);
+		if (!inode)
+			list_del_init(&binode->delalloc_inodes);
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
-		filemap_write_and_wait(binode->vfs_inode.i_mapping);
-		iput(&binode->vfs_inode);
+		if (inode) {
+			filemap_write_and_wait(inode->i_mapping);
+			iput(inode);
+		}
+		cond_resched();
 		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	}
 	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);