Btrfs: don't read leaf blocks containing only checksums during truncate

Checksum items take up a significant portion of the metadata for large files.
It is possible to avoid reading them during truncates by checking the keys in
the higher level nodes.

If a given leaf is followed by another leaf where the lowest key is a checksum
item from the same file, we know we can safely delete the leaf without
reading it.

For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete
the file with a cold cache.  It is read bound during the run.

With this change, Btrfs is able to delete the file in 0.5s

Signed-off-by: Chris Mason <chris.mason@oracle.com>

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ff3261f..2eab464 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1388,7 +1388,7 @@
 	struct btrfs_key prealloc_block;
 
 	lowest_level = p->lowest_level;
-	WARN_ON(lowest_level && ins_len);
+	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(cow && root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
@@ -3187,6 +3187,36 @@
 }
 
 /*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr)
+{
+	int ret;
+	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+
+	ret = del_ptr(trans, root, path, 1, path->slots[1]);
+	if (ret)
+		return ret;
+
+	ret = btrfs_free_extent(trans, root, bytenr,
+				btrfs_level_size(root, 0),
+				path->nodes[1]->start,
+				btrfs_header_owner(path->nodes[1]),
+				root_gen, 0, 0, 1);
+	return ret;
+}
+/*
  * delete the item at the leaf level in path.  If that empties
  * the leaf, remove it from the tree
  */
@@ -3251,17 +3281,8 @@
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
-			u64 root_gen = btrfs_header_generation(path->nodes[1]);
-			wret = del_ptr(trans, root, path, 1, path->slots[1]);
-			if (wret)
-				ret = wret;
-			wret = btrfs_free_extent(trans, root,
-					 leaf->start, leaf->len,
-					 path->nodes[1]->start,
-					 btrfs_header_owner(path->nodes[1]),
-					 root_gen, 0, 0, 1);
-			if (wret)
-				ret = wret;
+			ret = btrfs_del_leaf(trans, root, path, leaf->start);
+			BUG_ON(ret);
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, nritems);
@@ -3296,24 +3317,10 @@
 			}
 
 			if (btrfs_header_nritems(leaf) == 0) {
-				u64 root_gen;
-				u64 bytenr = leaf->start;
-				u32 blocksize = leaf->len;
-
-				root_gen = btrfs_header_generation(
-							   path->nodes[1]);
-
-				wret = del_ptr(trans, root, path, 1, slot);
-				if (wret)
-					ret = wret;
-
+				path->slots[1] = slot;
+				ret = btrfs_del_leaf(trans, root, path, leaf->start);
+				BUG_ON(ret);
 				free_extent_buffer(leaf);
-				wret = btrfs_free_extent(trans, root, bytenr,
-					     blocksize, path->nodes[1]->start,
-					     btrfs_header_owner(path->nodes[1]),
-					     root_gen, 0, 0, 1);
-				if (wret)
-					ret = wret;
 			} else {
 				/* if we're still in the path, make sure
 				 * we're dirty.  Otherwise, one of the
@@ -3418,8 +3425,8 @@
 		level = btrfs_header_level(cur);
 		sret = bin_search(cur, min_key, level, &slot);
 
-		/* at level = 0, we're done, setup the path and exit */
-		if (level == 0) {
+		/* at the lowest level, we're done, setup the path and exit */
+		if (level == path->lowest_level) {
 			if (slot >= nritems)
 				goto find_next_key;
 			ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ded1643..94e0cdf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1649,7 +1649,9 @@
 void btrfs_init_path(struct btrfs_path *p);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
-
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f3abecc..e5c9261 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1390,6 +1390,154 @@
 }
 
 /*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items.  This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new  size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file.  If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files.  It may do
+ * nothing.  It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct inode *inode, u64 new_size)
+{
+	struct btrfs_key key;
+	int ret;
+	int nritems;
+	struct btrfs_key found_key;
+	struct btrfs_key other_key;
+
+	path->lowest_level = 1;
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_CSUM_ITEM_KEY;
+	key.offset = new_size;
+again:
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (path->nodes[1] == NULL) {
+		ret = 0;
+		goto out;
+	}
+	ret = 0;
+	btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+	nritems = btrfs_header_nritems(path->nodes[1]);
+
+	if (!nritems)
+		goto out;
+
+	if (path->slots[1] >= nritems)
+		goto next_node;
+
+	/* did we find a key greater than anything we want to delete? */
+	if (found_key.objectid > inode->i_ino ||
+	   (found_key.objectid == inode->i_ino && found_key.type > key.type))
+		goto out;
+
+	/* we check the next key in the node to make sure the leave contains
+	 * only checksum items.  This comparison doesn't work if our
+	 * leaf is the last one in the node
+	 */
+	if (path->slots[1] + 1 >= nritems) {
+next_node:
+		/* search forward from the last key in the node, this
+		 * will bring us into the next node in the tree
+		 */
+		btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+
+		/* unlikely, but we inc below, so check to be safe */
+		if (found_key.offset == (u64)-1)
+			goto out;
+
+		/* search_forward needs a path with locks held, do the
+		 * search again for the original key.  It is possible
+		 * this will race with a balance and return a path that
+		 * we could modify, but this drop is just an optimization
+		 * and is allowed to miss some leaves.
+		 */
+		btrfs_release_path(root, path);
+		found_key.offset++;
+
+		/* setup a max key for search_forward */
+		other_key.offset = (u64)-1;
+		other_key.type = key.type;
+		other_key.objectid = key.objectid;
+
+		path->keep_locks = 1;
+		ret = btrfs_search_forward(root, &found_key, &other_key,
+					   path, 0, 0);
+		path->keep_locks = 0;
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		key.offset = found_key.offset;
+		btrfs_release_path(root, path);
+		cond_resched();
+		goto again;
+	}
+
+	/* we know there's one more slot after us in the tree,
+	 * read that key so we can verify it is also a checksum item
+	 */
+	btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+
+	if (found_key.objectid < inode->i_ino)
+		goto next_key;
+
+	if (found_key.type != key.type || found_key.offset < new_size)
+		goto next_key;
+
+	/*
+	 * if the key for the next leaf isn't a csum key from this objectid,
+	 * we can't be sure there aren't good items inside this leaf.
+	 * Bail out
+	 */
+	if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+		goto out;
+
+	/*
+	 * it is safe to delete this leaf, it contains only
+	 * csum items from this inode at an offset >= new_size
+	 */
+	ret = btrfs_del_leaf(trans, root, path,
+			     btrfs_node_blockptr(path->nodes[1],
+						 path->slots[1]));
+	BUG_ON(ret);
+
+next_key:
+	btrfs_release_path(root, path);
+
+	if (other_key.objectid == inode->i_ino &&
+	    other_key.type == key.type && other_key.offset > key.offset) {
+		key.offset = other_key.offset;
+		cond_resched();
+		goto again;
+	}
+	ret = 0;
+out:
+	/* fixup any changes we've made to the path */
+	path->lowest_level = 0;
+	path->keep_locks = 0;
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
  * any higher than new_size
@@ -1436,6 +1584,10 @@
 	key.type = (u8)-1;
 
 	btrfs_init_path(path);
+
+	ret = drop_csum_leaves(trans, root, path, inode, new_size);
+	BUG_ON(ret);
+
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {