xfs: use per-filesystem radix trees for dquot lookup

Replace the global hash tables for looking up in-memory dquot structures
with per-filesystem radix trees to allow scaling to a large number of
in-memory dquot structures.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fec1a3d..49456e5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -43,7 +43,7 @@
  * Lock order:
  *
  * ip->i_lock
- *   qh->qh_lock
+ *   qi->qi_tree_lock
  *     qi->qi_dqlist_lock
  *       dquot->q_qlock (xfs_dqlock() and friends)
  *         dquot->q_flush (xfs_dqflock() and friends)
@@ -602,60 +602,6 @@
 }
 
 /*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
-	xfs_mount_t		*mp,
-	xfs_dqid_t		id,
-	xfs_dqhash_t		*qh,
-	xfs_dquot_t		**O_dqpp)
-{
-	xfs_dquot_t		*dqp;
-
-	ASSERT(mutex_is_locked(&qh->qh_lock));
-
-	/*
-	 * Traverse the hashchain looking for a match
-	 */
-	list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
-		/*
-		 * We already have the hashlock. We don't need the
-		 * dqlock to look at the id field of the dquot, since the
-		 * id can't be modified without the hashlock anyway.
-		 */
-		if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
-			continue;
-
-		trace_xfs_dqlookup_found(dqp);
-
-		xfs_dqlock(dqp);
-		if (dqp->dq_flags & XFS_DQ_FREEING) {
-			*O_dqpp = NULL;
-			xfs_dqunlock(dqp);
-			return -1;
-		}
-
-		dqp->q_nrefs++;
-
-		/*
-		 * move the dquot to the front of the hashchain
-		 */
-		list_move(&dqp->q_hashlist, &qh->qh_list);
-		trace_xfs_dqlookup_done(dqp);
-		*O_dqpp = dqp;
-		return 0;
-	}
-
-	*O_dqpp = NULL;
-	return 1;
-}
-
-/*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
  * When both an inode and an id are given, the inode's id takes precedence.
@@ -672,10 +618,10 @@
 	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
 	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
 {
-	xfs_dquot_t	*dqp, *dqp1;
-	xfs_dqhash_t	*h;
-	uint		version;
-	int		error;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+	struct xfs_dquot	*dqp;
+	int			error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -683,7 +629,6 @@
 	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
 		return (ESRCH);
 	}
-	h = XFS_DQ_HASH(mp, id, type);
 
 #ifdef DEBUG
 	if (xfs_do_dqerror) {
@@ -704,34 +649,28 @@
 #endif
 
 restart:
-	mutex_lock(&h->qh_lock);
+	mutex_lock(&qi->qi_tree_lock);
+	dqp = radix_tree_lookup(tree, id);
+	if (dqp) {
+		xfs_dqlock(dqp);
+		if (dqp->dq_flags & XFS_DQ_FREEING) {
+			xfs_dqunlock(dqp);
+			mutex_unlock(&qi->qi_tree_lock);
+			trace_xfs_dqget_freeing(dqp);
+			delay(1);
+			goto restart;
+		}
 
-	/*
-	 * Look in the cache (hashtable).
-	 * The chain is kept locked during lookup.
-	 */
-	switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) {
-	case -1:
-		XFS_STATS_INC(xs_qm_dquot_dups);
-		mutex_unlock(&h->qh_lock);
-		delay(1);
-		goto restart;
-	case 0:
+		dqp->q_nrefs++;
+		mutex_unlock(&qi->qi_tree_lock);
+
+		trace_xfs_dqget_hit(dqp);
 		XFS_STATS_INC(xs_qm_dqcachehits);
-		/*
-		 * The dquot was found, moved to the front of the chain,
-		 * taken off the freelist if it was on it, and locked
-		 * at this point. Just unlock the hashchain and return.
-		 */
-		ASSERT(*O_dqpp);
-		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-		mutex_unlock(&h->qh_lock);
-		trace_xfs_dqget_hit(*O_dqpp);
-		return 0;	/* success */
-	default:
-		XFS_STATS_INC(xs_qm_dqcachemisses);
-		break;
+		*O_dqpp = dqp;
+		return 0;
 	}
+	mutex_unlock(&qi->qi_tree_lock);
+	XFS_STATS_INC(xs_qm_dqcachemisses);
 
 	/*
 	 * Dquot cache miss. We don't want to keep the inode lock across
@@ -742,12 +681,6 @@
 	 */
 	if (ip)
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	/*
-	 * Save the hashchain version stamp, and unlock the chain, so that
-	 * we don't keep the lock across a disk read
-	 */
-	version = h->qh_version;
-	mutex_unlock(&h->qh_lock);
 
 	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
 
@@ -757,15 +690,14 @@
 	if (error)
 		return error;
 
-	/*
-	 * Dquot lock comes after hashlock in the lock ordering
-	 */
 	if (ip) {
 		/*
 		 * A dquot could be attached to this inode by now, since
 		 * we had dropped the ilock.
 		 */
 		if (xfs_this_quota_on(mp, type)) {
+			struct xfs_dquot	*dqp1;
+
 			dqp1 = xfs_inode_dquot(ip, type);
 			if (dqp1) {
 				xfs_qm_dqdestroy(dqp);
@@ -780,51 +712,27 @@
 		}
 	}
 
-	/*
-	 * Hashlock comes after ilock in lock order
-	 */
-	mutex_lock(&h->qh_lock);
-	if (version != h->qh_version) {
-		xfs_dquot_t *tmpdqp;
-		/*
-		 * Now, see if somebody else put the dquot in the
-		 * hashtable before us. This can happen because we didn't
-		 * keep the hashchain lock. We don't have to worry about
-		 * lock order between the two dquots here since dqp isn't
-		 * on any findable lists yet.
-		 */
-		switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) {
-		case 0:
-		case -1:
-			/*
-			 * Duplicate found, either in cache or on its way out.
-			 * Just throw away the new dquot and start over.
-			 */
-			if (tmpdqp)
-				xfs_qm_dqput(tmpdqp);
-			mutex_unlock(&h->qh_lock);
-			xfs_qm_dqdestroy(dqp);
-			XFS_STATS_INC(xs_qm_dquot_dups);
-			goto restart;
-		default:
-			break;
-		}
-	}
+	mutex_lock(&qi->qi_tree_lock);
+	error = -radix_tree_insert(tree, id, dqp);
+	if (unlikely(error)) {
+		WARN_ON(error != EEXIST);
 
-	/*
-	 * Put the dquot at the beginning of the hash-chain and mp's list
-	 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
-	 */
-	ASSERT(mutex_is_locked(&h->qh_lock));
-	dqp->q_hash = h;
-	list_add(&dqp->q_hashlist, &h->qh_list);
-	h->qh_version++;
+		/*
+		 * Duplicate found. Just throw away the new dquot and start
+		 * over.
+		 */
+		mutex_unlock(&qi->qi_tree_lock);
+		trace_xfs_dqget_dup(dqp);
+		xfs_qm_dqdestroy(dqp);
+		XFS_STATS_INC(xs_qm_dquot_dups);
+		goto restart;
+	}
 
 	/*
 	 * Attach this dquot to this filesystem's list of all dquots,
 	 * kept inside the mount structure in m_quotainfo field
 	 */
-	mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
+	mutex_lock(&qi->qi_dqlist_lock);
 
 	/*
 	 * We return a locked dquot to the caller, with a reference taken
@@ -832,10 +740,11 @@
 	xfs_dqlock(dqp);
 	dqp->q_nrefs = 1;
 
-	list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
-	mp->m_quotainfo->qi_dquots++;
-	mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-	mutex_unlock(&h->qh_lock);
+	list_add(&dqp->q_mplist, &qi->qi_dqlist);
+	qi->qi_dquots++;
+	mutex_unlock(&qi->qi_dqlist_lock);
+	mutex_unlock(&qi->qi_tree_lock);
+
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);
@@ -1117,7 +1026,6 @@
 	struct xfs_dquot	*dqp)
 {
 	struct xfs_mount	*mp = dqp->q_mount;
-	struct xfs_dqhash	*qh = dqp->q_hash;
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
 
 	xfs_dqlock(dqp);
@@ -1164,10 +1072,10 @@
 	xfs_dqfunlock(dqp);
 	xfs_dqunlock(dqp);
 
-	mutex_lock(&qh->qh_lock);
-	list_del_init(&dqp->q_hashlist);
-	qh->qh_version++;
-	mutex_unlock(&qh->qh_lock);
+	mutex_lock(&qi->qi_tree_lock);
+	radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+			  be32_to_cpu(dqp->q_core.d_id));
+	mutex_unlock(&qi->qi_tree_lock);
 
 	mutex_lock(&qi->qi_dqlist_lock);
 	list_del_init(&dqp->q_mplist);