mmc: block: replace __blk_end_request() with blk_end_request()

For completing any block request, MMC block driver is calling:
	spin_lock_irq(queue)
	__blk_end_request()
	spin_unlock_irq(queue)

But if we analyze the sources of latency in kernel using ftrace,
__blk_end_request() function at times may take up to 6.5ms with
spinlock held and irq disabled.

__blk_end_request() calls couple of functions and ftrace output shows
that blk_update_bidi_request() function is almost taking 6ms.
There are 2 function to end the current request: ___blk_end_request()
and blk_end_request(). Both these functions do same thing except that
blk_end_request() function doesn't take up the spinlock while calling
the blk_update_bidi_request().

This patch replaces all __blk_end_request() calls with
blk_end_request() and __blk_end_request_all() calls with
blk_end_request_all().

Testing done: 20 process concurrent read/write on sd card and eMMC.
Ran this test for almost a day on multicore system and no errors observed.

This change is not meant for improving MMC throughput; it's basically about
becoming fair to other threads/interrupts in the system. By holding
spin lock and interrupts disabled for longer duration, we won't allow
other threads/interrupts to run at all.

Actually slight performance degradation at file system level can be
expected as we are not holding the spin lock during
blk_update_bidi_request() which means our mmcqd thread may get preempted
for other high priority thread or any interrupt in the system.

These are performance numbers (100MB file write) with eMMC running in DDR
mode:

Without this patch:
	Name of the Test   Value   Unit
	LMDD Read Test     53.79   MBPS
	LMDD Write Test    18.86   MBPS
	IOZONE  Read Test  51.65   MBPS
	IOZONE  Write Test 24.36   MBPS

With this patch:
	Name of the Test    Value  Unit
	LMDD Read Test      52.94  MBPS
	LMDD Write Test     16.70  MBPS
	IOZONE  Read Test   52.08  MBPS
	IOZONE  Write Test  23.29  MBPS

Read numbers are fine. Write numbers are bit down (especially LMDD write),
may be because write requests normally have large transfer size and
which means there are chances that while mmcq is executing
blk_update_bidi_request(), it may get interrupted by interrupts or other
high priority thread.

CRs-Fixed: 343776
Change-Id: I37aab86a217177d328d6893983d5507a5149c253
Signed-off-by: Subhash Jadavani <subhashj@codeaurora.org>
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 990faeb..5e0d669 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -918,9 +918,7 @@
 		goto retry;
 	if (!err)
 		mmc_blk_reset_success(md, type);
-	spin_lock_irq(&md->lock);
-	__blk_end_request(req, err, blk_rq_bytes(req));
-	spin_unlock_irq(&md->lock);
+	blk_end_request(req, err, blk_rq_bytes(req));
 
 	return err ? 0 : 1;
 }
@@ -989,9 +987,7 @@
 	if (!err)
 		mmc_blk_reset_success(md, type);
 out:
-	spin_lock_irq(&md->lock);
-	__blk_end_request(req, err, blk_rq_bytes(req));
-	spin_unlock_irq(&md->lock);
+	blk_end_request(req, err, blk_rq_bytes(req));
 
 	return err ? 0 : 1;
 }
@@ -1030,9 +1026,7 @@
 					     __func__);
 
 out:
-	spin_lock_irq(&md->lock);
-	__blk_end_request(req, err, blk_rq_bytes(req));
-	spin_unlock_irq(&md->lock);
+	blk_end_request(req, err, blk_rq_bytes(req));
 
 	return err ? 0 : 1;
 }
@@ -1047,9 +1041,7 @@
 	if (ret)
 		ret = -EIO;
 
-	spin_lock_irq(&md->lock);
-	__blk_end_request_all(req, ret);
-	spin_unlock_irq(&md->lock);
+	blk_end_request_all(req, ret);
 
 	return ret ? 0 : 1;
 }
@@ -1672,15 +1664,11 @@
 
 		blocks = mmc_sd_num_wr_blocks(card);
 		if (blocks != (u32)-1) {
-			spin_lock_irq(&md->lock);
-			ret = __blk_end_request(req, 0, blocks << 9);
-			spin_unlock_irq(&md->lock);
+			ret = blk_end_request(req, 0, blocks << 9);
 		}
 	} else {
 		if (mq_rq->packed_cmd == MMC_PACKED_NONE) {
-			spin_lock_irq(&md->lock);
-			ret = __blk_end_request(req, 0, brq->data.bytes_xfered);
-			spin_unlock_irq(&md->lock);
+			ret = blk_end_request(req, 0, brq->data.bytes_xfered);
 		}
 	}
 	return ret;
@@ -1689,7 +1677,6 @@
 static int mmc_blk_end_packed_req(struct mmc_queue *mq,
 				  struct mmc_queue_req *mq_rq)
 {
-	struct mmc_blk_data *md = mq->data;
 	struct request *prq;
 	int idx = mq_rq->packed_fail_idx, i = 0;
 	int ret = 0;
@@ -1709,9 +1696,7 @@
 			return ret;
 		}
 		list_del_init(&prq->queuelist);
-		spin_lock_irq(&md->lock);
-		__blk_end_request(prq, 0, blk_rq_bytes(prq));
-		spin_unlock_irq(&md->lock);
+		blk_end_request(prq, 0, blk_rq_bytes(prq));
 		i++;
 	}
 
@@ -1777,10 +1762,8 @@
 				ret = mmc_blk_end_packed_req(mq, mq_rq);
 				break;
 			} else {
-				spin_lock_irq(&md->lock);
-				ret = __blk_end_request(req, 0,
+				ret = blk_end_request(req, 0,
 						brq->data.bytes_xfered);
-				spin_unlock_irq(&md->lock);
 			}
 
 			/*
@@ -1833,10 +1816,8 @@
 			 * time, so we only reach here after trying to
 			 * read a single sector.
 			 */
-			spin_lock_irq(&md->lock);
-			ret = __blk_end_request(req, -EIO,
+			ret = blk_end_request(req, -EIO,
 						brq->data.blksz);
-			spin_unlock_irq(&md->lock);
 			if (!ret)
 				goto start_new_req;
 			break;
@@ -1866,20 +1847,16 @@
 
  cmd_abort:
 	if (mq_rq->packed_cmd == MMC_PACKED_NONE) {
-		spin_lock_irq(&md->lock);
 		if (mmc_card_removed(card))
 			req->cmd_flags |= REQ_QUIET;
 		while (ret)
-			ret = __blk_end_request(req, -EIO,
+			ret = blk_end_request(req, -EIO,
 					blk_rq_cur_bytes(req));
-		spin_unlock_irq(&md->lock);
 	} else {
 		while (!list_empty(&mq_rq->packed_list)) {
 			prq = list_entry_rq(mq_rq->packed_list.next);
 			list_del_init(&prq->queuelist);
-			spin_lock_irq(&md->lock);
-			__blk_end_request(prq, -EIO, blk_rq_bytes(prq));
-			spin_unlock_irq(&md->lock);
+			blk_end_request(prq, -EIO, blk_rq_bytes(prq));
 		}
 		mmc_blk_clear_packed(mq_rq);
 	}
@@ -1932,9 +1909,7 @@
 	ret = mmc_blk_part_switch(card, md);
 	if (ret) {
 		if (req) {
-			spin_lock_irq(&md->lock);
-			__blk_end_request_all(req, -EIO);
-			spin_unlock_irq(&md->lock);
+			blk_end_request_all(req, -EIO);
 		}
 		ret = 0;
 		goto out;