mmc: msm_sdcc: Add correct DMA limitations and some cleanup

1) dma_map_sg() should be called before sg_dma_address()
is passed to dmov. This makes sure that buffer adresses
are bounced if present in HIGH_MEM region.

2) The maximum length of data transfer that DMA
can handle in box mode while operating with SDCC
would be 4095 (rows(64K-1)*fifo_size(64)) KiB.
It is likely that upper layers can send more
data than this as suggested by host->max_seg_size.
The data mover hangs if such requests are passed
to driver. Fix this by limiting box mode command
to handle only requests upto 4095KiB and iterate
over many such commands for larger requests.

3) Increase NR_SG to 128 for optimal performance.

4) Clean up hardcoded values to proper macros in .h file

Change-Id: I06d53396a639e2108f87022a2eef6822dda88305
Signed-off-by: Sujit Reddy Thumma <sthumma@codeaurora.org>
diff --git a/drivers/mmc/host/msm_sdcc.c b/drivers/mmc/host/msm_sdcc.c
index 1af828b..fa91b53 100644
--- a/drivers/mmc/host/msm_sdcc.c
+++ b/drivers/mmc/host/msm_sdcc.c
@@ -68,8 +68,6 @@
 #define SPS_SDCC_CONSUMER_PIPE_INDEX	2
 #define SPS_CONS_PERIPHERAL		0
 #define SPS_PROD_PERIPHERAL		1
-/* 16 KB */
-#define SPS_MAX_DESC_SIZE		(16 * 1024)
 
 #if defined(CONFIG_DEBUG_FS)
 static void msmsdcc_dbg_createhost(struct msmsdcc_host *);
@@ -134,6 +132,42 @@
 		      u32 c);
 static inline void msmsdcc_delay(struct msmsdcc_host *host);
 
+static inline unsigned short msmsdcc_get_nr_sg(struct msmsdcc_host *host)
+{
+	unsigned short ret = NR_SG;
+
+	if (host->is_sps_mode) {
+		if (NR_SG > MAX_NR_SG_SPS)
+			ret = MAX_NR_SG_SPS;
+	} else { /* DMA or PIO mode */
+		if (NR_SG > MAX_NR_SG_DMA_PIO)
+			ret = MAX_NR_SG_DMA_PIO;
+	}
+
+	return ret;
+}
+
+static inline unsigned int msmsdcc_get_max_seg_size(struct msmsdcc_host *host)
+{
+	unsigned int max_seg_size;
+
+	/*
+	 * SPS BAM has limitation of max. number of descriptors.
+	 * max. # of descriptors = SPS_MAX_DESCS
+	 * each descriptor can point to SPS_MAX_DESC_SIZE (16KB)
+	 * So (nr_sg * max_seg_size) should be limited to the
+	 * max. size that all of the descriptors can point to.
+	 * i.e., (nr_sg * max_seg_size) = (SPS_MAX_DESCS * SPS_MAX_DESC_SIZE).
+	 */
+	if (host->is_sps_mode) {
+		max_seg_size = (SPS_MAX_DESCS * SPS_MAX_DESC_SIZE) /
+			msmsdcc_get_nr_sg(host);
+	} else { /* DMA or PIO mode */
+		max_seg_size = MMC_MAX_REQ_SIZE;
+	}
+
+	return max_seg_size;
+}
 
 #ifdef CONFIG_MMC_MSM_SPS_SUPPORT
 static int msmsdcc_sps_reset_ep(struct msmsdcc_host *host,
@@ -717,8 +751,9 @@
 	dmov_box *box;
 	uint32_t rows;
 	unsigned int n;
-	int i;
+	int i, err = 0, box_cmd_cnt = 0;
 	struct scatterlist *sg = data->sg;
+	unsigned int len, offset;
 
 	if ((host->dma.channel == -1) || (host->dma.crci == -1))
 		return -ENOENT;
@@ -728,7 +763,8 @@
 	host->dma.sg = data->sg;
 	host->dma.num_ents = data->sg_len;
 
-	BUG_ON(host->dma.num_ents > NR_SG); /* Prevent memory corruption */
+	/* Prevent memory corruption */
+	BUG_ON(host->dma.num_ents > msmsdcc_get_nr_sg(host));
 
 	nc = host->dma.nc;
 
@@ -737,59 +773,8 @@
 	else
 		host->dma.dir = DMA_TO_DEVICE;
 
-	/* host->curr.user_pages = (data->flags & MMC_DATA_USERPAGE); */
-	host->curr.user_pages = 0;
-	box = &nc->cmd[0];
-	for (i = 0; i < host->dma.num_ents; i++) {
-		box->cmd = CMD_MODE_BOX;
-
-		/* Initialize sg dma address */
-		sg->dma_address = pfn_to_dma(mmc_dev(host->mmc),
-					      page_to_pfn(sg_page(sg)))
-					      + sg->offset;
-
-		if (i == (host->dma.num_ents - 1))
-			box->cmd |= CMD_LC;
-		rows = (sg_dma_len(sg) % MCI_FIFOSIZE) ?
-			(sg_dma_len(sg) / MCI_FIFOSIZE) + 1 :
-			(sg_dma_len(sg) / MCI_FIFOSIZE) ;
-
-		if (data->flags & MMC_DATA_READ) {
-			box->src_row_addr = msmsdcc_fifo_addr(host);
-			box->dst_row_addr = sg_dma_address(sg);
-
-			box->src_dst_len = (MCI_FIFOSIZE << 16) |
-					   (MCI_FIFOSIZE);
-			box->row_offset = MCI_FIFOSIZE;
-
-			box->num_rows = rows * ((1 << 16) + 1);
-			box->cmd |= CMD_SRC_CRCI(host->dma.crci);
-		} else {
-			box->src_row_addr = sg_dma_address(sg);
-			box->dst_row_addr = msmsdcc_fifo_addr(host);
-
-			box->src_dst_len = (MCI_FIFOSIZE << 16) |
-					   (MCI_FIFOSIZE);
-			box->row_offset = (MCI_FIFOSIZE << 16);
-
-			box->num_rows = rows * ((1 << 16) + 1);
-			box->cmd |= CMD_DST_CRCI(host->dma.crci);
-		}
-		box++;
-		sg++;
-	}
-
-	/* location of command block must be 64 bit aligned */
-	BUG_ON(host->dma.cmd_busaddr & 0x07);
-
-	nc->cmdptr = (host->dma.cmd_busaddr >> 3) | CMD_PTR_LP;
-	host->dma.hdr.cmdptr = DMOV_CMD_PTR_LIST |
-			       DMOV_CMD_ADDR(host->dma.cmdptr_busaddr);
-	host->dma.hdr.complete_func = msmsdcc_dma_complete_func;
-
 	n = dma_map_sg(mmc_dev(host->mmc), host->dma.sg,
 			host->dma.num_ents, host->dma.dir);
-	/* dsb inside dma_map_sg will write nc out to mem as well */
 
 	if (n != host->dma.num_ents) {
 		pr_err("%s: Unable to map in all sg elements\n",
@@ -799,7 +784,79 @@
 		return -ENOMEM;
 	}
 
-	return 0;
+	/* host->curr.user_pages = (data->flags & MMC_DATA_USERPAGE); */
+	host->curr.user_pages = 0;
+	box = &nc->cmd[0];
+	for (i = 0; i < host->dma.num_ents; i++) {
+		len = sg_dma_len(sg);
+		offset = 0;
+
+		do {
+			/* Check if we can do DMA */
+			if (!len || (box_cmd_cnt >= MMC_MAX_DMA_CMDS)) {
+				err = -ENOTSUPP;
+				goto unmap;
+			}
+
+			box->cmd = CMD_MODE_BOX;
+
+			if (len >= MMC_MAX_DMA_BOX_LENGTH) {
+				len = MMC_MAX_DMA_BOX_LENGTH;
+				len -= len % data->blksz;
+			}
+			rows = (len % MCI_FIFOSIZE) ?
+				(len / MCI_FIFOSIZE) + 1 :
+				(len / MCI_FIFOSIZE);
+
+			if (data->flags & MMC_DATA_READ) {
+				box->src_row_addr = msmsdcc_fifo_addr(host);
+				box->dst_row_addr = sg_dma_address(sg) + offset;
+				box->src_dst_len = (MCI_FIFOSIZE << 16) |
+						(MCI_FIFOSIZE);
+				box->row_offset = MCI_FIFOSIZE;
+				box->num_rows = rows * ((1 << 16) + 1);
+				box->cmd |= CMD_SRC_CRCI(host->dma.crci);
+			} else {
+				box->src_row_addr = sg_dma_address(sg) + offset;
+				box->dst_row_addr = msmsdcc_fifo_addr(host);
+				box->src_dst_len = (MCI_FIFOSIZE << 16) |
+						(MCI_FIFOSIZE);
+				box->row_offset = (MCI_FIFOSIZE << 16);
+				box->num_rows = rows * ((1 << 16) + 1);
+				box->cmd |= CMD_DST_CRCI(host->dma.crci);
+			}
+
+			offset += len;
+			len = sg_dma_len(sg) - offset;
+			box++;
+			box_cmd_cnt++;
+		} while (len);
+		sg++;
+	}
+	/* Mark last command */
+	box--;
+	box->cmd |= CMD_LC;
+
+	/* location of command block must be 64 bit aligned */
+	BUG_ON(host->dma.cmd_busaddr & 0x07);
+
+	nc->cmdptr = (host->dma.cmd_busaddr >> 3) | CMD_PTR_LP;
+	host->dma.hdr.cmdptr = DMOV_CMD_PTR_LIST |
+			       DMOV_CMD_ADDR(host->dma.cmdptr_busaddr);
+	host->dma.hdr.complete_func = msmsdcc_dma_complete_func;
+
+	/* Flush all data to memory before starting dma */
+	mb();
+
+unmap:
+	if (err) {
+		dma_unmap_sg(mmc_dev(host->mmc), host->dma.sg,
+				host->dma.num_ents, host->dma.dir);
+		pr_err("%s: cannot do DMA, fall back to PIO mode err=%d\n",
+				mmc_hostname(host->mmc), err);
+	}
+
+	return err;
 }
 
 #ifdef CONFIG_MMC_MSM_SPS_SUPPORT
@@ -825,7 +882,8 @@
 	struct scatterlist *sg = data->sg;
 	struct sps_pipe *sps_pipe_handle;
 
-	BUG_ON(data->sg_len > NR_SG); /* Prevent memory corruption */
+	/* Prevent memory corruption */
+	BUG_ON(data->sg_len > msmsdcc_get_nr_sg(host));
 
 	host->sps.sg = data->sg;
 	host->sps.num_ents = data->sg_len;
@@ -3028,17 +3086,10 @@
 	 * transfer. It's ignored for BAM-to-System mode transfer.
 	 */
 	sps_config->event_thresh = 0x10;
-	/*
-	 * Max. no of scatter/gather buffers that can
-	 * be passed by block layer = 32 (NR_SG).
-	 * Each BAM descritor needs 64 bits (8 bytes).
-	 * One BAM descriptor is required per buffer transfer.
-	 * So we would require total 256 (32 * 8) bytes of descriptor FIFO.
-	 * But due to HW limitation we need to allocate atleast one extra
-	 * descriptor memory (256 bytes + 8 bytes). But in order to be
-	 * in power of 2, we are allocating 512 bytes of memory.
-	 */
-	sps_config->desc.size = 512;
+
+	/* Allocate maximum descriptor fifo size */
+	sps_config->desc.size = SPS_MAX_DESC_FIFO_SIZE -
+		(SPS_MAX_DESC_FIFO_SIZE % SPS_MAX_DESC_LENGTH);
 	sps_config->desc.base = dma_alloc_coherent(mmc_dev(host->mmc),
 						sps_config->desc.size,
 						&sps_config->desc.phys_base,
@@ -3921,12 +3972,12 @@
 	if (plat->is_sdio_al_client)
 		mmc->pm_flags |= MMC_PM_IGNORE_PM_NOTIFY;
 
-	mmc->max_segs = NR_SG;
-	mmc->max_blk_size = 4096;	/* MCI_DATA_CTL BLOCKSIZE up to 4096 */
-	mmc->max_blk_count = 65535;
+	mmc->max_segs = msmsdcc_get_nr_sg(host);
+	mmc->max_blk_size = MMC_MAX_BLK_SIZE;
+	mmc->max_blk_count = MMC_MAX_BLK_CNT;
 
-	mmc->max_req_size = 33554432;	/* MCI_DATA_LENGTH is 25 bits */
-	mmc->max_seg_size = mmc->max_req_size;
+	mmc->max_req_size = MMC_MAX_REQ_SIZE;
+	mmc->max_seg_size = msmsdcc_get_max_seg_size(host);
 
 	writel_relaxed(0, host->base + MMCIMASK0);
 	writel_relaxed(MCI_CLEAR_STATIC_MASK, host->base + MMCICLEAR);
diff --git a/drivers/mmc/host/msm_sdcc.h b/drivers/mmc/host/msm_sdcc.h
index 580da82..ca3eed8 100644
--- a/drivers/mmc/host/msm_sdcc.h
+++ b/drivers/mmc/host/msm_sdcc.h
@@ -207,7 +207,7 @@
 
 #define MCI_FIFOHALFSIZE (MCI_FIFOSIZE / 2)
 
-#define NR_SG		32
+#define NR_SG		128
 
 #define MSM_MMC_IDLE_TIMEOUT	5000 /* msecs */
 
@@ -217,10 +217,46 @@
  */
 #define MSM_MMC_REQ_TIMEOUT	10000 /* msecs */
 
+/*
+ * Controller HW limitations
+ */
+#define MCI_DATALENGTH_BITS	25
+#define MMC_MAX_REQ_SIZE	((1 << MCI_DATALENGTH_BITS) - 1)
+/* MCI_DATA_CTL BLOCKSIZE up to 4096 */
+#define MMC_MAX_BLK_SIZE	4096
+#define MMC_MIN_BLK_SIZE	512
+#define MMC_MAX_BLK_CNT		(MMC_MAX_REQ_SIZE / MMC_MIN_BLK_SIZE)
+
+/* 64KiB */
+#define MAX_SG_SIZE		(64 * 1024)
+#define MAX_NR_SG_DMA_PIO	(MMC_MAX_REQ_SIZE / MAX_SG_SIZE)
+
+/*
+ * BAM limitations
+ */
+/* upto 16 bits (64K - 1) */
+#define SPS_MAX_DESC_FIFO_SIZE	65535
+/* 16KiB */
+#define SPS_MAX_DESC_SIZE	(16 * 1024)
+/* Each descriptor is of length 8 bytes */
+#define SPS_MAX_DESC_LENGTH	8
+#define SPS_MAX_DESCS		(SPS_MAX_DESC_FIFO_SIZE / SPS_MAX_DESC_LENGTH)
+#define SPS_MAX_SG_DESCS	(MAX_SG_SIZE / SPS_MAX_DESC_SIZE)
+#define MAX_NR_SG_SPS		(SPS_MAX_DESCS / SPS_MAX_SG_DESCS)
+
+/*
+ * DMA limitations
+ */
+/* upto 16 bits (64K - 1) */
+#define MMC_MAX_DMA_ROWS (64 * 1024 - 1)
+#define MMC_MAX_DMA_BOX_LENGTH (MMC_MAX_DMA_ROWS * MCI_FIFOSIZE)
+#define MMC_MAX_DMA_CMDS (MAX_NR_SG_DMA_PIO * (MMC_MAX_REQ_SIZE / \
+		MMC_MAX_DMA_BOX_LENGTH))
+
 struct clk;
 
 struct msmsdcc_nc_dmadata {
-	dmov_box	cmd[NR_SG];
+	dmov_box	cmd[MMC_MAX_DMA_CMDS];
 	uint32_t	cmdptr;
 };