msm: kgsl: On pagefault stall GPU so that it hangs

Prevent the iommu fault handler from clearing the stalled status
of GPU on page fault. This will cause a GPU hang and print out
the snapshot that will help in fault analysis.

Change-Id: I9dcab83a098a988f86a0c03c46b0dbe6624de937
Signed-off-by: Shubhraprakash Das <sadas@codeaurora.org>
Signed-off-by: Rajeev Kulkarni <krajeev@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index f394422..6a8bde8 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -1464,6 +1464,7 @@
 		ret = -ENOMEM;
 		goto done;
 	}
+	rec_data->fault = device->mmu.fault;
 
 done:
 	if (ret) {
@@ -1526,9 +1527,9 @@
 			goto done;
 	}
 
-	/* Do not try the bad caommands if recovery has failed bad commands
-	 * once already */
-	if (!try_bad_commands)
+	/* Do not try the bad commands if recovery has failed bad commands
+	 * once already or if hang is due to a fault */
+	if (!try_bad_commands || rec_data->fault)
 		rec_data->bad_rb_size = 0;
 
 	if (rec_data->bad_rb_size) {
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 66402fe..75c52e2 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -138,6 +138,7 @@
  * bad_rb_size - Number of valid dwords in bad_rb_buffer
  * @last_valid_ctx_id - The last context from which commands were placed in
  * ringbuffer before the GPU hung
+ * @fault - Indicates whether the hang was caused due to a pagefault
  */
 struct adreno_recovery_data {
 	unsigned int ib1;
@@ -148,6 +149,7 @@
 	unsigned int *bad_rb_buffer;
 	unsigned int bad_rb_size;
 	unsigned int last_valid_ctx_id;
+	int fault;
 };
 
 extern struct adreno_gpudev adreno_a2xx_gpudev;
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index 87e8746..5eabc4d 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -34,6 +34,7 @@
 	{ 0x14, 0x0003FFFF, 14 },		/* TTBR1 */
 	{ 0x20, 0, 0 },				/* FSR */
 	{ 0x800, 0, 0 },			/* TLBIALL */
+	{ 0x820, 0, 0 },			/* RESUME */
 };
 
 static struct kgsl_iommu_register_list kgsl_iommuv2_reg[KGSL_IOMMU_REG_MAX] = {
@@ -41,7 +42,8 @@
 	{ 0x20, 0x00FFFFFF, 14 },		/* TTBR0 */
 	{ 0x28, 0x00FFFFFF, 14 },		/* TTBR1 */
 	{ 0x58, 0, 0 },				/* FSR */
-	{ 0x618, 0, 0 }				/* TLBIALL */
+	{ 0x618, 0, 0 },			/* TLBIALL */
+	{ 0x008, 0, 0 }				/* RESUME */
 };
 
 static int get_iommu_unit(struct device *dev, struct kgsl_mmu **mmu_out,
@@ -124,9 +126,19 @@
 	KGSL_MEM_CRIT(iommu_dev->kgsldev, "context = %d FSR = %X\n",
 		iommu_dev->ctx_id, fsr);
 
+	mmu->fault = 1;
+	iommu_dev->fault = 1;
+
 	trace_kgsl_mmu_pagefault(iommu_dev->kgsldev, addr,
 			kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase), 0);
 
+	/*
+	 * We do not want the h/w to resume fetching data from an iommu unit
+	 * that has faulted, this is better for debugging as it will stall
+	 * the GPU and trigger a snapshot. To stall the transaction return
+	 * EBUSY error.
+	 */
+	ret = -EBUSY;
 done:
 	return ret;
 }
@@ -859,12 +871,13 @@
 	 */
 	for (i = 0; i < iommu->unit_count; i++) {
 		struct kgsl_iommu_unit *iommu_unit = &iommu->iommu_units[i];
-		for (j = 0; j < iommu_unit->dev_count; j++)
+		for (j = 0; j < iommu_unit->dev_count; j++) {
 			iommu_unit->dev[j].pt_lsb = KGSL_IOMMMU_PT_LSB(iommu,
 						KGSL_IOMMU_GET_CTX_REG(iommu,
 						iommu_unit,
 						iommu_unit->dev[j].ctx_id,
 						TTBR0));
+		}
 	}
 
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
@@ -945,6 +958,7 @@
 static void kgsl_iommu_stop(struct kgsl_mmu *mmu)
 {
 	struct kgsl_iommu *iommu = mmu->priv;
+	int i, j;
 	/*
 	 *  stop device mmu
 	 *
@@ -957,8 +971,25 @@
 		mmu->hwpagetable = NULL;
 
 		mmu->flags &= ~KGSL_FLAGS_STARTED;
-	}
 
+		if (mmu->fault) {
+			for (i = 0; i < iommu->unit_count; i++) {
+				struct kgsl_iommu_unit *iommu_unit =
+					&iommu->iommu_units[i];
+				for (j = 0; j < iommu_unit->dev_count; j++) {
+					if (iommu_unit->dev[j].fault) {
+						kgsl_iommu_enable_clk(mmu, j);
+						KGSL_IOMMU_SET_CTX_REG(iommu,
+						iommu_unit,
+						iommu_unit->dev[j].ctx_id,
+						RESUME, 1);
+						iommu_unit->dev[j].fault = 0;
+					}
+				}
+			}
+			mmu->fault = 0;
+		}
+	}
 	/* switch off MMU clocks and cancel any events it has queued */
 	iommu->clk_event_queued = false;
 	kgsl_cancel_events(mmu->device, mmu);
diff --git a/drivers/gpu/msm/kgsl_iommu.h b/drivers/gpu/msm/kgsl_iommu.h
index eafba7b..661b4f0 100644
--- a/drivers/gpu/msm/kgsl_iommu.h
+++ b/drivers/gpu/msm/kgsl_iommu.h
@@ -25,6 +25,7 @@
 	KGSL_IOMMU_CTX_TTBR1,
 	KGSL_IOMMU_CTX_FSR,
 	KGSL_IOMMU_CTX_TLBIALL,
+	KGSL_IOMMU_CTX_RESUME,
 	KGSL_IOMMU_REG_MAX
 };
 
@@ -77,6 +78,8 @@
  * @ctx_id: This iommu units context id. It can be either 0 or 1
  * @clk_enabled: If set indicates that iommu clocks of this iommu context
  * are on, else the clocks are off
+ * fault: Flag when set indicates that this iommu device has caused a page
+ * fault
  */
 struct kgsl_iommu_device {
 	struct device *dev;
@@ -85,6 +88,7 @@
 	enum kgsl_iommu_context_id ctx_id;
 	bool clk_enabled;
 	struct kgsl_device *kgsldev;
+	int fault;
 };
 
 /*
diff --git a/drivers/gpu/msm/kgsl_mmu.h b/drivers/gpu/msm/kgsl_mmu.h
index 234629b..b8b9149 100644
--- a/drivers/gpu/msm/kgsl_mmu.h
+++ b/drivers/gpu/msm/kgsl_mmu.h
@@ -175,6 +175,7 @@
 	struct kgsl_pagetable  *hwpagetable;
 	const struct kgsl_mmu_ops *mmu_ops;
 	void *priv;
+	int fault;
 };
 
 #include "kgsl_gpummu.h"