msm: kgsl: Faster GPU hang detection

Check global GPU status registers in intervals of 2 seconds,
if the registers did not change after 2 seconds trigger a hang.
When there is an actual GPU hang and GPU is stalled, this reduces
the current hang detection time from 10 seconds to 2 seconds.
Faster GPU hang detection reduces the overall time to recover
from GPU hang.

Change-Id: If432ccacc1b77b4fc7f08b756886bd3ae5edf04f
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index ab47d40..bc687f5 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -111,6 +111,21 @@
 	.ib_check_level = 0,
 };
 
+/* This set of registers are used for Hang detection
+ * If the values of these registers are same after
+ * KGSL_TIMEOUT_PART time, GPU hang is reported in
+ * kernel log.
+ */
+unsigned int hang_detect_regs[] = {
+	A3XX_RBBM_STATUS,
+	REG_CP_RB_RPTR,
+	REG_CP_IB1_BASE,
+	REG_CP_IB1_BUFSZ,
+	REG_CP_IB2_BASE,
+	REG_CP_IB2_BUFSZ,
+};
+
+const unsigned int hang_detect_regs_count = ARRAY_SIZE(hang_detect_regs);
 
 /*
  * This is the master list of all GPU cores that are supported by this
@@ -748,6 +763,10 @@
 		kgsl_mh_start(device);
 	}
 
+	/* Assign correct RBBM status register to hang detect regs
+	 */
+	hang_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+
 	status = kgsl_mmu_start(device);
 	if (status)
 		goto error_clk_off;
@@ -1114,7 +1133,10 @@
 	unsigned long wait_time_part;
 	unsigned int msecs;
 	unsigned int msecs_first;
-	unsigned int msecs_part;
+	unsigned int msecs_part = KGSL_TIMEOUT_PART;
+	unsigned int prev_reg_val[hang_detect_regs_count];
+
+	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
 	kgsl_cffdump_regpoll(device->id,
 		adreno_dev->gpudev->reg_rbbm_status << 2,
@@ -1126,7 +1148,6 @@
 	if (rb->flags & KGSL_FLAGS_STARTED) {
 		msecs = adreno_dev->wait_timeout;
 		msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
-		msecs_part = (msecs - msecs_first + 3) / 4;
 		wait_time = jiffies + wait_timeout;
 		wait_time_part = jiffies + msecs_to_jiffies(msecs_first);
 		adreno_poke(device);
@@ -1135,6 +1156,8 @@
 				adreno_poke(device);
 				wait_time_part = jiffies +
 					msecs_to_jiffies(msecs_part);
+				if ((adreno_hang_detect(device, prev_reg_val)))
+					goto err;
 			}
 			GSL_RB_GET_READPTR(rb, &rb->rptr);
 			if (time_after(jiffies, wait_time)) {
@@ -1147,6 +1170,7 @@
 
 	/* now, wait for the GPU to finish its operations */
 	wait_time = jiffies + wait_timeout;
+	wait_time_part = jiffies + msecs_to_jiffies(msecs_part);
 	while (time_before(jiffies, wait_time)) {
 		adreno_regread(device, adreno_dev->gpudev->reg_rbbm_status,
 			&rbbm_status);
@@ -1157,6 +1181,16 @@
 			if (!(rbbm_status & 0x80000000))
 				return 0;
 		}
+
+		/* Dont wait for timeout, detect hang faster.
+		 */
+		if (time_after(jiffies, wait_time_part)) {
+				wait_time_part = jiffies +
+					msecs_to_jiffies(msecs_part);
+				if ((adreno_hang_detect(device, prev_reg_val)))
+					goto err;
+		}
+
 	}
 
 err:
@@ -1422,6 +1456,32 @@
 	__ret;								\
 })
 
+
+
+unsigned int adreno_hang_detect(struct kgsl_device *device,
+						unsigned int *prev_reg_val)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	unsigned int curr_reg_val[hang_detect_regs_count];
+	unsigned int hang_detected = 1;
+	unsigned int i;
+
+	if (!adreno_dev->fast_hang_detect)
+		return 0;
+
+	for (i = 0; i < hang_detect_regs_count; i++) {
+		adreno_regread(device, hang_detect_regs[i],
+					   &curr_reg_val[i]);
+		if (curr_reg_val[i] != prev_reg_val[i]) {
+			prev_reg_val[i] = curr_reg_val[i];
+			hang_detected = 0;
+		}
+	}
+
+	return hang_detected;
+}
+
+
 /* MUST be called with the device mutex held */
 static int adreno_waittimestamp(struct kgsl_device *device,
 				struct kgsl_context *context,
@@ -1433,16 +1493,20 @@
 	static uint io_cnt;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-	int retries;
+	int retries = 0;
 	unsigned int msecs_first;
-	unsigned int msecs_part;
+	unsigned int msecs_part = KGSL_TIMEOUT_PART;
 	unsigned int ts_issued;
 	unsigned int context_id = _get_context_id(context);
+	unsigned int time_elapsed = 0;
+	unsigned int prev_reg_val[hang_detect_regs_count];
+
+	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
 	ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
 
 	/* Don't wait forever, set a max value for now */
-	if (msecs == -1)
+	if (msecs == KGSL_TIMEOUT_DEFAULT)
 		msecs = adreno_dev->wait_timeout;
 
 	if (timestamp_cmp(timestamp, ts_issued) > 0) {
@@ -1458,8 +1522,7 @@
 	 * been updated properly.
 	 */
 	msecs_first = (msecs <= 100) ? ((msecs + 4) / 5) : 100;
-	msecs_part = (msecs - msecs_first + 3) / 4;
-	for (retries = 0; retries < 5; retries++) {
+	do {
 		/*
 		 * If the context ID is invalid, we are in a race with
 		 * the context being destroyed by userspace so bail.
@@ -1484,6 +1547,11 @@
 		if (io_cnt <
 		    pwr->pwrlevels[pwr->active_pwrlevel].io_fraction)
 			io = 0;
+
+		if ((retries > 0) &&
+			(adreno_hang_detect(device, prev_reg_val)))
+			goto hang_dump;
+
 		mutex_unlock(&device->mutex);
 		/* We need to make sure that the process is
 		 * placed in wait-q before its condition is called
@@ -1505,7 +1573,14 @@
 			goto done;
 		}
 		/*this wait timed out*/
-	}
+
+		time_elapsed = time_elapsed +
+				(retries ? msecs_part : msecs_first);
+		retries++;
+
+	} while (time_elapsed < msecs);
+
+hang_dump:
 	status = -ETIMEDOUT;
 	KGSL_DRV_ERR(device,
 		     "Device hang detected while waiting for timestamp: "
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 88ed895..e1d1eb9 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -82,6 +82,7 @@
 	unsigned int pix_shader_start;
 	unsigned int instruction_size;
 	unsigned int ib_check_level;
+	unsigned int fast_hang_detect;
 };
 
 struct adreno_gpudev {
@@ -124,6 +125,10 @@
 extern const unsigned int a3xx_registers[];
 extern const unsigned int a3xx_registers_count;
 
+extern unsigned int hang_detect_regs[];
+extern const unsigned int hang_detect_regs_count;
+
+
 int adreno_idle(struct kgsl_device *device, unsigned int timeout);
 void adreno_regread(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int *value);
@@ -146,6 +151,9 @@
 
 int adreno_dump_and_recover(struct kgsl_device *device);
 
+unsigned int adreno_hang_detect(struct kgsl_device *device,
+						unsigned int *prev_reg_val);
+
 static inline int adreno_is_a200(struct adreno_device *adreno_dev)
 {
 	return (adreno_dev->gpurev == ADRENO_REV_A200);
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 822cf14..e3c9a18 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -117,6 +117,11 @@
 	debugfs_create_u32("ib_check", 0644, device->d_debugfs,
 			   &adreno_dev->ib_check_level);
 
+	/* By Default enable fast hang detection */
+	adreno_dev->fast_hang_detect = 1;
+	debugfs_create_u32("fast_hang_detect", 0644, device->d_debugfs,
+			   &adreno_dev->fast_hang_detect);
+
 	/* Create post mortem control files */
 
 	pm_d_debugfs = debugfs_create_dir("postmortem", device->d_debugfs);
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 73b4513..afcceee 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -56,6 +56,11 @@
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(rb->device);
 	unsigned long wait_timeout = msecs_to_jiffies(adreno_dev->wait_timeout);
 	unsigned long wait_time;
+	unsigned long wait_time_part;
+	unsigned int msecs_part = KGSL_TIMEOUT_PART;
+	unsigned int prev_reg_val[hang_detect_regs_count];
+
+	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
 	/* if wptr ahead, fill the remaining with NOPs */
 	if (wptr_ahead) {
@@ -83,6 +88,7 @@
 	}
 
 	wait_time = jiffies + wait_timeout;
+	wait_time_part = jiffies + msecs_to_jiffies(msecs_part);
 	/* wait for space in ringbuffer */
 	while (1) {
 		GSL_RB_GET_READPTR(rb, &rb->rptr);
@@ -92,16 +98,34 @@
 		if (freecmds == 0 || freecmds > numcmds)
 			break;
 
+		/* Dont wait for timeout, detect hang faster.
+		 */
+		if (time_after(jiffies, wait_time_part)) {
+			wait_time_part = jiffies +
+				msecs_to_jiffies(msecs_part);
+			if ((adreno_hang_detect(rb->device,
+						prev_reg_val))){
+				KGSL_DRV_ERR(rb->device,
+				"Hang detected while waiting for freespace in"
+				"ringbuffer rptr: 0x%x, wptr: 0x%x\n",
+				rb->rptr, rb->wptr);
+				goto err;
+			}
+		}
+
 		if (time_after(jiffies, wait_time)) {
 			KGSL_DRV_ERR(rb->device,
 			"Timed out while waiting for freespace in ringbuffer "
 			"rptr: 0x%x, wptr: 0x%x\n", rb->rptr, rb->wptr);
-			if (!adreno_dump_and_recover(rb->device))
+			goto err;
+		}
+
+err:
+		if (!adreno_dump_and_recover(rb->device))
 				wait_time = jiffies + wait_timeout;
 			else
 				/* GPU is hung and we cannot recover */
 				BUG();
-		}
 	}
 }
 
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index d464cf1..75b688b 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -25,6 +25,7 @@
 
 #define KGSL_TIMEOUT_NONE       0
 #define KGSL_TIMEOUT_DEFAULT    0xFFFFFFFF
+#define KGSL_TIMEOUT_PART       2000 /* 2 sec */
 
 #define FIRST_TIMEOUT (HZ / 2)