msm: kgsl: Detect long running IBs When expired global timestamp is not progressing, check if an IB is talking more than 2 seconds with no update in GPU read pointer, IB1 base IB1 size, IB2 base, IB2 size and detect it as a long running IB. Invalidate the context of the IB and do not attempt fault tolerance on this context. Change-Id: I7e9780f5aed73a5c8807069aa506bec16a1576b4 Signed-off-by: Tarun Karra <tkarra@codeaurora.org>

commit: 696f89e62e65feb45195bcb86fff02f5643edff1 [log] [tgz]
author: Tarun Karra <tkarra@codeaurora.org> Sun Jan 27 21:31:40 2013 -0800
committer: Rajeev Kulkarni <krajeev@codeaurora.org> Thu Feb 14 12:58:52 2013 -0800
tree: d96ba92724fe933ab9c37e5ab294387f496346be
parent: 5bea9c675a87f3d35247f7e2fe3a999606207409 [diff]
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 893a6ed..678a1b3 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c

@@ -119,14 +119,21 @@
  * If the values of these registers are same after
  * KGSL_TIMEOUT_PART time, GPU hang is reported in
  * kernel log.
+ * *****ALERT******ALERT********ALERT*************
+ * Order of registers below is important, registers
+ * from LONG_IB_DETECT_REG_INDEX_START to
+ * LONG_IB_DETECT_REG_INDEX_END are used in long ib detection.
  */
-unsigned int hang_detect_regs[] = {
+#define LONG_IB_DETECT_REG_INDEX_START 1
+#define LONG_IB_DETECT_REG_INDEX_END 5
+
+unsigned int ft_detect_regs[] = {
 	A3XX_RBBM_STATUS,
-	REG_CP_RB_RPTR,
+	REG_CP_RB_RPTR,   /* LONG_IB_DETECT_REG_INDEX_START */
 	REG_CP_IB1_BASE,
 	REG_CP_IB1_BUFSZ,
 	REG_CP_IB2_BASE,
-	REG_CP_IB2_BUFSZ,
+	REG_CP_IB2_BUFSZ, /* LONG_IB_DETECT_REG_INDEX_END */
 	0,
 	0,
 	0,
@@ -135,7 +142,7 @@
 	0
 };
 
-const unsigned int hang_detect_regs_count = ARRAY_SIZE(hang_detect_regs);
+const unsigned int ft_detect_regs_count = ARRAY_SIZE(ft_detect_regs);
 
 /*
  * This is the master list of all GPU cores that are supported by this
@@ -1254,16 +1261,16 @@
 
 	/* Assign correct RBBM status register to hang detect regs
 	 */
-	hang_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+	ft_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
 
 	/* Add A3XX specific registers for hang detection */
 	if (adreno_is_a3xx(adreno_dev)) {
-		hang_detect_regs[6] = A3XX_RBBM_PERFCTR_SP_7_LO;
-		hang_detect_regs[7] = A3XX_RBBM_PERFCTR_SP_7_HI;
-		hang_detect_regs[8] = A3XX_RBBM_PERFCTR_SP_6_LO;
-		hang_detect_regs[9] = A3XX_RBBM_PERFCTR_SP_6_HI;
-		hang_detect_regs[10] = A3XX_RBBM_PERFCTR_SP_5_LO;
-		hang_detect_regs[11] = A3XX_RBBM_PERFCTR_SP_5_HI;
+		ft_detect_regs[6] = A3XX_RBBM_PERFCTR_SP_7_LO;
+		ft_detect_regs[7] = A3XX_RBBM_PERFCTR_SP_7_HI;
+		ft_detect_regs[8] = A3XX_RBBM_PERFCTR_SP_6_LO;
+		ft_detect_regs[9] = A3XX_RBBM_PERFCTR_SP_6_HI;
+		ft_detect_regs[10] = A3XX_RBBM_PERFCTR_SP_5_LO;
+		ft_detect_regs[11] = A3XX_RBBM_PERFCTR_SP_5_HI;
 	}
 
 	status = kgsl_mmu_start(device);
@@ -1639,9 +1646,35 @@
 }
 
 static int
+_adreno_check_long_ib(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	unsigned int curr_global_ts = 0;
+
+	/* check if the global ts is still the same */
+	kgsl_sharedmem_readl(&device->memstore,
+			&curr_global_ts,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+			eoptimestamp));
+
+	/* Mark long ib as handled */
+	adreno_dev->long_ib = 0;
+
+	if (curr_global_ts == adreno_dev->long_ib_ts) {
+		KGSL_FT_ERR(device,
+			"IB ran too long, invalidate ctxt\n");
+		return 1;
+	} else {
+		/* Do nothing GPU has gone ahead */
+		KGSL_FT_INFO(device, "false long ib detection return\n");
+		return 0;
+	}
+}
+
+static int
 _adreno_ft_restart_device(struct kgsl_device *device,
-					   struct kgsl_context *context,
-					   struct adreno_ft_data *ft_data)
+		   struct kgsl_context *context,
+		   struct adreno_ft_data *ft_data)
 {
 
 	struct adreno_context *adreno_context = context->devctxt;
@@ -1672,6 +1705,37 @@
 	return 0;
 }
 
+static inline void
+_adreno_debug_ft_info(struct kgsl_device *device,
+			struct adreno_ft_data *ft_data)
+{
+
+	/*
+	 * Dumping rb is a very useful tool to debug FT.
+	 * It will tell us if we are extracting the rb correctly
+	 * NOP'ing the right IB, skipping the EOF correctly etc.
+	 */
+	if (device->ft_log >= 7)  {
+
+		/* Print fault tolerance data here */
+		KGSL_FT_INFO(device, "Temp RB buffer size 0x%X\n",
+			ft_data->rb_size);
+		adreno_dump_rb(device, ft_data->rb_buffer,
+			ft_data->rb_size<<2, 0, ft_data->rb_size);
+
+		KGSL_FT_INFO(device, "Bad RB buffer size 0x%X\n",
+			ft_data->bad_rb_size);
+		adreno_dump_rb(device, ft_data->bad_rb_buffer,
+			ft_data->bad_rb_size<<2, 0, ft_data->bad_rb_size);
+
+		KGSL_FT_INFO(device, "Good RB buffer size 0x%X\n",
+			ft_data->good_rb_size);
+		adreno_dump_rb(device, ft_data->good_rb_buffer,
+			ft_data->good_rb_size<<2, 0, ft_data->good_rb_size);
+
+	}
+}
+
 static int
 _adreno_ft_resubmit_rb(struct kgsl_device *device,
 			struct adreno_ringbuffer *rb,
@@ -1681,6 +1745,8 @@
 {
 	unsigned int ret = 0;
 
+	_adreno_debug_ft_info(device, ft_data);
+
 	if (_adreno_ft_restart_device(device, context, ft_data))
 		return 1;
 
@@ -1727,10 +1793,21 @@
 	 * hang */
 	adreno_ringbuffer_extract(rb, ft_data);
 
+	/* Check if we detected a long running IB,
+	 * if true do not attempt replay of bad cmds */
+	if (adreno_dev->long_ib) {
+		if (_adreno_check_long_ib(device)) {
+			_adreno_debug_ft_info(device, ft_data);
+			goto play_good_cmds;
+		} else {
+			adreno_context->flags &= ~CTXT_FLAGS_GPU_HANG;
+			return 0;
+		}
+	}
+
 	/* Do not try the bad commands if  hang is due to a fault */
 	if (ft_data->fault) {
 		KGSL_FT_ERR(device, "Page fault no FT for bad context\n");
-
 		goto play_good_cmds;
 	}
 
@@ -1922,6 +1999,7 @@
 {
 	int result = -ETIMEDOUT;
 	struct adreno_ft_data ft_data;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
 	if (device->state == KGSL_STATE_HUNG)
 		goto done;
@@ -1938,22 +2016,32 @@
 
 		/* Get the fault tolerance data as soon as hang is detected */
 		result = adreno_setup_ft_data(device, &ft_data);
-		/*
-		 * Trigger an automatic dump of the state to
-		 * the console
-		 */
-		kgsl_postmortem_dump(device, 0);
 
 		/*
-		 * Make a GPU snapshot.  For now, do it after the PM dump so we
-		 * can at least be sure the PM dump will work as it always has
+		 * If long ib is detected, do not attempt postmortem or
+		 * snapshot, if GPU is still executing commands
+		 * we will get errors
 		 */
-		kgsl_device_snapshot(device, 1);
+		if (!adreno_dev->long_ib) {
+			/*
+			 * Trigger an automatic dump of the state to
+			 * the console
+			 */
+			kgsl_postmortem_dump(device, 0);
+
+			/*
+			* Make a GPU snapshot.  For now, do it after the
+			* PM dump so we can at least be sure the PM dump
+			* will work as it always has
+			*/
+			kgsl_device_snapshot(device, 1);
+		}
 
 		if (!result) {
 			result = adreno_ft(device, &ft_data);
 			adreno_destroy_ft_data(&ft_data);
 		}
+
 		if (result) {
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_HUNG);
 		} else {
@@ -2132,7 +2220,7 @@
 	do {
 		if (time_after(jiffies, wait)) {
 			/* Check to see if the core is hung */
-			if (adreno_hang_detect(device, regs))
+			if (adreno_ft_detect(device, regs))
 				return -ETIMEDOUT;
 
 			wait = jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART);
@@ -2156,7 +2244,7 @@
 	unsigned int rbbm_status;
 	unsigned long wait_time;
 	unsigned long wait_time_part;
-	unsigned int prev_reg_val[hang_detect_regs_count];
+	unsigned int prev_reg_val[ft_detect_regs_count];
 
 	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
@@ -2189,7 +2277,7 @@
 		if (time_after(jiffies, wait_time_part)) {
 				wait_time_part = jiffies +
 					msecs_to_jiffies(KGSL_TIMEOUT_PART);
-				if ((adreno_hang_detect(device, prev_reg_val)))
+				if ((adreno_ft_detect(device, prev_reg_val)))
 					goto err;
 		}
 
@@ -2535,17 +2623,26 @@
 
 
 
-unsigned int adreno_hang_detect(struct kgsl_device *device,
+unsigned int adreno_ft_detect(struct kgsl_device *device,
 						unsigned int *prev_reg_val)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int curr_reg_val[hang_detect_regs_count];
-	unsigned int hang_detected = 1;
+	unsigned int curr_reg_val[ft_detect_regs_count];
+	unsigned int fast_hang_detected = 1;
+	unsigned int long_ib_detected = 1;
 	unsigned int i;
 	static unsigned long next_hang_detect_time;
+	static unsigned int prev_global_ts;
+	unsigned int curr_global_ts = 0;
+	unsigned int curr_context_id = 0;
+	static struct adreno_context *curr_context;
+	static struct kgsl_context *context;
 
 	if (!adreno_dev->fast_hang_detect)
-		return 0;
+		fast_hang_detected = 0;
+
+	if (!adreno_dev->long_ib_detect)
+		long_ib_detected = 0;
 
 	if (is_adreno_rbbm_status_idle(device)) {
 
@@ -2578,20 +2675,123 @@
 		next_hang_detect_time = (jiffies +
 			msecs_to_jiffies(KGSL_TIMEOUT_PART-1));
 
-	for (i = 0; i < hang_detect_regs_count; i++) {
-
-		if (hang_detect_regs[i] == 0)
+	/* Read the current Hang detect reg values here */
+	for (i = 0; i < ft_detect_regs_count; i++) {
+		if (ft_detect_regs[i] == 0)
 			continue;
-
-		adreno_regread(device, hang_detect_regs[i],
-					   &curr_reg_val[i]);
-		if (curr_reg_val[i] != prev_reg_val[i]) {
-			prev_reg_val[i] = curr_reg_val[i];
-			hang_detected = 0;
-		}
+		adreno_regread(device, ft_detect_regs[i],
+			&curr_reg_val[i]);
 	}
 
-	return hang_detected;
+	/* Read the current global timestamp here */
+	kgsl_sharedmem_readl(&device->memstore,
+			&curr_global_ts,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+			eoptimestamp));
+
+	mb();
+
+	if (curr_global_ts == prev_global_ts) {
+
+		/* Get the current context here */
+		if (context == NULL) {
+			kgsl_sharedmem_readl(&device->memstore,
+				&curr_context_id,
+				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+				current_context));
+			context = idr_find(&device->context_idr,
+				curr_context_id);
+			if (context != NULL) {
+				curr_context = context->devctxt;
+				curr_context->ib_gpu_time_used = 0;
+			} else {
+				KGSL_DRV_ERR(device,
+					"Fault tolerance no context found\n");
+			}
+		}
+
+		mb();
+
+		if (curr_context != NULL) {
+
+			curr_context->ib_gpu_time_used += KGSL_TIMEOUT_PART;
+			KGSL_FT_INFO(device,
+			"Proc %s used GPU Time %d ms on timestamp 0x%X\n",
+			curr_context->pid_name, curr_context->ib_gpu_time_used,
+			curr_global_ts+1);
+
+			for (i = 0; i < ft_detect_regs_count; i++) {
+				if (curr_reg_val[i] != prev_reg_val[i]) {
+					fast_hang_detected = 0;
+
+					/* Check for long IB here */
+					if ((i >=
+						LONG_IB_DETECT_REG_INDEX_START)
+						&&
+						(i <=
+						LONG_IB_DETECT_REG_INDEX_END))
+						long_ib_detected = 0;
+				}
+			}
+
+			if (fast_hang_detected) {
+				KGSL_FT_ERR(device,
+					"Proc %s, ctxt_id %d ts %d triggered fault tolerance"
+					" on global ts %d\n",
+					curr_context->pid_name, curr_context->id
+					, (kgsl_readtimestamp(device, context,
+					KGSL_TIMESTAMP_RETIRED)+1),
+					curr_global_ts+1);
+				return 1;
+			}
+
+			if (long_ib_detected) {
+				curr_context->ib_gpu_time_used +=
+					KGSL_TIMEOUT_PART;
+				if (curr_context->ib_gpu_time_used >
+					KGSL_TIMEOUT_LONG_IB_DETECTION) {
+					if (adreno_dev->long_ib_ts !=
+						curr_global_ts) {
+						KGSL_FT_ERR(device,
+						"Proc %s, ctxt_id %d ts %d"
+						"used GPU for %d ms long ib "
+						"detected on global ts %d\n",
+						curr_context->pid_name,
+						curr_context->id,
+						(kgsl_readtimestamp(device,
+						context,
+						KGSL_TIMESTAMP_RETIRED)+1),
+						curr_context->ib_gpu_time_used,
+						curr_global_ts+1);
+						adreno_dev->long_ib = 1;
+						adreno_dev->long_ib_ts =
+								curr_global_ts;
+						curr_context->ib_gpu_time_used =
+								0;
+						return 1;
+					}
+				}
+			}
+		} else {
+			KGSL_FT_ERR(device,
+				"Last context unknown id:%d\n",
+				curr_context_id);
+		}
+	} else {
+		/* GPU is moving forward */
+		prev_global_ts = curr_global_ts;
+		context = NULL;
+		curr_context = NULL;
+		adreno_dev->long_ib = 0;
+		adreno_dev->long_ib_ts = 0;
+	}
+
+
+	/* If hangs are not detected copy the current reg values
+	 * to previous values and return no hang */
+	for (i = 0; i < ft_detect_regs_count; i++)
+			prev_reg_val[i] = curr_reg_val[i];
+	return 0;
 }
 
 /**
@@ -2620,7 +2820,7 @@
 	adreno_regread(device, REG_CP_RB_RPTR, &rptr);
 	mb();
 
-	KGSL_DRV_ERR(device,
+	KGSL_DRV_WARN(device,
 		     "Device hang detected while waiting for timestamp: "
 		     "<%d:0x%x>, last submitted timestamp: <%d:0x%x>, "
 		     "retired timestamp: <%d:0x%x>, wptr: 0x%x, rptr: 0x%x\n",
@@ -2684,7 +2884,7 @@
 	struct adreno_context *adreno_ctx = context ? context->devctxt : NULL;
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
 	unsigned int context_id = _get_context_id(context);
-	unsigned int prev_reg_val[hang_detect_regs_count];
+	unsigned int prev_reg_val[ft_detect_regs_count];
 	unsigned int time_elapsed = 0;
 	unsigned int wait;
 	int ts_compare = 1;
@@ -2743,7 +2943,7 @@
 		}
 
 		/* Check to see if the GPU is hung */
-		if (adreno_hang_detect(device, prev_reg_val)) {
+		if (adreno_ft_detect(device, prev_reg_val)) {
 			ret = adreno_handle_hang(device, context, timestamp);
 			break;
 		}

diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 23fce2f..417a041 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h

@@ -103,6 +103,9 @@
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
 	unsigned int ft_policy;
+	unsigned int long_ib_detect;
+	unsigned int long_ib;
+	unsigned int long_ib_ts;
 	unsigned int gpulist_index;
 	struct ocmem_buf *ocmem_hdl;
 	unsigned int ocmem_base;
@@ -201,8 +204,8 @@
 extern const unsigned int a330_registers[];
 extern const unsigned int a330_registers_count;
 
-extern unsigned int hang_detect_regs[];
-extern const unsigned int hang_detect_regs_count;
+extern unsigned int ft_detect_regs[];
+extern const unsigned int ft_detect_regs_count;
 
 
 int adreno_idle(struct kgsl_device *device);
@@ -232,7 +235,7 @@
 void adreno_dump_rb(struct kgsl_device *device, const void *buf,
 			 size_t len, int start, int size);
 
-unsigned int adreno_hang_detect(struct kgsl_device *device,
+unsigned int adreno_ft_detect(struct kgsl_device *device,
 						unsigned int *prev_reg_val);
 
 static inline int adreno_is_a200(struct adreno_device *adreno_dev)

diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 1f96f8c..79eb0c3 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c

@@ -75,5 +75,8 @@
 	adreno_dev->ft_policy = FT_REPLAY_BAD_CTXT_CMDS;
 	debugfs_create_u32("fault_tolerance_policy", 0644, device->d_debugfs,
 			   &adreno_dev->ft_policy);
-
+	/* By default enable long IB detection */
+	adreno_dev->long_ib_detect = 1;
+	debugfs_create_u32("long_ib_detect", 0644, device->d_debugfs,
+			   &adreno_dev->long_ib_detect);
 }

diff --git a/drivers/gpu/msm/adreno_drawctxt.h b/drivers/gpu/msm/adreno_drawctxt.h
index aa11cb7..1581964 100644
--- a/drivers/gpu/msm/adreno_drawctxt.h
+++ b/drivers/gpu/msm/adreno_drawctxt.h

@@ -89,6 +89,7 @@
 	pid_t pid;
 	char pid_name[TASK_COMM_LEN];
 	unsigned int id;
+	unsigned int ib_gpu_time_used;
 	uint32_t flags;
 	struct kgsl_pagetable *pagetable;
 	struct kgsl_memdesc gpustate;

diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 6fda86d..d3e5dea 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c

@@ -64,7 +64,7 @@
 	unsigned long wait_time;
 	unsigned long wait_timeout = msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
 	unsigned long wait_time_part;
-	unsigned int prev_reg_val[hang_detect_regs_count];
+	unsigned int prev_reg_val[ft_detect_regs_count];
 
 	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
@@ -109,7 +109,7 @@
 		if (time_after(jiffies, wait_time_part)) {
 			wait_time_part = jiffies +
 				msecs_to_jiffies(KGSL_TIMEOUT_PART);
-			if ((adreno_hang_detect(rb->device,
+			if ((adreno_ft_detect(rb->device,
 						prev_reg_val))){
 				KGSL_DRV_ERR(rb->device,
 				"Hang detected while waiting for freespace in"
@@ -982,17 +982,17 @@
 	drawctxt = context->devctxt;
 
 	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG) {
-		KGSL_CTXT_ERR(device, "Context %p failed fault tolerance"
+		KGSL_CTXT_ERR(device, "proc %s failed fault tolerance"
 			" will not accept commands for context %d\n",
-			drawctxt, drawctxt->id);
+			drawctxt->pid_name, drawctxt->id);
 		return -EDEADLK;
 	}
 
 	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
 		KGSL_CTXT_ERR(device,
-			"Context %p triggered fault tolerance"
+			"proc %s triggered fault tolerance"
 			" skipping commands for context till EOF %d\n",
-			drawctxt, drawctxt->id);
+			drawctxt->pid_name, drawctxt->id);
 		if (flags & KGSL_CMD_FLAGS_EOF)
 			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
 		numibs = 0;

diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index 805e54b..8787eea 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h

@@ -24,9 +24,10 @@
 #include "kgsl_pwrscale.h"
 #include <linux/sync.h>
 
-#define KGSL_TIMEOUT_NONE       0
-#define KGSL_TIMEOUT_DEFAULT    0xFFFFFFFF
-#define KGSL_TIMEOUT_PART       50 /* 50 msec */
+#define KGSL_TIMEOUT_NONE           0
+#define KGSL_TIMEOUT_DEFAULT        0xFFFFFFFF
+#define KGSL_TIMEOUT_PART           50 /* 50 msec */
+#define KGSL_TIMEOUT_LONG_IB_DETECTION  2000 /* 2 sec*/
 
 #define FIRST_TIMEOUT (HZ / 2)
commit	696f89e62e65feb45195bcb86fff02f5643edff1	[log] [tgz]
author	Tarun Karra <tkarra@codeaurora.org>	Sun Jan 27 21:31:40 2013 -0800
committer	Rajeev Kulkarni <krajeev@codeaurora.org>	Thu Feb 14 12:58:52 2013 -0800
tree	d96ba92724fe933ab9c37e5ab294387f496346be
parent	5bea9c675a87f3d35247f7e2fe3a999606207409 [diff]