msm: kgsl: Recovery policy change

Recovery algorithm is changed to
step 1: retry the same commands that hung the GPU
step 2: if step 1 fails, nop just the IB that hung the GPU
	and retry
step 3: if step 2 fails, skip commands in current context
        till end of frame and retry
step 4: if step 3 fails mark context as bad and execute
        the remaining commands from good contexts.

Previously we used to return -EDEADLK when recovery succeeds,
this is the same error code if the context is not recoverable.
With new policy if recovery succeeds we return -EAGAIN so that
userspace treats recovered context differently from the ones
that are not recoverable. If recovery fails we mark the context
as bad and return -EDEADLK.

Change-Id: I9fa3c40801964186866b6002e62f19cf6aa41361
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 5573bdf..6a8bffb 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -711,6 +711,11 @@
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0);
 	}
 
+	if (flags & KGSL_CMD_FLAGS_EOF) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_END_OF_FRAME_IDENTIFIER);
+	}
+
 	adreno_ringbuffer_submit(rb);
 
 	return timestamp;
@@ -977,12 +982,22 @@
 	drawctxt = context->devctxt;
 
 	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG) {
-		KGSL_CTXT_WARN(device, "Context %p caused a gpu hang.."
+		KGSL_CTXT_ERR(device, "Context %p caused a gpu hang.."
 			" will not accept commands for context %d\n",
 			drawctxt, drawctxt->id);
 		return -EDEADLK;
 	}
 
+	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
+		KGSL_CTXT_ERR(device,
+			"Context %p caused a gpu hang.."
+			" skipping commands for context till EOF %d\n",
+			drawctxt, drawctxt->id);
+		if (flags & KGSL_CMD_FLAGS_EOF)
+			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
+		numibs = 0;
+	}
+
 	cmds = link = kzalloc(sizeof(unsigned int) * (numibs * 3 + 4),
 				GFP_KERNEL);
 	if (!link) {
@@ -1032,7 +1047,7 @@
 
 	*timestamp = adreno_ringbuffer_addcmds(&adreno_dev->ringbuffer,
 					drawctxt,
-					0,
+					(flags & KGSL_CMD_FLAGS_EOF),
 					&link[0], (cmds - link), *timestamp);
 
 	KGSL_CMD_INFO(device, "ctxt %d g %08x numibs %d ts %d\n",
@@ -1048,13 +1063,13 @@
 	 */
 	adreno_idle(device);
 #endif
+
 	/* If context hung and recovered then return error so that the
 	 * application may handle it */
 	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_RECOVERED)
-		return -EDEADLK;
+		return -EAGAIN;
 	else
 		return 0;
-
 }
 
 static void _turn_preamble_on_for_ib_seq(struct adreno_ringbuffer *rb,
@@ -1106,20 +1121,21 @@
 {
 	struct kgsl_device *device = rb->device;
 	unsigned int rb_rptr = rec_data->start_of_replay_cmds;
+	unsigned int good_rb_idx = 0, bad_rb_idx = 0, temp_rb_idx = 0;
+	unsigned int last_good_cmd_end_idx = 0, last_bad_cmd_end_idx = 0;
+	unsigned int cmd_start_idx = 0;
+	unsigned int val1 = 0;
+	int copy_rb_contents = 0;
+	unsigned int temp_rb_rptr;
+	struct kgsl_context *k_ctxt;
+	struct adreno_context *a_ctxt;
+	unsigned int size = rb->buffer_desc.size;
 	unsigned int *temp_rb_buffer = rec_data->rb_buffer;
 	int *rb_size = &rec_data->rb_size;
 	unsigned int *bad_rb_buffer = rec_data->bad_rb_buffer;
 	int *bad_rb_size = &rec_data->bad_rb_size;
-
-	unsigned int good_rb_idx = 0, cmd_start_idx = 0;
-	unsigned int val1 = 0;
-	struct kgsl_context *k_ctxt;
-	struct adreno_context *a_ctxt;
-	unsigned int bad_rb_idx = 0;
-	int copy_rb_contents = 0;
-	unsigned int temp_rb_rptr;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int good_cmd_start_idx = 0;
+	unsigned int *good_rb_buffer = rec_data->good_rb_buffer;
+	int *good_rb_size = &rec_data->good_rb_size;
 
 	/*
 	 * If the start index from where commands need to be copied is invalid
@@ -1144,9 +1160,11 @@
 		if (KGSL_CMD_IDENTIFIER == val1) {
 			/* Start is the NOP dword that comes before
 			 * KGSL_CMD_IDENTIFIER */
-			cmd_start_idx = bad_rb_idx - 1;
-			if (copy_rb_contents)
-				good_cmd_start_idx = good_rb_idx - 1;
+			cmd_start_idx = temp_rb_idx - 1;
+			if ((copy_rb_contents) && (good_rb_idx))
+				last_good_cmd_end_idx = good_rb_idx - 1;
+			if ((!copy_rb_contents) && (bad_rb_idx))
+				last_bad_cmd_end_idx = bad_rb_idx - 1;
 		}
 
 		/* check for context switch indicator */
@@ -1172,33 +1190,48 @@
 				!(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) ||
 				!k_ctxt)) {
 				for (temp_idx = cmd_start_idx;
-					temp_idx < bad_rb_idx;
+					temp_idx < temp_rb_idx;
 					temp_idx++)
-					temp_rb_buffer[good_rb_idx++] =
-						bad_rb_buffer[temp_idx];
+					good_rb_buffer[good_rb_idx++] =
+						temp_rb_buffer[temp_idx];
 				rec_data->last_valid_ctx_id = val2;
 				copy_rb_contents = 1;
+				/* remove the good commands from bad buffer */
+				bad_rb_idx = last_bad_cmd_end_idx;
 			} else if (copy_rb_contents && k_ctxt &&
 				(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) {
-				/* If we are changing to bad context then remove
-				 * the dwords we copied for this sequence from
-				 * the good buffer */
-				good_rb_idx = good_cmd_start_idx;
+
+				/* If we are changing back to a bad context
+				 * from good ctxt and were not copying commands
+				 * to bad ctxt then copy over commands to
+				 * the bad context */
+				for (temp_idx = cmd_start_idx;
+					temp_idx < temp_rb_idx;
+					temp_idx++)
+					bad_rb_buffer[bad_rb_idx++] =
+						temp_rb_buffer[temp_idx];
+				/* If we are changing to bad context then
+				 * remove the dwords we copied for this
+				 * sequence from the good buffer */
+				good_rb_idx = last_good_cmd_end_idx;
 				copy_rb_contents = 0;
 			}
 			}
 		}
 
 		if (copy_rb_contents)
-			temp_rb_buffer[good_rb_idx++] = val1;
-		/* Copy both good and bad commands for replay to the bad
-		 * buffer */
-		bad_rb_buffer[bad_rb_idx++] = val1;
+			good_rb_buffer[good_rb_idx++] = val1;
+		else
+			bad_rb_buffer[bad_rb_idx++] = val1;
+
+		/* Copy both good and bad commands to temp buffer */
+		temp_rb_buffer[temp_rb_idx++] = val1;
 
 		rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, size);
 	}
-	*rb_size = good_rb_idx;
+	*good_rb_size = good_rb_idx;
 	*bad_rb_size = bad_rb_idx;
+	*rb_size = temp_rb_idx;
 }
 
 void