msm: kgsl: Recovery policy change

Recovery algorithm is changed to
step 1: retry the same commands that hung the GPU
step 2: if step 1 fails, nop just the IB that hung the GPU
	and retry
step 3: if step 2 fails, skip commands in current context
        till end of frame and retry
step 4: if step 3 fails mark context as bad and execute
        the remaining commands from good contexts.

Previously we used to return -EDEADLK when recovery succeeds,
this is the same error code if the context is not recoverable.
With new policy if recovery succeeds we return -EAGAIN so that
userspace treats recovered context differently from the ones
that are not recoverable. If recovery fails we mark the context
as bad and return -EDEADLK.

Change-Id: I9fa3c40801964186866b6002e62f19cf6aa41361
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 514c86c..03c82bf 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -34,12 +34,15 @@
 #define KGSL_CMD_FLAGS_NONE             0x00000000
 #define KGSL_CMD_FLAGS_PMODE		0x00000001
 #define KGSL_CMD_FLAGS_INTERNAL_ISSUE	0x00000002
+#define KGSL_CMD_FLAGS_EOF	        0x00000100
 
 /* Command identifiers */
 #define KGSL_CONTEXT_TO_MEM_IDENTIFIER	0x2EADBEEF
 #define KGSL_CMD_IDENTIFIER		0x2EEDFACE
 #define KGSL_START_OF_IB_IDENTIFIER	0x2EADEABE
 #define KGSL_END_OF_IB_IDENTIFIER	0x2ABEDEAD
+#define KGSL_END_OF_FRAME_IDENTIFIER	0x2E0F2E0F
+#define KGSL_NOP_IB_IDENTIFIER	        0x20F20F20
 
 #ifdef CONFIG_MSM_SCM
 #define ADRENO_DEFAULT_PWRSCALE_POLICY  (&kgsl_pwrscale_policy_tz)
@@ -99,6 +102,7 @@
 	unsigned int instruction_size;
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
+	unsigned int ft_policy;
 	unsigned int gpulist_index;
 	struct ocmem_buf *ocmem_hdl;
 	unsigned int ocmem_base;
@@ -139,8 +143,12 @@
  * @rb_size - Number of valid dwords in rb_buffer
  * @bad_rb_buffer - Buffer that holds commands from the hanging context
  * bad_rb_size - Number of valid dwords in bad_rb_buffer
+ * @good_rb_buffer - Buffer that holds commands from good contexts
+ * good_rb_size - Number of valid dwords in good_rb_buffer
  * @last_valid_ctx_id - The last context from which commands were placed in
  * ringbuffer before the GPU hung
+ * @step - Current recovery step being executed
+ * @err_code - Recovery error code
  * @fault - Indicates whether the hang was caused due to a pagefault
  * @start_of_replay_cmds - Offset in ringbuffer from where commands can be
  * replayed during recovery
@@ -155,12 +163,25 @@
 	unsigned int rb_size;
 	unsigned int *bad_rb_buffer;
 	unsigned int bad_rb_size;
+	unsigned int *good_rb_buffer;
+	unsigned int good_rb_size;
 	unsigned int last_valid_ctx_id;
+	unsigned int step;
+	unsigned int err_code;
 	int fault;
 	unsigned int start_of_replay_cmds;
 	unsigned int replay_for_snapshot;
 };
 
+enum ft_steps {
+	FT_REPLAY_BAD_CTXT_CMDS = 0,
+	FT_NOT_IB_BAD_CTXT_CMDS,
+	FT_SKIP_EOF_BAD_CTXT_CMDS,
+	FT_FAIL_BAD_CTXT_CMDS,
+	FT_PLAY_GOOD_CTXT_CMDS
+};
+
+
 extern struct adreno_gpudev adreno_a2xx_gpudev;
 extern struct adreno_gpudev adreno_a3xx_gpudev;