msm: kgsl: Recovery policy change Recovery algorithm is changed to step 1: retry the same commands that hung the GPU step 2: if step 1 fails, nop just the IB that hung the GPU and retry step 3: if step 2 fails, skip commands in current context till end of frame and retry step 4: if step 3 fails mark context as bad and execute the remaining commands from good contexts. Previously we used to return -EDEADLK when recovery succeeds, this is the same error code if the context is not recoverable. With new policy if recovery succeeds we return -EAGAIN so that userspace treats recovered context differently from the ones that are not recoverable. If recovery fails we mark the context as bad and return -EDEADLK. Change-Id: I9fa3c40801964186866b6002e62f19cf6aa41361 Signed-off-by: Tarun Karra <tkarra@codeaurora.org>

commit: deeecc0ae202c5a294d1b73398bb9c0db5ee5b5a [log] [tgz]
author: Tarun Karra <tkarra@codeaurora.org> Mon Jan 21 23:42:17 2013 -0800
committer: Rajeev Kulkarni <krajeev@codeaurora.org> Thu Feb 14 12:57:26 2013 -0800
tree: 0f0ee603f36fb059652d058aacb26e441bf673bf
parent: 460cc76bb535ac612d9da3ed1fb755837ef67063 [diff] [blame]
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 514c86c..03c82bf 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h

@@ -34,12 +34,15 @@
 #define KGSL_CMD_FLAGS_NONE             0x00000000
 #define KGSL_CMD_FLAGS_PMODE		0x00000001
 #define KGSL_CMD_FLAGS_INTERNAL_ISSUE	0x00000002
+#define KGSL_CMD_FLAGS_EOF	        0x00000100
 
 /* Command identifiers */
 #define KGSL_CONTEXT_TO_MEM_IDENTIFIER	0x2EADBEEF
 #define KGSL_CMD_IDENTIFIER		0x2EEDFACE
 #define KGSL_START_OF_IB_IDENTIFIER	0x2EADEABE
 #define KGSL_END_OF_IB_IDENTIFIER	0x2ABEDEAD
+#define KGSL_END_OF_FRAME_IDENTIFIER	0x2E0F2E0F
+#define KGSL_NOP_IB_IDENTIFIER	        0x20F20F20
 
 #ifdef CONFIG_MSM_SCM
 #define ADRENO_DEFAULT_PWRSCALE_POLICY  (&kgsl_pwrscale_policy_tz)
@@ -99,6 +102,7 @@
 	unsigned int instruction_size;
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
+	unsigned int ft_policy;
 	unsigned int gpulist_index;
 	struct ocmem_buf *ocmem_hdl;
 	unsigned int ocmem_base;
@@ -139,8 +143,12 @@
  * @rb_size - Number of valid dwords in rb_buffer
  * @bad_rb_buffer - Buffer that holds commands from the hanging context
  * bad_rb_size - Number of valid dwords in bad_rb_buffer
+ * @good_rb_buffer - Buffer that holds commands from good contexts
+ * good_rb_size - Number of valid dwords in good_rb_buffer
  * @last_valid_ctx_id - The last context from which commands were placed in
  * ringbuffer before the GPU hung
+ * @step - Current recovery step being executed
+ * @err_code - Recovery error code
  * @fault - Indicates whether the hang was caused due to a pagefault
  * @start_of_replay_cmds - Offset in ringbuffer from where commands can be
  * replayed during recovery
@@ -155,12 +163,25 @@
 	unsigned int rb_size;
 	unsigned int *bad_rb_buffer;
 	unsigned int bad_rb_size;
+	unsigned int *good_rb_buffer;
+	unsigned int good_rb_size;
 	unsigned int last_valid_ctx_id;
+	unsigned int step;
+	unsigned int err_code;
 	int fault;
 	unsigned int start_of_replay_cmds;
 	unsigned int replay_for_snapshot;
 };
 
+enum ft_steps {
+	FT_REPLAY_BAD_CTXT_CMDS = 0,
+	FT_NOT_IB_BAD_CTXT_CMDS,
+	FT_SKIP_EOF_BAD_CTXT_CMDS,
+	FT_FAIL_BAD_CTXT_CMDS,
+	FT_PLAY_GOOD_CTXT_CMDS
+};
+
+
 extern struct adreno_gpudev adreno_a2xx_gpudev;
 extern struct adreno_gpudev adreno_a3xx_gpudev;
commit	deeecc0ae202c5a294d1b73398bb9c0db5ee5b5a	[log] [tgz]
author	Tarun Karra <tkarra@codeaurora.org>	Mon Jan 21 23:42:17 2013 -0800
committer	Rajeev Kulkarni <krajeev@codeaurora.org>	Thu Feb 14 12:57:26 2013 -0800
tree	0f0ee603f36fb059652d058aacb26e441bf673bf
parent	460cc76bb535ac612d9da3ed1fb755837ef67063 [diff] [blame]