msm: kgsl: Recovery policy change
Recovery algorithm is changed to
step 1: retry the same commands that hung the GPU
step 2: if step 1 fails, nop just the IB that hung the GPU
and retry
step 3: if step 2 fails, skip commands in current context
till end of frame and retry
step 4: if step 3 fails mark context as bad and execute
the remaining commands from good contexts.
Previously we used to return -EDEADLK when recovery succeeds,
this is the same error code if the context is not recoverable.
With new policy if recovery succeeds we return -EAGAIN so that
userspace treats recovered context differently from the ones
that are not recoverable. If recovery fails we mark the context
as bad and return -EDEADLK.
Change-Id: I9fa3c40801964186866b6002e62f19cf6aa41361
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 514c86c..03c82bf 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -34,12 +34,15 @@
#define KGSL_CMD_FLAGS_NONE 0x00000000
#define KGSL_CMD_FLAGS_PMODE 0x00000001
#define KGSL_CMD_FLAGS_INTERNAL_ISSUE 0x00000002
+#define KGSL_CMD_FLAGS_EOF 0x00000100
/* Command identifiers */
#define KGSL_CONTEXT_TO_MEM_IDENTIFIER 0x2EADBEEF
#define KGSL_CMD_IDENTIFIER 0x2EEDFACE
#define KGSL_START_OF_IB_IDENTIFIER 0x2EADEABE
#define KGSL_END_OF_IB_IDENTIFIER 0x2ABEDEAD
+#define KGSL_END_OF_FRAME_IDENTIFIER 0x2E0F2E0F
+#define KGSL_NOP_IB_IDENTIFIER 0x20F20F20
#ifdef CONFIG_MSM_SCM
#define ADRENO_DEFAULT_PWRSCALE_POLICY (&kgsl_pwrscale_policy_tz)
@@ -99,6 +102,7 @@
unsigned int instruction_size;
unsigned int ib_check_level;
unsigned int fast_hang_detect;
+ unsigned int ft_policy;
unsigned int gpulist_index;
struct ocmem_buf *ocmem_hdl;
unsigned int ocmem_base;
@@ -139,8 +143,12 @@
* @rb_size - Number of valid dwords in rb_buffer
* @bad_rb_buffer - Buffer that holds commands from the hanging context
* bad_rb_size - Number of valid dwords in bad_rb_buffer
+ * @good_rb_buffer - Buffer that holds commands from good contexts
+ * good_rb_size - Number of valid dwords in good_rb_buffer
* @last_valid_ctx_id - The last context from which commands were placed in
* ringbuffer before the GPU hung
+ * @step - Current recovery step being executed
+ * @err_code - Recovery error code
* @fault - Indicates whether the hang was caused due to a pagefault
* @start_of_replay_cmds - Offset in ringbuffer from where commands can be
* replayed during recovery
@@ -155,12 +163,25 @@
unsigned int rb_size;
unsigned int *bad_rb_buffer;
unsigned int bad_rb_size;
+ unsigned int *good_rb_buffer;
+ unsigned int good_rb_size;
unsigned int last_valid_ctx_id;
+ unsigned int step;
+ unsigned int err_code;
int fault;
unsigned int start_of_replay_cmds;
unsigned int replay_for_snapshot;
};
+enum ft_steps {
+ FT_REPLAY_BAD_CTXT_CMDS = 0,
+ FT_NOT_IB_BAD_CTXT_CMDS,
+ FT_SKIP_EOF_BAD_CTXT_CMDS,
+ FT_FAIL_BAD_CTXT_CMDS,
+ FT_PLAY_GOOD_CTXT_CMDS
+};
+
+
extern struct adreno_gpudev adreno_a2xx_gpudev;
extern struct adreno_gpudev adreno_a3xx_gpudev;