msm: kgsl: Renaming recovery to fault tolerance
Renaming recovery to fault tolerance and modifying
the functions and log messages accordingly.
Change-Id: I5f249806026ac514c4aff7da45c3a4e8cc2f8c34
Signed-off-by: Tarun Karra <tkarra@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 88756c0..893a6ed 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -1199,7 +1199,7 @@
int status = -EINVAL;
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
- if (KGSL_STATE_DUMP_AND_RECOVER != device->state)
+ if (KGSL_STATE_DUMP_AND_FT != device->state)
kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
/* Power up the device */
@@ -1284,9 +1284,9 @@
status = adreno_ringbuffer_start(&adreno_dev->ringbuffer, init_ram);
if (status == 0) {
- /* While recovery is on we do not want timer to
+ /* While fault tolerance is on we do not want timer to
* fire and attempt to change any device state */
- if (KGSL_STATE_DUMP_AND_RECOVER != device->state)
+ if (KGSL_STATE_DUMP_AND_FT != device->state)
mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
return 0;
}
@@ -1325,26 +1325,26 @@
}
static void adreno_mark_context_status(struct kgsl_device *device,
- int recovery_status)
+ int ft_status)
{
struct kgsl_context *context;
int next = 0;
/*
* Set the reset status of all contexts to
* INNOCENT_CONTEXT_RESET_EXT except for the bad context
- * since thats the guilty party, if recovery failed then
+ * since thats the guilty party, if fault tolerance failed then
* mark all as guilty
*/
while ((context = idr_get_next(&device->context_idr, &next))) {
struct adreno_context *adreno_context = context->devctxt;
- if (recovery_status) {
+ if (ft_status) {
context->reset_status =
KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
} else if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
context->reset_status) {
if (adreno_context->flags & (CTXT_FLAGS_GPU_HANG |
- CTXT_FLAGS_GPU_HANG_RECOVERED))
+ CTXT_FLAGS_GPU_HANG_FT))
context->reset_status =
KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
else
@@ -1379,11 +1379,11 @@
}
}
-static void adreno_destroy_recovery_data(struct adreno_recovery_data *rec_data)
+static void adreno_destroy_ft_data(struct adreno_ft_data *ft_data)
{
- vfree(rec_data->rb_buffer);
- vfree(rec_data->bad_rb_buffer);
- vfree(rec_data->good_rb_buffer);
+ vfree(ft_data->rb_buffer);
+ vfree(ft_data->bad_rb_buffer);
+ vfree(ft_data->good_rb_buffer);
}
static int _find_start_of_cmd_seq(struct adreno_ringbuffer *rb,
@@ -1479,13 +1479,13 @@
status = _find_start_of_cmd_seq(rb, &temp_rb_rptr, false);
if (!status) {
*rb_rptr = temp_rb_rptr;
- KGSL_DRV_ERR(rb->device,
+ KGSL_FT_INFO(rb->device,
"Offset of cmd sequence after eop timestamp: 0x%x\n",
temp_rb_rptr / sizeof(unsigned int));
}
}
if (status)
- KGSL_DRV_ERR(rb->device,
+ KGSL_FT_ERR(rb->device,
"Failed to find the command sequence after eop timestamp\n");
return status;
}
@@ -1512,7 +1512,8 @@
/* go till start of command sequence */
status = _find_start_of_cmd_seq(rb,
&temp_rb_rptr, false);
- KGSL_DRV_INFO(rb->device,
+
+ KGSL_FT_INFO(rb->device,
"Found the hanging IB at offset 0x%x\n",
temp_rb_rptr / sizeof(unsigned int));
break;
@@ -1526,7 +1527,7 @@
* can point to the context switch */
if (val[i] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
if (ctx_switch) {
- KGSL_DRV_ERR(rb->device,
+ KGSL_FT_ERR(rb->device,
"Context switch encountered before bad "
"IB found\n");
break;
@@ -1544,8 +1545,8 @@
return status;
}
-static int adreno_setup_recovery_data(struct kgsl_device *device,
- struct adreno_recovery_data *rec_data)
+static int adreno_setup_ft_data(struct kgsl_device *device,
+ struct adreno_ft_data *ft_data)
{
int ret = 0;
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
@@ -1554,104 +1555,105 @@
struct adreno_context *adreno_context;
unsigned int rb_rptr = rb->wptr * sizeof(unsigned int);
- memset(rec_data, 0, sizeof(*rec_data));
- rec_data->start_of_replay_cmds = 0xFFFFFFFF;
- rec_data->replay_for_snapshot = 0xFFFFFFFF;
+ memset(ft_data, 0, sizeof(*ft_data));
+ ft_data->start_of_replay_cmds = 0xFFFFFFFF;
+ ft_data->replay_for_snapshot = 0xFFFFFFFF;
- adreno_regread(device, REG_CP_IB1_BASE, &rec_data->ib1);
+ adreno_regread(device, REG_CP_IB1_BASE, &ft_data->ib1);
- kgsl_sharedmem_readl(&device->memstore, &rec_data->context_id,
+ kgsl_sharedmem_readl(&device->memstore, &ft_data->context_id,
KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
current_context));
kgsl_sharedmem_readl(&device->memstore,
- &rec_data->global_eop,
+ &ft_data->global_eop,
KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
eoptimestamp));
- rec_data->rb_buffer = vmalloc(rb->buffer_desc.size);
- if (!rec_data->rb_buffer) {
+ ft_data->rb_buffer = vmalloc(rb->buffer_desc.size);
+ if (!ft_data->rb_buffer) {
KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
rb->buffer_desc.size);
return -ENOMEM;
}
- rec_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
- if (!rec_data->bad_rb_buffer) {
+ ft_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
+ if (!ft_data->bad_rb_buffer) {
KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
rb->buffer_desc.size);
ret = -ENOMEM;
goto done;
}
- rec_data->good_rb_buffer = vmalloc(rb->buffer_desc.size);
- if (!rec_data->good_rb_buffer) {
+ ft_data->good_rb_buffer = vmalloc(rb->buffer_desc.size);
+ if (!ft_data->good_rb_buffer) {
KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
rb->buffer_desc.size);
ret = -ENOMEM;
goto done;
}
- rec_data->fault = device->mmu.fault;
- rec_data->step = adreno_dev->ft_policy;
+ ft_data->fault = device->mmu.fault;
+ ft_data->step = adreno_dev->ft_policy;
/* find the start of bad command sequence in rb */
- context = idr_find(&device->context_idr, rec_data->context_id);
+ context = idr_find(&device->context_idr, ft_data->context_id);
/* Look for the command stream that is right after the global eop */
if (!context) {
/*
- * If there is no context then recovery does not need to
+ * If there is no context then fault tolerance does not need to
* replay anything, just reset GPU and thats it
*/
goto done;
}
ret = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
- rec_data->global_eop + 1, false);
+ ft_data->global_eop + 1, false);
if (ret)
goto done;
- rec_data->start_of_replay_cmds = rb_rptr;
+ ft_data->start_of_replay_cmds = rb_rptr;
adreno_context = context->devctxt;
if (adreno_context->flags & CTXT_FLAGS_PREAMBLE) {
- if (rec_data->ib1) {
+ if (ft_data->ib1) {
ret = _find_hanging_ib_sequence(rb,
- &rb_rptr, rec_data->ib1);
+ &rb_rptr, ft_data->ib1);
if (ret) {
- KGSL_DRV_ERR(device,
+ KGSL_FT_ERR(device,
"Start not found for replay IB sequence\n");
ret = 0;
goto done;
}
- rec_data->start_of_replay_cmds = rb_rptr;
- rec_data->replay_for_snapshot = rb_rptr;
+ ft_data->start_of_replay_cmds = rb_rptr;
+ ft_data->replay_for_snapshot = rb_rptr;
}
}
done:
if (ret) {
- vfree(rec_data->rb_buffer);
- vfree(rec_data->bad_rb_buffer);
- vfree(rec_data->good_rb_buffer);
+ vfree(ft_data->rb_buffer);
+ vfree(ft_data->bad_rb_buffer);
+ vfree(ft_data->good_rb_buffer);
}
return ret;
}
static int
-_adreno_restart_device(struct kgsl_device *device,
- struct kgsl_context *context)
+_adreno_ft_restart_device(struct kgsl_device *device,
+ struct kgsl_context *context,
+ struct adreno_ft_data *ft_data)
{
struct adreno_context *adreno_context = context->devctxt;
/* restart device */
if (adreno_stop(device)) {
- KGSL_DRV_ERR(device, "Device stop failed in recovery\n");
+ KGSL_FT_ERR(device, "Device stop failed\n");
return 1;
}
if (adreno_start(device, true)) {
- KGSL_DRV_ERR(device, "Device start failed in recovery\n");
+ KGSL_FT_ERR(device, "Device start failed\n");
return 1;
}
@@ -1671,15 +1673,15 @@
}
static int
-_adreno_recovery_resubmit(struct kgsl_device *device,
+_adreno_ft_resubmit_rb(struct kgsl_device *device,
struct adreno_ringbuffer *rb,
struct kgsl_context *context,
- struct adreno_recovery_data *rec_data,
+ struct adreno_ft_data *ft_data,
unsigned int *buff, unsigned int size)
{
unsigned int ret = 0;
- if (_adreno_restart_device(device, context))
+ if (_adreno_ft_restart_device(device, context, ft_data))
return 1;
if (size) {
@@ -1695,8 +1697,8 @@
static int
-_adreno_recover_hang(struct kgsl_device *device,
- struct adreno_recovery_data *rec_data)
+_adreno_ft(struct kgsl_device *device,
+ struct adreno_ft_data *ft_data)
{
int ret = 0, i;
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
@@ -1705,10 +1707,10 @@
struct adreno_context *adreno_context = NULL;
struct adreno_context *last_active_ctx = adreno_dev->drawctxt_active;
- context = idr_find(&device->context_idr, rec_data->context_id);
+ context = idr_find(&device->context_idr, ft_data->context_id);
if (context == NULL) {
- KGSL_DRV_ERR(device, "Last context unknown id:%d\n",
- rec_data->context_id);
+ KGSL_FT_CRIT(device, "Last context unknown id:%d\n",
+ ft_data->context_id);
} else {
adreno_context = context->devctxt;
adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
@@ -1717,119 +1719,124 @@
* detected a hang for it
*/
context->wait_on_invalid_ts = false;
+
+ KGSL_FT_INFO(device, "Context found\n");
}
/* Extract valid contents from rb which can still be executed after
* hang */
- adreno_ringbuffer_extract(rb, rec_data);
+ adreno_ringbuffer_extract(rb, ft_data);
/* Do not try the bad commands if hang is due to a fault */
- if (rec_data->fault)
+ if (ft_data->fault) {
+ KGSL_FT_ERR(device, "Page fault no FT for bad context\n");
+
goto play_good_cmds;
+ }
- if (rec_data->step == FT_REPLAY_BAD_CTXT_CMDS) {
+ if (ft_data->step == FT_REPLAY_BAD_CTXT_CMDS) {
- ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
- rec_data->bad_rb_buffer, rec_data->bad_rb_size);
+ ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+ ft_data->bad_rb_buffer, ft_data->bad_rb_size);
if (ret)
- rec_data->step = FT_NOT_IB_BAD_CTXT_CMDS;
+ KGSL_FT_INFO(device, "Replay unsuccessful\n");
else
goto play_good_cmds;
}
- if (rec_data->step == FT_NOT_IB_BAD_CTXT_CMDS) {
+ if (ft_data->step == FT_NOP_IB_BAD_CTXT_CMDS) {
- for (i = 0; i < rec_data->bad_rb_size; i++) {
- if ((rec_data->bad_rb_buffer[i] ==
- CP_HDR_INDIRECT_BUFFER_PFD) &&
- (rec_data->bad_rb_buffer[i+1] ==
- rec_data->ib1)) {
+ for (i = 0; i < ft_data->bad_rb_size; i++) {
+ if ((ft_data->bad_rb_buffer[i] ==
+ CP_HDR_INDIRECT_BUFFER_PFD) &&
+ (ft_data->bad_rb_buffer[i+1] == ft_data->ib1)) {
- rec_data->bad_rb_buffer[i] = cp_nop_packet(2);
- rec_data->bad_rb_buffer[i+1] =
+ ft_data->bad_rb_buffer[i] = cp_nop_packet(2);
+ ft_data->bad_rb_buffer[i+1] =
KGSL_NOP_IB_IDENTIFIER;
- rec_data->bad_rb_buffer[i+2] =
+ ft_data->bad_rb_buffer[i+2] =
KGSL_NOP_IB_IDENTIFIER;
break;
}
}
- if ((i == (rec_data->bad_rb_size)) || (!rec_data->ib1)) {
- KGSL_DRV_ERR(device, "Bad IB to NOP not found\n");
- rec_data->step = FT_FAIL_BAD_CTXT_CMDS;
+ if ((i == (ft_data->bad_rb_size)) || (!ft_data->ib1)) {
+ KGSL_FT_ERR(device, "Bad IB to NOP not found\n");
+ ft_data->step = FT_FAIL_BAD_CTXT_CMDS;
goto play_good_cmds;
}
- ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
- rec_data->bad_rb_buffer, rec_data->bad_rb_size);
+ ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+ ft_data->bad_rb_buffer, ft_data->bad_rb_size);
if (ret) {
- KGSL_DRV_ERR(device, "NOP faulty IB unsuccessful\n");
- rec_data->step = FT_SKIP_EOF_BAD_CTXT_CMDS;
+ KGSL_FT_INFO(device, "NOP faulty IB unsuccessful\n");
+ ft_data->step = FT_SKIP_EOF_BAD_CTXT_CMDS;
} else
goto play_good_cmds;
}
- if (rec_data->step == FT_SKIP_EOF_BAD_CTXT_CMDS) {
+ if (ft_data->step == FT_SKIP_EOF_BAD_CTXT_CMDS) {
- for (i = 0; i < rec_data->bad_rb_size; i++) {
- if (rec_data->bad_rb_buffer[i] ==
- KGSL_END_OF_FRAME_IDENTIFIER) {
- rec_data->bad_rb_buffer[0] = cp_nop_packet(i);
+ for (i = 0; i < ft_data->bad_rb_size; i++) {
+ if (ft_data->bad_rb_buffer[i] ==
+ KGSL_END_OF_FRAME_IDENTIFIER) {
+ ft_data->bad_rb_buffer[0] = cp_nop_packet(i);
break;
}
}
/* EOF not found in RB, discard till EOF in
next IB submission */
- if (i == rec_data->bad_rb_size) {
+ if (i == ft_data->bad_rb_size) {
adreno_context->flags |= CTXT_FLAGS_SKIP_EOF;
- rec_data->bad_rb_buffer[0] = cp_nop_packet(i);
+ ft_data->bad_rb_buffer[0] = cp_nop_packet(i);
}
- ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
- rec_data->bad_rb_buffer, rec_data->bad_rb_size);
+ ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+ ft_data->bad_rb_buffer, ft_data->bad_rb_size);
if (ret) {
- KGSL_DRV_ERR(device, "Skip EOF unsuccessful\n");
- rec_data->step = FT_FAIL_BAD_CTXT_CMDS;
+ KGSL_FT_INFO(device, "Skip EOF unsuccessful\n");
+ ft_data->step = FT_FAIL_BAD_CTXT_CMDS;
} else
goto play_good_cmds;
}
play_good_cmds:
- if (rec_data->step == FT_FAIL_BAD_CTXT_CMDS)
- KGSL_DRV_ERR(device, "Bad context commands failed\n");
+ if (ft_data->step == FT_FAIL_BAD_CTXT_CMDS)
+ KGSL_FT_ERR(device, "Bad context commands failed\n");
else {
+ KGSL_FT_INFO(device, "Bad context commands success\n");
if (adreno_context) {
adreno_context->flags = (adreno_context->flags &
- ~CTXT_FLAGS_GPU_HANG) | CTXT_FLAGS_GPU_HANG_RECOVERED;
+ ~CTXT_FLAGS_GPU_HANG) | CTXT_FLAGS_GPU_HANG_FT;
}
adreno_dev->drawctxt_active = last_active_ctx;
}
- ret = _adreno_recovery_resubmit(device, rb, context, rec_data,
- rec_data->good_rb_buffer, rec_data->good_rb_size);
+ ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
+ ft_data->good_rb_buffer, ft_data->good_rb_size);
if (ret) {
/* If we fail here we can try to invalidate another
* context and try fault tolerance again */
ret = -EAGAIN;
- KGSL_DRV_ERR(device, "Playing good commands unsuccessful\n");
+ KGSL_FT_ERR(device, "Playing good commands unsuccessful\n");
goto done;
- }
-
+ } else
+ KGSL_FT_INFO(device, "Playing good commands successful\n");
/* ringbuffer now has data from the last valid context id,
* so restore the active_ctx to the last valid context */
- if (rec_data->last_valid_ctx_id) {
+ if (ft_data->last_valid_ctx_id) {
struct kgsl_context *last_ctx =
idr_find(&device->context_idr,
- rec_data->last_valid_ctx_id);
+ ft_data->last_valid_ctx_id);
if (last_ctx)
adreno_dev->drawctxt_active = last_ctx->devctxt;
}
@@ -1842,40 +1849,42 @@
}
static int
-adreno_recover_hang(struct kgsl_device *device,
- struct adreno_recovery_data *rec_data)
+adreno_ft(struct kgsl_device *device,
+ struct adreno_ft_data *ft_data)
{
int ret = 0;
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
unsigned int timestamp;
- KGSL_DRV_ERR(device,
- "Starting recovery from 3D GPU hang. Recovery parameters: IB1: 0x%X, "
+ KGSL_FT_INFO(device,
+ "Start Parameters: IB1: 0x%X, "
"Bad context_id: %u, global_eop: 0x%x\n",
- rec_data->ib1, rec_data->context_id, rec_data->global_eop);
+ ft_data->ib1, ft_data->context_id, ft_data->global_eop);
timestamp = rb->timestamp[KGSL_MEMSTORE_GLOBAL];
- KGSL_DRV_ERR(device, "Last issued global timestamp: %x\n", timestamp);
+ KGSL_FT_INFO(device, "Last issued global timestamp: %x\n", timestamp);
/* We may need to replay commands multiple times based on whether
* multiple contexts hang the GPU */
while (true) {
- ret = _adreno_recover_hang(device, rec_data);
+ ret = _adreno_ft(device, ft_data);
+
+ KGSL_FT_CRIT(device, "POLICY: 0x%X\n", ft_data->step);
if (-EAGAIN == ret) {
- /* setup new recovery parameters and retry, this
+ /* setup new fault tolerance parameters and retry, this
* means more than 1 contexts are causing hang */
- adreno_destroy_recovery_data(rec_data);
- ret = adreno_setup_recovery_data(device, rec_data);
+ adreno_destroy_ft_data(ft_data);
+ ret = adreno_setup_ft_data(device, ft_data);
if (ret)
goto done;
- KGSL_DRV_ERR(device,
- "Retry recovery from 3D GPU hang. Recovery parameters: "
+ KGSL_FT_INFO(device,
+ "Retry. Parameters: "
"IB1: 0x%X, Bad context_id: %u, global_eop: 0x%x\n",
- rec_data->ib1, rec_data->context_id,
- rec_data->global_eop);
+ ft_data->ib1, ft_data->context_id,
+ ft_data->global_eop);
} else {
break;
}
@@ -1884,7 +1893,7 @@
if (ret)
goto done;
- /* Restore correct states after recovery */
+ /* Restore correct states after fault tolerance */
if (adreno_dev->drawctxt_active)
device->mmu.hwpagetable =
adreno_dev->drawctxt_active->pagetable;
@@ -1903,34 +1912,32 @@
done:
adreno_set_max_ts_for_bad_ctxs(device);
adreno_mark_context_status(device, ret);
- if (!ret)
- KGSL_DRV_ERR(device, "Recovery succeeded\n");
- else
- KGSL_DRV_ERR(device, "Recovery failed\n");
+ if (ret)
+ KGSL_FT_ERR(device, "Fault Tolerance failed\n");
return ret;
}
int
-adreno_dump_and_recover(struct kgsl_device *device)
+adreno_dump_and_exec_ft(struct kgsl_device *device)
{
int result = -ETIMEDOUT;
- struct adreno_recovery_data rec_data;
+ struct adreno_ft_data ft_data;
if (device->state == KGSL_STATE_HUNG)
goto done;
- if (device->state == KGSL_STATE_DUMP_AND_RECOVER) {
+ if (device->state == KGSL_STATE_DUMP_AND_FT) {
mutex_unlock(&device->mutex);
- wait_for_completion(&device->recovery_gate);
+ wait_for_completion(&device->ft_gate);
mutex_lock(&device->mutex);
if (device->state != KGSL_STATE_HUNG)
result = 0;
} else {
- kgsl_pwrctrl_set_state(device, KGSL_STATE_DUMP_AND_RECOVER);
- INIT_COMPLETION(device->recovery_gate);
+ kgsl_pwrctrl_set_state(device, KGSL_STATE_DUMP_AND_FT);
+ INIT_COMPLETION(device->ft_gate);
/* Detected a hang */
- /* Get the recovery data as soon as hang is detected */
- result = adreno_setup_recovery_data(device, &rec_data);
+ /* Get the fault tolerance data as soon as hang is detected */
+ result = adreno_setup_ft_data(device, &ft_data);
/*
* Trigger an automatic dump of the state to
* the console
@@ -1944,8 +1951,8 @@
kgsl_device_snapshot(device, 1);
if (!result) {
- result = adreno_recover_hang(device, &rec_data);
- adreno_destroy_recovery_data(&rec_data);
+ result = adreno_ft(device, &ft_data);
+ adreno_destroy_ft_data(&ft_data);
}
if (result) {
kgsl_pwrctrl_set_state(device, KGSL_STATE_HUNG);
@@ -1953,12 +1960,12 @@
kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
}
- complete_all(&device->recovery_gate);
+ complete_all(&device->ft_gate);
}
done:
return result;
}
-EXPORT_SYMBOL(adreno_dump_and_recover);
+EXPORT_SYMBOL(adreno_dump_and_exec_ft);
static int adreno_getproperty(struct kgsl_device *device,
enum kgsl_property_type type,
@@ -2190,9 +2197,9 @@
err:
KGSL_DRV_ERR(device, "spun too long waiting for RB to idle\n");
- if (KGSL_STATE_DUMP_AND_RECOVER != device->state &&
- !adreno_dump_and_recover(device)) {
- wait_time = jiffies + msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
+ if (KGSL_STATE_DUMP_AND_FT != device->state &&
+ !adreno_dump_and_exec_ft(device)) {
+ wait_time = jiffies + ADRENO_IDLE_TIMEOUT;
goto retry;
}
return -ETIMEDOUT;
@@ -2593,7 +2600,8 @@
* @context - pointer to the active KGSL context
* @timestamp - the timestamp that the process was waiting for
*
- * Process a possible GPU hang and try to recover from it cleanly
+ * Process a possible GPU hang and try fault tolerance from it
+ * cleanly
*/
static int adreno_handle_hang(struct kgsl_device *device,
struct kgsl_context *context, unsigned int timestamp)
@@ -2621,8 +2629,8 @@
KGSL_TIMESTAMP_RETIRED),
adreno_dev->ringbuffer.wptr, rptr);
- /* Return 0 after a successful recovery */
- if (!adreno_dump_and_recover(device))
+ /* Return 0 after a successful fault tolerance */
+ if (!adreno_dump_and_exec_ft(device))
return 0;
return -ETIMEDOUT;
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 03c82bf..23fce2f 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -134,8 +134,8 @@
};
/*
- * struct adreno_recovery_data - Structure that contains all information to
- * perform gpu recovery from hangs
+ * struct adreno_ft_data - Structure that contains all information to
+ * perform gpu fault tolerance
* @ib1 - IB1 that the GPU was executing when hang happened
* @context_id - Context which caused the hang
* @global_eop - eoptimestamp at time of hang
@@ -147,15 +147,15 @@
* good_rb_size - Number of valid dwords in good_rb_buffer
* @last_valid_ctx_id - The last context from which commands were placed in
* ringbuffer before the GPU hung
- * @step - Current recovery step being executed
- * @err_code - Recovery error code
+ * @step - Current fault tolerance step being executed
+ * @err_code - Fault tolerance error code
* @fault - Indicates whether the hang was caused due to a pagefault
* @start_of_replay_cmds - Offset in ringbuffer from where commands can be
- * replayed during recovery
+ * replayed during fault tolerance
* @replay_for_snapshot - Offset in ringbuffer where IB's can be saved for
* replaying with snapshot
*/
-struct adreno_recovery_data {
+struct adreno_ft_data {
unsigned int ib1;
unsigned int context_id;
unsigned int global_eop;
@@ -167,7 +167,6 @@
unsigned int good_rb_size;
unsigned int last_valid_ctx_id;
unsigned int step;
- unsigned int err_code;
int fault;
unsigned int start_of_replay_cmds;
unsigned int replay_for_snapshot;
@@ -175,13 +174,12 @@
enum ft_steps {
FT_REPLAY_BAD_CTXT_CMDS = 0,
- FT_NOT_IB_BAD_CTXT_CMDS,
+ FT_NOP_IB_BAD_CTXT_CMDS,
FT_SKIP_EOF_BAD_CTXT_CMDS,
FT_FAIL_BAD_CTXT_CMDS,
FT_PLAY_GOOD_CTXT_CMDS
};
-
extern struct adreno_gpudev adreno_a2xx_gpudev;
extern struct adreno_gpudev adreno_a3xx_gpudev;
@@ -229,7 +227,10 @@
void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
int hang);
-int adreno_dump_and_recover(struct kgsl_device *device);
+int adreno_dump_and_exec_ft(struct kgsl_device *device);
+
+void adreno_dump_rb(struct kgsl_device *device, const void *buf,
+ size_t len, int start, int size);
unsigned int adreno_hang_detect(struct kgsl_device *device,
unsigned int *prev_reg_val);
diff --git a/drivers/gpu/msm/adreno_a3xx.c b/drivers/gpu/msm/adreno_a3xx.c
index 6276a59..0778ccb 100644
--- a/drivers/gpu/msm/adreno_a3xx.c
+++ b/drivers/gpu/msm/adreno_a3xx.c
@@ -70,7 +70,7 @@
const unsigned int a3xx_registers_count = ARRAY_SIZE(a3xx_registers) / 2;
/* Removed the following HLSQ register ranges from being read during
- * recovery since reading the registers may cause the device to hang:
+ * fault tolerance since reading the registers may cause the device to hang:
*/
const unsigned int a3xx_hlsq_registers[] = {
0x0e00, 0x0e05, 0x0e0c, 0x0e0c, 0x0e22, 0x0e23,
diff --git a/drivers/gpu/msm/adreno_drawctxt.h b/drivers/gpu/msm/adreno_drawctxt.h
index 133f92f..2e8a0c1 100644
--- a/drivers/gpu/msm/adreno_drawctxt.h
+++ b/drivers/gpu/msm/adreno_drawctxt.h
@@ -44,8 +44,8 @@
#define CTXT_FLAGS_TRASHSTATE BIT(10)
/* per context timestamps enabled */
#define CTXT_FLAGS_PER_CONTEXT_TS BIT(11)
-/* Context has caused a GPU hang and recovered properly */
-#define CTXT_FLAGS_GPU_HANG_RECOVERED BIT(12)
+/* Context has caused a GPU hang and fault tolerance successful */
+#define CTXT_FLAGS_GPU_HANG_FT BIT(12)
/* Context is being destroyed so dont save it */
#define CTXT_FLAGS_BEING_DESTROYED BIT(13)
/* User mode generated timestamps enabled */
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
index 2367bb9..164b607 100644
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -268,7 +268,7 @@
#endif
}
-static void adreno_dump_rb(struct kgsl_device *device, const void *buf,
+void adreno_dump_rb(struct kgsl_device *device, const void *buf,
size_t len, int start, int size)
{
const uint32_t *ptr = buf;
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 6a8bffb..6fda86d 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -129,7 +129,7 @@
continue;
err:
- if (!adreno_dump_and_recover(rb->device)) {
+ if (!adreno_dump_and_exec_ft(rb->device)) {
if (context && context->flags & CTXT_FLAGS_GPU_HANG) {
KGSL_CTXT_WARN(rb->device,
"Context %p caused a gpu hang. Will not accept commands for context %d\n",
@@ -138,7 +138,7 @@
}
wait_time = jiffies + wait_timeout;
} else {
- /* GPU is hung and we cannot recover */
+ /* GPU is hung and fault tolerance failed */
BUG();
}
}
@@ -572,7 +572,7 @@
if (adreno_is_a3xx(adreno_dev))
total_sizedwords += 7;
- total_sizedwords += 2; /* scratchpad ts for recovery */
+ total_sizedwords += 2; /* scratchpad ts for fault tolerance */
if (context && context->flags & CTXT_FLAGS_PER_CONTEXT_TS &&
!(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
total_sizedwords += 3; /* sop timestamp */
@@ -580,7 +580,7 @@
total_sizedwords += 3; /* global timestamp without cache
* flush for non-zero context */
} else {
- total_sizedwords += 4; /* global timestamp for recovery*/
+ total_sizedwords += 4; /* global timestamp for fault tolerance*/
}
ringcmds = adreno_ringbuffer_allocspace(rb, context, total_sizedwords);
@@ -632,7 +632,7 @@
}
timestamp = rb->timestamp[context_id];
- /* scratchpad ts for recovery */
+ /* scratchpad ts for fault tolerance */
GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type0_packet(REG_CP_TIMESTAMP, 1));
GSL_RB_WRITE(ringcmds, rcmd_gpu, rb->timestamp[KGSL_MEMSTORE_GLOBAL]);
@@ -982,7 +982,7 @@
drawctxt = context->devctxt;
if (drawctxt->flags & CTXT_FLAGS_GPU_HANG) {
- KGSL_CTXT_ERR(device, "Context %p caused a gpu hang.."
+ KGSL_CTXT_ERR(device, "Context %p failed fault tolerance"
" will not accept commands for context %d\n",
drawctxt, drawctxt->id);
return -EDEADLK;
@@ -990,7 +990,7 @@
if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
KGSL_CTXT_ERR(device,
- "Context %p caused a gpu hang.."
+ "Context %p triggered fault tolerance"
" skipping commands for context till EOF %d\n",
drawctxt, drawctxt->id);
if (flags & KGSL_CMD_FLAGS_EOF)
@@ -1064,11 +1064,14 @@
adreno_idle(device);
#endif
- /* If context hung and recovered then return error so that the
- * application may handle it */
- if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_RECOVERED)
- return -EAGAIN;
- else
+ /*
+ * If context hung and recovered then return error so that the
+ * application may handle it
+ */
+ if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_FT) {
+ drawctxt->flags &= ~CTXT_FLAGS_GPU_HANG_FT;
+ return -EPROTO;
+ } else
return 0;
}
@@ -1094,7 +1097,7 @@
kgsl_sharedmem_writel(&rb->buffer_desc,
temp_rb_rptr, cp_nop_packet(1));
}
- KGSL_DRV_ERR(rb->device,
+ KGSL_FT_INFO(rb->device,
"Turned preamble on at offset 0x%x\n",
temp_rb_rptr / 4);
break;
@@ -1117,10 +1120,10 @@
}
void adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
- struct adreno_recovery_data *rec_data)
+ struct adreno_ft_data *ft_data)
{
struct kgsl_device *device = rb->device;
- unsigned int rb_rptr = rec_data->start_of_replay_cmds;
+ unsigned int rb_rptr = ft_data->start_of_replay_cmds;
unsigned int good_rb_idx = 0, bad_rb_idx = 0, temp_rb_idx = 0;
unsigned int last_good_cmd_end_idx = 0, last_bad_cmd_end_idx = 0;
unsigned int cmd_start_idx = 0;
@@ -1130,21 +1133,21 @@
struct kgsl_context *k_ctxt;
struct adreno_context *a_ctxt;
unsigned int size = rb->buffer_desc.size;
- unsigned int *temp_rb_buffer = rec_data->rb_buffer;
- int *rb_size = &rec_data->rb_size;
- unsigned int *bad_rb_buffer = rec_data->bad_rb_buffer;
- int *bad_rb_size = &rec_data->bad_rb_size;
- unsigned int *good_rb_buffer = rec_data->good_rb_buffer;
- int *good_rb_size = &rec_data->good_rb_size;
+ unsigned int *temp_rb_buffer = ft_data->rb_buffer;
+ int *rb_size = &ft_data->rb_size;
+ unsigned int *bad_rb_buffer = ft_data->bad_rb_buffer;
+ int *bad_rb_size = &ft_data->bad_rb_size;
+ unsigned int *good_rb_buffer = ft_data->good_rb_buffer;
+ int *good_rb_size = &ft_data->good_rb_size;
/*
* If the start index from where commands need to be copied is invalid
* then no need to save off any commands
*/
- if (0xFFFFFFFF == rec_data->start_of_replay_cmds)
+ if (0xFFFFFFFF == ft_data->start_of_replay_cmds)
return;
- k_ctxt = idr_find(&device->context_idr, rec_data->context_id);
+ k_ctxt = idr_find(&device->context_idr, ft_data->context_id);
if (k_ctxt) {
a_ctxt = k_ctxt->devctxt;
if (a_ctxt->flags & CTXT_FLAGS_PREAMBLE)
@@ -1194,7 +1197,7 @@
temp_idx++)
good_rb_buffer[good_rb_idx++] =
temp_rb_buffer[temp_idx];
- rec_data->last_valid_ctx_id = val2;
+ ft_data->last_valid_ctx_id = val2;
copy_rb_contents = 1;
/* remove the good commands from bad buffer */
bad_rb_idx = last_bad_cmd_end_idx;
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index ebbeb65..d65b91f 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -27,7 +27,7 @@
struct kgsl_device;
struct kgsl_device_private;
-struct adreno_recovery_data;
+struct adreno_ft_data;
#define GSL_RB_MEMPTRS_SCRATCH_COUNT 8
struct kgsl_rbmemptrs {
@@ -120,7 +120,7 @@
void kgsl_cp_intrcallback(struct kgsl_device *device);
void adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
- struct adreno_recovery_data *rec_data);
+ struct adreno_ft_data *ft_data);
void
adreno_ringbuffer_restore(struct adreno_ringbuffer *rb, unsigned int *rb_buff,
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
index c61da62..1d32302 100644
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -2631,7 +2631,7 @@
/* On a manual trigger, turn on the interrupts and put
the clocks to sleep. They will recover themselves
on the next event. For a hang, leave things as they
- are until recovery kicks in. */
+ are until fault tolerance kicks in. */
if (manual) {
kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
diff --git a/drivers/gpu/msm/kgsl_debugfs.c b/drivers/gpu/msm/kgsl_debugfs.c
index d4721bb..991570b 100644
--- a/drivers/gpu/msm/kgsl_debugfs.c
+++ b/drivers/gpu/msm/kgsl_debugfs.c
@@ -105,6 +105,7 @@
KGSL_DEBUGFS_LOG(ctxt_log);
KGSL_DEBUGFS_LOG(mem_log);
KGSL_DEBUGFS_LOG(pwr_log);
+KGSL_DEBUGFS_LOG(ft_log);
void kgsl_device_debugfs_init(struct kgsl_device *device)
{
@@ -120,6 +121,7 @@
device->drv_log = KGSL_LOG_LEVEL_DEFAULT;
device->mem_log = KGSL_LOG_LEVEL_DEFAULT;
device->pwr_log = KGSL_LOG_LEVEL_DEFAULT;
+ device->ft_log = KGSL_LOG_LEVEL_DEFAULT;
debugfs_create_file("log_level_cmd", 0644, device->d_debugfs, device,
&cmd_log_fops);
@@ -131,6 +133,8 @@
&mem_log_fops);
debugfs_create_file("log_level_pwr", 0644, device->d_debugfs, device,
&pwr_log_fops);
+ debugfs_create_file("log_level_ft", 0644, device->d_debugfs, device,
+ &ft_log_fops);
/* Create postmortem dump control files */
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index 322ad08..805e54b 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -46,7 +46,7 @@
#define KGSL_STATE_SLEEP 0x00000008
#define KGSL_STATE_SUSPEND 0x00000010
#define KGSL_STATE_HUNG 0x00000020
-#define KGSL_STATE_DUMP_AND_RECOVER 0x00000040
+#define KGSL_STATE_DUMP_AND_FT 0x00000040
#define KGSL_STATE_SLUMBER 0x00000080
#define KGSL_GRAPHICS_MEMORY_LOW_WATERMARK 0x1000000
@@ -169,7 +169,7 @@
wait_queue_head_t wait_queue;
struct workqueue_struct *work_queue;
struct device *parentdev;
- struct completion recovery_gate;
+ struct completion ft_gate;
struct dentry *d_debugfs;
struct idr context_idr;
struct early_suspend display_off;
@@ -195,6 +195,7 @@
int drv_log;
int mem_log;
int pwr_log;
+ int ft_log;
struct kgsl_pwrscale pwrscale;
struct kobject pwrscale_kobj;
struct pm_qos_request pm_qos_req_dma;
@@ -212,7 +213,7 @@
#define KGSL_DEVICE_COMMON_INIT(_dev) \
.hwaccess_gate = COMPLETION_INITIALIZER((_dev).hwaccess_gate),\
.suspend_gate = COMPLETION_INITIALIZER((_dev).suspend_gate),\
- .recovery_gate = COMPLETION_INITIALIZER((_dev).recovery_gate),\
+ .ft_gate = COMPLETION_INITIALIZER((_dev).ft_gate),\
.ts_notifier_list = ATOMIC_NOTIFIER_INIT((_dev).ts_notifier_list),\
.idle_check_ws = __WORK_INITIALIZER((_dev).idle_check_ws,\
kgsl_idle_check),\
diff --git a/drivers/gpu/msm/kgsl_log.h b/drivers/gpu/msm/kgsl_log.h
index 81a35e0..83d14f7 100644
--- a/drivers/gpu/msm/kgsl_log.h
+++ b/drivers/gpu/msm/kgsl_log.h
@@ -103,6 +103,15 @@
#define KGSL_PWR_CRIT(_dev, fmt, args...) \
KGSL_LOG_CRIT(_dev->dev, _dev->pwr_log, fmt, ##args)
+#define KGSL_FT_INFO(_dev, fmt, args...) \
+KGSL_LOG_INFO(_dev->dev, _dev->ft_log, fmt, ##args)
+#define KGSL_FT_WARN(_dev, fmt, args...) \
+KGSL_LOG_WARN(_dev->dev, _dev->ft_log, fmt, ##args)
+#define KGSL_FT_ERR(_dev, fmt, args...) \
+KGSL_LOG_ERR(_dev->dev, _dev->ft_log, fmt, ##args)
+#define KGSL_FT_CRIT(_dev, fmt, args...) \
+KGSL_LOG_CRIT(_dev->dev, _dev->ft_log, fmt, ##args)
+
/* Core error messages - these are for core KGSL functions that have
no device associated with them (such as memory) */
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 10737c9..d489119 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -1024,7 +1024,7 @@
}
}
} else if (device->state & (KGSL_STATE_HUNG |
- KGSL_STATE_DUMP_AND_RECOVER)) {
+ KGSL_STATE_DUMP_AND_FT)) {
kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
}
@@ -1063,7 +1063,7 @@
break;
case KGSL_STATE_INIT:
case KGSL_STATE_HUNG:
- case KGSL_STATE_DUMP_AND_RECOVER:
+ case KGSL_STATE_DUMP_AND_FT:
if (test_bit(KGSL_PWRFLAGS_CLK_ON,
&device->pwrctrl.power_flags))
break;
@@ -1087,9 +1087,9 @@
mutex_unlock(&device->mutex);
wait_for_completion(&device->hwaccess_gate);
mutex_lock(&device->mutex);
- } else if (device->state == KGSL_STATE_DUMP_AND_RECOVER) {
+ } else if (device->state == KGSL_STATE_DUMP_AND_FT) {
mutex_unlock(&device->mutex);
- wait_for_completion(&device->recovery_gate);
+ wait_for_completion(&device->ft_gate);
mutex_lock(&device->mutex);
} else if (device->state == KGSL_STATE_SLUMBER)
kgsl_pwrctrl_wake(device);
@@ -1312,7 +1312,7 @@
return "SUSPEND";
case KGSL_STATE_HUNG:
return "HUNG";
- case KGSL_STATE_DUMP_AND_RECOVER:
+ case KGSL_STATE_DUMP_AND_FT:
return "DNR";
case KGSL_STATE_SLUMBER:
return "SLUMBER";
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index 1adcf55..d0fd54c 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -539,7 +539,7 @@
/* Freeze the snapshot on a hang until it gets read */
device->snapshot_frozen = (hang) ? 1 : 0;
- /* log buffer info to aid in ramdump recovery */
+ /* log buffer info to aid in ramdump fault tolerance */
KGSL_DRV_ERR(device, "snapshot created at va %p pa %lx size %d\n",
device->snapshot, __pa(device->snapshot),
device->snapshot_size);