msm: kgsl: Introduce recovery data structure for hang recovery

Create a new data structure that holds all the information required
for recovering from hangs. This data structure is initialized as soon
as hang is detected because if the GPU is under DOS attack then by
the time we start recovery the recovery data may have changed.

Change-Id: I862787267c582b9ca95bb191fb3503ceba1faa8c
Signed-off-by: Shubhraprakash Das <sadas@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index c0c7adb..7a6fc24 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -797,65 +797,59 @@
 }
 
 static int
-adreno_recover_hang(struct kgsl_device *device)
+adreno_recover_hang(struct kgsl_device *device,
+			struct adreno_recovery_data *rec_data)
 {
 	int ret;
-	unsigned int *rb_buffer;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 	unsigned int timestamp;
-	unsigned int num_rb_contents;
 	unsigned int reftimestamp;
 	unsigned int enable_ts;
 	unsigned int soptimestamp;
 	unsigned int eoptimestamp;
-	unsigned int context_id;
 	struct kgsl_context *context;
 	struct adreno_context *adreno_context;
 	int next = 0;
 
-	KGSL_DRV_ERR(device, "Starting recovery from 3D GPU hang....\n");
-	rb_buffer = vmalloc(rb->buffer_desc.size);
-	if (!rb_buffer) {
-		KGSL_MEM_ERR(device,
-			"Failed to allocate memory for recovery: %x\n",
-			rb->buffer_desc.size);
-		return -ENOMEM;
-	}
+	KGSL_DRV_ERR(device,
+	"Starting recovery from 3D GPU hang. Recovery parameters: IB1: 0x%X, "
+	"Bad context_id: %u, global_eop: 0x%x\n", rec_data->ib1,
+	rec_data->context_id, rec_data->global_eop);
+
 	/* Extract valid contents from rb which can stil be executed after
 	 * hang */
-	ret = adreno_ringbuffer_extract(rb, rb_buffer, &num_rb_contents);
+	ret = adreno_ringbuffer_extract(rb, rec_data);
 	if (ret)
 		goto done;
-	kgsl_sharedmem_readl(&device->memstore, &context_id,
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-					current_context));
-	context = idr_find(&device->context_idr, context_id);
+
+	context = idr_find(&device->context_idr, rec_data->context_id);
 	if (context == NULL) {
 		KGSL_DRV_ERR(device, "Last context unknown id:%d\n",
-				context_id);
-		context_id = KGSL_MEMSTORE_GLOBAL;
+				rec_data->context_id);
+		rec_data->context_id = KGSL_MEMSTORE_GLOBAL;
 	}
 
 	timestamp = rb->timestamp[KGSL_MEMSTORE_GLOBAL];
 	KGSL_DRV_ERR(device, "Last issued global timestamp: %x\n", timestamp);
 
 	kgsl_sharedmem_readl(&device->memstore, &reftimestamp,
-				KGSL_MEMSTORE_OFFSET(context_id,
+				KGSL_MEMSTORE_OFFSET(rec_data->context_id,
 					ref_wait_ts));
 	kgsl_sharedmem_readl(&device->memstore, &enable_ts,
-				KGSL_MEMSTORE_OFFSET(context_id,
+				KGSL_MEMSTORE_OFFSET(rec_data->context_id,
 					ts_cmp_enable));
 	kgsl_sharedmem_readl(&device->memstore, &soptimestamp,
-				KGSL_MEMSTORE_OFFSET(context_id,
+				KGSL_MEMSTORE_OFFSET(rec_data->context_id,
 					soptimestamp));
 	kgsl_sharedmem_readl(&device->memstore, &eoptimestamp,
-				KGSL_MEMSTORE_OFFSET(context_id,
+				KGSL_MEMSTORE_OFFSET(rec_data->context_id,
 					eoptimestamp));
 	/* Make sure memory is synchronized before restarting the GPU */
 	mb();
 	KGSL_CTXT_ERR(device,
-		"Context id that caused a GPU hang: %d\n", context_id);
+		"Context id that caused a GPU hang: %d\n",
+		rec_data->context_id);
 	/* restart device */
 	ret = adreno_stop(device);
 	if (ret)
@@ -866,19 +860,19 @@
 	KGSL_DRV_ERR(device, "Device has been restarted after hang\n");
 	/* Restore timestamp states */
 	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(context_id, soptimestamp),
-			soptimestamp);
+			KGSL_MEMSTORE_OFFSET(rec_data->context_id,
+			soptimestamp), soptimestamp);
 	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(context_id, eoptimestamp),
-			eoptimestamp);
+			KGSL_MEMSTORE_OFFSET(rec_data->context_id,
+			eoptimestamp), eoptimestamp);
 
-	if (num_rb_contents) {
+	if (rec_data->rb_size) {
 		kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(context_id, ref_wait_ts),
-			reftimestamp);
+			KGSL_MEMSTORE_OFFSET(rec_data->context_id,
+			ref_wait_ts), reftimestamp);
 		kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(context_id, ts_cmp_enable),
-			enable_ts);
+			KGSL_MEMSTORE_OFFSET(rec_data->context_id,
+			ts_cmp_enable), enable_ts);
 	}
 	/* Make sure all writes are posted before the GPU reads them */
 	wmb();
@@ -900,7 +894,7 @@
 	while ((context = idr_get_next(&device->context_idr, &next))) {
 		if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
 			context->reset_status) {
-			if (context->id != context_id)
+			if (context->id != rec_data->context_id)
 				context->reset_status =
 				KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
 			else
@@ -911,18 +905,71 @@
 	}
 
 	/* Restore valid commands in ringbuffer */
-	adreno_ringbuffer_restore(rb, rb_buffer, num_rb_contents);
+	adreno_ringbuffer_restore(rb, rec_data->rb_buffer, rec_data->rb_size);
 	rb->timestamp[KGSL_MEMSTORE_GLOBAL] = timestamp;
 	/* wait for idle */
 	ret = adreno_idle(device, KGSL_TIMEOUT_DEFAULT);
 done:
-	vfree(rb_buffer);
+	return ret;
+}
+
+static void adreno_destroy_recovery_data(struct adreno_recovery_data *rec_data)
+{
+	vfree(rec_data->rb_buffer);
+	vfree(rec_data->bad_rb_buffer);
+}
+
+static int adreno_setup_recovery_data(struct kgsl_device *device,
+					struct adreno_recovery_data *rec_data)
+{
+	int ret = 0;
+	unsigned int ib1_sz, ib2_sz;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
+
+	memset(rec_data, 0, sizeof(*rec_data));
+
+	adreno_regread(device, REG_CP_IB1_BUFSZ, &ib1_sz);
+	adreno_regread(device, REG_CP_IB2_BUFSZ, &ib2_sz);
+	if (ib1_sz || ib2_sz)
+		adreno_regread(device, REG_CP_IB1_BASE, &rec_data->ib1);
+
+	kgsl_sharedmem_readl(&device->memstore, &rec_data->context_id,
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+			current_context));
+
+	kgsl_sharedmem_readl(&device->memstore,
+				&rec_data->global_eop,
+				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+				eoptimestamp));
+
+	rec_data->rb_buffer = vmalloc(rb->buffer_desc.size);
+	if (!rec_data->rb_buffer) {
+		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
+				rb->buffer_desc.size);
+		return -ENOMEM;
+	}
+
+	rec_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
+	if (!rec_data->bad_rb_buffer) {
+		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
+				rb->buffer_desc.size);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+done:
+	if (ret) {
+		vfree(rec_data->rb_buffer);
+		vfree(rec_data->bad_rb_buffer);
+	}
 	return ret;
 }
 
 int adreno_dump_and_recover(struct kgsl_device *device)
 {
 	int result = -ETIMEDOUT;
+	struct adreno_recovery_data rec_data;
 
 	if (device->state == KGSL_STATE_HUNG)
 		goto done;
@@ -937,7 +984,8 @@
 		INIT_COMPLETION(device->recovery_gate);
 		/* Detected a hang */
 
-
+		/* Get the recovery data as soon as hang is detected */
+		result = adreno_setup_recovery_data(device, &rec_data);
 		/*
 		 * Trigger an automatic dump of the state to
 		 * the console
@@ -950,7 +998,8 @@
 		 */
 		kgsl_device_snapshot(device, 1);
 
-		result = adreno_recover_hang(device);
+		result = adreno_recover_hang(device, &rec_data);
+		adreno_destroy_recovery_data(&rec_data);
 		if (result) {
 			kgsl_pwrctrl_set_state(device, KGSL_STATE_HUNG);
 		} else {
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 3cc47ba..57f4859 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -114,6 +114,30 @@
 	unsigned int (*busy_cycles)(struct adreno_device *);
 };
 
+/*
+ * struct adreno_recovery_data - Structure that contains all information to
+ * perform gpu recovery from hangs
+ * @ib1 - IB1 that the GPU was executing when hang happened
+ * @context_id - Context which caused the hang
+ * @global_eop - eoptimestamp at time of hang
+ * @rb_buffer - Buffer that holds the commands from good contexts
+ * @rb_size - Number of valid dwords in rb_buffer
+ * @bad_rb_buffer - Buffer that holds commands from the hanging context
+ * bad_rb_size - Number of valid dwords in bad_rb_buffer
+ * @last_valid_ctx_id - The last context from which commands were placed in
+ * ringbuffer before the GPU hung
+ */
+struct adreno_recovery_data {
+	unsigned int ib1;
+	unsigned int context_id;
+	unsigned int global_eop;
+	unsigned int *rb_buffer;
+	unsigned int rb_size;
+	unsigned int *bad_rb_buffer;
+	unsigned int bad_rb_size;
+	unsigned int last_valid_ctx_id;
+};
+
 extern struct adreno_gpudev adreno_a2xx_gpudev;
 extern struct adreno_gpudev adreno_a3xx_gpudev;
 
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 9d550ac..ea92c9b 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -947,8 +947,7 @@
 }
 
 int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
-				unsigned int *temp_rb_buffer,
-				int *rb_size)
+				struct adreno_recovery_data *rec_data)
 {
 	struct kgsl_device *device = rb->device;
 	unsigned int rb_rptr;
@@ -961,6 +960,7 @@
 	unsigned int copy_rb_contents = 0;
 	struct kgsl_context *context;
 	unsigned int context_id;
+	unsigned int *temp_rb_buffer = rec_data->rb_buffer;
 
 	GSL_RB_GET_READPTR(rb, &rb->rptr);
 
@@ -1121,7 +1121,7 @@
 			temp_rb_buffer[temp_idx++] = value;
 	}
 
-	*rb_size = temp_idx;
+	rec_data->rb_size = temp_idx;
 	return 0;
 }
 
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index 6429f46..38882e0 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -27,6 +27,7 @@
 
 struct kgsl_device;
 struct kgsl_device_private;
+struct adreno_recovery_data;
 
 #define GSL_RB_MEMPTRS_SCRATCH_COUNT	 8
 struct kgsl_rbmemptrs {
@@ -114,8 +115,7 @@
 void kgsl_cp_intrcallback(struct kgsl_device *device);
 
 int adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
-				unsigned int *temp_rb_buffer,
-				int *rb_size);
+				struct adreno_recovery_data *rec_data);
 
 void
 adreno_ringbuffer_restore(struct adreno_ringbuffer *rb, unsigned int *rb_buff,