msm: kgsl: Switch pagetables in stream for IOMMU

Switch IOMMU pagetables by using GPU commands instead of using the CPU.
This is better for performance because the synchronization required for
switching pagetables instream using GPU commands is less expensive
comparedto synchronization required when switching pagetables using the
CPU

Change-Id: I00e676bbf04a4c9fa944e896a367ba837d3606f5
Signed-off-by: Shubhraprakash Das <sadas@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 1432ccf..2f503ae 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -21,6 +21,7 @@
 #include "kgsl_pwrscale.h"
 #include "kgsl_cffdump.h"
 #include "kgsl_sharedmem.h"
+#include "kgsl_iommu.h"
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
@@ -243,7 +244,144 @@
 	return result;
 }
 
-static void adreno_setstate(struct kgsl_device *device,
+static void adreno_iommu_setstate(struct kgsl_device *device,
+					uint32_t flags)
+{
+	unsigned int pt_val, reg_pt_val;
+	unsigned int link[200];
+	unsigned int *cmds = &link[0];
+	int sizedwords = 0;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct kgsl_memdesc **reg_map_desc;
+	void *reg_map_array;
+	int num_iommu_units, i;
+
+	if (!adreno_dev->drawctxt_active)
+		return kgsl_mmu_device_setstate(&device->mmu, flags);
+	num_iommu_units = kgsl_mmu_get_reg_map_desc(&device->mmu,
+							&reg_map_array);
+	reg_map_desc = reg_map_array;
+
+	if (kgsl_mmu_enable_clk(&device->mmu,
+				KGSL_IOMMU_CONTEXT_USER))
+		goto done;
+
+	if (adreno_is_a225(adreno_dev))
+		cmds += adreno_add_change_mh_phys_limit_cmds(cmds, 0xFFFFF000,
+					device->mmu.setstate_memory.gpuaddr +
+					KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+	else
+		cmds += adreno_add_bank_change_cmds(cmds,
+					KGSL_IOMMU_CONTEXT_USER,
+					device->mmu.setstate_memory.gpuaddr +
+					KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+
+	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
+		pt_val = kgsl_mmu_pt_get_base_addr(device->mmu.hwpagetable);
+		/*
+		 * We need to perfrom the following operations for all
+		 * IOMMU units
+		 */
+		for (i = 0; i < num_iommu_units; i++) {
+			reg_pt_val = (pt_val &
+				(KGSL_IOMMU_TTBR0_PA_MASK <<
+				KGSL_IOMMU_TTBR0_PA_SHIFT)) +
+				kgsl_mmu_get_pt_lsb(&device->mmu, i,
+					KGSL_IOMMU_CONTEXT_USER);
+			/*
+			 * Set address of the new pagetable by writng to IOMMU
+			 * TTBR0 register
+			 */
+			*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
+			*cmds++ = reg_map_desc[i]->gpuaddr +
+				(KGSL_IOMMU_CONTEXT_USER <<
+				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_TTBR0;
+			*cmds++ = reg_pt_val;
+			*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+			*cmds++ = 0x00000000;
+
+			/*
+			 * Read back the ttbr0 register as a barrier to ensure
+			 * above writes have completed
+			 */
+			cmds += adreno_add_read_cmds(device, cmds,
+				reg_map_desc[i]->gpuaddr +
+				(KGSL_IOMMU_CONTEXT_USER <<
+				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_TTBR0,
+				reg_pt_val,
+				device->mmu.setstate_memory.gpuaddr +
+				KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+
+			/* set the asid */
+			*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
+			*cmds++ = reg_map_desc[i]->gpuaddr +
+				(KGSL_IOMMU_CONTEXT_USER <<
+				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_CONTEXTIDR;
+			*cmds++ = kgsl_mmu_get_hwpagetable_asid(&device->mmu);
+			*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+			*cmds++ = 0x00000000;
+
+			/* Read back asid to ensure above write completes */
+			cmds += adreno_add_read_cmds(device, cmds,
+				reg_map_desc[i]->gpuaddr +
+				(KGSL_IOMMU_CONTEXT_USER <<
+				KGSL_IOMMU_CTX_SHIFT) + KGSL_IOMMU_CONTEXTIDR,
+				kgsl_mmu_get_hwpagetable_asid(&device->mmu),
+				device->mmu.setstate_memory.gpuaddr +
+				KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+		}
+		/* invalidate all base pointers */
+		*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
+		*cmds++ = 0x7fff;
+
+		if (flags & KGSL_MMUFLAGS_TLBFLUSH)
+			cmds += __adreno_add_idle_indirect_cmds(cmds,
+				device->mmu.setstate_memory.gpuaddr +
+				KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+	}
+	if (flags & KGSL_MMUFLAGS_TLBFLUSH) {
+		/*
+		 * tlb flush based on asid, no need to flush entire tlb
+		 */
+		for (i = 0; i < num_iommu_units; i++) {
+			*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
+			*cmds++ = (reg_map_desc[i]->gpuaddr +
+				(KGSL_IOMMU_CONTEXT_USER <<
+				KGSL_IOMMU_CTX_SHIFT) +
+				KGSL_IOMMU_CTX_TLBIASID);
+			*cmds++ = kgsl_mmu_get_hwpagetable_asid(&device->mmu);
+			cmds += adreno_add_read_cmds(device, cmds,
+				reg_map_desc[i]->gpuaddr +
+				(KGSL_IOMMU_CONTEXT_USER <<
+				KGSL_IOMMU_CTX_SHIFT) +
+				KGSL_IOMMU_CONTEXTIDR,
+				kgsl_mmu_get_hwpagetable_asid(&device->mmu),
+				device->mmu.setstate_memory.gpuaddr +
+				KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+		}
+	}
+
+	if (adreno_is_a225(adreno_dev))
+		cmds += adreno_add_change_mh_phys_limit_cmds(cmds,
+			reg_map_desc[num_iommu_units - 1]->gpuaddr - PAGE_SIZE,
+			device->mmu.setstate_memory.gpuaddr +
+			KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+	else
+		cmds += adreno_add_bank_change_cmds(cmds,
+			KGSL_IOMMU_CONTEXT_PRIV,
+			device->mmu.setstate_memory.gpuaddr +
+			KGSL_IOMMU_SETSTATE_NOP_OFFSET);
+
+	sizedwords += (cmds - &link[0]);
+	if (sizedwords)
+		adreno_ringbuffer_issuecmds(device,
+			KGSL_CMD_FLAGS_PMODE, &link[0], sizedwords);
+done:
+	if (num_iommu_units)
+		kfree(reg_map_array);
+}
+
+static void adreno_gpummu_setstate(struct kgsl_device *device,
 					uint32_t flags)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
@@ -253,16 +391,6 @@
 	unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */
 
 	/*
-	 * A3XX doesn't support the fast path (the registers don't even exist)
-	 * so just bail out early
-	 */
-
-	if (adreno_is_a3xx(adreno_dev)) {
-		kgsl_mmu_device_setstate(&device->mmu, flags);
-		return;
-	}
-
-	/*
 	 * If possible, then set the state via the command stream to avoid
 	 * a CPU idle.  Otherwise, use the default setstate which uses register
 	 * writes For CFF dump we must idle and use the registers so that it is
@@ -348,6 +476,16 @@
 	}
 }
 
+static void adreno_setstate(struct kgsl_device *device,
+			uint32_t flags)
+{
+	/* call the mmu specific handler */
+	if (KGSL_MMU_TYPE_GPU == kgsl_mmu_get_mmutype())
+		return adreno_gpummu_setstate(device, flags);
+	else if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
+		return adreno_iommu_setstate(device, flags);
+}
+
 static unsigned int
 a3xx_getchipid(struct kgsl_device *device)
 {
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index a7ea20c..4ce56a4 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -16,6 +16,7 @@
 #include "kgsl_device.h"
 #include "adreno_drawctxt.h"
 #include "adreno_ringbuffer.h"
+#include "kgsl_iommu.h"
 
 #define DEVICE_3D_NAME "kgsl-3d"
 #define DEVICE_3D0_NAME "kgsl-3d0"
@@ -223,4 +224,70 @@
 	return (ilog2(size) - 5) << 29;
 }
 
+static inline int __adreno_add_idle_indirect_cmds(unsigned int *cmds,
+						unsigned int nop_gpuaddr)
+{
+	/* Adding an indirect buffer ensures that the prefetch stalls until
+	 * the commands in indirect buffer have completed. We need to stall
+	 * prefetch with a nop indirect buffer when updating pagetables
+	 * because it provides stabler synchronization */
+	*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
+	*cmds++ = nop_gpuaddr;
+	*cmds++ = 2;
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0x00000000;
+	return 5;
+}
+
+static inline int adreno_add_change_mh_phys_limit_cmds(unsigned int *cmds,
+						unsigned int new_phys_limit,
+						unsigned int nop_gpuaddr)
+{
+	unsigned int *start = cmds;
+
+	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
+	*cmds++ = cp_type0_packet(MH_MMU_MPU_END, 1);
+	*cmds++ = new_phys_limit;
+	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
+	return cmds - start;
+}
+
+static inline int adreno_add_bank_change_cmds(unsigned int *cmds,
+					int cur_ctx_bank,
+					unsigned int nop_gpuaddr)
+{
+	unsigned int *start = cmds;
+
+	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
+	*cmds++ = cp_type0_packet(REG_CP_STATE_DEBUG_INDEX, 1);
+	*cmds++ = (cur_ctx_bank ? 0 : 0x20);
+	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
+	return cmds - start;
+}
+
+/*
+ * adreno_read_cmds - Add pm4 packets to perform read
+ * @device - Pointer to device structure
+ * @cmds - Pointer to memory where read commands need to be added
+ * @addr - gpu address of the read
+ * @val - The GPU will wait until the data at address addr becomes
+ * equal to value
+ */
+static inline int adreno_add_read_cmds(struct kgsl_device *device,
+				unsigned int *cmds, unsigned int addr,
+				unsigned int val, unsigned int nop_gpuaddr)
+{
+	unsigned int *start = cmds;
+
+	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
+	/* MEM SPACE = memory, FUNCTION = equals */
+	*cmds++ = 0x13;
+	*cmds++ = addr;
+	*cmds++ = val;
+	*cmds++ = 0xFFFFFFFF;
+	*cmds++ = 0xFFFFFFFF;
+	cmds += __adreno_add_idle_indirect_cmds(cmds, nop_gpuaddr);
+	return cmds - start;
+}
+
 #endif /*__ADRENO_H */
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
index eb936f8..ae846da 100644
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1848,8 +1848,13 @@
 
 	/* NQ and External Memory Swap */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
-	/* Protected mode error checking */
-	GSL_RB_WRITE(cmds, cmds_gpu, GSL_RB_PROTECTED_MODE_CONTROL);
+	/* Protected mode error checking
+	 * If iommu is used then protection needs to be turned off
+	 * to enable context bank switching */
+	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
+		GSL_RB_WRITE(cmds, cmds_gpu, 0);
+	else
+		GSL_RB_WRITE(cmds, cmds_gpu, GSL_RB_PROTECTED_MODE_CONTROL);
 	/* Disable header dumping and Header dump address */
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 	/* Header dump size */
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index fdd4aa5..fb65565 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -595,7 +595,9 @@
 		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000001);
 		kgsl_regwrite(mmu->device, MH_MMU_MPU_END,
 			mh->mpu_base +
-			iommu->iommu_units[0].reg_map.gpuaddr - PAGE_SIZE);
+			iommu->iommu_units
+				[iommu->unit_count - 1].reg_map.gpuaddr -
+				PAGE_SIZE);
 	} else {
 		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000000);
 	}
diff --git a/drivers/gpu/msm/kgsl_mmu.c b/drivers/gpu/msm/kgsl_mmu.c
index 663ba0f..b882807 100644
--- a/drivers/gpu/msm/kgsl_mmu.c
+++ b/drivers/gpu/msm/kgsl_mmu.c
@@ -545,8 +545,7 @@
 	struct kgsl_device *device = mmu->device;
 	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_type)
 		return;
-	else if (device->ftbl->setstate && (KGSL_MMU_TYPE_IOMMU !=
-						kgsl_mmu_type))
+	else if (device->ftbl->setstate)
 		device->ftbl->setstate(device, flags);
 	else if (mmu->mmu_ops->mmu_device_setstate)
 		mmu->mmu_ops->mmu_device_setstate(mmu, flags);
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 3a29d71..2aaefba 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -706,6 +706,7 @@
 		}
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
 		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP);
+		kgsl_mmu_disable_clk(&device->mmu);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP);
 		if (device->idle_wakelock.name)
 			wake_unlock(&device->idle_wakelock);
@@ -749,6 +750,7 @@
 				gpu_freq);
 		_sleep_accounting(device);
 		kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
+		kgsl_mmu_disable_clk(&device->mmu);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_SLEEP);
 		wake_unlock(&device->idle_wakelock);
 		pm_qos_update_request(&device->pm_qos_req_dma,
@@ -888,6 +890,7 @@
 	/* Order pwrrail/clk sequence based upon platform */
 	kgsl_pwrctrl_axi(device, KGSL_PWRFLAGS_OFF);
 	kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_SLEEP);
+	kgsl_mmu_disable_clk(&device->mmu);
 	kgsl_pwrctrl_pwrrail(device, KGSL_PWRFLAGS_OFF);
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_disable);