gpu: host1x: Add channel support

Add support for host1x client modules, and host1x channels to submit
work to the clients.

Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: Thierry Reding <thierry.reding@avionic-design.de>
Tested-by: Thierry Reding <thierry.reding@avionic-design.de>
Tested-by: Erik Faye-Lund <kusmabite@gmail.com>
Signed-off-by: Thierry Reding <thierry.reding@avionic-design.de>
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
new file mode 100644
index 0000000..4eb22ef
--- /dev/null
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -0,0 +1,324 @@
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+
+#include "cdma.h"
+#include "channel.h"
+#include "dev.h"
+#include "debug.h"
+
+/*
+ * Put the restart at the end of pushbuffer memor
+ */
+static void push_buffer_init(struct push_buffer *pb)
+{
+	*(pb->mapped + (pb->size_bytes >> 2)) = host1x_opcode_restart(0);
+}
+
+/*
+ * Increment timedout buffer's syncpt via CPU.
+ */
+static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
+				u32 syncpt_incrs, u32 syncval, u32 nr_slots)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct push_buffer *pb = &cdma->push_buffer;
+	u32 i;
+
+	for (i = 0; i < syncpt_incrs; i++)
+		host1x_syncpt_cpu_incr(cdma->timeout.syncpt);
+
+	/* after CPU incr, ensure shadow is up to date */
+	host1x_syncpt_load(cdma->timeout.syncpt);
+
+	/* NOP all the PB slots */
+	while (nr_slots--) {
+		u32 *p = (u32 *)((u32)pb->mapped + getptr);
+		*(p++) = HOST1X_OPCODE_NOP;
+		*(p++) = HOST1X_OPCODE_NOP;
+		dev_dbg(host1x->dev, "%s: NOP at 0x%x\n", __func__,
+			pb->phys + getptr);
+		getptr = (getptr + 8) & (pb->size_bytes - 1);
+	}
+	wmb();
+}
+
+/*
+ * Start channel DMA
+ */
+static void cdma_start(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	if (cdma->running)
+		return;
+
+	cdma->last_pos = cdma->push_buffer.pos;
+
+	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
+			 HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, put and end pointer */
+	host1x_ch_writel(ch, cdma->push_buffer.phys, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
+	host1x_ch_writel(ch, cdma->push_buffer.phys +
+			 cdma->push_buffer.size_bytes + 4,
+			 HOST1X_CHANNEL_DMAEND);
+
+	/* reset GET */
+	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
+			 HOST1X_CHANNEL_DMACTRL_DMAGETRST |
+			 HOST1X_CHANNEL_DMACTRL_DMAINITGET,
+			 HOST1X_CHANNEL_DMACTRL);
+
+	/* start the command DMA */
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/*
+ * Similar to cdma_start(), but rather than starting from an idle
+ * state (where DMA GET is set to DMA PUT), on a timeout we restore
+ * DMA GET from an explicit value (so DMA may again be pending).
+ */
+static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	if (cdma->running)
+		return;
+
+	cdma->last_pos = cdma->push_buffer.pos;
+
+	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
+			 HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, end pointer (all of memory) */
+	host1x_ch_writel(ch, cdma->push_buffer.phys, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, cdma->push_buffer.phys +
+			 cdma->push_buffer.size_bytes,
+			 HOST1X_CHANNEL_DMAEND);
+
+	/* set GET, by loading the value in PUT (then reset GET) */
+	host1x_ch_writel(ch, getptr, HOST1X_CHANNEL_DMAPUT);
+	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP |
+			 HOST1X_CHANNEL_DMACTRL_DMAGETRST |
+			 HOST1X_CHANNEL_DMACTRL_DMAINITGET,
+			 HOST1X_CHANNEL_DMACTRL);
+
+	dev_dbg(host1x->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n", __func__,
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET),
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT),
+		cdma->last_pos);
+
+	/* deassert GET reset and set PUT */
+	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
+			 HOST1X_CHANNEL_DMACTRL);
+	host1x_ch_writel(ch, cdma->push_buffer.pos, HOST1X_CHANNEL_DMAPUT);
+
+	/* start the command DMA */
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/*
+ * Kick channel DMA into action by writing its PUT offset (if it has changed)
+ */
+static void cdma_flush(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	if (cdma->push_buffer.pos != cdma->last_pos) {
+		host1x_ch_writel(ch, cdma->push_buffer.pos,
+				 HOST1X_CHANNEL_DMAPUT);
+		cdma->last_pos = cdma->push_buffer.pos;
+	}
+}
+
+static void cdma_stop(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+	if (cdma->running) {
+		host1x_cdma_wait_locked(cdma, CDMA_EVENT_SYNC_QUEUE_EMPTY);
+		host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
+				 HOST1X_CHANNEL_DMACTRL);
+		cdma->running = false;
+	}
+	mutex_unlock(&cdma->lock);
+}
+
+/*
+ * Stops both channel's command processor and CDMA immediately.
+ * Also, tears down the channel and resets corresponding module.
+ */
+static void cdma_freeze(struct host1x_cdma *cdma)
+{
+	struct host1x *host = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	if (cdma->torndown && !cdma->running) {
+		dev_warn(host->dev, "Already torn down\n");
+		return;
+	}
+
+	dev_dbg(host->dev, "freezing channel (id %d)\n", ch->id);
+
+	cmdproc_stop = host1x_sync_readl(host, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop |= BIT(ch->id);
+	host1x_sync_writel(host, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(host->dev, "%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__, host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET),
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT),
+		cdma->last_pos);
+
+	host1x_ch_writel(ch, HOST1X_CHANNEL_DMACTRL_DMASTOP,
+			 HOST1X_CHANNEL_DMACTRL);
+
+	host1x_sync_writel(host, BIT(ch->id), HOST1X_SYNC_CH_TEARDOWN);
+
+	cdma->running = false;
+	cdma->torndown = true;
+}
+
+static void cdma_resume(struct host1x_cdma *cdma, u32 getptr)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	dev_dbg(host1x->dev,
+		"resuming channel (id %d, DMAGET restart = 0x%x)\n",
+		ch->id, getptr);
+
+	cmdproc_stop = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop &= ~(BIT(ch->id));
+	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	cdma->torndown = false;
+	cdma_timeout_restart(cdma, getptr);
+}
+
+/*
+ * If this timeout fires, it indicates the current sync_queue entry has
+ * exceeded its TTL and the userctx should be timed out and remaining
+ * submits already issued cleaned up (future submits return an error).
+ */
+static void cdma_timeout_handler(struct work_struct *work)
+{
+	struct host1x_cdma *cdma;
+	struct host1x *host1x;
+	struct host1x_channel *ch;
+
+	u32 syncpt_val;
+
+	u32 prev_cmdproc, cmdproc_stop;
+
+	cdma = container_of(to_delayed_work(work), struct host1x_cdma,
+			    timeout.wq);
+	host1x = cdma_to_host1x(cdma);
+	ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+
+	if (!cdma->timeout.client) {
+		dev_dbg(host1x->dev,
+			"cdma_timeout: expired, but has no clientid\n");
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	/* stop processing to get a clean snapshot */
+	prev_cmdproc = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop = prev_cmdproc | BIT(ch->id);
+	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(host1x->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
+		prev_cmdproc, cmdproc_stop);
+
+	syncpt_val = host1x_syncpt_load(cdma->timeout.syncpt);
+
+	/* has buffer actually completed? */
+	if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
+		dev_dbg(host1x->dev,
+			"cdma_timeout: expired, but buffer had completed\n");
+		/* restore */
+		cmdproc_stop = prev_cmdproc & ~(BIT(ch->id));
+		host1x_sync_writel(host1x, cmdproc_stop,
+				   HOST1X_SYNC_CMDPROC_STOP);
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	dev_warn(host1x->dev, "%s: timeout: %d (%s), HW thresh %d, done %d\n",
+		__func__, cdma->timeout.syncpt->id, cdma->timeout.syncpt->name,
+		syncpt_val, cdma->timeout.syncpt_val);
+
+	/* stop HW, resetting channel/module */
+	host1x_hw_cdma_freeze(host1x, cdma);
+
+	host1x_cdma_update_sync_queue(cdma, ch->dev);
+	mutex_unlock(&cdma->lock);
+}
+
+/*
+ * Init timeout resources
+ */
+static int cdma_timeout_init(struct host1x_cdma *cdma, u32 syncpt_id)
+{
+	INIT_DELAYED_WORK(&cdma->timeout.wq, cdma_timeout_handler);
+	cdma->timeout.initialized = true;
+
+	return 0;
+}
+
+/*
+ * Clean up timeout resources
+ */
+static void cdma_timeout_destroy(struct host1x_cdma *cdma)
+{
+	if (cdma->timeout.initialized)
+		cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.initialized = false;
+}
+
+static const struct host1x_cdma_ops host1x_cdma_ops = {
+	.start = cdma_start,
+	.stop = cdma_stop,
+	.flush = cdma_flush,
+
+	.timeout_init = cdma_timeout_init,
+	.timeout_destroy = cdma_timeout_destroy,
+	.freeze = cdma_freeze,
+	.resume = cdma_resume,
+	.timeout_cpu_incr = cdma_timeout_cpu_incr,
+};
+
+static const struct host1x_pushbuffer_ops host1x_pushbuffer_ops = {
+	.init = push_buffer_init,
+};
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
new file mode 100644
index 0000000..5137a56
--- /dev/null
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -0,0 +1,143 @@
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <trace/events/host1x.h>
+
+#include "host1x.h"
+#include "host1x_bo.h"
+#include "channel.h"
+#include "dev.h"
+#include "intr.h"
+#include "job.h"
+
+#define HOST1X_CHANNEL_SIZE 16384
+#define TRACE_MAX_LENGTH 128U
+
+static void submit_gathers(struct host1x_job *job)
+{
+	struct host1x_cdma *cdma = &job->channel->cdma;
+	unsigned int i;
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		u32 op1 = host1x_opcode_gather(g->words);
+		u32 op2 = g->base + g->offset;
+		host1x_cdma_push(cdma, op1, op2);
+	}
+}
+
+static int channel_submit(struct host1x_job *job)
+{
+	struct host1x_channel *ch = job->channel;
+	struct host1x_syncpt *sp;
+	u32 user_syncpt_incrs = job->syncpt_incrs;
+	u32 prev_max = 0;
+	u32 syncval;
+	int err;
+	struct host1x_waitlist *completed_waiter = NULL;
+	struct host1x *host = dev_get_drvdata(ch->dev->parent);
+
+	sp = host->syncpt + job->syncpt_id;
+	trace_host1x_channel_submit(dev_name(ch->dev),
+				    job->num_gathers, job->num_relocs,
+				    job->num_waitchk, job->syncpt_id,
+				    job->syncpt_incrs);
+
+	/* before error checks, return current max */
+	prev_max = job->syncpt_end = host1x_syncpt_read_max(sp);
+
+	/* get submit lock */
+	err = mutex_lock_interruptible(&ch->submitlock);
+	if (err)
+		goto error;
+
+	completed_waiter = kzalloc(sizeof(*completed_waiter), GFP_KERNEL);
+	if (!completed_waiter) {
+		mutex_unlock(&ch->submitlock);
+		err = -ENOMEM;
+		goto error;
+	}
+
+	/* begin a CDMA submit */
+	err = host1x_cdma_begin(&ch->cdma, job);
+	if (err) {
+		mutex_unlock(&ch->submitlock);
+		goto error;
+	}
+
+	if (job->serialize) {
+		/*
+		 * Force serialization by inserting a host wait for the
+		 * previous job to finish before this one can commence.
+		 */
+		host1x_cdma_push(&ch->cdma,
+				 host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
+					host1x_uclass_wait_syncpt_r(), 1),
+				 host1x_class_host_wait_syncpt(job->syncpt_id,
+					host1x_syncpt_read_max(sp)));
+	}
+
+	syncval = host1x_syncpt_incr_max(sp, user_syncpt_incrs);
+
+	job->syncpt_end = syncval;
+
+	/* add a setclass for modules that require it */
+	if (job->class)
+		host1x_cdma_push(&ch->cdma,
+				 host1x_opcode_setclass(job->class, 0, 0),
+				 HOST1X_OPCODE_NOP);
+
+	submit_gathers(job);
+
+	/* end CDMA submit & stash pinned hMems into sync queue */
+	host1x_cdma_end(&ch->cdma, job);
+
+	trace_host1x_channel_submitted(dev_name(ch->dev), prev_max, syncval);
+
+	/* schedule a submit complete interrupt */
+	err = host1x_intr_add_action(host, job->syncpt_id, syncval,
+				     HOST1X_INTR_ACTION_SUBMIT_COMPLETE, ch,
+				     completed_waiter, NULL);
+	completed_waiter = NULL;
+	WARN(err, "Failed to set submit complete interrupt");
+
+	mutex_unlock(&ch->submitlock);
+
+	return 0;
+
+error:
+	kfree(completed_waiter);
+	return err;
+}
+
+static int host1x_channel_init(struct host1x_channel *ch, struct host1x *dev,
+			       unsigned int index)
+{
+	ch->id = index;
+	mutex_init(&ch->reflock);
+	mutex_init(&ch->submitlock);
+
+	ch->regs = dev->regs + index * HOST1X_CHANNEL_SIZE;
+	return 0;
+}
+
+static const struct host1x_channel_ops host1x_channel_ops = {
+	.init = host1x_channel_init,
+	.submit = channel_submit,
+};
diff --git a/drivers/gpu/host1x/hw/host1x01.c b/drivers/gpu/host1x/hw/host1x01.c
index f5c35fa..013ff38 100644
--- a/drivers/gpu/host1x/hw/host1x01.c
+++ b/drivers/gpu/host1x/hw/host1x01.c
@@ -21,6 +21,8 @@
 #include "hw/host1x01_hardware.h"
 
 /* include code */
+#include "hw/cdma_hw.c"
+#include "hw/channel_hw.c"
 #include "hw/intr_hw.c"
 #include "hw/syncpt_hw.c"
 
@@ -28,6 +30,9 @@
 
 int host1x01_init(struct host1x *host)
 {
+	host->channel_op = &host1x_channel_ops;
+	host->cdma_op = &host1x_cdma_ops;
+	host->cdma_pb_op = &host1x_pushbuffer_ops;
 	host->syncpt_op = &host1x_syncpt_ops;
 	host->intr_op = &host1x_intr_ops;
 
diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
index 8cecbee..5f0fb86 100644
--- a/drivers/gpu/host1x/hw/host1x01_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
@@ -22,6 +22,122 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 
+#include "hw_host1x01_channel.h"
 #include "hw_host1x01_sync.h"
+#include "hw_host1x01_uclass.h"
+
+static inline u32 host1x_class_host_wait_syncpt(
+	unsigned indx, unsigned threshold)
+{
+	return host1x_uclass_wait_syncpt_indx_f(indx)
+		| host1x_uclass_wait_syncpt_thresh_f(threshold);
+}
+
+static inline u32 host1x_class_host_load_syncpt_base(
+	unsigned indx, unsigned threshold)
+{
+	return host1x_uclass_load_syncpt_base_base_indx_f(indx)
+		| host1x_uclass_load_syncpt_base_value_f(threshold);
+}
+
+static inline u32 host1x_class_host_wait_syncpt_base(
+	unsigned indx, unsigned base_indx, unsigned offset)
+{
+	return host1x_uclass_wait_syncpt_base_indx_f(indx)
+		| host1x_uclass_wait_syncpt_base_base_indx_f(base_indx)
+		| host1x_uclass_wait_syncpt_base_offset_f(offset);
+}
+
+static inline u32 host1x_class_host_incr_syncpt_base(
+	unsigned base_indx, unsigned offset)
+{
+	return host1x_uclass_incr_syncpt_base_base_indx_f(base_indx)
+		| host1x_uclass_incr_syncpt_base_offset_f(offset);
+}
+
+static inline u32 host1x_class_host_incr_syncpt(
+	unsigned cond, unsigned indx)
+{
+	return host1x_uclass_incr_syncpt_cond_f(cond)
+		| host1x_uclass_incr_syncpt_indx_f(indx);
+}
+
+static inline u32 host1x_class_host_indoff_reg_write(
+	unsigned mod_id, unsigned offset, bool auto_inc)
+{
+	u32 v = host1x_uclass_indoff_indbe_f(0xf)
+		| host1x_uclass_indoff_indmodid_f(mod_id)
+		| host1x_uclass_indoff_indroffset_f(offset);
+	if (auto_inc)
+		v |= host1x_uclass_indoff_autoinc_f(1);
+	return v;
+}
+
+static inline u32 host1x_class_host_indoff_reg_read(
+	unsigned mod_id, unsigned offset, bool auto_inc)
+{
+	u32 v = host1x_uclass_indoff_indmodid_f(mod_id)
+		| host1x_uclass_indoff_indroffset_f(offset)
+		| host1x_uclass_indoff_rwn_read_v();
+	if (auto_inc)
+		v |= host1x_uclass_indoff_autoinc_f(1);
+	return v;
+}
+
+
+/* cdma opcodes */
+static inline u32 host1x_opcode_setclass(
+	unsigned class_id, unsigned offset, unsigned mask)
+{
+	return (0 << 28) | (offset << 16) | (class_id << 6) | mask;
+}
+
+static inline u32 host1x_opcode_incr(unsigned offset, unsigned count)
+{
+	return (1 << 28) | (offset << 16) | count;
+}
+
+static inline u32 host1x_opcode_nonincr(unsigned offset, unsigned count)
+{
+	return (2 << 28) | (offset << 16) | count;
+}
+
+static inline u32 host1x_opcode_mask(unsigned offset, unsigned mask)
+{
+	return (3 << 28) | (offset << 16) | mask;
+}
+
+static inline u32 host1x_opcode_imm(unsigned offset, unsigned value)
+{
+	return (4 << 28) | (offset << 16) | value;
+}
+
+static inline u32 host1x_opcode_imm_incr_syncpt(unsigned cond, unsigned indx)
+{
+	return host1x_opcode_imm(host1x_uclass_incr_syncpt_r(),
+		host1x_class_host_incr_syncpt(cond, indx));
+}
+
+static inline u32 host1x_opcode_restart(unsigned address)
+{
+	return (5 << 28) | (address >> 4);
+}
+
+static inline u32 host1x_opcode_gather(unsigned count)
+{
+	return (6 << 28) | count;
+}
+
+static inline u32 host1x_opcode_gather_nonincr(unsigned offset,	unsigned count)
+{
+	return (6 << 28) | (offset << 16) | BIT(15) | count;
+}
+
+static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
+{
+	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
+}
+
+#define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
 
 #endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_channel.h b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
new file mode 100644
index 0000000..9ba1332
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_host1x_channel_host1x_h__
+#define __hw_host1x_channel_host1x_h__
+
+static inline u32 host1x_channel_dmastart_r(void)
+{
+	return 0x14;
+}
+#define HOST1X_CHANNEL_DMASTART \
+	host1x_channel_dmastart_r()
+static inline u32 host1x_channel_dmaput_r(void)
+{
+	return 0x18;
+}
+#define HOST1X_CHANNEL_DMAPUT \
+	host1x_channel_dmaput_r()
+static inline u32 host1x_channel_dmaget_r(void)
+{
+	return 0x1c;
+}
+#define HOST1X_CHANNEL_DMAGET \
+	host1x_channel_dmaget_r()
+static inline u32 host1x_channel_dmaend_r(void)
+{
+	return 0x20;
+}
+#define HOST1X_CHANNEL_DMAEND \
+	host1x_channel_dmaend_r()
+static inline u32 host1x_channel_dmactrl_r(void)
+{
+	return 0x24;
+}
+#define HOST1X_CHANNEL_DMACTRL \
+	host1x_channel_dmactrl_r()
+static inline u32 host1x_channel_dmactrl_dmastop(void)
+{
+	return 1 << 0;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMASTOP \
+	host1x_channel_dmactrl_dmastop()
+static inline u32 host1x_channel_dmactrl_dmagetrst(void)
+{
+	return 1 << 1;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMAGETRST \
+	host1x_channel_dmactrl_dmagetrst()
+static inline u32 host1x_channel_dmactrl_dmainitget(void)
+{
+	return 1 << 2;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMAINITGET \
+	host1x_channel_dmactrl_dmainitget()
+#endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_sync.h b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
index eea0bb0..8f2a246 100644
--- a/drivers/gpu/host1x/hw/hw_host1x01_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
@@ -77,6 +77,18 @@
 }
 #define HOST1X_SYNC_SYNCPT_THRESH_INT_ENABLE_CPU0(id) \
 	host1x_sync_syncpt_thresh_int_enable_cpu0_r(id)
+static inline u32 host1x_sync_cmdproc_stop_r(void)
+{
+	return 0xac;
+}
+#define HOST1X_SYNC_CMDPROC_STOP \
+	host1x_sync_cmdproc_stop_r()
+static inline u32 host1x_sync_ch_teardown_r(void)
+{
+	return 0xb0;
+}
+#define HOST1X_SYNC_CH_TEARDOWN \
+	host1x_sync_ch_teardown_r()
 static inline u32 host1x_sync_usec_clk_r(void)
 {
 	return 0x1a4;
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_uclass.h b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
new file mode 100644
index 0000000..7af6609
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_host1x_uclass_host1x_h__
+#define __hw_host1x_uclass_host1x_h__
+
+static inline u32 host1x_uclass_incr_syncpt_r(void)
+{
+	return 0x0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT \
+	host1x_uclass_incr_syncpt_r()
+static inline u32 host1x_uclass_incr_syncpt_cond_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_COND_F(v) \
+	host1x_uclass_incr_syncpt_cond_f(v)
+static inline u32 host1x_uclass_incr_syncpt_indx_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_INDX_F(v) \
+	host1x_uclass_incr_syncpt_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_r(void)
+{
+	return 0x8;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT \
+	host1x_uclass_wait_syncpt_r()
+static inline u32 host1x_uclass_wait_syncpt_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_INDX_F(v) \
+	host1x_uclass_wait_syncpt_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_thresh_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_THRESH_F(v) \
+	host1x_uclass_wait_syncpt_thresh_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_INDX_F(v) \
+	host1x_uclass_wait_syncpt_base_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_wait_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_offset_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_OFFSET_F(v) \
+	host1x_uclass_wait_syncpt_base_offset_f(v)
+static inline u32 host1x_uclass_load_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_load_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_load_syncpt_base_value_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(v) \
+	host1x_uclass_load_syncpt_base_value_f(v)
+static inline u32 host1x_uclass_incr_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_incr_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_incr_syncpt_base_offset_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_BASE_OFFSET_F(v) \
+	host1x_uclass_incr_syncpt_base_offset_f(v)
+static inline u32 host1x_uclass_indoff_r(void)
+{
+	return 0x2d;
+}
+#define HOST1X_UCLASS_INDOFF \
+	host1x_uclass_indoff_r()
+static inline u32 host1x_uclass_indoff_indbe_f(u32 v)
+{
+	return (v & 0xf) << 28;
+}
+#define HOST1X_UCLASS_INDOFF_INDBE_F(v) \
+	host1x_uclass_indoff_indbe_f(v)
+static inline u32 host1x_uclass_indoff_autoinc_f(u32 v)
+{
+	return (v & 0x1) << 27;
+}
+#define HOST1X_UCLASS_INDOFF_AUTOINC_F(v) \
+	host1x_uclass_indoff_autoinc_f(v)
+static inline u32 host1x_uclass_indoff_indmodid_f(u32 v)
+{
+	return (v & 0xff) << 18;
+}
+#define HOST1X_UCLASS_INDOFF_INDMODID_F(v) \
+	host1x_uclass_indoff_indmodid_f(v)
+static inline u32 host1x_uclass_indoff_indroffset_f(u32 v)
+{
+	return (v & 0xffff) << 2;
+}
+#define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
+	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_indoff_rwn_read_v(void)
+{
+	return 1;
+}
+#define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
+	host1x_uclass_indoff_indroffset_f(v)
+#endif
diff --git a/drivers/gpu/host1x/hw/syncpt_hw.c b/drivers/gpu/host1x/hw/syncpt_hw.c
index 885b257..2c1f4af 100644
--- a/drivers/gpu/host1x/hw/syncpt_hw.c
+++ b/drivers/gpu/host1x/hw/syncpt_hw.c
@@ -93,10 +93,21 @@
 	wmb();
 }
 
+/* remove a wait pointed to by patch_addr */
+static int syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
+{
+	u32 override = host1x_class_host_wait_syncpt(
+		HOST1X_SYNCPT_RESERVED, 0);
+
+	*((u32 *)patch_addr) = override;
+	return 0;
+}
+
 static const struct host1x_syncpt_ops host1x_syncpt_ops = {
 	.restore = syncpt_restore,
 	.restore_wait_base = syncpt_restore_wait_base,
 	.load_wait_base = syncpt_read_wait_base,
 	.load = syncpt_load,
 	.cpu_incr = syncpt_cpu_incr,
+	.patch_wait = syncpt_patch_wait,
 };