Merge branch 'master' into sh/hw-breakpoints

Conflict between FPU thread flag migration and debug
thread flag addition.

Conflicts:
	arch/sh/include/asm/thread_info.h
	arch/sh/include/asm/ubc.h
	arch/sh/kernel/process_32.c
diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile
index 649daad..cd43877 100644
--- a/arch/sh/kernel/Makefile
+++ b/arch/sh/kernel/Makefile
@@ -9,8 +9,12 @@
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
-obj-y	:= debugtraps.o dumpstack.o idle.o io.o io_generic.o irq.o	\
-	   machvec.o nmi_debug.o process_$(BITS).o ptrace_$(BITS).o	\
+CFLAGS_REMOVE_return_address.o = -pg
+
+obj-y	:= debugtraps.o dma-nommu.o dumpstack.o 			\
+	   idle.o io.o io_generic.o irq.o				\
+	   irq_$(BITS).o machvec.o nmi_debug.o process_$(BITS).o 	\
+	   ptrace_$(BITS).o return_address.o				\
 	   setup.o signal_$(BITS).o sys_sh.o sys_sh$(BITS).o		\
 	   syscalls_$(BITS).o time.o topology.o traps.o			\
 	   traps_$(BITS).o unwinder.o
@@ -28,13 +32,13 @@
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_IO_TRAPPED)	+= io_trapped.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_GENERIC_GPIO)	+= gpio.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_DUMP_CODE)		+= disassemble.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o
 obj-$(CONFIG_DWARF_UNWINDER)	+= dwarf.o
+obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_callchain.o
 
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)		+= hw_breakpoint.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= localtimer.o
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index d218e80..08a2be7 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -34,5 +34,28 @@
 	DEFINE(PBE_NEXT, offsetof(struct pbe, next));
 	DEFINE(SWSUSP_ARCH_REGS_SIZE, sizeof(struct swsusp_arch_regs));
 #endif
+
+	DEFINE(SH_SLEEP_MODE, offsetof(struct sh_sleep_data, mode));
+	DEFINE(SH_SLEEP_SF_PRE, offsetof(struct sh_sleep_data, sf_pre));
+	DEFINE(SH_SLEEP_SF_POST, offsetof(struct sh_sleep_data, sf_post));
+	DEFINE(SH_SLEEP_RESUME, offsetof(struct sh_sleep_data, resume));
+	DEFINE(SH_SLEEP_VBR, offsetof(struct sh_sleep_data, vbr));
+	DEFINE(SH_SLEEP_SPC, offsetof(struct sh_sleep_data, spc));
+	DEFINE(SH_SLEEP_SR, offsetof(struct sh_sleep_data, sr));
+	DEFINE(SH_SLEEP_SP, offsetof(struct sh_sleep_data, sp));
+	DEFINE(SH_SLEEP_BASE_ADDR, offsetof(struct sh_sleep_data, addr));
+	DEFINE(SH_SLEEP_BASE_DATA, offsetof(struct sh_sleep_data, data));
+	DEFINE(SH_SLEEP_REG_STBCR, offsetof(struct sh_sleep_regs, stbcr));
+	DEFINE(SH_SLEEP_REG_BAR, offsetof(struct sh_sleep_regs, bar));
+	DEFINE(SH_SLEEP_REG_PTEH, offsetof(struct sh_sleep_regs, pteh));
+	DEFINE(SH_SLEEP_REG_PTEL, offsetof(struct sh_sleep_regs, ptel));
+	DEFINE(SH_SLEEP_REG_TTB, offsetof(struct sh_sleep_regs, ttb));
+	DEFINE(SH_SLEEP_REG_TEA, offsetof(struct sh_sleep_regs, tea));
+	DEFINE(SH_SLEEP_REG_MMUCR, offsetof(struct sh_sleep_regs, mmucr));
+	DEFINE(SH_SLEEP_REG_PTEA, offsetof(struct sh_sleep_regs, ptea));
+	DEFINE(SH_SLEEP_REG_PASCR, offsetof(struct sh_sleep_regs, pascr));
+	DEFINE(SH_SLEEP_REG_IRMCR, offsetof(struct sh_sleep_regs, irmcr));
+	DEFINE(SH_SLEEP_REG_CCR, offsetof(struct sh_sleep_regs, ccr));
+	DEFINE(SH_SLEEP_REG_RAMCR, offsetof(struct sh_sleep_regs, ramcr));
 	return 0;
 }
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index 3d6b931..d97c803 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -15,7 +15,6 @@
 
 # Common interfaces.
 
-obj-$(CONFIG_UBC_WAKEUP)	+= ubc.o
 obj-$(CONFIG_SH_ADC)		+= adc.o
 obj-$(CONFIG_SH_CLK_CPG)	+= clock-cpg.o
 
diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c
index e932ebe..89b4b76 100644
--- a/arch/sh/kernel/cpu/init.c
+++ b/arch/sh/kernel/cpu/init.c
@@ -75,16 +75,11 @@
 	/*
 	 * Future proofing.
 	 *
-	 * Disable support for slottable sleep instruction
-	 * and non-nop instructions in the rte delay slot.
+	 * Disable support for slottable sleep instruction, non-nop
+	 * instructions in the rte delay slot, and associative writes to
+	 * the memory-mapped cache array.
 	 */
-	expmask &= ~(EXPMASK_RTEDS | EXPMASK_BRDSSLP);
-
-	/*
-	 * Enable associative writes to the memory-mapped cache array
-	 * until the cache flush ops have been rewritten.
-	 */
-	expmask |= EXPMASK_MMCAW;
+	expmask &= ~(EXPMASK_RTEDS | EXPMASK_BRDSSLP | EXPMASK_MMCAW);
 
 	__raw_writel(expmask, EXPMASK);
 	ctrl_barrier();
@@ -311,12 +306,12 @@
 	if (fpu_disabled) {
 		printk("FPU Disabled\n");
 		current_cpu_data.flags &= ~CPU_HAS_FPU;
-		disable_fpu();
 	}
 
 	/* FPU initialization */
+	disable_fpu();
 	if ((current_cpu_data.flags & CPU_HAS_FPU)) {
-		clear_thread_flag(TIF_USEDFPU);
+		current_thread_info()->status &= ~TS_USEDFPU;
 		clear_used_math();
 	}
 
@@ -338,17 +333,6 @@
 	}
 #endif
 
-	/*
-	 * Some brain-damaged loaders decided it would be a good idea to put
-	 * the UBC to sleep. This causes some issues when it comes to things
-	 * like PTRACE_SINGLESTEP or doing hardware watchpoints in GDB.  So ..
-	 * we wake it up and hope that all is well.
-	 */
-#ifdef CONFIG_SUPERH32
-	if (raw_smp_processor_id() == 0)
-		ubc_wakeup();
-#endif
-
 	speculative_execution_init();
 	expmask_init();
 }
diff --git a/arch/sh/kernel/cpu/sh2a/fpu.c b/arch/sh/kernel/cpu/sh2a/fpu.c
index 6df2fb9..d395ce5 100644
--- a/arch/sh/kernel/cpu/sh2a/fpu.c
+++ b/arch/sh/kernel/cpu/sh2a/fpu.c
@@ -25,14 +25,12 @@
 
 /*
  * Save FPU registers onto task structure.
- * Assume called with FPU enabled (SR.FD=0).
  */
 void
-save_fpu(struct task_struct *tsk, struct pt_regs *regs)
+save_fpu(struct task_struct *tsk)
 {
 	unsigned long dummy;
 
-	clear_tsk_thread_flag(tsk, TIF_USEDFPU);
 	enable_fpu();
 	asm volatile("sts.l	fpul, @-%0\n\t"
 		     "sts.l	fpscr, @-%0\n\t"
@@ -60,7 +58,6 @@
 		     : "memory");
 
 	disable_fpu();
-	release_fpu(regs);
 }
 
 static void
@@ -598,31 +595,31 @@
 	struct task_struct *tsk = current;
 	TRAP_HANDLER_DECL;
 
-	save_fpu(tsk, regs);
+	__unlazy_fpu(tsk, regs);
 	if (ieee_fpe_handler(regs)) {
 		tsk->thread.fpu.hard.fpscr &=
 			~(FPSCR_CAUSE_MASK | FPSCR_FLAG_MASK);
 		grab_fpu(regs);
 		restore_fpu(tsk);
-		set_tsk_thread_flag(tsk, TIF_USEDFPU);
+		task_thread_info(tsk)->status |= TS_USEDFPU;
 		return;
 	}
 
 	force_sig(SIGFPE, tsk);
 }
 
-BUILD_TRAP_HANDLER(fpu_state_restore)
+void fpu_state_restore(struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
-	TRAP_HANDLER_DECL;
 
 	grab_fpu(regs);
-	if (!user_mode(regs)) {
+	if (unlikely(!user_mode(regs))) {
 		printk(KERN_ERR "BUG: FPU is used in kernel mode.\n");
+		BUG();
 		return;
 	}
 
-	if (used_math()) {
+	if (likely(used_math())) {
 		/* Using the FPU again.  */
 		restore_fpu(tsk);
 	} else	{
@@ -630,5 +627,13 @@
 		fpu_init();
 		set_used_math();
 	}
-	set_tsk_thread_flag(tsk, TIF_USEDFPU);
+	task_thread_info(tsk)->status |= TS_USEDFPU;
+	tsk->fpu_counter++;
+}
+
+BUILD_TRAP_HANDLER(fpu_state_restore)
+{
+	TRAP_HANDLER_DECL;
+
+	fpu_state_restore(regs);
 }
diff --git a/arch/sh/kernel/cpu/sh3/entry.S b/arch/sh/kernel/cpu/sh3/entry.S
index bb407ef..3f7e2a2 100644
--- a/arch/sh/kernel/cpu/sh3/entry.S
+++ b/arch/sh/kernel/cpu/sh3/entry.S
@@ -297,41 +297,8 @@
 !
 	.balign 	256,0,256
 general_exception:
-#ifndef CONFIG_CPU_SUBTYPE_SHX3
 	bra	handle_exception
 	 sts	pr, k3		! save original pr value in k3
-#else
-	mov.l	1f, k4
-	mov.l	@k4, k4
-
-	! Is EXPEVT larger than 0x800?
-	mov	#0x8, k0
-	shll8	k0
-	cmp/hs	k0, k4
-	bf	0f
-
-	! then add 0x580 (k2 is 0xd80 or 0xda0)
-	mov	#0x58, k0
-	shll2	k0
-	shll2	k0
-	add	k0, k4
-0:
-	! Setup stack and save DSP context (k0 contains original r15 on return)
-	bsr	prepare_stack
-	 nop
-
-	! Save registers / Switch to bank 0
-	mov		k4, k2		! keep vector in k2
-	mov.l	1f, k4		! SR bits to clear in k4
-	bsr	save_regs	! needs original pr value in k3
-	 nop
-
-	bra	handle_exception_special
-	 nop
-
-	.align	2
-1:	.long	EXPEVT
-#endif
 
 ! prepare_stack()
 ! - roll back gRB
diff --git a/arch/sh/kernel/cpu/sh4/Makefile b/arch/sh/kernel/cpu/sh4/Makefile
index 203b183..3a1dbc7 100644
--- a/arch/sh/kernel/cpu/sh4/Makefile
+++ b/arch/sh/kernel/cpu/sh4/Makefile
@@ -9,6 +9,11 @@
 obj-$(CONFIG_SH_FPU)			+= fpu.o softfloat.o
 obj-$(CONFIG_SH_STORE_QUEUES)		+= sq.o
 
+# Perf events
+perf-$(CONFIG_CPU_SUBTYPE_SH7750)	:= perf_event.o
+perf-$(CONFIG_CPU_SUBTYPE_SH7750S)	:= perf_event.o
+perf-$(CONFIG_CPU_SUBTYPE_SH7091)	:= perf_event.o
+
 # CPU subtype setup
 obj-$(CONFIG_CPU_SUBTYPE_SH7750)	+= setup-sh7750.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7750R)	+= setup-sh7750.o
@@ -27,4 +32,5 @@
 # Additional clocks by subtype
 clock-$(CONFIG_CPU_SUBTYPE_SH4_202)	+= clock-sh4-202.o
 
-obj-y	+= $(clock-y)
+obj-y					+= $(clock-y)
+obj-$(CONFIG_PERF_EVENTS)		+= $(perf-y)
diff --git a/arch/sh/kernel/cpu/sh4/fpu.c b/arch/sh/kernel/cpu/sh4/fpu.c
index e3ea541..e97857a 100644
--- a/arch/sh/kernel/cpu/sh4/fpu.c
+++ b/arch/sh/kernel/cpu/sh4/fpu.c
@@ -41,13 +41,11 @@
 
 /*
  * Save FPU registers onto task structure.
- * Assume called with FPU enabled (SR.FD=0).
  */
-void save_fpu(struct task_struct *tsk, struct pt_regs *regs)
+void save_fpu(struct task_struct *tsk)
 {
 	unsigned long dummy;
 
-	clear_tsk_thread_flag(tsk, TIF_USEDFPU);
 	enable_fpu();
 	asm volatile ("sts.l	fpul, @-%0\n\t"
 		      "sts.l	fpscr, @-%0\n\t"
@@ -92,7 +90,6 @@
 		      :"memory");
 
 	disable_fpu();
-	release_fpu(regs);
 }
 
 static void restore_fpu(struct task_struct *tsk)
@@ -285,7 +282,6 @@
 		/* fcnvsd */
 		struct task_struct *tsk = current;
 
-		save_fpu(tsk, regs);
 		if ((tsk->thread.fpu.hard.fpscr & FPSCR_CAUSE_ERROR))
 			/* FPU error */
 			denormal_to_double(&tsk->thread.fpu.hard,
@@ -462,7 +458,7 @@
 	struct task_struct *tsk = current;
 	TRAP_HANDLER_DECL;
 
-	save_fpu(tsk, regs);
+	__unlazy_fpu(tsk, regs);
 	fpu_exception_flags = 0;
 	if (ieee_fpe_handler(regs)) {
 		tsk->thread.fpu.hard.fpscr &=
@@ -473,7 +469,7 @@
 		tsk->thread.fpu.hard.fpscr |= (fpu_exception_flags >> 10);
 		grab_fpu(regs);
 		restore_fpu(tsk);
-		set_tsk_thread_flag(tsk, TIF_USEDFPU);
+		task_thread_info(tsk)->status |= TS_USEDFPU;
 		if ((((tsk->thread.fpu.hard.fpscr & FPSCR_ENABLE_MASK) >> 7) &
 		     (fpu_exception_flags >> 2)) == 0) {
 			return;
@@ -483,18 +479,18 @@
 	force_sig(SIGFPE, tsk);
 }
 
-BUILD_TRAP_HANDLER(fpu_state_restore)
+void fpu_state_restore(struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
-	TRAP_HANDLER_DECL;
 
 	grab_fpu(regs);
-	if (!user_mode(regs)) {
+	if (unlikely(!user_mode(regs))) {
 		printk(KERN_ERR "BUG: FPU is used in kernel mode.\n");
+		BUG();
 		return;
 	}
 
-	if (used_math()) {
+	if (likely(used_math())) {
 		/* Using the FPU again.  */
 		restore_fpu(tsk);
 	} else {
@@ -502,5 +498,13 @@
 		fpu_init();
 		set_used_math();
 	}
-	set_tsk_thread_flag(tsk, TIF_USEDFPU);
+	task_thread_info(tsk)->status |= TS_USEDFPU;
+	tsk->fpu_counter++;
+}
+
+BUILD_TRAP_HANDLER(fpu_state_restore)
+{
+	TRAP_HANDLER_DECL;
+
+	fpu_state_restore(regs);
 }
diff --git a/arch/sh/kernel/cpu/sh4/perf_event.c b/arch/sh/kernel/cpu/sh4/perf_event.c
new file mode 100644
index 0000000..7f9ecc9
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4/perf_event.c
@@ -0,0 +1,253 @@
+/*
+ * Performance events support for SH7750-style performance counters
+ *
+ *  Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/perf_event.h>
+#include <asm/processor.h>
+
+#define PM_CR_BASE	0xff000084	/* 16-bit */
+#define PM_CTR_BASE	0xff100004	/* 32-bit */
+
+#define PMCR(n)		(PM_CR_BASE + ((n) * 0x04))
+#define PMCTRH(n)	(PM_CTR_BASE + 0x00 + ((n) * 0x08))
+#define PMCTRL(n)	(PM_CTR_BASE + 0x04 + ((n) * 0x08))
+
+#define PMCR_PMM_MASK	0x0000003f
+
+#define PMCR_CLKF	0x00000100
+#define PMCR_PMCLR	0x00002000
+#define PMCR_PMST	0x00004000
+#define PMCR_PMEN	0x00008000
+
+static struct sh_pmu sh7750_pmu;
+
+/*
+ * There are a number of events supported by each counter (33 in total).
+ * Since we have 2 counters, each counter will take the event code as it
+ * corresponds to the PMCR PMM setting. Each counter can be configured
+ * independently.
+ *
+ *	Event Code	Description
+ *	----------	-----------
+ *
+ *	0x01		Operand read access
+ *	0x02		Operand write access
+ *	0x03		UTLB miss
+ *	0x04		Operand cache read miss
+ *	0x05		Operand cache write miss
+ *	0x06		Instruction fetch (w/ cache)
+ *	0x07		Instruction TLB miss
+ *	0x08		Instruction cache miss
+ *	0x09		All operand accesses
+ *	0x0a		All instruction accesses
+ *	0x0b		OC RAM operand access
+ *	0x0d		On-chip I/O space access
+ *	0x0e		Operand access (r/w)
+ *	0x0f		Operand cache miss (r/w)
+ *	0x10		Branch instruction
+ *	0x11		Branch taken
+ *	0x12		BSR/BSRF/JSR
+ *	0x13		Instruction execution
+ *	0x14		Instruction execution in parallel
+ *	0x15		FPU Instruction execution
+ *	0x16		Interrupt
+ *	0x17		NMI
+ *	0x18		trapa instruction execution
+ *	0x19		UBCA match
+ *	0x1a		UBCB match
+ *	0x21		Instruction cache fill
+ *	0x22		Operand cache fill
+ *	0x23		Elapsed time
+ *	0x24		Pipeline freeze by I-cache miss
+ *	0x25		Pipeline freeze by D-cache miss
+ *	0x27		Pipeline freeze by branch instruction
+ *	0x28		Pipeline freeze by CPU register
+ *	0x29		Pipeline freeze by FPU
+ */
+
+static const int sh7750_general_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x0023,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x000a,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0006,	/* I-cache */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x0008,	/* I-cache */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x0010,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= -1,
+	[PERF_COUNT_HW_BUS_CYCLES]		= -1,
+};
+
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+static const int sh7750_cache_events
+			[PERF_COUNT_HW_CACHE_MAX]
+			[PERF_COUNT_HW_CACHE_OP_MAX]
+			[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	[ C(L1D) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0001,
+			[ C(RESULT_MISS)   ] = 0x0004,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0002,
+			[ C(RESULT_MISS)   ] = 0x0005,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(L1I) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0006,
+			[ C(RESULT_MISS)   ] = 0x0008,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(LL) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(DTLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0x0003,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(ITLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0x0007,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+
+	[ C(BPU) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+};
+
+static int sh7750_event_map(int event)
+{
+	return sh7750_general_events[event];
+}
+
+static u64 sh7750_pmu_read(int idx)
+{
+	return (u64)((u64)(__raw_readl(PMCTRH(idx)) & 0xffff) << 32) |
+			   __raw_readl(PMCTRL(idx));
+}
+
+static void sh7750_pmu_disable(struct hw_perf_event *hwc, int idx)
+{
+	unsigned int tmp;
+
+	tmp = __raw_readw(PMCR(idx));
+	tmp &= ~(PMCR_PMM_MASK | PMCR_PMEN);
+	__raw_writew(tmp, PMCR(idx));
+}
+
+static void sh7750_pmu_enable(struct hw_perf_event *hwc, int idx)
+{
+	__raw_writew(__raw_readw(PMCR(idx)) | PMCR_PMCLR, PMCR(idx));
+	__raw_writew(hwc->config | PMCR_PMEN | PMCR_PMST, PMCR(idx));
+}
+
+static void sh7750_pmu_disable_all(void)
+{
+	int i;
+
+	for (i = 0; i < sh7750_pmu.num_events; i++)
+		__raw_writew(__raw_readw(PMCR(i)) & ~PMCR_PMEN, PMCR(i));
+}
+
+static void sh7750_pmu_enable_all(void)
+{
+	int i;
+
+	for (i = 0; i < sh7750_pmu.num_events; i++)
+		__raw_writew(__raw_readw(PMCR(i)) | PMCR_PMEN, PMCR(i));
+}
+
+static struct sh_pmu sh7750_pmu = {
+	.name		= "SH7750",
+	.num_events	= 2,
+	.event_map	= sh7750_event_map,
+	.max_events	= ARRAY_SIZE(sh7750_general_events),
+	.raw_event_mask	= PMCR_PMM_MASK,
+	.cache_events	= &sh7750_cache_events,
+	.read		= sh7750_pmu_read,
+	.disable	= sh7750_pmu_disable,
+	.enable		= sh7750_pmu_enable,
+	.disable_all	= sh7750_pmu_disable_all,
+	.enable_all	= sh7750_pmu_enable_all,
+};
+
+static int __init sh7750_pmu_init(void)
+{
+	/*
+	 * Make sure this CPU actually has perf counters.
+	 */
+	if (!(boot_cpu_data.flags & CPU_HAS_PERF_COUNTER)) {
+		pr_notice("HW perf events unsupported, software events only.\n");
+		return -ENODEV;
+	}
+
+	return register_sh_pmu(&sh7750_pmu);
+}
+arch_initcall(sh7750_pmu_init);
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 490d5dc..33bab47 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -44,3 +44,4 @@
 obj-y				+= $(clock-y)
 obj-$(CONFIG_SMP)		+= $(smp-y)
 obj-$(CONFIG_GENERIC_GPIO)	+= $(pinmux-y)
+obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
index dfe9192..9db7438 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
@@ -152,7 +152,7 @@
 	SH_CLK_DIV6("fsia_clk", &div3_clk, FCLKACR, 0),
 	SH_CLK_DIV6("fsib_clk", &div3_clk, FCLKBCR, 0),
 	SH_CLK_DIV6("irda_clk", &div3_clk, IRDACLKCR, 0),
-	SH_CLK_DIV6("spu_clk", &div3_clk, SPUCLKCR, 0),
+	SH_CLK_DIV6("spu_clk", &div3_clk, SPUCLKCR, CLK_ENABLE_ON_INIT),
 };
 
 #define R_CLK (&r_clk)
diff --git a/arch/sh/kernel/cpu/sh4a/perf_event.c b/arch/sh/kernel/cpu/sh4a/perf_event.c
new file mode 100644
index 0000000..eddc219
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/perf_event.c
@@ -0,0 +1,269 @@
+/*
+ * Performance events support for SH-4A performance counters
+ *
+ *  Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/perf_event.h>
+#include <asm/processor.h>
+
+#define PPC_CCBR(idx)	(0xff200800 + (sizeof(u32) * idx))
+#define PPC_PMCTR(idx)	(0xfc100000 + (sizeof(u32) * idx))
+
+#define CCBR_CIT_MASK	(0x7ff << 6)
+#define CCBR_DUC	(1 << 3)
+#define CCBR_CMDS	(1 << 1)
+#define CCBR_PPCE	(1 << 0)
+
+#define PPC_PMCAT	0xfc100080
+
+#define PMCAT_OVF3	(1 << 27)
+#define PMCAT_CNN3	(1 << 26)
+#define PMCAT_CLR3	(1 << 25)
+#define PMCAT_OVF2	(1 << 19)
+#define PMCAT_CLR2	(1 << 17)
+#define PMCAT_OVF1	(1 << 11)
+#define PMCAT_CNN1	(1 << 10)
+#define PMCAT_CLR1	(1 << 9)
+#define PMCAT_OVF0	(1 << 3)
+#define PMCAT_CLR0	(1 << 1)
+
+static struct sh_pmu sh4a_pmu;
+
+/*
+ * Supported raw event codes:
+ *
+ *	Event Code	Description
+ *	----------	-----------
+ *
+ *	0x0000		number of elapsed cycles
+ *	0x0200		number of elapsed cycles in privileged mode
+ *	0x0280		number of elapsed cycles while SR.BL is asserted
+ *	0x0202		instruction execution
+ *	0x0203		instruction execution in parallel
+ *	0x0204		number of unconditional branches
+ *	0x0208		number of exceptions
+ *	0x0209		number of interrupts
+ *	0x0220		UTLB miss caused by instruction fetch
+ *	0x0222		UTLB miss caused by operand access
+ *	0x02a0		number of ITLB misses
+ *	0x0028		number of accesses to instruction memories
+ *	0x0029		number of accesses to instruction cache
+ *	0x002a		instruction cache miss
+ *	0x022e		number of access to instruction X/Y memory
+ *	0x0030		number of reads to operand memories
+ *	0x0038		number of writes to operand memories
+ *	0x0031		number of operand cache read accesses
+ *	0x0039		number of operand cache write accesses
+ *	0x0032		operand cache read miss
+ *	0x003a		operand cache write miss
+ *	0x0236		number of reads to operand X/Y memory
+ *	0x023e		number of writes to operand X/Y memory
+ *	0x0237		number of reads to operand U memory
+ *	0x023f		number of writes to operand U memory
+ *	0x0337		number of U memory read buffer misses
+ *	0x02b4		number of wait cycles due to operand read access
+ *	0x02bc		number of wait cycles due to operand write access
+ *	0x0033		number of wait cycles due to operand cache read miss
+ *	0x003b		number of wait cycles due to operand cache write miss
+ */
+
+/*
+ * Special reserved bits used by hardware emulators, read values will
+ * vary, but writes must always be 0.
+ */
+#define PMCAT_EMU_CLR_MASK	((1 << 24) | (1 << 16) | (1 << 8) | (1 << 0))
+
+static const int sh4a_general_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x0000,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x0202,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0029,	/* I-cache */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x002a,	/* I-cache */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x0204,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= -1,
+	[PERF_COUNT_HW_BUS_CYCLES]		= -1,
+};
+
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+static const int sh4a_cache_events
+			[PERF_COUNT_HW_CACHE_MAX]
+			[PERF_COUNT_HW_CACHE_OP_MAX]
+			[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	[ C(L1D) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0031,
+			[ C(RESULT_MISS)   ] = 0x0032,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0039,
+			[ C(RESULT_MISS)   ] = 0x003a,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(L1I) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0029,
+			[ C(RESULT_MISS)   ] = 0x002a,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(LL) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0030,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0038,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(DTLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0x0222,
+			[ C(RESULT_MISS)   ] = 0x0220,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+
+	[ C(ITLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0x02a0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+
+	[ C(BPU) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+};
+
+static int sh4a_event_map(int event)
+{
+	return sh4a_general_events[event];
+}
+
+static u64 sh4a_pmu_read(int idx)
+{
+	return __raw_readl(PPC_PMCTR(idx));
+}
+
+static void sh4a_pmu_disable(struct hw_perf_event *hwc, int idx)
+{
+	unsigned int tmp;
+
+	tmp = __raw_readl(PPC_CCBR(idx));
+	tmp &= ~(CCBR_CIT_MASK | CCBR_DUC);
+	__raw_writel(tmp, PPC_CCBR(idx));
+}
+
+static void sh4a_pmu_enable(struct hw_perf_event *hwc, int idx)
+{
+	unsigned int tmp;
+
+	tmp = __raw_readl(PPC_PMCAT);
+	tmp &= ~PMCAT_EMU_CLR_MASK;
+	tmp |= idx ? PMCAT_CLR1 : PMCAT_CLR0;
+	__raw_writel(tmp, PPC_PMCAT);
+
+	tmp = __raw_readl(PPC_CCBR(idx));
+	tmp |= (hwc->config << 6) | CCBR_CMDS | CCBR_PPCE;
+	__raw_writel(tmp, PPC_CCBR(idx));
+
+	__raw_writel(__raw_readl(PPC_CCBR(idx)) | CCBR_DUC, PPC_CCBR(idx));
+}
+
+static void sh4a_pmu_disable_all(void)
+{
+	int i;
+
+	for (i = 0; i < sh4a_pmu.num_events; i++)
+		__raw_writel(__raw_readl(PPC_CCBR(i)) & ~CCBR_DUC, PPC_CCBR(i));
+}
+
+static void sh4a_pmu_enable_all(void)
+{
+	int i;
+
+	for (i = 0; i < sh4a_pmu.num_events; i++)
+		__raw_writel(__raw_readl(PPC_CCBR(i)) | CCBR_DUC, PPC_CCBR(i));
+}
+
+static struct sh_pmu sh4a_pmu = {
+	.name		= "SH-4A",
+	.num_events	= 2,
+	.event_map	= sh4a_event_map,
+	.max_events	= ARRAY_SIZE(sh4a_general_events),
+	.raw_event_mask	= 0x3ff,
+	.cache_events	= &sh4a_cache_events,
+	.read		= sh4a_pmu_read,
+	.disable	= sh4a_pmu_disable,
+	.enable		= sh4a_pmu_enable,
+	.disable_all	= sh4a_pmu_disable_all,
+	.enable_all	= sh4a_pmu_enable_all,
+};
+
+static int __init sh4a_pmu_init(void)
+{
+	/*
+	 * Make sure this CPU actually has perf counters.
+	 */
+	if (!(boot_cpu_data.flags & CPU_HAS_PERF_COUNTER)) {
+		pr_notice("HW perf events unsupported, software events only.\n");
+		return -ENODEV;
+	}
+
+	return register_sh_pmu(&sh4a_pmu);
+}
+arch_initcall(sh4a_pmu_init);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index f3851fd..845e89c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -20,6 +20,8 @@
 #include <linux/uio_driver.h>
 #include <linux/sh_timer.h>
 #include <linux/io.h>
+#include <linux/notifier.h>
+#include <asm/suspend.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 #include <cpu/sh7724.h>
@@ -202,7 +204,7 @@
 	[0] = {
 		.name	= "VEU3F0",
 		.start	= 0xfe920000,
-		.end	= 0xfe9200cb - 1,
+		.end	= 0xfe9200cb,
 		.flags	= IORESOURCE_MEM,
 	},
 	[1] = {
@@ -234,7 +236,7 @@
 	[0] = {
 		.name	= "VEU3F1",
 		.start	= 0xfe924000,
-		.end	= 0xfe9240cb - 1,
+		.end	= 0xfe9240cb,
 		.flags	= IORESOURCE_MEM,
 	},
 	[1] = {
@@ -523,6 +525,70 @@
 	},
 };
 
+/* SPU2DSP0 */
+static struct uio_info spu0_platform_data = {
+	.name = "SPU2DSP0",
+	.version = "0",
+	.irq = 86,
+};
+
+static struct resource spu0_resources[] = {
+	[0] = {
+		.name	= "SPU2DSP0",
+		.start	= 0xFE200000,
+		.end	= 0xFE2FFFFF,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device spu0_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &spu0_platform_data,
+	},
+	.resource	= spu0_resources,
+	.num_resources	= ARRAY_SIZE(spu0_resources),
+	.archdata = {
+		.hwblk_id = HWBLK_SPU,
+	},
+};
+
+/* SPU2DSP1 */
+static struct uio_info spu1_platform_data = {
+	.name = "SPU2DSP1",
+	.version = "0",
+	.irq = 87,
+};
+
+static struct resource spu1_resources[] = {
+	[0] = {
+		.name	= "SPU2DSP1",
+		.start	= 0xFE300000,
+		.end	= 0xFE3FFFFF,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device spu1_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &spu1_platform_data,
+	},
+	.resource	= spu1_resources,
+	.num_resources	= ARRAY_SIZE(spu1_resources),
+	.archdata = {
+		.hwblk_id = HWBLK_SPU,
+	},
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&cmt_device,
 	&tmu0_device,
@@ -539,6 +605,8 @@
 	&veu0_device,
 	&veu1_device,
 	&jpu_device,
+	&spu0_device,
+	&spu1_device,
 };
 
 static int __init sh7724_devices_setup(void)
@@ -547,6 +615,8 @@
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
 	platform_resource_setup_memory(&jpu_device,  "jpu",  2 << 20);
+	platform_resource_setup_memory(&spu0_device, "spu0", 2 << 20);
+	platform_resource_setup_memory(&spu1_device, "spu1", 2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
@@ -827,3 +897,193 @@
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct {
+	/* BSC */
+	unsigned long mmselr;
+	unsigned long cs0bcr;
+	unsigned long cs4bcr;
+	unsigned long cs5abcr;
+	unsigned long cs5bbcr;
+	unsigned long cs6abcr;
+	unsigned long cs6bbcr;
+	unsigned long cs4wcr;
+	unsigned long cs5awcr;
+	unsigned long cs5bwcr;
+	unsigned long cs6awcr;
+	unsigned long cs6bwcr;
+	/* INTC */
+	unsigned short ipra;
+	unsigned short iprb;
+	unsigned short iprc;
+	unsigned short iprd;
+	unsigned short ipre;
+	unsigned short iprf;
+	unsigned short iprg;
+	unsigned short iprh;
+	unsigned short ipri;
+	unsigned short iprj;
+	unsigned short iprk;
+	unsigned short iprl;
+	unsigned char imr0;
+	unsigned char imr1;
+	unsigned char imr2;
+	unsigned char imr3;
+	unsigned char imr4;
+	unsigned char imr5;
+	unsigned char imr6;
+	unsigned char imr7;
+	unsigned char imr8;
+	unsigned char imr9;
+	unsigned char imr10;
+	unsigned char imr11;
+	unsigned char imr12;
+	/* RWDT */
+	unsigned short rwtcnt;
+	unsigned short rwtcsr;
+	/* CPG */
+	unsigned long irdaclk;
+	unsigned long spuclk;
+} sh7724_rstandby_state;
+
+static int sh7724_pre_sleep_notifier_call(struct notifier_block *nb,
+					  unsigned long flags, void *unused)
+{
+	if (!(flags & SUSP_SH_RSTANDBY))
+		return NOTIFY_DONE;
+
+	/* BCR */
+	sh7724_rstandby_state.mmselr = __raw_readl(0xff800020); /* MMSELR */
+	sh7724_rstandby_state.mmselr |= 0xa5a50000;
+	sh7724_rstandby_state.cs0bcr = __raw_readl(0xfec10004); /* CS0BCR */
+	sh7724_rstandby_state.cs4bcr = __raw_readl(0xfec10010); /* CS4BCR */
+	sh7724_rstandby_state.cs5abcr = __raw_readl(0xfec10014); /* CS5ABCR */
+	sh7724_rstandby_state.cs5bbcr = __raw_readl(0xfec10018); /* CS5BBCR */
+	sh7724_rstandby_state.cs6abcr = __raw_readl(0xfec1001c); /* CS6ABCR */
+	sh7724_rstandby_state.cs6bbcr = __raw_readl(0xfec10020); /* CS6BBCR */
+	sh7724_rstandby_state.cs4wcr = __raw_readl(0xfec10030); /* CS4WCR */
+	sh7724_rstandby_state.cs5awcr = __raw_readl(0xfec10034); /* CS5AWCR */
+	sh7724_rstandby_state.cs5bwcr = __raw_readl(0xfec10038); /* CS5BWCR */
+	sh7724_rstandby_state.cs6awcr = __raw_readl(0xfec1003c); /* CS6AWCR */
+	sh7724_rstandby_state.cs6bwcr = __raw_readl(0xfec10040); /* CS6BWCR */
+
+	/* INTC */
+	sh7724_rstandby_state.ipra = __raw_readw(0xa4080000); /* IPRA */
+	sh7724_rstandby_state.iprb = __raw_readw(0xa4080004); /* IPRB */
+	sh7724_rstandby_state.iprc = __raw_readw(0xa4080008); /* IPRC */
+	sh7724_rstandby_state.iprd = __raw_readw(0xa408000c); /* IPRD */
+	sh7724_rstandby_state.ipre = __raw_readw(0xa4080010); /* IPRE */
+	sh7724_rstandby_state.iprf = __raw_readw(0xa4080014); /* IPRF */
+	sh7724_rstandby_state.iprg = __raw_readw(0xa4080018); /* IPRG */
+	sh7724_rstandby_state.iprh = __raw_readw(0xa408001c); /* IPRH */
+	sh7724_rstandby_state.ipri = __raw_readw(0xa4080020); /* IPRI */
+	sh7724_rstandby_state.iprj = __raw_readw(0xa4080024); /* IPRJ */
+	sh7724_rstandby_state.iprk = __raw_readw(0xa4080028); /* IPRK */
+	sh7724_rstandby_state.iprl = __raw_readw(0xa408002c); /* IPRL */
+	sh7724_rstandby_state.imr0 = __raw_readb(0xa4080080); /* IMR0 */
+	sh7724_rstandby_state.imr1 = __raw_readb(0xa4080084); /* IMR1 */
+	sh7724_rstandby_state.imr2 = __raw_readb(0xa4080088); /* IMR2 */
+	sh7724_rstandby_state.imr3 = __raw_readb(0xa408008c); /* IMR3 */
+	sh7724_rstandby_state.imr4 = __raw_readb(0xa4080090); /* IMR4 */
+	sh7724_rstandby_state.imr5 = __raw_readb(0xa4080094); /* IMR5 */
+	sh7724_rstandby_state.imr6 = __raw_readb(0xa4080098); /* IMR6 */
+	sh7724_rstandby_state.imr7 = __raw_readb(0xa408009c); /* IMR7 */
+	sh7724_rstandby_state.imr8 = __raw_readb(0xa40800a0); /* IMR8 */
+	sh7724_rstandby_state.imr9 = __raw_readb(0xa40800a4); /* IMR9 */
+	sh7724_rstandby_state.imr10 = __raw_readb(0xa40800a8); /* IMR10 */
+	sh7724_rstandby_state.imr11 = __raw_readb(0xa40800ac); /* IMR11 */
+	sh7724_rstandby_state.imr12 = __raw_readb(0xa40800b0); /* IMR12 */
+
+	/* RWDT */
+	sh7724_rstandby_state.rwtcnt = __raw_readb(0xa4520000); /* RWTCNT */
+	sh7724_rstandby_state.rwtcnt |= 0x5a00;
+	sh7724_rstandby_state.rwtcsr = __raw_readb(0xa4520004); /* RWTCSR */
+	sh7724_rstandby_state.rwtcsr |= 0xa500;
+	__raw_writew(sh7724_rstandby_state.rwtcsr & 0x07, 0xa4520004);
+
+	/* CPG */
+	sh7724_rstandby_state.irdaclk = __raw_readl(0xa4150018); /* IRDACLKCR */
+	sh7724_rstandby_state.spuclk = __raw_readl(0xa415003c); /* SPUCLKCR */
+
+	return NOTIFY_DONE;
+}
+
+static int sh7724_post_sleep_notifier_call(struct notifier_block *nb,
+					   unsigned long flags, void *unused)
+{
+	if (!(flags & SUSP_SH_RSTANDBY))
+		return NOTIFY_DONE;
+
+	/* BCR */
+	__raw_writel(sh7724_rstandby_state.mmselr, 0xff800020); /* MMSELR */
+	__raw_writel(sh7724_rstandby_state.cs0bcr, 0xfec10004); /* CS0BCR */
+	__raw_writel(sh7724_rstandby_state.cs4bcr, 0xfec10010); /* CS4BCR */
+	__raw_writel(sh7724_rstandby_state.cs5abcr, 0xfec10014); /* CS5ABCR */
+	__raw_writel(sh7724_rstandby_state.cs5bbcr, 0xfec10018); /* CS5BBCR */
+	__raw_writel(sh7724_rstandby_state.cs6abcr, 0xfec1001c); /* CS6ABCR */
+	__raw_writel(sh7724_rstandby_state.cs6bbcr, 0xfec10020); /* CS6BBCR */
+	__raw_writel(sh7724_rstandby_state.cs4wcr, 0xfec10030); /* CS4WCR */
+	__raw_writel(sh7724_rstandby_state.cs5awcr, 0xfec10034); /* CS5AWCR */
+	__raw_writel(sh7724_rstandby_state.cs5bwcr, 0xfec10038); /* CS5BWCR */
+	__raw_writel(sh7724_rstandby_state.cs6awcr, 0xfec1003c); /* CS6AWCR */
+	__raw_writel(sh7724_rstandby_state.cs6bwcr, 0xfec10040); /* CS6BWCR */
+
+	/* INTC */
+	__raw_writew(sh7724_rstandby_state.ipra, 0xa4080000); /* IPRA */
+	__raw_writew(sh7724_rstandby_state.iprb, 0xa4080004); /* IPRB */
+	__raw_writew(sh7724_rstandby_state.iprc, 0xa4080008); /* IPRC */
+	__raw_writew(sh7724_rstandby_state.iprd, 0xa408000c); /* IPRD */
+	__raw_writew(sh7724_rstandby_state.ipre, 0xa4080010); /* IPRE */
+	__raw_writew(sh7724_rstandby_state.iprf, 0xa4080014); /* IPRF */
+	__raw_writew(sh7724_rstandby_state.iprg, 0xa4080018); /* IPRG */
+	__raw_writew(sh7724_rstandby_state.iprh, 0xa408001c); /* IPRH */
+	__raw_writew(sh7724_rstandby_state.ipri, 0xa4080020); /* IPRI */
+	__raw_writew(sh7724_rstandby_state.iprj, 0xa4080024); /* IPRJ */
+	__raw_writew(sh7724_rstandby_state.iprk, 0xa4080028); /* IPRK */
+	__raw_writew(sh7724_rstandby_state.iprl, 0xa408002c); /* IPRL */
+	__raw_writeb(sh7724_rstandby_state.imr0, 0xa4080080); /* IMR0 */
+	__raw_writeb(sh7724_rstandby_state.imr1, 0xa4080084); /* IMR1 */
+	__raw_writeb(sh7724_rstandby_state.imr2, 0xa4080088); /* IMR2 */
+	__raw_writeb(sh7724_rstandby_state.imr3, 0xa408008c); /* IMR3 */
+	__raw_writeb(sh7724_rstandby_state.imr4, 0xa4080090); /* IMR4 */
+	__raw_writeb(sh7724_rstandby_state.imr5, 0xa4080094); /* IMR5 */
+	__raw_writeb(sh7724_rstandby_state.imr6, 0xa4080098); /* IMR6 */
+	__raw_writeb(sh7724_rstandby_state.imr7, 0xa408009c); /* IMR7 */
+	__raw_writeb(sh7724_rstandby_state.imr8, 0xa40800a0); /* IMR8 */
+	__raw_writeb(sh7724_rstandby_state.imr9, 0xa40800a4); /* IMR9 */
+	__raw_writeb(sh7724_rstandby_state.imr10, 0xa40800a8); /* IMR10 */
+	__raw_writeb(sh7724_rstandby_state.imr11, 0xa40800ac); /* IMR11 */
+	__raw_writeb(sh7724_rstandby_state.imr12, 0xa40800b0); /* IMR12 */
+
+	/* RWDT */
+	__raw_writew(sh7724_rstandby_state.rwtcnt, 0xa4520000); /* RWTCNT */
+	__raw_writew(sh7724_rstandby_state.rwtcsr, 0xa4520004); /* RWTCSR */
+
+	/* CPG */
+	__raw_writel(sh7724_rstandby_state.irdaclk, 0xa4150018); /* IRDACLKCR */
+	__raw_writel(sh7724_rstandby_state.spuclk, 0xa415003c); /* SPUCLKCR */
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block sh7724_pre_sleep_notifier = {
+	.notifier_call = sh7724_pre_sleep_notifier_call,
+	.priority = SH_MOBILE_PRE(SH_MOBILE_SLEEP_CPU),
+};
+
+static struct notifier_block sh7724_post_sleep_notifier = {
+	.notifier_call = sh7724_post_sleep_notifier_call,
+	.priority = SH_MOBILE_POST(SH_MOBILE_SLEEP_CPU),
+};
+
+static int __init sh7724_sleep_setup(void)
+{
+	atomic_notifier_chain_register(&sh_mobile_pre_sleep_notifier_list,
+				       &sh7724_pre_sleep_notifier);
+
+	atomic_notifier_chain_register(&sh_mobile_post_sleep_notifier_list,
+				       &sh7724_post_sleep_notifier);
+	return 0;
+}
+arch_initcall(sh7724_sleep_setup);
+
diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index e848443..c7ba916 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -15,6 +15,15 @@
 #include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
+/*
+ * This intentionally only registers SCIF ports 0, 1, and 3. SCIF 2
+ * INTEVT values overlap with the FPU EXPEVT ones, requiring special
+ * demuxing in the exception dispatch path.
+ *
+ * As this overlap is something that never should have made it in to
+ * silicon in the first place, we just refuse to deal with the port at
+ * all rather than adding infrastructure to hack around it.
+ */
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffc30000,
@@ -27,11 +36,6 @@
 		.type		= PORT_SCIF,
 		.irqs		= { 44, 45, 47, 46 },
 	}, {
-		.mapbase	= 0xffc50000,
-		.flags		= UPF_BOOT_AUTOCONF,
-		.type		= PORT_SCIF,
-		.irqs		= { 48, 49, 51, 50 },
-	}, {
 		.mapbase	= 0xffc60000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
@@ -268,7 +272,11 @@
 	UNUSED = 0,
 
 	/* interrupt sources */
-	IRL, IRQ0, IRQ1, IRQ2, IRQ3,
+	IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	IRL_HHLL, IRL_HHLH, IRL_HHHL,
+	IRQ0, IRQ1, IRQ2, IRQ3,
 	HUDII,
 	TMU0, TMU1, TMU2, TMU3, TMU4, TMU5,
 	PCII0, PCII1, PCII2, PCII3, PCII4,
@@ -291,7 +299,7 @@
 	INTICI4, INTICI5, INTICI6, INTICI7,
 
 	/* interrupt groups */
-	PCII56789, SCIF0, SCIF1, SCIF2, SCIF3,
+	IRL, PCII56789, SCIF0, SCIF1, SCIF2, SCIF3,
 	DMAC0, DMAC1,
 };
 
@@ -309,8 +317,6 @@
 	INTC_VECT(SCIF0_BRI, 0x740), INTC_VECT(SCIF0_TXI, 0x760),
 	INTC_VECT(SCIF1_ERI, 0x780), INTC_VECT(SCIF1_RXI, 0x7a0),
 	INTC_VECT(SCIF1_BRI, 0x7c0), INTC_VECT(SCIF1_TXI, 0x7e0),
-	INTC_VECT(SCIF2_ERI, 0x800), INTC_VECT(SCIF2_RXI, 0x820),
-	INTC_VECT(SCIF2_BRI, 0x840), INTC_VECT(SCIF2_TXI, 0x860),
 	INTC_VECT(SCIF3_ERI, 0x880), INTC_VECT(SCIF3_RXI, 0x8a0),
 	INTC_VECT(SCIF3_BRI, 0x8c0), INTC_VECT(SCIF3_TXI, 0x8e0),
 	INTC_VECT(DMAC0_DMINT0, 0x900), INTC_VECT(DMAC0_DMINT1, 0x920),
@@ -344,10 +350,13 @@
 };
 
 static struct intc_group groups[] __initdata = {
+	INTC_GROUP(IRL, IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+		   IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+		   IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+		   IRL_HHLL, IRL_HHLH, IRL_HHHL),
 	INTC_GROUP(PCII56789, PCII5, PCII6, PCII7, PCII8, PCII9),
 	INTC_GROUP(SCIF0, SCIF0_ERI, SCIF0_RXI, SCIF0_BRI, SCIF0_TXI),
 	INTC_GROUP(SCIF1, SCIF1_ERI, SCIF1_RXI, SCIF1_BRI, SCIF1_TXI),
-	INTC_GROUP(SCIF2, SCIF2_ERI, SCIF2_RXI, SCIF2_BRI, SCIF2_TXI),
 	INTC_GROUP(SCIF3, SCIF3_ERI, SCIF3_RXI, SCIF3_BRI, SCIF3_TXI),
 	INTC_GROUP(DMAC0, DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2,
 		   DMAC0_DMINT3, DMAC0_DMINT4, DMAC0_DMINT5, DMAC0_DMAE),
@@ -419,14 +428,14 @@
 
 /* External interrupt pins in IRL mode */
 static struct intc_vect vectors_irl[] __initdata = {
-	INTC_VECT(IRL, 0x200), INTC_VECT(IRL, 0x220),
-	INTC_VECT(IRL, 0x240), INTC_VECT(IRL, 0x260),
-	INTC_VECT(IRL, 0x280), INTC_VECT(IRL, 0x2a0),
-	INTC_VECT(IRL, 0x2c0), INTC_VECT(IRL, 0x2e0),
-	INTC_VECT(IRL, 0x300), INTC_VECT(IRL, 0x320),
-	INTC_VECT(IRL, 0x340), INTC_VECT(IRL, 0x360),
-	INTC_VECT(IRL, 0x380), INTC_VECT(IRL, 0x3a0),
-	INTC_VECT(IRL, 0x3c0),
+	INTC_VECT(IRL_LLLL, 0x200), INTC_VECT(IRL_LLLH, 0x220),
+	INTC_VECT(IRL_LLHL, 0x240), INTC_VECT(IRL_LLHH, 0x260),
+	INTC_VECT(IRL_LHLL, 0x280), INTC_VECT(IRL_LHLH, 0x2a0),
+	INTC_VECT(IRL_LHHL, 0x2c0), INTC_VECT(IRL_LHHH, 0x2e0),
+	INTC_VECT(IRL_HLLL, 0x300), INTC_VECT(IRL_HLLH, 0x320),
+	INTC_VECT(IRL_HLHL, 0x340), INTC_VECT(IRL_HLHH, 0x360),
+	INTC_VECT(IRL_HHLL, 0x380), INTC_VECT(IRL_HHLH, 0x3a0),
+	INTC_VECT(IRL_HHHL, 0x3c0),
 };
 
 static DECLARE_INTC_DESC(intc_desc_irl, "shx3-irl", vectors_irl, groups,
diff --git a/arch/sh/kernel/cpu/sh4a/smp-shx3.c b/arch/sh/kernel/cpu/sh4a/smp-shx3.c
index 185ec39..5863e0c 100644
--- a/arch/sh/kernel/cpu/sh4a/smp-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/smp-shx3.c
@@ -14,6 +14,13 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 
+#define STBCR_REG(phys_id) (0xfe400004 | (phys_id << 12))
+#define RESET_REG(phys_id) (0xfe400008 | (phys_id << 12))
+
+#define STBCR_MSTP	0x00000001
+#define STBCR_RESET	0x00000002
+#define STBCR_LTSLP	0x80000000
+
 static irqreturn_t ipi_interrupt_handler(int irq, void *arg)
 {
 	unsigned int message = (unsigned int)(long)arg;
@@ -21,9 +28,9 @@
 	unsigned int offs = 4 * cpu;
 	unsigned int x;
 
-	x = ctrl_inl(0xfe410070 + offs); /* C0INITICI..CnINTICI */
+	x = __raw_readl(0xfe410070 + offs); /* C0INITICI..CnINTICI */
 	x &= (1 << (message << 2));
-	ctrl_outl(x, 0xfe410080 + offs); /* C0INTICICLR..CnINTICICLR */
+	__raw_writel(x, 0xfe410080 + offs); /* C0INTICICLR..CnINTICICLR */
 
 	smp_message_recv(message);
 
@@ -37,6 +44,9 @@
 
 	init_cpu_possible(cpumask_of(cpu));
 
+	/* Enable light sleep for the boot CPU */
+	__raw_writel(__raw_readl(STBCR_REG(cpu)) | STBCR_LTSLP, STBCR_REG(cpu));
+
 	__cpu_number_map[0] = 0;
 	__cpu_logical_map[0] = 0;
 
@@ -66,32 +76,23 @@
 			    "IPI", (void *)(long)i);
 }
 
-#define STBCR_REG(phys_id) (0xfe400004 | (phys_id << 12))
-#define RESET_REG(phys_id) (0xfe400008 | (phys_id << 12))
-
-#define STBCR_MSTP	0x00000001
-#define STBCR_RESET	0x00000002
-#define STBCR_LTSLP	0x80000000
-
-#define STBCR_AP_VAL	(STBCR_RESET | STBCR_LTSLP)
-
 void plat_start_cpu(unsigned int cpu, unsigned long entry_point)
 {
-	ctrl_outl(entry_point, RESET_REG(cpu));
+	__raw_writel(entry_point, RESET_REG(cpu));
 
-	if (!(ctrl_inl(STBCR_REG(cpu)) & STBCR_MSTP))
-		ctrl_outl(STBCR_MSTP, STBCR_REG(cpu));
+	if (!(__raw_readl(STBCR_REG(cpu)) & STBCR_MSTP))
+		__raw_writel(STBCR_MSTP, STBCR_REG(cpu));
 
-	while (!(ctrl_inl(STBCR_REG(cpu)) & STBCR_MSTP))
+	while (!(__raw_readl(STBCR_REG(cpu)) & STBCR_MSTP))
 		cpu_relax();
 
 	/* Start up secondary processor by sending a reset */
-	ctrl_outl(STBCR_AP_VAL, STBCR_REG(cpu));
+	__raw_writel(STBCR_RESET | STBCR_LTSLP, STBCR_REG(cpu));
 }
 
 int plat_smp_processor_id(void)
 {
-	return ctrl_inl(0xff000048); /* CPIDR */
+	return __raw_readl(0xff000048); /* CPIDR */
 }
 
 void plat_send_ipi(unsigned int cpu, unsigned int message)
@@ -100,5 +101,5 @@
 
 	BUG_ON(cpu >= 4);
 
-	ctrl_outl(1 << (message << 2), addr); /* C0INTICI..CnINTICI */
+	__raw_writel(1 << (message << 2), addr); /* C0INTICI..CnINTICI */
 }
diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S
index b0aacf6..8f13f73 100644
--- a/arch/sh/kernel/cpu/sh5/entry.S
+++ b/arch/sh/kernel/cpu/sh5/entry.S
@@ -933,7 +933,7 @@
 
 	pta	restore_all, tr1
 
-	movi	(_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK), r8
+	movi	_TIF_SIGPENDING, r8
 	and	r8, r7, r8
 	pta	work_notifysig, tr0
 	bne	r8, ZERO, tr0
diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index 1c504bd..83972aa 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -87,25 +87,31 @@
 
 	dev->safe_state = state;
 
-	state = &dev->states[i++];
-	snprintf(state->name, CPUIDLE_NAME_LEN, "C1");
-	strncpy(state->desc, "SuperH Sleep Mode [SF]", CPUIDLE_DESC_LEN);
-	state->exit_latency = 100;
-	state->target_residency = 1 * 2;
-	state->power_usage = 1;
-	state->flags = 0;
-	state->flags |= CPUIDLE_FLAG_TIME_VALID;
-	state->enter = cpuidle_sleep_enter;
+	if (sh_mobile_sleep_supported & SUSP_SH_SF) {
+		state = &dev->states[i++];
+		snprintf(state->name, CPUIDLE_NAME_LEN, "C1");
+		strncpy(state->desc, "SuperH Sleep Mode [SF]",
+			CPUIDLE_DESC_LEN);
+		state->exit_latency = 100;
+		state->target_residency = 1 * 2;
+		state->power_usage = 1;
+		state->flags = 0;
+		state->flags |= CPUIDLE_FLAG_TIME_VALID;
+		state->enter = cpuidle_sleep_enter;
+	}
 
-	state = &dev->states[i++];
-	snprintf(state->name, CPUIDLE_NAME_LEN, "C2");
-	strncpy(state->desc, "SuperH Mobile Standby Mode [SF]", CPUIDLE_DESC_LEN);
-	state->exit_latency = 2300;
-	state->target_residency = 1 * 2;
-	state->power_usage = 1;
-	state->flags = 0;
-	state->flags |= CPUIDLE_FLAG_TIME_VALID;
-	state->enter = cpuidle_sleep_enter;
+	if (sh_mobile_sleep_supported & SUSP_SH_STANDBY) {
+		state = &dev->states[i++];
+		snprintf(state->name, CPUIDLE_NAME_LEN, "C2");
+		strncpy(state->desc, "SuperH Mobile Standby Mode [SF]",
+			CPUIDLE_DESC_LEN);
+		state->exit_latency = 2300;
+		state->target_residency = 1 * 2;
+		state->power_usage = 1;
+		state->flags = 0;
+		state->flags |= CPUIDLE_FLAG_TIME_VALID;
+		state->enter = cpuidle_sleep_enter;
+	}
 
 	dev->state_count = i;
 
diff --git a/arch/sh/kernel/cpu/shmobile/pm.c b/arch/sh/kernel/cpu/shmobile/pm.c
index ee3c2aa..ca029a4 100644
--- a/arch/sh/kernel/cpu/shmobile/pm.c
+++ b/arch/sh/kernel/cpu/shmobile/pm.c
@@ -15,6 +15,13 @@
 #include <linux/suspend.h>
 #include <asm/suspend.h>
 #include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Notifier lists for pre/post sleep notification
+ */
+ATOMIC_NOTIFIER_HEAD(sh_mobile_pre_sleep_notifier_list);
+ATOMIC_NOTIFIER_HEAD(sh_mobile_post_sleep_notifier_list);
 
 /*
  * Sleep modes available on SuperH Mobile:
@@ -26,30 +33,105 @@
 #define SUSP_MODE_SLEEP		(SUSP_SH_SLEEP)
 #define SUSP_MODE_SLEEP_SF	(SUSP_SH_SLEEP | SUSP_SH_SF)
 #define SUSP_MODE_STANDBY_SF	(SUSP_SH_STANDBY | SUSP_SH_SF)
+#define SUSP_MODE_RSTANDBY	(SUSP_SH_RSTANDBY | SUSP_SH_MMU | SUSP_SH_SF)
+ /*
+  * U-standby mode is unsupported since it needs bootloader hacks
+  */
 
-/*
- * The following modes are not there yet:
- *
- * R-standby mode is unsupported, but will be added in the future
- * U-standby mode is low priority since it needs bootloader hacks
- */
-
-#define ILRAM_BASE 0xe5200000
-
-extern const unsigned char sh_mobile_standby[];
-extern const unsigned int sh_mobile_standby_size;
+#ifdef CONFIG_CPU_SUBTYPE_SH7724
+#define RAM_BASE 0xfd800000 /* RSMEM */
+#else
+#define RAM_BASE 0xe5200000 /* ILRAM */
+#endif
 
 void sh_mobile_call_standby(unsigned long mode)
 {
-	void *onchip_mem = (void *)ILRAM_BASE;
-	void (*standby_onchip_mem)(unsigned long, unsigned long) = onchip_mem;
+	void *onchip_mem = (void *)RAM_BASE;
+	struct sh_sleep_data *sdp = onchip_mem;
+	void (*standby_onchip_mem)(unsigned long, unsigned long);
+
+	/* code located directly after data structure */
+	standby_onchip_mem = (void *)(sdp + 1);
+
+	atomic_notifier_call_chain(&sh_mobile_pre_sleep_notifier_list,
+				   mode, NULL);
+
+	/* flush the caches if MMU flag is set */
+	if (mode & SUSP_SH_MMU)
+		flush_cache_all();
 
 	/* Let assembly snippet in on-chip memory handle the rest */
-	standby_onchip_mem(mode, ILRAM_BASE);
+	standby_onchip_mem(mode, RAM_BASE);
+
+	atomic_notifier_call_chain(&sh_mobile_post_sleep_notifier_list,
+				   mode, NULL);
+}
+
+extern char sh_mobile_sleep_enter_start;
+extern char sh_mobile_sleep_enter_end;
+
+extern char sh_mobile_sleep_resume_start;
+extern char sh_mobile_sleep_resume_end;
+
+unsigned long sh_mobile_sleep_supported = SUSP_SH_SLEEP;
+
+void sh_mobile_register_self_refresh(unsigned long flags,
+				     void *pre_start, void *pre_end,
+				     void *post_start, void *post_end)
+{
+	void *onchip_mem = (void *)RAM_BASE;
+	void *vp;
+	struct sh_sleep_data *sdp;
+	int n;
+
+	/* part 0: data area */
+	sdp = onchip_mem;
+	sdp->addr.stbcr = 0xa4150020; /* STBCR */
+	sdp->addr.bar = 0xa4150040; /* BAR */
+	sdp->addr.pteh = 0xff000000; /* PTEH */
+	sdp->addr.ptel = 0xff000004; /* PTEL */
+	sdp->addr.ttb = 0xff000008; /* TTB */
+	sdp->addr.tea = 0xff00000c; /* TEA */
+	sdp->addr.mmucr = 0xff000010; /* MMUCR */
+	sdp->addr.ptea = 0xff000034; /* PTEA */
+	sdp->addr.pascr = 0xff000070; /* PASCR */
+	sdp->addr.irmcr = 0xff000078; /* IRMCR */
+	sdp->addr.ccr = 0xff00001c; /* CCR */
+	sdp->addr.ramcr = 0xff000074; /* RAMCR */
+	vp = sdp + 1;
+
+	/* part 1: common code to enter sleep mode */
+	n = &sh_mobile_sleep_enter_end - &sh_mobile_sleep_enter_start;
+	memcpy(vp, &sh_mobile_sleep_enter_start, n);
+	vp += roundup(n, 4);
+
+	/* part 2: board specific code to enter self-refresh mode */
+	n = pre_end - pre_start;
+	memcpy(vp, pre_start, n);
+	sdp->sf_pre = (unsigned long)vp;
+	vp += roundup(n, 4);
+
+	/* part 3: board specific code to resume from self-refresh mode */
+	n = post_end - post_start;
+	memcpy(vp, post_start, n);
+	sdp->sf_post = (unsigned long)vp;
+	vp += roundup(n, 4);
+
+	/* part 4: common code to resume from sleep mode */
+	WARN_ON(vp > (onchip_mem + 0x600));
+	vp = onchip_mem + 0x600; /* located at interrupt vector */
+	n = &sh_mobile_sleep_resume_end - &sh_mobile_sleep_resume_start;
+	memcpy(vp, &sh_mobile_sleep_resume_start, n);
+	sdp->resume = (unsigned long)vp;
+
+	sh_mobile_sleep_supported |= flags;
 }
 
 static int sh_pm_enter(suspend_state_t state)
 {
+	if (!(sh_mobile_sleep_supported & SUSP_MODE_STANDBY_SF))
+		return -ENXIO;
+
 	local_irq_disable();
 	set_bl_bit();
 	sh_mobile_call_standby(SUSP_MODE_STANDBY_SF);
@@ -65,13 +147,6 @@
 
 static int __init sh_pm_init(void)
 {
-	void *onchip_mem = (void *)ILRAM_BASE;
-
-	/* Copy the assembly snippet to the otherwise ununsed ILRAM */
-	memcpy(onchip_mem, sh_mobile_standby, sh_mobile_standby_size);
-	wmb();
-	ctrl_barrier();
-
 	suspend_set_ops(&sh_pm_ops);
 	sh_mobile_setup_cpuidle();
 	return 0;
diff --git a/arch/sh/kernel/cpu/shmobile/pm_runtime.c b/arch/sh/kernel/cpu/shmobile/pm_runtime.c
index 7c615b1..6dcb816 100644
--- a/arch/sh/kernel/cpu/shmobile/pm_runtime.c
+++ b/arch/sh/kernel/cpu/shmobile/pm_runtime.c
@@ -45,12 +45,14 @@
 
 	dev_dbg(d, "__platform_pm_runtime_resume() [%d]\n", hwblk);
 
-	if (d->driver && d->driver->pm && d->driver->pm->runtime_resume) {
+	if (d->driver) {
 		hwblk_enable(hwblk_info, hwblk);
 		ret = 0;
 
 		if (test_bit(PDEV_ARCHDATA_FLAG_SUSP, &ad->flags)) {
-			ret = d->driver->pm->runtime_resume(d);
+			if (d->driver->pm && d->driver->pm->runtime_resume)
+				ret = d->driver->pm->runtime_resume(d);
+
 			if (!ret)
 				clear_bit(PDEV_ARCHDATA_FLAG_SUSP, &ad->flags);
 			else
@@ -73,12 +75,15 @@
 
 	dev_dbg(d, "__platform_pm_runtime_suspend() [%d]\n", hwblk);
 
-	if (d->driver && d->driver->pm && d->driver->pm->runtime_suspend) {
+	if (d->driver) {
 		BUG_ON(!test_bit(PDEV_ARCHDATA_FLAG_IDLE, &ad->flags));
+		ret = 0;
 
-		hwblk_enable(hwblk_info, hwblk);
-		ret = d->driver->pm->runtime_suspend(d);
-		hwblk_disable(hwblk_info, hwblk);
+		if (d->driver->pm && d->driver->pm->runtime_suspend) {
+			hwblk_enable(hwblk_info, hwblk);
+			ret = d->driver->pm->runtime_suspend(d);
+			hwblk_disable(hwblk_info, hwblk);
+		}
 
 		if (!ret) {
 			set_bit(PDEV_ARCHDATA_FLAG_SUSP, &ad->flags);
diff --git a/arch/sh/kernel/cpu/shmobile/sleep.S b/arch/sh/kernel/cpu/shmobile/sleep.S
index a439e6c..e9dd7fa 100644
--- a/arch/sh/kernel/cpu/shmobile/sleep.S
+++ b/arch/sh/kernel/cpu/shmobile/sleep.S
@@ -20,79 +20,103 @@
  * Kernel mode register usage, see entry.S:
  *	k0	scratch
  *	k1	scratch
- *	k4	scratch
  */
 #define k0	r0
 #define k1	r1
-#define k4	r4
 
-/* manage self-refresh and enter standby mode.
+/* manage self-refresh and enter standby mode. must be self-contained.
  * this code will be copied to on-chip memory and executed from there.
  */
+	.balign 4
+ENTRY(sh_mobile_sleep_enter_start)
 
-	.balign 	4096,0,4096
-ENTRY(sh_mobile_standby)
+	/* save mode flags */
+	mov.l	r4, @(SH_SLEEP_MODE, r5)
 
 	/* save original vbr */
-	stc	vbr, r1
-	mova	saved_vbr, r0
-	mov.l	r1, @r0
+	stc	vbr, r0
+	mov.l	r0, @(SH_SLEEP_VBR, r5)
 
 	/* point vbr to our on-chip memory page */
 	ldc	r5, vbr
 
 	/* save return address */
-	mova	saved_spc, r0
-	sts	pr, r5
-	mov.l	r5, @r0
+	sts	pr, r0
+	mov.l	r0, @(SH_SLEEP_SPC, r5)
 
 	/* save sr */
-	mova	saved_sr, r0
-	stc	sr, r5
-	mov.l	r5, @r0
+	stc	sr, r0
+	mov.l	r0, @(SH_SLEEP_SR, r5)
 
-	/* save mode flags */
-	mova	saved_mode, r0
-	mov.l	r4, @r0
+	/* save sp */
+	mov.l	r15, @(SH_SLEEP_SP, r5)
 
-	/* put mode flags in r0 */
-	mov	r4, r0
+	/* save stbcr */
+	bsr     save_register
+	 mov    #SH_SLEEP_REG_STBCR, r0
 
+	/* save mmu and cache context if needed */
+	mov.l	@(SH_SLEEP_MODE, r5), r0
+	tst	#SUSP_SH_MMU, r0
+	bt	skip_mmu_save_disable
+
+       /* save mmu state */
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_PTEH, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_PTEL, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_TTB, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_TEA, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_MMUCR, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_PTEA, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_PASCR, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_IRMCR, r0
+
+	/* invalidate TLBs and disable the MMU */
+	bsr	get_register
+	 mov	#SH_SLEEP_REG_MMUCR, r0
+	mov	#4, r1
+	mov.l	r1, @r0
+	icbi	@r0
+
+	/* save cache registers and disable caches */
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_CCR, r0
+
+	bsr	save_register
+	 mov	#SH_SLEEP_REG_RAMCR, r0
+
+	bsr	get_register
+	 mov	#SH_SLEEP_REG_CCR, r0
+	mov	#0, r1
+	mov.l	r1, @r0
+	icbi	@r0
+
+skip_mmu_save_disable:
+	/* call self-refresh entering code if needed */
+	mov.l	@(SH_SLEEP_MODE, r5), r0
 	tst	#SUSP_SH_SF, r0
 	bt	skip_set_sf
-#ifdef CONFIG_CPU_SUBTYPE_SH7724
-	/* DBSC: put memory in self-refresh mode */
-	mov.l	dben_reg, r4
-	mov.l	dben_data0, r1
-	mov.l	r1, @r4
 
-	mov.l	dbrfpdn0_reg, r4
-	mov.l	dbrfpdn0_data0, r1
-	mov.l	r1, @r4
-
-	mov.l	dbcmdcnt_reg, r4
-	mov.l	dbcmdcnt_data0, r1
-	mov.l	r1, @r4
-
-	mov.l	dbcmdcnt_reg, r4
-	mov.l	dbcmdcnt_data1, r1
-	mov.l	r1, @r4
-
-	mov.l	dbrfpdn0_reg, r4
-	mov.l	dbrfpdn0_data1, r1
-	mov.l	r1, @r4
-#else
-	/* SBSC: disable power down and put in self-refresh mode */
-	mov.l	1f, r4
-	mov.l	2f, r1
-	mov.l	@r4, r2
-	or	r1, r2
-	mov.l   3f, r3
-	and	r3, r2
-	mov.l	r2, @r4
-#endif
+	mov.l	@(SH_SLEEP_SF_PRE, r5), r0
+	jsr	@r0
+	 nop
 
 skip_set_sf:
+	mov.l	@(SH_SLEEP_MODE, r5), r0
 	tst	#SUSP_SH_STANDBY, r0
 	bt	test_rstandby
 
@@ -104,6 +128,12 @@
 	tst	#SUSP_SH_RSTANDBY, r0
 	bt	test_ustandby
 
+	/* setup BAR register */
+	bsr	get_register
+	 mov	#SH_SLEEP_REG_BAR, r0
+	mov.l	@(SH_SLEEP_RESUME, r5), r1
+	mov.l	r1, @r0
+
 	/* set mode to "r-standby mode" */
 	bra	do_sleep
 	 mov	#0x20, r1
@@ -123,124 +153,136 @@
 
 do_sleep:
 	/* setup and enter selected standby mode */
-	mov.l	5f, r4
-	mov.l	r1, @r4
+	bsr     get_register
+	 mov    #SH_SLEEP_REG_STBCR, r0
+	mov.l	r1, @r0
 again:
 	sleep
 	bra	again
 	 nop
 
-restore_jump_vbr:
+save_register:
+	add	#SH_SLEEP_BASE_ADDR, r0
+	mov.l	@(r0, r5), r1
+	add	#-SH_SLEEP_BASE_ADDR, r0
+	mov.l	@r1, r1
+	add	#SH_SLEEP_BASE_DATA, r0
+	mov.l	r1, @(r0, r5)
+	add	#-SH_SLEEP_BASE_DATA, r0
+	rts
+	 nop
+
+get_register:
+	add	#SH_SLEEP_BASE_ADDR, r0
+	mov.l	@(r0, r5), r0
+	rts
+	 nop
+ENTRY(sh_mobile_sleep_enter_end)
+
+	.balign 4
+ENTRY(sh_mobile_sleep_resume_start)
+
+	/* figure out start address */
+	bsr	0f
+	 nop
+0:
+	sts	pr, k1
+	mov.l	1f, k0
+	and	k0, k1
+
+	/* store pointer to data area in VBR */
+	ldc	k1, vbr
+
+	/* setup sr with saved sr */
+	mov.l	@(SH_SLEEP_SR, k1), k0
+	ldc	k0, sr
+
+	/* now: user register set! */
+	stc	vbr, r5
+
 	/* setup spc with return address to c code */
-	mov.l	saved_spc, k0
-	ldc	k0, spc
+	mov.l	@(SH_SLEEP_SPC, r5), r0
+	ldc	r0, spc
 
 	/* restore vbr */
-	mov.l	saved_vbr, k0
-	ldc	k0, vbr
+	mov.l	@(SH_SLEEP_VBR, r5), r0
+	ldc	r0, vbr
 
 	/* setup ssr with saved sr */
-	mov.l	saved_sr, k0
-	ldc	k0, ssr
+	mov.l	@(SH_SLEEP_SR, r5), r0
+	ldc	r0, ssr
 
-	/* get mode flags */
-	mov.l	saved_mode, k0
+	/* restore sp */
+	mov.l   @(SH_SLEEP_SP, r5), r15
 
-done_sleep:
-	/* reset standby mode to sleep mode */
-	mov.l	5f, k4
-	mov	#0x00, k1
-	mov.l	k1, @k4
+	/* restore sleep mode register */
+	bsr     restore_register
+	 mov    #SH_SLEEP_REG_STBCR, r0
 
-	tst	#SUSP_SH_SF, k0
+	/* call self-refresh resume code if needed */
+	mov.l	@(SH_SLEEP_MODE, r5), r0
+	tst	#SUSP_SH_SF, r0
 	bt	skip_restore_sf
 
-#ifdef CONFIG_CPU_SUBTYPE_SH7724
-	/* DBSC: put memory in auto-refresh mode */
-	mov.l	dbrfpdn0_reg, k4
-	mov.l	dbrfpdn0_data0, k1
-	mov.l	k1, @k4
+	mov.l	@(SH_SLEEP_SF_POST, r5), r0
+	jsr	@r0
+	 nop
 
-	nop /* sleep 140 ns */
-	nop
-	nop
-	nop
-
-	mov.l	dbcmdcnt_reg, k4
-	mov.l	dbcmdcnt_data0, k1
-	mov.l	k1, @k4
-
-	mov.l	dbcmdcnt_reg, k4
-	mov.l	dbcmdcnt_data1, k1
-	mov.l	k1, @k4
-
-	mov.l	dben_reg, k4
-	mov.l	dben_data1, k1
-	mov.l	k1, @k4
-
-	mov.l	dbrfpdn0_reg, k4
-	mov.l	dbrfpdn0_data2, k1
-	mov.l	k1, @k4
-#else
-	/* SBSC: set auto-refresh mode */
-	mov.l	1f, k4
-	mov.l	@k4, k0
-	mov.l   4f, k1
-	and	k1, k0
-	mov.l	k0, @k4
-	mov.l	6f, k4
-	mov.l	8f, k0
-	mov.l	@k4, k1
-	mov	#-1, k4
-	add	k4, k1
-	or	k1, k0
-	mov.l	7f, k1
-	mov.l	k0, @k1
-#endif
 skip_restore_sf:
-	/* jump to vbr vector */
-	mov.l	saved_vbr, k0
-	mov.l	offset_vbr, k4
-	add	k4, k0
-	jmp	@k0
+	/* restore mmu and cache state if needed */
+	mov.l	@(SH_SLEEP_MODE, r5), r0
+	tst	#SUSP_SH_MMU, r0
+	bt	skip_restore_mmu
+
+	/* restore mmu state */
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_PTEH, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_PTEL, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_TTB, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_TEA, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_PTEA, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_PASCR, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_IRMCR, r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_MMUCR, r0
+	icbi	@r0
+
+	/* restore cache settings */
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_RAMCR, r0
+	icbi	@r0
+
+	bsr	restore_register
+	 mov	#SH_SLEEP_REG_CCR, r0
+	icbi	@r0
+
+skip_restore_mmu:
+	rte
+	 nop
+
+restore_register:
+	add	#SH_SLEEP_BASE_DATA, r0
+	mov.l	@(r0, r5), r1
+	add	#-SH_SLEEP_BASE_DATA, r0
+	add	#SH_SLEEP_BASE_ADDR, r0
+	mov.l	@(r0, r5), r0
+	mov.l	r1, @r0
+	rts
 	 nop
 
 	.balign 4
-saved_mode:	.long	0
-saved_spc:	.long	0
-saved_sr:	.long	0
-saved_vbr:	.long	0
-offset_vbr:	.long	0x600
-#ifdef CONFIG_CPU_SUBTYPE_SH7724
-dben_reg:	.long	0xfd000010 /* DBEN */
-dben_data0:	.long	0
-dben_data1:	.long	1
-dbrfpdn0_reg:	.long	0xfd000040 /* DBRFPDN0 */
-dbrfpdn0_data0:	.long	0
-dbrfpdn0_data1:	.long	1
-dbrfpdn0_data2:	.long	0x00010000
-dbcmdcnt_reg:	.long	0xfd000014 /* DBCMDCNT */
-dbcmdcnt_data0:	.long	2
-dbcmdcnt_data1:	.long	4
-#else
-1:	.long	0xfe400008 /* SDCR0 */
-2:	.long	0x00000400
-3:	.long	0xffff7fff
-4:	.long	0xfffffbff
-#endif
-5:	.long	0xa4150020 /* STBCR */
-6:	.long   0xfe40001c /* RTCOR */
-7:	.long   0xfe400018 /* RTCNT */
-8:	.long   0xa55a0000
-
-
-/* interrupt vector @ 0x600 */
-	.balign 	0x400,0,0x400
-	.long	0xdeadbeef
-	.balign 	0x200,0,0x200
-	bra	restore_jump_vbr
-	 nop
-sh_mobile_standby_end:
-
-ENTRY(sh_mobile_standby_size)
-	.long sh_mobile_standby_end - sh_mobile_standby
+1:	.long	~0x7ff
+ENTRY(sh_mobile_sleep_resume_end)
diff --git a/arch/sh/kernel/cpu/ubc.S b/arch/sh/kernel/cpu/ubc.S
deleted file mode 100644
index 8192307..0000000
--- a/arch/sh/kernel/cpu/ubc.S
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * arch/sh/kernel/cpu/ubc.S
- *
- * Set of management routines for the User Break Controller (UBC)
- *
- * Copyright (C) 2002 Paul Mundt
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- */
-#include <linux/linkage.h>
-#include <asm/ubc.h>
-
-#define STBCR2		0xffc00010
-
-ENTRY(ubc_sleep)
-	mov	#0, r0
-
-	mov.l	1f, r1		! Zero out UBC_BBRA ..
-	mov.w	r0, @r1
-
-	mov.l	2f, r1		! .. same for BBRB ..
-	mov.w	r0, @r1
-
-	mov.l	3f, r1		! .. and again for BRCR.
-	mov.w	r0, @r1
-
-	mov.w	@r1, r0		! Dummy read BRCR
-
-	mov.l	4f, r1		! Set MSTP5 in STBCR2
-	mov.b	@r1, r0
-	or	#0x01, r0
-	mov.b	r0, @r1
-
-	mov.b	@r1, r0		! Two dummy reads ..
-	mov.b	@r1, r0
-
-	rts
-	nop
-
-ENTRY(ubc_wakeup)
-	mov.l	4f, r1		! Clear MSTP5
-	mov.b	@r1, r0
-	and	#0xfe, r0
-	mov.b	r0, @r1
-
-	mov.b	@r1, r0		! Two more dummy reads ..
-	mov.b	@r1, r0
-
-	rts
-	nop
-
-1:	.long	UBC_BBRA
-2:	.long	UBC_BBRB
-3:	.long	UBC_BRCR
-4:	.long	STBCR2
-
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
new file mode 100644
index 0000000..3c55b87
--- /dev/null
+++ b/arch/sh/kernel/dma-nommu.c
@@ -0,0 +1,82 @@
+/*
+ * DMA mapping support for platforms lacking IOMMUs.
+ *
+ * Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/dma-mapping.h>
+#include <linux/io.h>
+
+static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
+				 unsigned long offset, size_t size,
+				 enum dma_data_direction dir,
+				 struct dma_attrs *attrs)
+{
+	dma_addr_t addr = page_to_phys(page) + offset;
+
+	WARN_ON(size == 0);
+	dma_cache_sync(dev, page_address(page) + offset, size, dir);
+
+	return addr;
+}
+
+static int nommu_map_sg(struct device *dev, struct scatterlist *sg,
+			int nents, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
+	for_each_sg(sg, s, nents, i) {
+		BUG_ON(!sg_page(s));
+
+		dma_cache_sync(dev, sg_virt(s), s->length, dir);
+
+		s->dma_address = sg_phys(s);
+		s->dma_length = s->length;
+	}
+
+	return nents;
+}
+
+#ifdef CONFIG_DMA_NONCOHERENT
+static void nommu_sync_single(struct device *dev, dma_addr_t addr,
+			      size_t size, enum dma_data_direction dir)
+{
+	dma_cache_sync(dev, phys_to_virt(addr), size, dir);
+}
+
+static void nommu_sync_sg(struct device *dev, struct scatterlist *sg,
+			  int nelems, enum dma_data_direction dir)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nelems, i)
+		dma_cache_sync(dev, sg_virt(s), s->length, dir);
+}
+#endif
+
+struct dma_map_ops nommu_dma_ops = {
+	.alloc_coherent		= dma_generic_alloc_coherent,
+	.free_coherent		= dma_generic_free_coherent,
+	.map_page		= nommu_map_page,
+	.map_sg			= nommu_map_sg,
+#ifdef CONFIG_DMA_NONCOHERENT
+	.sync_single_for_device	= nommu_sync_single,
+	.sync_sg_for_device	= nommu_sync_sg,
+#endif
+	.is_phys		= 1,
+};
+
+void __init no_iommu_init(void)
+{
+	if (dma_ops)
+		return;
+	dma_ops = &nommu_dma_ops;
+}
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index d76a231..3576b70 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -20,6 +20,7 @@
 #include <linux/list.h>
 #include <linux/mempool.h>
 #include <linux/mm.h>
+#include <linux/elf.h>
 #include <linux/ftrace.h>
 #include <asm/dwarf.h>
 #include <asm/unwinder.h>
@@ -530,7 +531,18 @@
 }
 
 /**
- *	dwarf_unwind_stack - recursively unwind the stack
+ *	dwarf_free_frame - free the memory allocated for @frame
+ *	@frame: the frame to free
+ */
+void dwarf_free_frame(struct dwarf_frame *frame)
+{
+	dwarf_frame_free_regs(frame);
+	mempool_free(frame, dwarf_frame_pool);
+}
+
+/**
+ *	dwarf_unwind_stack - unwind the stack
+ *
  *	@pc: address of the function to unwind
  *	@prev: struct dwarf_frame of the previous stackframe on the callstack
  *
@@ -548,9 +560,9 @@
 	unsigned long addr;
 
 	/*
-	 * If this is the first invocation of this recursive function we
-	 * need get the contents of a physical register to get the CFA
-	 * in order to begin the virtual unwinding of the stack.
+	 * If we're starting at the top of the stack we need get the
+	 * contents of a physical register to get the CFA in order to
+	 * begin the virtual unwinding of the stack.
 	 *
 	 * NOTE: the return address is guaranteed to be setup by the
 	 * time this function makes its first function call.
@@ -593,9 +605,8 @@
 	fde = dwarf_lookup_fde(pc);
 	if (!fde) {
 		/*
-		 * This is our normal exit path - the one that stops the
-		 * recursion. There's two reasons why we might exit
-		 * here,
+		 * This is our normal exit path. There are two reasons
+		 * why we might exit here,
 		 *
 		 *	a) pc has no asscociated DWARF frame info and so
 		 *	we don't know how to unwind this frame. This is
@@ -637,10 +648,10 @@
 
 		} else {
 			/*
-			 * Again, this is the first invocation of this
-			 * recurisve function. We need to physically
-			 * read the contents of a register in order to
-			 * get the Canonical Frame Address for this
+			 * Again, we're starting from the top of the
+			 * stack. We need to physically read
+			 * the contents of a register in order to get
+			 * the Canonical Frame Address for this
 			 * function.
 			 */
 			frame->cfa = dwarf_read_arch_reg(frame->cfa_register);
@@ -670,13 +681,12 @@
 	return frame;
 
 bail:
-	dwarf_frame_free_regs(frame);
-	mempool_free(frame, dwarf_frame_pool);
+	dwarf_free_frame(frame);
 	return NULL;
 }
 
 static int dwarf_parse_cie(void *entry, void *p, unsigned long len,
-			   unsigned char *end)
+			   unsigned char *end, struct module *mod)
 {
 	struct dwarf_cie *cie;
 	unsigned long flags;
@@ -772,6 +782,8 @@
 	cie->initial_instructions = p;
 	cie->instructions_end = end;
 
+	cie->mod = mod;
+
 	/* Add to list */
 	spin_lock_irqsave(&dwarf_cie_lock, flags);
 	list_add_tail(&cie->link, &dwarf_cie_list);
@@ -782,7 +794,7 @@
 
 static int dwarf_parse_fde(void *entry, u32 entry_type,
 			   void *start, unsigned long len,
-			   unsigned char *end)
+			   unsigned char *end, struct module *mod)
 {
 	struct dwarf_fde *fde;
 	struct dwarf_cie *cie;
@@ -831,6 +843,8 @@
 	fde->instructions = p;
 	fde->end = end;
 
+	fde->mod = mod;
+
 	/* Add to list. */
 	spin_lock_irqsave(&dwarf_fde_lock, flags);
 	list_add_tail(&fde->link, &dwarf_fde_list);
@@ -854,10 +868,8 @@
 	while (1) {
 		frame = dwarf_unwind_stack(return_addr, _frame);
 
-		if (_frame) {
-			dwarf_frame_free_regs(_frame);
-			mempool_free(_frame, dwarf_frame_pool);
-		}
+		if (_frame)
+			dwarf_free_frame(_frame);
 
 		_frame = frame;
 
@@ -867,6 +879,9 @@
 		return_addr = frame->return_addr;
 		ops->address(data, return_addr, 1);
 	}
+
+	if (frame)
+		dwarf_free_frame(frame);
 }
 
 static struct unwinder dwarf_unwinder = {
@@ -896,6 +911,158 @@
 }
 
 /**
+ *	dwarf_parse_section - parse DWARF section
+ *	@eh_frame_start: start address of the .eh_frame section
+ *	@eh_frame_end: end address of the .eh_frame section
+ *	@mod: the kernel module containing the .eh_frame section
+ *
+ *	Parse the information in a .eh_frame section.
+ */
+static int dwarf_parse_section(char *eh_frame_start, char *eh_frame_end,
+			       struct module *mod)
+{
+	u32 entry_type;
+	void *p, *entry;
+	int count, err = 0;
+	unsigned long len = 0;
+	unsigned int c_entries, f_entries;
+	unsigned char *end;
+
+	c_entries = 0;
+	f_entries = 0;
+	entry = eh_frame_start;
+
+	while ((char *)entry < eh_frame_end) {
+		p = entry;
+
+		count = dwarf_entry_len(p, &len);
+		if (count == 0) {
+			/*
+			 * We read a bogus length field value. There is
+			 * nothing we can do here apart from disabling
+			 * the DWARF unwinder. We can't even skip this
+			 * entry and move to the next one because 'len'
+			 * tells us where our next entry is.
+			 */
+			err = -EINVAL;
+			goto out;
+		} else
+			p += count;
+
+		/* initial length does not include itself */
+		end = p + len;
+
+		entry_type = get_unaligned((u32 *)p);
+		p += 4;
+
+		if (entry_type == DW_EH_FRAME_CIE) {
+			err = dwarf_parse_cie(entry, p, len, end, mod);
+			if (err < 0)
+				goto out;
+			else
+				c_entries++;
+		} else {
+			err = dwarf_parse_fde(entry, entry_type, p, len,
+					      end, mod);
+			if (err < 0)
+				goto out;
+			else
+				f_entries++;
+		}
+
+		entry = (char *)entry + len + 4;
+	}
+
+	printk(KERN_INFO "DWARF unwinder initialised: read %u CIEs, %u FDEs\n",
+	       c_entries, f_entries);
+
+	return 0;
+
+out:
+	return err;
+}
+
+#ifdef CONFIG_MODULES
+int module_dwarf_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+			  struct module *me)
+{
+	unsigned int i, err;
+	unsigned long start, end;
+	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	start = end = 0;
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		/* Alloc bit cleared means "ignore it." */
+		if ((sechdrs[i].sh_flags & SHF_ALLOC)
+		    && !strcmp(secstrings+sechdrs[i].sh_name, ".eh_frame")) {
+			start = sechdrs[i].sh_addr;
+			end = start + sechdrs[i].sh_size;
+			break;
+		}
+	}
+
+	/* Did we find the .eh_frame section? */
+	if (i != hdr->e_shnum) {
+		err = dwarf_parse_section((char *)start, (char *)end, me);
+		if (err) {
+			printk(KERN_WARNING "%s: failed to parse DWARF info\n",
+			       me->name);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ *	module_dwarf_cleanup - remove FDE/CIEs associated with @mod
+ *	@mod: the module that is being unloaded
+ *
+ *	Remove any FDEs and CIEs from the global lists that came from
+ *	@mod's .eh_frame section because @mod is being unloaded.
+ */
+void module_dwarf_cleanup(struct module *mod)
+{
+	struct dwarf_fde *fde;
+	struct dwarf_cie *cie;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dwarf_cie_lock, flags);
+
+again_cie:
+	list_for_each_entry(cie, &dwarf_cie_list, link) {
+		if (cie->mod == mod)
+			break;
+	}
+
+	if (&cie->link != &dwarf_cie_list) {
+		list_del(&cie->link);
+		kfree(cie);
+		goto again_cie;
+	}
+
+	spin_unlock_irqrestore(&dwarf_cie_lock, flags);
+
+	spin_lock_irqsave(&dwarf_fde_lock, flags);
+
+again_fde:
+	list_for_each_entry(fde, &dwarf_fde_list, link) {
+		if (fde->mod == mod)
+			break;
+	}
+
+	if (&fde->link != &dwarf_fde_list) {
+		list_del(&fde->link);
+		kfree(fde);
+		goto again_fde;
+	}
+
+	spin_unlock_irqrestore(&dwarf_fde_lock, flags);
+}
+#endif /* CONFIG_MODULES */
+
+/**
  *	dwarf_unwinder_init - initialise the dwarf unwinder
  *
  *	Build the data structures describing the .dwarf_frame section to
@@ -906,19 +1073,10 @@
  */
 static int __init dwarf_unwinder_init(void)
 {
-	u32 entry_type;
-	void *p, *entry;
-	int count, err = 0;
-	unsigned long len;
-	unsigned int c_entries, f_entries;
-	unsigned char *end;
+	int err;
 	INIT_LIST_HEAD(&dwarf_cie_list);
 	INIT_LIST_HEAD(&dwarf_fde_list);
 
-	c_entries = 0;
-	f_entries = 0;
-	entry = &__start_eh_frame;
-
 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames",
 			sizeof(struct dwarf_frame), 0,
 			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
@@ -937,47 +1095,9 @@
 					 mempool_free_slab,
 					 dwarf_reg_cachep);
 
-	while ((char *)entry < __stop_eh_frame) {
-		p = entry;
-
-		count = dwarf_entry_len(p, &len);
-		if (count == 0) {
-			/*
-			 * We read a bogus length field value. There is
-			 * nothing we can do here apart from disabling
-			 * the DWARF unwinder. We can't even skip this
-			 * entry and move to the next one because 'len'
-			 * tells us where our next entry is.
-			 */
-			goto out;
-		} else
-			p += count;
-
-		/* initial length does not include itself */
-		end = p + len;
-
-		entry_type = get_unaligned((u32 *)p);
-		p += 4;
-
-		if (entry_type == DW_EH_FRAME_CIE) {
-			err = dwarf_parse_cie(entry, p, len, end);
-			if (err < 0)
-				goto out;
-			else
-				c_entries++;
-		} else {
-			err = dwarf_parse_fde(entry, entry_type, p, len, end);
-			if (err < 0)
-				goto out;
-			else
-				f_entries++;
-		}
-
-		entry = (char *)entry + len + 4;
-	}
-
-	printk(KERN_INFO "DWARF unwinder initialised: read %u CIEs, %u FDEs\n",
-	       c_entries, f_entries);
+	err = dwarf_parse_section(__start_eh_frame, __stop_eh_frame, NULL);
+	if (err)
+		goto out;
 
 	err = unwinder_register(&dwarf_unwinder);
 	if (err)
diff --git a/arch/sh/kernel/entry-common.S b/arch/sh/kernel/entry-common.S
index 3eb8493..f0abd58 100644
--- a/arch/sh/kernel/entry-common.S
+++ b/arch/sh/kernel/entry-common.S
@@ -133,7 +133,7 @@
 	! r8: current_thread_info
 	! t:  result of "tst	#_TIF_NEED_RESCHED, r0"
 	bf/s	work_resched
-	 tst	#(_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK), r0
+	 tst	#_TIF_SIGPENDING, r0
 work_notifysig:
 	bt/s	__restore_all
 	 mov	r15, r4
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c
index 2c48e26..b6f41c1 100644
--- a/arch/sh/kernel/ftrace.c
+++ b/arch/sh/kernel/ftrace.c
@@ -62,6 +62,150 @@
 	return ftrace_replaced_code;
 }
 
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ *    and the new code into the "code" buffer.
+ * 2) Wait for any running NMIs to finish and set a flag that says
+ *    we are modifying code, it is done in an atomic operation.
+ * 3) Write the code
+ * 4) clear the flag.
+ * 5) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+#define MOD_CODE_WRITE_FLAG (1 << 31)	/* set when NMI should do the write */
+static atomic_t nmi_running = ATOMIC_INIT(0);
+static int mod_code_status;		/* holds return value of text write */
+static void *mod_code_ip;		/* holds the IP to write to */
+static void *mod_code_newcode;		/* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+	int r;
+
+	r = snprintf(buf, size, "%u %u",
+		     nmi_wait_count,
+		     atomic_read(&nmi_update_count));
+	return r;
+}
+
+static void clear_mod_flag(void)
+{
+	int old = atomic_read(&nmi_running);
+
+	for (;;) {
+		int new = old & ~MOD_CODE_WRITE_FLAG;
+
+		if (old == new)
+			break;
+
+		old = atomic_cmpxchg(&nmi_running, old, new);
+	}
+}
+
+static void ftrace_mod_code(void)
+{
+	/*
+	 * Yes, more than one CPU process can be writing to mod_code_status.
+	 *    (and the code itself)
+	 * But if one were to fail, then they all should, and if one were
+	 * to succeed, then they all should.
+	 */
+	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+					     MCOUNT_INSN_SIZE);
+
+	/* if we fail, then kill any new writers */
+	if (mod_code_status)
+		clear_mod_flag();
+}
+
+void ftrace_nmi_enter(void)
+{
+	if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
+		smp_rmb();
+		ftrace_mod_code();
+		atomic_inc(&nmi_update_count);
+	}
+	/* Must have previous changes seen before executions */
+	smp_mb();
+}
+
+void ftrace_nmi_exit(void)
+{
+	/* Finish all executions before clearing nmi_running */
+	smp_mb();
+	atomic_dec(&nmi_running);
+}
+
+static void wait_for_nmi_and_set_mod_flag(void)
+{
+	if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
+		return;
+
+	do {
+		cpu_relax();
+	} while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
+
+	nmi_wait_count++;
+}
+
+static void wait_for_nmi(void)
+{
+	if (!atomic_read(&nmi_running))
+		return;
+
+	do {
+		cpu_relax();
+	} while (atomic_read(&nmi_running));
+
+	nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+	mod_code_ip = (void *)ip;
+	mod_code_newcode = new_code;
+
+	/* The buffers need to be visible before we let NMIs write them */
+	smp_mb();
+
+	wait_for_nmi_and_set_mod_flag();
+
+	/* Make sure all running NMIs have finished before we write the code */
+	smp_mb();
+
+	ftrace_mod_code();
+
+	/* Make sure the write happens before clearing the bit */
+	smp_mb();
+
+	clear_mod_flag();
+	wait_for_nmi();
+
+	return mod_code_status;
+}
+
 static int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		       unsigned char *new_code)
 {
@@ -86,7 +230,7 @@
 		return -EINVAL;
 
 	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+	if (do_ftrace_mod_code(ip, new_code))
 		return -EPERM;
 
 	flush_icache_range(ip, ip + MCOUNT_INSN_SIZE);
diff --git a/arch/sh/kernel/gpio.c b/arch/sh/kernel/gpio.c
deleted file mode 100644
index d22e5af..0000000
--- a/arch/sh/kernel/gpio.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * Pinmuxed GPIO support for SuperH.
- *
- * Copyright (C) 2008 Magnus Damm
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/bitops.h>
-#include <linux/gpio.h>
-
-static int enum_in_range(pinmux_enum_t enum_id, struct pinmux_range *r)
-{
-	if (enum_id < r->begin)
-		return 0;
-
-	if (enum_id > r->end)
-		return 0;
-
-	return 1;
-}
-
-static unsigned long gpio_read_raw_reg(unsigned long reg,
-				       unsigned long reg_width)
-{
-	switch (reg_width) {
-	case 8:
-		return ctrl_inb(reg);
-	case 16:
-		return ctrl_inw(reg);
-	case 32:
-		return ctrl_inl(reg);
-	}
-
-	BUG();
-	return 0;
-}
-
-static void gpio_write_raw_reg(unsigned long reg,
-			       unsigned long reg_width,
-			       unsigned long data)
-{
-	switch (reg_width) {
-	case 8:
-		ctrl_outb(data, reg);
-		return;
-	case 16:
-		ctrl_outw(data, reg);
-		return;
-	case 32:
-		ctrl_outl(data, reg);
-		return;
-	}
-
-	BUG();
-}
-
-static void gpio_write_bit(struct pinmux_data_reg *dr,
-			   unsigned long in_pos, unsigned long value)
-{
-	unsigned long pos;
-
-	pos = dr->reg_width - (in_pos + 1);
-
-#ifdef DEBUG
-	pr_info("write_bit addr = %lx, value = %ld, pos = %ld, "
-		"r_width = %ld\n",
-		dr->reg, !!value, pos, dr->reg_width);
-#endif
-
-	if (value)
-		set_bit(pos, &dr->reg_shadow);
-	else
-		clear_bit(pos, &dr->reg_shadow);
-
-	gpio_write_raw_reg(dr->reg, dr->reg_width, dr->reg_shadow);
-}
-
-static int gpio_read_reg(unsigned long reg, unsigned long reg_width,
-			 unsigned long field_width, unsigned long in_pos)
-{
-	unsigned long data, mask, pos;
-
-	data = 0;
-	mask = (1 << field_width) - 1;
-	pos = reg_width - ((in_pos + 1) * field_width);
-
-#ifdef DEBUG
-	pr_info("read_reg: addr = %lx, pos = %ld, "
-		"r_width = %ld, f_width = %ld\n",
-		reg, pos, reg_width, field_width);
-#endif
-
-	data = gpio_read_raw_reg(reg, reg_width);
-	return (data >> pos) & mask;
-}
-
-static void gpio_write_reg(unsigned long reg, unsigned long reg_width,
-			   unsigned long field_width, unsigned long in_pos,
-			   unsigned long value)
-{
-	unsigned long mask, pos;
-
-	mask = (1 << field_width) - 1;
-	pos = reg_width - ((in_pos + 1) * field_width);
-
-#ifdef DEBUG
-	pr_info("write_reg addr = %lx, value = %ld, pos = %ld, "
-		"r_width = %ld, f_width = %ld\n",
-		reg, value, pos, reg_width, field_width);
-#endif
-
-	mask = ~(mask << pos);
-	value = value << pos;
-
-	switch (reg_width) {
-	case 8:
-		ctrl_outb((ctrl_inb(reg) & mask) | value, reg);
-		break;
-	case 16:
-		ctrl_outw((ctrl_inw(reg) & mask) | value, reg);
-		break;
-	case 32:
-		ctrl_outl((ctrl_inl(reg) & mask) | value, reg);
-		break;
-	}
-}
-
-static int setup_data_reg(struct pinmux_info *gpioc, unsigned gpio)
-{
-	struct pinmux_gpio *gpiop = &gpioc->gpios[gpio];
-	struct pinmux_data_reg *data_reg;
-	int k, n;
-
-	if (!enum_in_range(gpiop->enum_id, &gpioc->data))
-		return -1;
-
-	k = 0;
-	while (1) {
-		data_reg = gpioc->data_regs + k;
-
-		if (!data_reg->reg_width)
-			break;
-
-		for (n = 0; n < data_reg->reg_width; n++) {
-			if (data_reg->enum_ids[n] == gpiop->enum_id) {
-				gpiop->flags &= ~PINMUX_FLAG_DREG;
-				gpiop->flags |= (k << PINMUX_FLAG_DREG_SHIFT);
-				gpiop->flags &= ~PINMUX_FLAG_DBIT;
-				gpiop->flags |= (n << PINMUX_FLAG_DBIT_SHIFT);
-				return 0;
-			}
-		}
-		k++;
-	}
-
-	BUG();
-
-	return -1;
-}
-
-static void setup_data_regs(struct pinmux_info *gpioc)
-{
-	struct pinmux_data_reg *drp;
-	int k;
-
-	for (k = gpioc->first_gpio; k <= gpioc->last_gpio; k++)
-		setup_data_reg(gpioc, k);
-
-	k = 0;
-	while (1) {
-		drp = gpioc->data_regs + k;
-
-		if (!drp->reg_width)
-			break;
-
-		drp->reg_shadow = gpio_read_raw_reg(drp->reg, drp->reg_width);
-		k++;
-	}
-}
-
-static int get_data_reg(struct pinmux_info *gpioc, unsigned gpio,
-			struct pinmux_data_reg **drp, int *bitp)
-{
-	struct pinmux_gpio *gpiop = &gpioc->gpios[gpio];
-	int k, n;
-
-	if (!enum_in_range(gpiop->enum_id, &gpioc->data))
-		return -1;
-
-	k = (gpiop->flags & PINMUX_FLAG_DREG) >> PINMUX_FLAG_DREG_SHIFT;
-	n = (gpiop->flags & PINMUX_FLAG_DBIT) >> PINMUX_FLAG_DBIT_SHIFT;
-	*drp = gpioc->data_regs + k;
-	*bitp = n;
-	return 0;
-}
-
-static int get_config_reg(struct pinmux_info *gpioc, pinmux_enum_t enum_id,
-			  struct pinmux_cfg_reg **crp, int *indexp,
-			  unsigned long **cntp)
-{
-	struct pinmux_cfg_reg *config_reg;
-	unsigned long r_width, f_width;
-	int k, n;
-
-	k = 0;
-	while (1) {
-		config_reg = gpioc->cfg_regs + k;
-
-		r_width = config_reg->reg_width;
-		f_width = config_reg->field_width;
-
-		if (!r_width)
-			break;
-		for (n = 0; n < (r_width / f_width) * 1 << f_width; n++) {
-			if (config_reg->enum_ids[n] == enum_id) {
-				*crp = config_reg;
-				*indexp = n;
-				*cntp = &config_reg->cnt[n / (1 << f_width)];
-				return 0;
-			}
-		}
-		k++;
-	}
-
-	return -1;
-}
-
-static int get_gpio_enum_id(struct pinmux_info *gpioc, unsigned gpio,
-			    int pos, pinmux_enum_t *enum_idp)
-{
-	pinmux_enum_t enum_id = gpioc->gpios[gpio].enum_id;
-	pinmux_enum_t *data = gpioc->gpio_data;
-	int k;
-
-	if (!enum_in_range(enum_id, &gpioc->data)) {
-		if (!enum_in_range(enum_id, &gpioc->mark)) {
-			pr_err("non data/mark enum_id for gpio %d\n", gpio);
-			return -1;
-		}
-	}
-
-	if (pos) {
-		*enum_idp = data[pos + 1];
-		return pos + 1;
-	}
-
-	for (k = 0; k < gpioc->gpio_data_size; k++) {
-		if (data[k] == enum_id) {
-			*enum_idp = data[k + 1];
-			return k + 1;
-		}
-	}
-
-	pr_err("cannot locate data/mark enum_id for gpio %d\n", gpio);
-	return -1;
-}
-
-static void write_config_reg(struct pinmux_info *gpioc,
-			     struct pinmux_cfg_reg *crp,
-			     int index)
-{
-	unsigned long ncomb, pos, value;
-
-	ncomb = 1 << crp->field_width;
-	pos = index / ncomb;
-	value = index % ncomb;
-
-	gpio_write_reg(crp->reg, crp->reg_width, crp->field_width, pos, value);
-}
-
-static int check_config_reg(struct pinmux_info *gpioc,
-			    struct pinmux_cfg_reg *crp,
-			    int index)
-{
-	unsigned long ncomb, pos, value;
-
-	ncomb = 1 << crp->field_width;
-	pos = index / ncomb;
-	value = index % ncomb;
-
-	if (gpio_read_reg(crp->reg, crp->reg_width,
-			  crp->field_width, pos) == value)
-		return 0;
-
-	return -1;
-}
-
-enum { GPIO_CFG_DRYRUN, GPIO_CFG_REQ, GPIO_CFG_FREE };
-
-static int pinmux_config_gpio(struct pinmux_info *gpioc, unsigned gpio,
-			      int pinmux_type, int cfg_mode)
-{
-	struct pinmux_cfg_reg *cr = NULL;
-	pinmux_enum_t enum_id;
-	struct pinmux_range *range;
-	int in_range, pos, index;
-	unsigned long *cntp;
-
-	switch (pinmux_type) {
-
-	case PINMUX_TYPE_FUNCTION:
-		range = NULL;
-		break;
-
-	case PINMUX_TYPE_OUTPUT:
-		range = &gpioc->output;
-		break;
-
-	case PINMUX_TYPE_INPUT:
-		range = &gpioc->input;
-		break;
-
-	case PINMUX_TYPE_INPUT_PULLUP:
-		range = &gpioc->input_pu;
-		break;
-
-	case PINMUX_TYPE_INPUT_PULLDOWN:
-		range = &gpioc->input_pd;
-		break;
-
-	default:
-		goto out_err;
-	}
-
-	pos = 0;
-	enum_id = 0;
-	index = 0;
-	while (1) {
-		pos = get_gpio_enum_id(gpioc, gpio, pos, &enum_id);
-		if (pos <= 0)
-			goto out_err;
-
-		if (!enum_id)
-			break;
-
-		in_range = enum_in_range(enum_id, &gpioc->function);
-		if (!in_range && range) {
-			in_range = enum_in_range(enum_id, range);
-
-			if (in_range && enum_id == range->force)
-				continue;
-		}
-
-		if (!in_range)
-			continue;
-
-		if (get_config_reg(gpioc, enum_id, &cr, &index, &cntp) != 0)
-			goto out_err;
-
-		switch (cfg_mode) {
-		case GPIO_CFG_DRYRUN:
-			if (!*cntp || !check_config_reg(gpioc, cr, index))
-				continue;
-			break;
-
-		case GPIO_CFG_REQ:
-			write_config_reg(gpioc, cr, index);
-			*cntp = *cntp + 1;
-			break;
-
-		case GPIO_CFG_FREE:
-			*cntp = *cntp - 1;
-			break;
-		}
-	}
-
-	return 0;
- out_err:
-	return -1;
-}
-
-static DEFINE_SPINLOCK(gpio_lock);
-
-static struct pinmux_info *chip_to_pinmux(struct gpio_chip *chip)
-{
-	return container_of(chip, struct pinmux_info, chip);
-}
-
-static int sh_gpio_request(struct gpio_chip *chip, unsigned offset)
-{
-	struct pinmux_info *gpioc = chip_to_pinmux(chip);
-	struct pinmux_data_reg *dummy;
-	unsigned long flags;
-	int i, ret, pinmux_type;
-
-	ret = -EINVAL;
-
-	if (!gpioc)
-		goto err_out;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-
-	if ((gpioc->gpios[offset].flags & PINMUX_FLAG_TYPE) != PINMUX_TYPE_NONE)
-		goto err_unlock;
-
-	/* setup pin function here if no data is associated with pin */
-
-	if (get_data_reg(gpioc, offset, &dummy, &i) != 0)
-		pinmux_type = PINMUX_TYPE_FUNCTION;
-	else
-		pinmux_type = PINMUX_TYPE_GPIO;
-
-	if (pinmux_type == PINMUX_TYPE_FUNCTION) {
-		if (pinmux_config_gpio(gpioc, offset,
-				       pinmux_type,
-				       GPIO_CFG_DRYRUN) != 0)
-			goto err_unlock;
-
-		if (pinmux_config_gpio(gpioc, offset,
-				       pinmux_type,
-				       GPIO_CFG_REQ) != 0)
-			BUG();
-	}
-
-	gpioc->gpios[offset].flags &= ~PINMUX_FLAG_TYPE;
-	gpioc->gpios[offset].flags |= pinmux_type;
-
-	ret = 0;
- err_unlock:
-	spin_unlock_irqrestore(&gpio_lock, flags);
- err_out:
-	return ret;
-}
-
-static void sh_gpio_free(struct gpio_chip *chip, unsigned offset)
-{
-	struct pinmux_info *gpioc = chip_to_pinmux(chip);
-	unsigned long flags;
-	int pinmux_type;
-
-	if (!gpioc)
-		return;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-
-	pinmux_type = gpioc->gpios[offset].flags & PINMUX_FLAG_TYPE;
-	pinmux_config_gpio(gpioc, offset, pinmux_type, GPIO_CFG_FREE);
-	gpioc->gpios[offset].flags &= ~PINMUX_FLAG_TYPE;
-	gpioc->gpios[offset].flags |= PINMUX_TYPE_NONE;
-
-	spin_unlock_irqrestore(&gpio_lock, flags);
-}
-
-static int pinmux_direction(struct pinmux_info *gpioc,
-			    unsigned gpio, int new_pinmux_type)
-{
-	int pinmux_type;
-	int ret = -EINVAL;
-
-	if (!gpioc)
-		goto err_out;
-
-	pinmux_type = gpioc->gpios[gpio].flags & PINMUX_FLAG_TYPE;
-
-	switch (pinmux_type) {
-	case PINMUX_TYPE_GPIO:
-		break;
-	case PINMUX_TYPE_OUTPUT:
-	case PINMUX_TYPE_INPUT:
-	case PINMUX_TYPE_INPUT_PULLUP:
-	case PINMUX_TYPE_INPUT_PULLDOWN:
-		pinmux_config_gpio(gpioc, gpio, pinmux_type, GPIO_CFG_FREE);
-		break;
-	default:
-		goto err_out;
-	}
-
-	if (pinmux_config_gpio(gpioc, gpio,
-			       new_pinmux_type,
-			       GPIO_CFG_DRYRUN) != 0)
-		goto err_out;
-
-	if (pinmux_config_gpio(gpioc, gpio,
-			       new_pinmux_type,
-			       GPIO_CFG_REQ) != 0)
-		BUG();
-
-	gpioc->gpios[gpio].flags &= ~PINMUX_FLAG_TYPE;
-	gpioc->gpios[gpio].flags |= new_pinmux_type;
-
-	ret = 0;
- err_out:
-	return ret;
-}
-
-static int sh_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
-{
-	struct pinmux_info *gpioc = chip_to_pinmux(chip);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&gpio_lock, flags);
-	ret = pinmux_direction(gpioc, offset, PINMUX_TYPE_INPUT);
-	spin_unlock_irqrestore(&gpio_lock, flags);
-
-	return ret;
-}
-
-static void sh_gpio_set_value(struct pinmux_info *gpioc,
-			     unsigned gpio, int value)
-{
-	struct pinmux_data_reg *dr = NULL;
-	int bit = 0;
-
-	if (!gpioc || get_data_reg(gpioc, gpio, &dr, &bit) != 0)
-		BUG();
-	else
-		gpio_write_bit(dr, bit, value);
-}
-
-static int sh_gpio_direction_output(struct gpio_chip *chip, unsigned offset,
-				    int value)
-{
-	struct pinmux_info *gpioc = chip_to_pinmux(chip);
-	unsigned long flags;
-	int ret;
-
-	sh_gpio_set_value(gpioc, offset, value);
-	spin_lock_irqsave(&gpio_lock, flags);
-	ret = pinmux_direction(gpioc, offset, PINMUX_TYPE_OUTPUT);
-	spin_unlock_irqrestore(&gpio_lock, flags);
-
-	return ret;
-}
-
-static int sh_gpio_get_value(struct pinmux_info *gpioc, unsigned gpio)
-{
-	struct pinmux_data_reg *dr = NULL;
-	int bit = 0;
-
-	if (!gpioc || get_data_reg(gpioc, gpio, &dr, &bit) != 0) {
-		BUG();
-		return 0;
-	}
-
-	return gpio_read_reg(dr->reg, dr->reg_width, 1, bit);
-}
-
-static int sh_gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-	return sh_gpio_get_value(chip_to_pinmux(chip), offset);
-}
-
-static void sh_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	sh_gpio_set_value(chip_to_pinmux(chip), offset, value);
-}
-
-int register_pinmux(struct pinmux_info *pip)
-{
-	struct gpio_chip *chip = &pip->chip;
-
-	pr_info("sh pinmux: %s handling gpio %d -> %d\n",
-		pip->name, pip->first_gpio, pip->last_gpio);
-
-	setup_data_regs(pip);
-
-	chip->request = sh_gpio_request;
-	chip->free = sh_gpio_free;
-	chip->direction_input = sh_gpio_direction_input;
-	chip->get = sh_gpio_get;
-	chip->direction_output = sh_gpio_direction_output;
-	chip->set = sh_gpio_set;
-
-	WARN_ON(pip->first_gpio != 0); /* needs testing */
-
-	chip->label = pip->name;
-	chip->owner = THIS_MODULE;
-	chip->base = pip->first_gpio;
-	chip->ngpio = (pip->last_gpio - pip->first_gpio) + 1;
-
-	return gpiochip_add(chip);
-}
diff --git a/arch/sh/kernel/head_32.S b/arch/sh/kernel/head_32.S
index a78be74..1151ecd 100644
--- a/arch/sh/kernel/head_32.S
+++ b/arch/sh/kernel/head_32.S
@@ -33,7 +33,7 @@
 	.long	1		/* LOADER_TYPE */
 	.long	0x00000000	/* INITRD_START */
 	.long	0x00000000	/* INITRD_SIZE */
-#ifdef CONFIG_32BIT
+#if defined(CONFIG_32BIT) && defined(CONFIG_PMB_FIXED)
 	.long	0x53453f00 + 32	/* "SE?" = 32 bit */
 #else
 	.long	0x53453f00 + 29	/* "SE?" = 29 bit */
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 27ff2dc..aaff003 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -21,7 +21,7 @@
 #include <asm/atomic.h>
 
 static int hlt_counter;
-void (*pm_idle)(void);
+void (*pm_idle)(void) = NULL;
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
@@ -39,48 +39,92 @@
 }
 __setup("hlt", hlt_setup);
 
-void default_idle(void)
+static inline int hlt_works(void)
 {
-	if (!hlt_counter) {
-		clear_thread_flag(TIF_POLLING_NRFLAG);
-		smp_mb__after_clear_bit();
-		set_bl_bit();
-		stop_critical_timings();
-
-		while (!need_resched())
-			cpu_sleep();
-
-		start_critical_timings();
-		clear_bl_bit();
-		set_thread_flag(TIF_POLLING_NRFLAG);
-	} else
-		while (!need_resched())
-			cpu_relax();
+	return !hlt_counter;
 }
 
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+	local_irq_enable();
+	while (!need_resched())
+		cpu_relax();
+}
+
+void default_idle(void)
+{
+	if (hlt_works()) {
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+
+		if (!need_resched()) {
+			local_irq_enable();
+			cpu_sleep();
+		} else
+			local_irq_enable();
+
+		set_thread_flag(TIF_POLLING_NRFLAG);
+	} else
+		poll_idle();
+}
+
+/*
+ * The idle thread. There's no useful work to be done, so just try to conserve
+ * power and have a low exit latency (ie sit in a loop waiting for somebody to
+ * say that they'd like to reschedule)
+ */
 void cpu_idle(void)
 {
+	unsigned int cpu = smp_processor_id();
+
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		void (*idle)(void) = pm_idle;
-
-		if (!idle)
-			idle = default_idle;
-
 		tick_nohz_stop_sched_tick(1);
-		while (!need_resched())
-			idle();
-		tick_nohz_restart_sched_tick();
 
+		while (!need_resched() && cpu_online(cpu)) {
+			check_pgt_cache();
+			rmb();
+
+			local_irq_disable();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
+			pm_idle();
+			/*
+			 * Sanity check to ensure that pm_idle() returns
+			 * with IRQs enabled
+			 */
+			WARN_ON(irqs_disabled());
+			start_critical_timings();
+		}
+
+		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
-		check_pgt_cache();
 	}
 }
 
+void __cpuinit select_idle_routine(void)
+{
+	/*
+	 * If a platform has set its own idle routine, leave it alone.
+	 */
+	if (pm_idle)
+		return;
+
+	if (hlt_works())
+		pm_idle = default_idle;
+	else
+		pm_idle = poll_idle;
+}
+
 static void do_nothing(void *unused)
 {
 }
diff --git a/arch/sh/kernel/io_generic.c b/arch/sh/kernel/io_generic.c
index b8fa652..e1e1dbd 100644
--- a/arch/sh/kernel/io_generic.c
+++ b/arch/sh/kernel/io_generic.c
@@ -24,7 +24,7 @@
 #define dummy_read()
 #endif
 
-unsigned long generic_io_base;
+unsigned long generic_io_base = 0;
 
 u8 generic_inb(unsigned long port)
 {
@@ -147,8 +147,10 @@
 
 void __iomem *generic_ioport_map(unsigned long addr, unsigned int size)
 {
+#ifdef P1SEG
 	if (PXSEG(addr) >= P1SEG)
 		return (void __iomem *)addr;
+#endif
 
 	return (void __iomem *)(addr + generic_io_base);
 }
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index eac7da7..e1913f2 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -37,7 +37,15 @@
  */
 static int show_other_interrupts(struct seq_file *p, int prec)
 {
+	int j;
+
+	seq_printf(p, "%*s: ", prec, "NMI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stat[j].__nmi_count);
+	seq_printf(p, "  Non-maskable interrupts\n");
+
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+
 	return 0;
 }
 
@@ -255,6 +263,12 @@
 {
 	plat_irq_setup();
 
+	/*
+	 * Pin any of the legacy IRQ vectors that haven't already been
+	 * grabbed by the platform
+	 */
+	reserve_irq_legacy();
+
 	/* Perform the machine specific initialisation */
 	if (sh_mv.mv_init_irq)
 		sh_mv.mv_init_irq();
diff --git a/arch/sh/kernel/irq_32.c b/arch/sh/kernel/irq_32.c
new file mode 100644
index 0000000..e33ab15
--- /dev/null
+++ b/arch/sh/kernel/irq_32.c
@@ -0,0 +1,57 @@
+/*
+ * SHcompact irqflags support
+ *
+ * Copyright (C) 2006 - 2009 Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/irqflags.h>
+#include <linux/module.h>
+
+void notrace raw_local_irq_restore(unsigned long flags)
+{
+	unsigned long __dummy0, __dummy1;
+
+	if (flags == RAW_IRQ_DISABLED) {
+		__asm__ __volatile__ (
+			"stc	sr, %0\n\t"
+			"or	#0xf0, %0\n\t"
+			"ldc	%0, sr\n\t"
+			: "=&z" (__dummy0)
+			: /* no inputs */
+			: "memory"
+		);
+	} else {
+		__asm__ __volatile__ (
+			"stc	sr, %0\n\t"
+			"and	%1, %0\n\t"
+#ifdef CONFIG_CPU_HAS_SR_RB
+			"stc	r6_bank, %1\n\t"
+			"or	%1, %0\n\t"
+#endif
+			"ldc	%0, sr\n\t"
+			: "=&r" (__dummy0), "=r" (__dummy1)
+			: "1" (~RAW_IRQ_DISABLED)
+			: "memory"
+		);
+	}
+}
+EXPORT_SYMBOL(raw_local_irq_restore);
+
+unsigned long notrace __raw_local_save_flags(void)
+{
+	unsigned long flags;
+
+	__asm__ __volatile__ (
+		"stc	sr, %0\n\t"
+		"and	#0xf0, %0\n\t"
+		: "=&z" (flags)
+		: /* no inputs */
+		: "memory"
+	);
+
+	return flags;
+}
+EXPORT_SYMBOL(__raw_local_save_flags);
diff --git a/arch/sh/kernel/irq_64.c b/arch/sh/kernel/irq_64.c
new file mode 100644
index 0000000..32365ba
--- /dev/null
+++ b/arch/sh/kernel/irq_64.c
@@ -0,0 +1,51 @@
+/*
+ * SHmedia irqflags support
+ *
+ * Copyright (C) 2006 - 2009 Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/irqflags.h>
+#include <linux/module.h>
+#include <cpu/registers.h>
+
+void notrace raw_local_irq_restore(unsigned long flags)
+{
+	unsigned long long __dummy;
+
+	if (flags == RAW_IRQ_DISABLED) {
+		__asm__ __volatile__ (
+			"getcon	" __SR ", %0\n\t"
+			"or	%0, %1, %0\n\t"
+			"putcon	%0, " __SR "\n\t"
+			: "=&r" (__dummy)
+			: "r" (RAW_IRQ_DISABLED)
+		);
+	} else {
+		__asm__ __volatile__ (
+			"getcon	" __SR ", %0\n\t"
+			"and	%0, %1, %0\n\t"
+			"putcon	%0, " __SR "\n\t"
+			: "=&r" (__dummy)
+			: "r" (~RAW_IRQ_DISABLED)
+		);
+	}
+}
+EXPORT_SYMBOL(raw_local_irq_restore);
+
+unsigned long notrace __raw_local_save_flags(void)
+{
+	unsigned long flags;
+
+	__asm__ __volatile__ (
+		"getcon	" __SR ", %0\n\t"
+		"and	%0, %1, %0"
+		: "=&r" (flags)
+		: "r" (RAW_IRQ_DISABLED)
+	);
+
+	return flags;
+}
+EXPORT_SYMBOL(__raw_local_save_flags);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 7ea2704..76f2802 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -46,12 +46,6 @@
  */
 int machine_kexec_prepare(struct kimage *image)
 {
-	/* older versions of kexec-tools are passing
-	 * the zImage entry point as a virtual address.
-	 */
-	if (image->start != PHYSADDR(image->start))
-		return -EINVAL; /* upgrade your kexec-tools */
-
 	return 0;
 }
 
diff --git a/arch/sh/kernel/machvec.c b/arch/sh/kernel/machvec.c
index cbce639..1652340 100644
--- a/arch/sh/kernel/machvec.c
+++ b/arch/sh/kernel/machvec.c
@@ -135,5 +135,9 @@
 	if (!sh_mv.mv_nr_irqs)
 		sh_mv.mv_nr_irqs = NR_IRQS;
 
+#ifdef P2SEG
 	__set_io_port_base(P2SEG);
+#else
+	__set_io_port_base(0);
+#endif
 }
diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c
index c2efdcd..43adddf 100644
--- a/arch/sh/kernel/module.c
+++ b/arch/sh/kernel/module.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <asm/unaligned.h>
+#include <asm/dwarf.h>
 
 void *module_alloc(unsigned long size)
 {
@@ -145,10 +146,16 @@
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
-	return module_bug_finalize(hdr, sechdrs, me);
+	int ret = 0;
+
+	ret |= module_dwarf_finalize(hdr, sechdrs, me);
+	ret |= module_bug_finalize(hdr, sechdrs, me);
+
+	return ret;
 }
 
 void module_arch_cleanup(struct module *mod)
 {
 	module_bug_cleanup(mod);
+	module_dwarf_cleanup(mod);
 }
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c
new file mode 100644
index 0000000..24ea837
--- /dev/null
+++ b/arch/sh/kernel/perf_callchain.c
@@ -0,0 +1,98 @@
+/*
+ * Performance event callchain support - SuperH architecture code
+ *
+ * Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <asm/unwinder.h>
+#include <asm/ptrace.h>
+
+static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static void callchain_warning(void *data, char *msg)
+{
+}
+
+static void
+callchain_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+}
+
+static int callchain_stack(void *data, char *name)
+{
+	return 0;
+}
+
+static void callchain_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops callchain_ops = {
+	.warning	= callchain_warning,
+	.warning_symbol	= callchain_warning_symbol,
+	.stack		= callchain_stack,
+	.address	= callchain_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
+	callchain_store(entry, regs->pc);
+
+	unwind_stack(NULL, regs, NULL, &callchain_ops, entry);
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	/*
+	 * Only the kernel side is implemented for now.
+	 */
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+}
+
+/*
+ * No need for separate IRQ and NMI entries.
+ */
+static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
new file mode 100644
index 0000000..7ff0943
--- /dev/null
+++ b/arch/sh/kernel/perf_event.c
@@ -0,0 +1,312 @@
+/*
+ * Performance event support framework for SuperH hardware counters.
+ *
+ *  Copyright (C) 2009  Paul Mundt
+ *
+ * Heavily based on the x86 and PowerPC implementations.
+ *
+ * x86:
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *
+ * ppc:
+ *  Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/perf_event.h>
+#include <asm/processor.h>
+
+struct cpu_hw_events {
+	struct perf_event	*events[MAX_HWEVENTS];
+	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+	unsigned long		active_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+};
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+static struct sh_pmu *sh_pmu __read_mostly;
+
+/* Number of perf_events counting hardware events */
+static atomic_t num_events;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Stub these out for now, do something more profound later.
+ */
+int reserve_pmc_hardware(void)
+{
+	return 0;
+}
+
+void release_pmc_hardware(void)
+{
+}
+
+static inline int sh_pmu_initialized(void)
+{
+	return !!sh_pmu;
+}
+
+/*
+ * Release the PMU if this is the last perf_event.
+ */
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	if (!atomic_add_unless(&num_events, -1, 1)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_dec_return(&num_events) == 0)
+			release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+static int hw_perf_cache_event(int config, int *evp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!sh_pmu->cache_events)
+		return -EINVAL;
+
+	/* unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*sh_pmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*evp = ev;
+	return 0;
+}
+
+static int __hw_perf_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	int config = -1;
+	int err;
+
+	if (!sh_pmu_initialized())
+		return -ENODEV;
+
+	/*
+	 * All of the on-chip counters are "limited", in that they have
+	 * no interrupts, and are therefore unable to do sampling without
+	 * further work and timer assistance.
+	 */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * See if we need to reserve the counter.
+	 *
+	 * If no events are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * reserve_pmc_hardware or release_pmc_hardware.
+	 */
+	err = 0;
+	if (!atomic_inc_not_zero(&num_events)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_events) == 0 &&
+		    reserve_pmc_hardware())
+			err = -EBUSY;
+		else
+			atomic_inc(&num_events);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+
+	if (err)
+		return err;
+
+	event->destroy = hw_perf_event_destroy;
+
+	switch (attr->type) {
+	case PERF_TYPE_RAW:
+		config = attr->config & sh_pmu->raw_event_mask;
+		break;
+	case PERF_TYPE_HW_CACHE:
+		err = hw_perf_cache_event(attr->config, &config);
+		if (err)
+			return err;
+		break;
+	case PERF_TYPE_HARDWARE:
+		if (attr->config >= sh_pmu->max_events)
+			return -EINVAL;
+
+		config = sh_pmu->event_map(attr->config);
+		break;
+	}
+
+	if (config == -1)
+		return -EINVAL;
+
+	hwc->config |= config;
+
+	return 0;
+}
+
+static void sh_perf_event_update(struct perf_event *event,
+				   struct hw_perf_event *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+	int shift = 0;
+
+	/*
+	 * Depending on the counter configuration, they may or may not
+	 * be chained, in which case the previous counter value can be
+	 * updated underneath us if the lower-half overflows.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically.
+	 *
+	 * As there is no interrupt associated with the overflow events,
+	 * this is the simplest approach for maintaining consistency.
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	new_raw_count = sh_pmu->read(idx);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			     new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &event->count);
+}
+
+static void sh_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	clear_bit(idx, cpuc->active_mask);
+	sh_pmu->disable(hwc, idx);
+
+	barrier();
+
+	sh_perf_event_update(event, &event->hw, idx);
+
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+static int sh_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (test_and_set_bit(idx, cpuc->used_mask)) {
+		idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
+		if (idx == sh_pmu->num_events)
+			return -EAGAIN;
+
+		set_bit(idx, cpuc->used_mask);
+		hwc->idx = idx;
+	}
+
+	sh_pmu->disable(hwc, idx);
+
+	cpuc->events[idx] = event;
+	set_bit(idx, cpuc->active_mask);
+
+	sh_pmu->enable(hwc, idx);
+
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void sh_pmu_read(struct perf_event *event)
+{
+	sh_perf_event_update(event, &event->hw, event->hw.idx);
+}
+
+static const struct pmu pmu = {
+	.enable		= sh_pmu_enable,
+	.disable	= sh_pmu_disable,
+	.read		= sh_pmu_read,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	int err = __hw_perf_event_init(event);
+	if (unlikely(err)) {
+		if (event->destroy)
+			event->destroy(event);
+		return ERR_PTR(err);
+	}
+
+	return &pmu;
+}
+
+void hw_perf_event_setup(int cpu)
+{
+	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	memset(cpuhw, 0, sizeof(struct cpu_hw_events));
+}
+
+void hw_perf_enable(void)
+{
+	if (!sh_pmu_initialized())
+		return;
+
+	sh_pmu->enable_all();
+}
+
+void hw_perf_disable(void)
+{
+	if (!sh_pmu_initialized())
+		return;
+
+	sh_pmu->disable_all();
+}
+
+int register_sh_pmu(struct sh_pmu *pmu)
+{
+	if (sh_pmu)
+		return -EBUSY;
+	sh_pmu = pmu;
+
+	pr_info("Performance Events: %s support registered\n", pmu->name);
+
+	WARN_ON(pmu->num_events > MAX_HWEVENTS);
+
+	return 0;
+}
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 4a2c866..7399d78 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -133,7 +133,10 @@
 	regs.regs[5] = (unsigned long)fn;
 
 	regs.pc = (unsigned long)kernel_thread_helper;
-	regs.sr = (1 << 30);
+	regs.sr = SR_MD;
+#if defined(CONFIG_SH_FPU)
+	regs.sr |= SR_FD;
+#endif
 
 	/* Ok, create the new process.. */
 	pid = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0,
@@ -141,6 +144,7 @@
 
 	return pid;
 }
+EXPORT_SYMBOL(kernel_thread);
 
 /*
  * Free current thread data structures etc..
@@ -184,6 +188,16 @@
 
 	return fpvalid;
 }
+EXPORT_SYMBOL(dump_fpu);
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+	unlazy_fpu(tsk, task_pt_regs(tsk));
+}
 
 asmlinkage void ret_from_fork(void);
 
@@ -193,15 +207,10 @@
 {
 	struct thread_info *ti = task_thread_info(p);
 	struct pt_regs *childregs;
-	struct task_struct *tsk = current;
-
-#if defined(CONFIG_SH_FPU)
-	unlazy_fpu(tsk, regs);
-	p->thread.fpu = tsk->thread.fpu;
-	copy_to_stopped_child_used_math(p);
-#endif
 
 #if defined(CONFIG_SH_DSP)
+	struct task_struct *tsk = current;
+
 	if (is_dsp_enabled(tsk)) {
 		/* We can use the __save_dsp or just copy the struct:
 		 * __save_dsp(p);
@@ -220,6 +229,8 @@
 	} else {
 		childregs->regs[15] = (unsigned long)childregs;
 		ti->addr_limit = KERNEL_DS;
+		ti->status &= ~TS_USEDFPU;
+		p->fpu_counter = 0;
 	}
 
 	if (clone_flags & CLONE_SETTLS)
@@ -242,9 +253,13 @@
 __notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev, struct task_struct *next)
 {
-#if defined(CONFIG_SH_FPU)
+	struct thread_struct *next_t = &next->thread;
+
 	unlazy_fpu(prev, task_pt_regs(prev));
-#endif
+
+	/* we're going to use this soon, after a few expensive things */
+	if (next->fpu_counter > 5)
+		prefetch(&next_t->fpu.hard);
 
 #ifdef CONFIG_MMU
 	/*
@@ -256,6 +271,14 @@
 		     : "r" (task_thread_info(next)));
 #endif
 
+	/*
+	 * If the task has used fpu the last 5 timeslices, just do a full
+	 * restore of the math state immediately to avoid the trap; the
+	 * chances of needing FPU soon are obviously high now
+	 */
+	if (next->fpu_counter > 5)
+		fpu_state_restore(task_pt_regs(next));
+
 	return prev;
 }
 
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 1192398..359b8a2 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -335,6 +335,7 @@
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0,
 		      &regs, 0, NULL, NULL);
 }
+EXPORT_SYMBOL(kernel_thread);
 
 /*
  * Free current thread data structures etc..
@@ -417,6 +418,7 @@
 	return 0; /* Task didn't use the fpu at all. */
 #endif
 }
+EXPORT_SYMBOL(dump_fpu);
 
 asmlinkage void ret_from_fork(void);
 
diff --git a/arch/sh/kernel/return_address.c b/arch/sh/kernel/return_address.c
new file mode 100644
index 0000000..df3ab58
--- /dev/null
+++ b/arch/sh/kernel/return_address.c
@@ -0,0 +1,54 @@
+/*
+ * arch/sh/kernel/return_address.c
+ *
+ * Copyright (C) 2009  Matt Fleming
+ * Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <asm/dwarf.h>
+
+#ifdef CONFIG_DWARF_UNWINDER
+
+void *return_address(unsigned int depth)
+{
+	struct dwarf_frame *frame;
+	unsigned long ra;
+	int i;
+
+	for (i = 0, frame = NULL, ra = 0; i <= depth; i++) {
+		struct dwarf_frame *tmp;
+
+		tmp = dwarf_unwind_stack(ra, frame);
+
+		if (frame)
+			dwarf_free_frame(frame);
+
+		frame = tmp;
+
+		if (!frame || !frame->return_addr)
+			break;
+
+		ra = frame->return_addr;
+	}
+
+	/* Failed to unwind the stack to the specified depth. */
+	WARN_ON(i != depth + 1);
+
+	if (frame)
+		dwarf_free_frame(frame);
+
+	return (void *)ra;
+}
+
+#else
+
+void *return_address(unsigned int depth)
+{
+	return NULL;
+}
+
+#endif
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 99b4fb5..5a947a2 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -453,6 +453,10 @@
 
 	paging_init();
 
+#ifdef CONFIG_PMB_ENABLE
+	pmb_init();
+#endif
+
 #ifdef CONFIG_SMP
 	plat_smp_setup();
 #endif
diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c
index 444cce3..3896f26 100644
--- a/arch/sh/kernel/sh_ksyms_32.c
+++ b/arch/sh/kernel/sh_ksyms_32.c
@@ -1,37 +1,11 @@
 #include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/user.h>
-#include <linux/elfcore.h>
-#include <linux/sched.h>
-#include <linux/in6.h>
-#include <linux/interrupt.h>
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <linux/irq.h>
-#include <asm/sections.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
 #include <asm/checksum.h>
-#include <asm/io.h>
-#include <asm/delay.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/ftrace.h>
+#include <asm/sections.h>
 
-extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
-
-/* platform dependent support */
-EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(strlen);
-
-/* PCI exports */
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_alloc_consistent);
-EXPORT_SYMBOL(pci_free_consistent);
-#endif
-
-/* mem exports */
 EXPORT_SYMBOL(memchr);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
@@ -40,6 +14,13 @@
 EXPORT_SYMBOL(__udelay);
 EXPORT_SYMBOL(__ndelay);
 EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(csum_partial);
+EXPORT_SYMBOL(csum_partial_copy_generic);
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(_ebss);
+EXPORT_SYMBOL(empty_zero_page);
 
 #define DECLARE_EXPORT(name)		\
 	extern void name(void);EXPORT_SYMBOL(name)
@@ -107,30 +88,6 @@
 DECLARE_EXPORT(__udivsi3_i4);
 DECLARE_EXPORT(__sdivsi3_i4i);
 DECLARE_EXPORT(__udivsi3_i4i);
-
-#if !defined(CONFIG_CACHE_OFF) && (defined(CONFIG_CPU_SH4) || \
-	defined(CONFIG_SH7705_CACHE_32KB))
-/* needed by some modules */
-EXPORT_SYMBOL(flush_cache_all);
-EXPORT_SYMBOL(flush_cache_range);
-EXPORT_SYMBOL(flush_dcache_page);
-#endif
-
 #ifdef CONFIG_MCOUNT
 DECLARE_EXPORT(mcount);
 #endif
-EXPORT_SYMBOL(csum_partial);
-EXPORT_SYMBOL(csum_partial_copy_generic);
-#ifdef CONFIG_IPV6
-EXPORT_SYMBOL(csum_ipv6_magic);
-#endif
-EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(_ebss);
-EXPORT_SYMBOL(empty_zero_page);
-
-#ifndef CONFIG_CACHE_OFF
-EXPORT_SYMBOL(__flush_purge_region);
-EXPORT_SYMBOL(__flush_wback_region);
-EXPORT_SYMBOL(__flush_invalidate_region);
-#endif
diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c
index d008e17..45afa5c 100644
--- a/arch/sh/kernel/sh_ksyms_64.c
+++ b/arch/sh/kernel/sh_ksyms_64.c
@@ -24,16 +24,6 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 
-extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
-
-/* platform dependent support */
-EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(kernel_thread);
-
-#ifdef CONFIG_VT
-EXPORT_SYMBOL(screen_info);
-#endif
-
 EXPORT_SYMBOL(__put_user_asm_b);
 EXPORT_SYMBOL(__put_user_asm_w);
 EXPORT_SYMBOL(__put_user_asm_l);
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 3db3742..12815ce 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -67,7 +67,8 @@
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
-	set_thread_flag(TIF_RESTORE_SIGMASK);
+	set_restore_sigmask();
+
 	return -ERESTARTNOHAND;
 }
 
@@ -590,7 +591,7 @@
 	if (try_to_freeze())
 		goto no_signal;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
+	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
 		oldset = &current->saved_sigmask;
 	else
 		oldset = &current->blocked;
@@ -602,12 +603,13 @@
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(signr, &ka, &info, oldset,
 				  regs, save_r0) == 0) {
-			/* a signal was successfully delivered; the saved
+			/*
+			 * A signal was successfully delivered; the saved
 			 * sigmask will have been stored in the signal frame,
 			 * and will be restored by sigreturn, so we can simply
-			 * clear the TIF_RESTORE_SIGMASK flag */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
+			 * clear the TS_RESTORE_SIGMASK flag
+			 */
+			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 
 			tracehook_signal_handler(signr, &info, &ka, regs,
 					test_thread_flag(TIF_SINGLESTEP));
@@ -631,10 +633,12 @@
 		}
 	}
 
-	/* if there's no signal to deliver, we just put the saved sigmask
-	 * back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
+	/*
+	 * If there's no signal to deliver, we just put the saved sigmask
+	 * back.
+	 */
+	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
+		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
 	}
 }
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 74793c8..feb3ddd 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -101,7 +101,7 @@
 	if (try_to_freeze())
 		goto no_signal;
 
-	if (test_thread_flag(TIF_RESTORE_SIGMASK))
+	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
 		oldset = &current->saved_sigmask;
 	else if (!oldset)
 		oldset = &current->blocked;
@@ -115,11 +115,9 @@
 			/*
 			 * If a signal was successfully delivered, the
 			 * saved sigmask is in its frame, and we can
-			 * clear the TIF_RESTORE_SIGMASK flag.
+			 * clear the TS_RESTORE_SIGMASK flag.
 			 */
-			if (test_thread_flag(TIF_RESTORE_SIGMASK))
-				clear_thread_flag(TIF_RESTORE_SIGMASK);
-
+			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 			tracehook_signal_handler(signr, &info, &ka, regs, 0);
 			return 1;
 		}
@@ -146,8 +144,8 @@
 	}
 
 	/* No signal to deliver -- put the saved sigmask back */
-	if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-		clear_thread_flag(TIF_RESTORE_SIGMASK);
+	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
+		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
 	}
 
@@ -176,6 +174,7 @@
 	while (1) {
 		current->state = TASK_INTERRUPTIBLE;
 		schedule();
+		set_restore_sigmask();
 		regs->pc += 4;    /* because sys_sigreturn decrements the pc */
 		if (do_signal(regs, &saveset)) {
 			/* pc now points at signal handler. Need to decrement
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 160db10..983e079 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -122,7 +122,9 @@
 	stack_start.bss_start = 0; /* don't clear bss for secondary cpus */
 	stack_start.start_kernel_fn = start_secondary;
 
-	flush_cache_all();
+	flush_icache_range((unsigned long)&stack_start,
+			   (unsigned long)&stack_start + sizeof(stack_start));
+	wmb();
 
 	plat_start_cpu(cpu, (unsigned long)_stext);
 
diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c
index 0838942..9b0b633 100644
--- a/arch/sh/kernel/topology.c
+++ b/arch/sh/kernel/topology.c
@@ -16,6 +16,32 @@
 
 static DEFINE_PER_CPU(struct cpu, cpu_devices);
 
+cpumask_t cpu_core_map[NR_CPUS];
+
+static cpumask_t cpu_coregroup_map(unsigned int cpu)
+{
+	/*
+	 * Presently all SH-X3 SMP cores are multi-cores, so just keep it
+	 * simple until we have a method for determining topology..
+	 */
+	return cpu_possible_map;
+}
+
+const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+	return &cpu_core_map[cpu];
+}
+
+int arch_update_cpu_topology(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
+
+	return 0;
+}
+
 static int __init topology_init(void)
 {
 	int i, ret;
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index a8396f3..7b03633 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -9,8 +9,8 @@
 #include <asm/unwinder.h>
 #include <asm/system.h>
 
-#ifdef CONFIG_BUG
-void handle_BUG(struct pt_regs *regs)
+#ifdef CONFIG_GENERIC_BUG
+static void handle_BUG(struct pt_regs *regs)
 {
 	const struct bug_entry *bug;
 	unsigned long bugaddr = regs->pc;
@@ -81,7 +81,7 @@
 		       SIGTRAP) == NOTIFY_STOP)
 		return;
 
-#ifdef CONFIG_BUG
+#ifdef CONFIG_GENERIC_BUG
 	if (__kernel_text_address(instruction_pointer(regs))) {
 		insn_size_t insn = *(insn_size_t *)instruction_pointer(regs);
 		if (insn == TRAPA_BUG_OPCODE)
@@ -95,9 +95,11 @@
 
 BUILD_TRAP_HANDLER(nmi)
 {
+	unsigned int cpu = smp_processor_id();
 	TRAP_HANDLER_DECL;
 
 	nmi_enter();
+	nmi_count(cpu)++;
 
 	switch (notify_die(DIE_NMI, "NMI", regs, 0, vec & 0xff, SIGINT)) {
 	case NOTIFY_OK:
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 7a2ee3a..3da5a12 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -25,6 +25,7 @@
 #include <linux/kexec.h>
 #include <linux/limits.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/sysfs.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -68,61 +69,49 @@
 	"signal+warn"
 };
 
-static int
-proc_alignment_read(char *page, char **start, off_t off, int count, int *eof,
-		    void *data)
+static int alignment_proc_show(struct seq_file *m, void *v)
 {
-	char *p = page;
-	int len;
-
-	p += sprintf(p, "User:\t\t%lu\n", se_user);
-	p += sprintf(p, "System:\t\t%lu\n", se_sys);
-	p += sprintf(p, "Half:\t\t%lu\n", se_half);
-	p += sprintf(p, "Word:\t\t%lu\n", se_word);
-	p += sprintf(p, "DWord:\t\t%lu\n", se_dword);
-	p += sprintf(p, "Multi:\t\t%lu\n", se_multi);
-	p += sprintf(p, "User faults:\t%i (%s)\n", se_usermode,
+	seq_printf(m, "User:\t\t%lu\n", se_user);
+	seq_printf(m, "System:\t\t%lu\n", se_sys);
+	seq_printf(m, "Half:\t\t%lu\n", se_half);
+	seq_printf(m, "Word:\t\t%lu\n", se_word);
+	seq_printf(m, "DWord:\t\t%lu\n", se_dword);
+	seq_printf(m, "Multi:\t\t%lu\n", se_multi);
+	seq_printf(m, "User faults:\t%i (%s)\n", se_usermode,
 			se_usermode_action[se_usermode]);
-	p += sprintf(p, "Kernel faults:\t%i (fixup%s)\n", se_kernmode_warn,
+	seq_printf(m, "Kernel faults:\t%i (fixup%s)\n", se_kernmode_warn,
 			se_kernmode_warn ? "+warn" : "");
-
-	len = (p - page) - off;
-	if (len < 0)
-		len = 0;
-
-	*eof = (len <= count) ? 1 : 0;
-	*start = page + off;
-
-	return len;
+	return 0;
 }
 
-static int proc_alignment_write(struct file *file, const char __user *buffer,
-				unsigned long count, void *data)
+static int alignment_proc_open(struct inode *inode, struct file *file)
 {
+	return single_open(file, alignment_proc_show, NULL);
+}
+
+static ssize_t alignment_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *pos)
+{
+	int *data = PDE(file->f_path.dentry->d_inode)->data;
 	char mode;
 
 	if (count > 0) {
 		if (get_user(mode, buffer))
 			return -EFAULT;
 		if (mode >= '0' && mode <= '5')
-			se_usermode = mode - '0';
+			*data = mode - '0';
 	}
 	return count;
 }
 
-static int proc_alignment_kern_write(struct file *file, const char __user *buffer,
-				     unsigned long count, void *data)
-{
-	char mode;
-
-	if (count > 0) {
-		if (get_user(mode, buffer))
-			return -EFAULT;
-		if (mode >= '0' && mode <= '1')
-			se_kernmode_warn = mode - '0';
-	}
-	return count;
-}
+static const struct file_operations alignment_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= alignment_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= alignment_proc_write,
+};
 #endif
 
 static void dump_mem(const char *str, unsigned long bottom, unsigned long top)
@@ -945,14 +934,9 @@
 	set_exception_table_evt(0x800, do_reserved_inst);
 	set_exception_table_evt(0x820, do_illegal_slot_inst);
 #elif defined(CONFIG_SH_FPU)
-#ifdef CONFIG_CPU_SUBTYPE_SHX3
-	set_exception_table_evt(0xd80, fpu_state_restore_trap_handler);
-	set_exception_table_evt(0xda0, fpu_state_restore_trap_handler);
-#else
 	set_exception_table_evt(0x800, fpu_state_restore_trap_handler);
 	set_exception_table_evt(0x820, fpu_state_restore_trap_handler);
 #endif
-#endif
 
 #ifdef CONFIG_CPU_SH2
 	set_exception_table_vec(TRAP_ADDRESS_ERROR, address_error_trap_handler);
@@ -1011,20 +995,16 @@
 	if (!dir)
 		return -ENOMEM;
 
-	res = create_proc_entry("alignment", S_IWUSR | S_IRUGO, dir);
+	res = proc_create_data("alignment", S_IWUSR | S_IRUGO, dir,
+			       &alignment_proc_fops, &se_usermode);
 	if (!res)
 		return -ENOMEM;
 
-	res->read_proc = proc_alignment_read;
-	res->write_proc = proc_alignment_write;
-
-        res = create_proc_entry("kernel_alignment", S_IWUSR | S_IRUGO, dir);
+        res = proc_create_data("kernel_alignment", S_IWUSR | S_IRUGO, dir,
+			       &alignment_proc_fops, &se_kernmode_warn);
         if (!res)
                 return -ENOMEM;
 
-        res->read_proc = proc_alignment_read;
-        res->write_proc = proc_alignment_kern_write;
-
 	return 0;
 }