Merge ../linux-2.6-x86

Conflicts:

	arch/x86/kernel/io_apic.c
	kernel/sched.c
	kernel/sched_stats.h
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b62a766..1cad931 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,7 @@
 
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time_$(BITS).o ioport.o ldt.o
+obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
@@ -65,6 +65,7 @@
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4c51a2f..65d0b72 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1360,6 +1360,17 @@
 			disable_acpi();
 		}
 	}
+
+	/*
+	 * ACPI supports both logical (e.g. Hyper-Threading) and physical
+	 * processors, where MPS only supports physical.
+	 */
+	if (acpi_lapic && acpi_ioapic)
+		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+		       "information\n");
+	else if (acpi_lapic)
+		printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+		       "configuration information\n");
 #endif
 	return;
 }
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e..3a26525 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -391,11 +391,7 @@
 #else
 static int power_off = 1;
 #endif
-#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int realmode_power_off = 1;
-#else
 static int realmode_power_off;
-#endif
 #ifdef CONFIG_APM_ALLOW_INTS
 static int allow_ints = 1;
 #else
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d..88ea02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
+	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -427,6 +429,8 @@
 		}
 	}
 
+	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b61..816f27f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -307,12 +307,11 @@
 		set_cpu_cap(c, X86_FEATURE_P4);
 	if (c->x86 == 6)
 		set_cpu_cap(c, X86_FEATURE_P3);
+#endif
 
 	if (cpu_has_bts)
 		ptrace_bts_init_intel(c);
 
-#endif
-
 	detect_extended_topology(c);
 	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
 		/*
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index a2d1176..19a8c2c 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/kernel.h>
 
 
 /*
@@ -44,6 +44,33 @@
 };
 static struct ds_configuration ds_cfg;
 
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+	/* the DS context (partially) owned by this tracer */
+	struct ds_context *context;
+	/* the buffer provided on ds_request() and its size in bytes */
+	void *buffer;
+	size_t size;
+};
+
+struct bts_tracer {
+	/* the common DS part */
+	struct ds_tracer ds;
+	/* buffer overflow notification function */
+	bts_ovfl_callback_t ovfl;
+};
+
+struct pebs_tracer {
+	/* the common DS part */
+	struct ds_tracer ds;
+	/* buffer overflow notification function */
+	pebs_ovfl_callback_t ovfl;
+};
 
 /*
  * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -107,35 +134,14 @@
 	(*(unsigned long *)base) = value;
 }
 
+#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */
+
 
 /*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
+ * Locking is done only for allocating BTS or PEBS resources.
  */
 static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
-/*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
- */
-static inline int ds_validate_access(struct ds_context *context,
-				     enum ds_qualifier qual)
-{
-	if (!context)
-		return -EPERM;
-
-	if (context->owner[qual] == current)
-		return 0;
-
-	return -EPERM;
-}
-
 
 /*
  * We either support (system-wide) per-cpu or per-thread allocation.
@@ -183,51 +189,13 @@
  *
  * Contexts are use-counted. They are allocated on first access and
  * deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- *   requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- *   non-existing context indicates an error. It acquires and releases
- *   the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
  */
 static DEFINE_PER_CPU(struct ds_context *, system_context);
 
 #define this_system_context per_cpu(system_context, smp_processor_id())
 
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
- */
 static inline struct ds_context *ds_get_context(struct task_struct *task)
 {
-	struct ds_context *context;
-	unsigned long irq;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	context = (task ? task->thread.ds_ctx : this_system_context);
-	if (context)
-		context->count++;
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-	return context;
-}
-
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
-{
 	struct ds_context **p_context =
 		(task ? &task->thread.ds_ctx : &this_system_context);
 	struct ds_context *context = *p_context;
@@ -238,16 +206,9 @@
 		if (!context)
 			return NULL;
 
-		context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
-		if (!context->ds) {
-			kfree(context);
-			return NULL;
-		}
-
 		spin_lock_irqsave(&ds_lock, irq);
 
 		if (*p_context) {
-			kfree(context->ds);
 			kfree(context);
 
 			context = *p_context;
@@ -272,10 +233,6 @@
 	return context;
 }
 
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
 static inline void ds_put_context(struct ds_context *context)
 {
 	unsigned long irq;
@@ -296,13 +253,6 @@
 	if (!context->task || (context->task == current))
 		wrmsrl(MSR_IA32_DS_AREA, 0);
 
-	put_tracer(context->task);
-
-	/* free any leftover buffers from tracers that did not
-	 * deallocate them properly. */
-	kfree(context->buffer[ds_bts]);
-	kfree(context->buffer[ds_pebs]);
-	kfree(context->ds);
 	kfree(context);
  out:
 	spin_unlock_irqrestore(&ds_lock, irq);
@@ -312,119 +262,37 @@
 /*
  * Handle a buffer overflow
  *
- * task: the task whose buffers are overflowing;
- *       NULL for a buffer overflow on the current cpu
  * context: the ds context
  * qual: the buffer type
  */
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
-			enum ds_qualifier qual)
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
 {
-	if (!context)
-		return;
-
-	if (context->callback[qual])
-		(*context->callback[qual])(task);
-
-	/* todo: do some more overflow handling */
-}
-
-
-/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
- *
- * Returns the buffer, if successful;
- *         NULL, if out of memory or rlimit exceeded.
- *
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
- */
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
-{
-	unsigned long rlim, vm, pgsz;
-	void *buffer;
-
-	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->total_vm  + pgsz;
-	if (rlim < vm)
-		return NULL;
-
-	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->locked_vm  + pgsz;
-	if (rlim < vm)
-		return NULL;
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		return NULL;
-
-	current->mm->total_vm  += pgsz;
-	current->mm->locked_vm += pgsz;
-
-	if (pages)
-		*pages = pgsz;
-
-	return buffer;
-}
-
-static int ds_request(struct task_struct *task, void *base, size_t size,
-		      ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
-{
-	struct ds_context *context;
-	unsigned long buffer, adj;
-	const unsigned long alignment = (1 << 3);
-	unsigned long irq;
-	int error = 0;
-
-	if (!ds_cfg.sizeof_ds)
-		return -EOPNOTSUPP;
-
-	/* we require some space to do alignment adjustments below */
-	if (size < (alignment + ds_cfg.sizeof_rec[qual]))
-		return -EINVAL;
-
-	/* buffer overflow notification is not yet implemented */
-	if (ovfl)
-		return -EOPNOTSUPP;
-
-
-	context = ds_alloc_context(task);
-	if (!context)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	error = -EPERM;
-	if (!check_tracer(task))
-		goto out_unlock;
-
-	get_tracer(task);
-
-	error = -EALREADY;
-	if (context->owner[qual] == current)
-		goto out_put_tracer;
-	error = -EPERM;
-	if (context->owner[qual] != NULL)
-		goto out_put_tracer;
-	context->owner[qual] = current;
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-
-	error = -ENOMEM;
-	if (!base) {
-		base = ds_allocate_buffer(size, &context->pages[qual]);
-		if (!base)
-			goto out_release;
-
-		context->buffer[qual]   = base;
+	switch (qual) {
+	case ds_bts: {
+		struct bts_tracer *tracer =
+			container_of(context->owner[qual],
+				     struct bts_tracer, ds);
+		if (tracer->ovfl)
+			tracer->ovfl(tracer);
 	}
-	error = 0;
+		break;
+	case ds_pebs: {
+		struct pebs_tracer *tracer =
+			container_of(context->owner[qual],
+				     struct pebs_tracer, ds);
+		if (tracer->ovfl)
+			tracer->ovfl(tracer);
+	}
+		break;
+	}
+}
 
-	context->callback[qual] = ovfl;
+
+static void ds_install_ds_config(struct ds_context *context,
+				 enum ds_qualifier qual,
+				 void *base, size_t size, size_t ith)
+{
+	unsigned long buffer, adj;
 
 	/* adjust the buffer address and size to meet alignment
 	 * constraints:
@@ -436,7 +304,7 @@
 	 */
 	buffer = (unsigned long)base;
 
-	adj = ALIGN(buffer, alignment) - buffer;
+	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
 	buffer += adj;
 	size   -= adj;
 
@@ -447,210 +315,289 @@
 	ds_set(context->ds, qual, ds_index, buffer);
 	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
 
-	if (ovfl) {
-		/* todo: select a suitable interrupt threshold */
-	} else
-		ds_set(context->ds, qual,
-		       ds_interrupt_threshold, buffer + size + 1);
+	/* The value for 'no threshold' is -1, which will set the
+	 * threshold outside of the buffer, just like we want it.
+	 */
+	ds_set(context->ds, qual,
+	       ds_interrupt_threshold, buffer + size - ith);
+}
 
-	/* we keep the context until ds_release */
-	return error;
+static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
+		      struct task_struct *task,
+		      void *base, size_t size, size_t th)
+{
+	struct ds_context *context;
+	unsigned long irq;
+	int error;
 
- out_release:
-	context->owner[qual] = NULL;
-	ds_put_context(context);
-	put_tracer(task);
-	return error;
+	error = -EOPNOTSUPP;
+	if (!ds_cfg.sizeof_ds)
+		goto out;
+
+	error = -EINVAL;
+	if (!base)
+		goto out;
+
+	/* we require some space to do alignment adjustments below */
+	error = -EINVAL;
+	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+		goto out;
+
+	if (th != (size_t)-1) {
+		th *= ds_cfg.sizeof_rec[qual];
+
+		error = -EINVAL;
+		if (size <= th)
+			goto out;
+	}
+
+	tracer->buffer = base;
+	tracer->size = size;
+
+	error = -ENOMEM;
+	context = ds_get_context(task);
+	if (!context)
+		goto out;
+	tracer->context = context;
+
+
+	spin_lock_irqsave(&ds_lock, irq);
+
+	error = -EPERM;
+	if (!check_tracer(task))
+		goto out_unlock;
+	get_tracer(task);
+
+	error = -EPERM;
+	if (context->owner[qual])
+		goto out_put_tracer;
+	context->owner[qual] = tracer;
+
+	spin_unlock_irqrestore(&ds_lock, irq);
+
+
+	ds_install_ds_config(context, qual, base, size, th);
+
+	return 0;
 
  out_put_tracer:
-	spin_unlock_irqrestore(&ds_lock, irq);
-	ds_put_context(context);
 	put_tracer(task);
-	return error;
-
  out_unlock:
 	spin_unlock_irqrestore(&ds_lock, irq);
 	ds_put_context(context);
-	return error;
-}
-
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
-		   ds_ovfl_callback_t ovfl)
-{
-	return ds_request(task, base, size, ovfl, ds_bts);
-}
-
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-		    ds_ovfl_callback_t ovfl)
-{
-	return ds_request(task, base, size, ovfl, ds_pebs);
-}
-
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
-{
-	struct ds_context *context;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
-
-	kfree(context->buffer[qual]);
-	context->buffer[qual] = NULL;
-
-	current->mm->total_vm  -= context->pages[qual];
-	current->mm->locked_vm -= context->pages[qual];
-	context->pages[qual] = 0;
-	context->owner[qual] = NULL;
-
-	/*
-	 * we put the context twice:
-	 *   once for the ds_get_context
-	 *   once for the corresponding ds_request
-	 */
-	ds_put_context(context);
+	tracer->context = NULL;
  out:
-	ds_put_context(context);
 	return error;
 }
 
-int ds_release_bts(struct task_struct *task)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+				  void *base, size_t size,
+				  bts_ovfl_callback_t ovfl, size_t th)
 {
-	return ds_release(task, ds_bts);
-}
-
-int ds_release_pebs(struct task_struct *task)
-{
-	return ds_release(task, ds_pebs);
-}
-
-static int ds_get_index(struct task_struct *task, size_t *pos,
-			enum ds_qualifier qual)
-{
-	struct ds_context *context;
-	unsigned long base, index;
+	struct bts_tracer *tracer;
 	int error;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
+	/* buffer overflow notification is not yet implemented */
+	error = -EOPNOTSUPP;
+	if (ovfl)
 		goto out;
 
+	error = -ENOMEM;
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	if (!tracer)
+		goto out;
+	tracer->ovfl = ovfl;
+
+	error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
+	if (error < 0)
+		goto out_tracer;
+
+	return tracer;
+
+ out_tracer:
+	kfree(tracer);
+ out:
+	return ERR_PTR(error);
+}
+
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+				    void *base, size_t size,
+				    pebs_ovfl_callback_t ovfl, size_t th)
+{
+	struct pebs_tracer *tracer;
+	int error;
+
+	/* buffer overflow notification is not yet implemented */
+	error = -EOPNOTSUPP;
+	if (ovfl)
+		goto out;
+
+	error = -ENOMEM;
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	if (!tracer)
+		goto out;
+	tracer->ovfl = ovfl;
+
+	error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
+	if (error < 0)
+		goto out_tracer;
+
+	return tracer;
+
+ out_tracer:
+	kfree(tracer);
+ out:
+	return ERR_PTR(error);
+}
+
+static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
+{
+	BUG_ON(tracer->context->owner[qual] != tracer);
+	tracer->context->owner[qual] = NULL;
+
+	put_tracer(tracer->context->task);
+	ds_put_context(tracer->context);
+}
+
+int ds_release_bts(struct bts_tracer *tracer)
+{
+	if (!tracer)
+		return -EINVAL;
+
+	ds_release(&tracer->ds, ds_bts);
+	kfree(tracer);
+
+	return 0;
+}
+
+int ds_release_pebs(struct pebs_tracer *tracer)
+{
+	if (!tracer)
+		return -EINVAL;
+
+	ds_release(&tracer->ds, ds_pebs);
+	kfree(tracer);
+
+	return 0;
+}
+
+static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
+{
+	unsigned long base, index;
+
 	base  = ds_get(context->ds, qual, ds_buffer_base);
 	index = ds_get(context->ds, qual, ds_index);
 
-	error = ((index - base) / ds_cfg.sizeof_rec[qual]);
-	if (pos)
-		*pos = error;
- out:
-	ds_put_context(context);
-	return error;
+	return (index - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
 {
-	return ds_get_index(task, pos, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_index(tracer->ds.context, ds_bts);
+
+	return 0;
 }
 
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
 {
-	return ds_get_index(task, pos, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_index(tracer->ds.context, ds_pebs);
+
+	return 0;
 }
 
-static int ds_get_end(struct task_struct *task, size_t *pos,
-		      enum ds_qualifier qual)
+static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
 {
-	struct ds_context *context;
-	unsigned long base, end;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
+	unsigned long base, max;
 
 	base = ds_get(context->ds, qual, ds_buffer_base);
-	end  = ds_get(context->ds, qual, ds_absolute_maximum);
+	max  = ds_get(context->ds, qual, ds_absolute_maximum);
 
-	error = ((end - base) / ds_cfg.sizeof_rec[qual]);
-	if (pos)
-		*pos = error;
- out:
-	ds_put_context(context);
-	return error;
+	return (max - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
+int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
 {
-	return ds_get_end(task, pos, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_end(tracer->ds.context, ds_bts);
+
+	return 0;
 }
 
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
 {
-	return ds_get_end(task, pos, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_end(tracer->ds.context, ds_pebs);
+
+	return 0;
 }
 
-static int ds_access(struct task_struct *task, size_t index,
-		     const void **record, enum ds_qualifier qual)
+static int ds_access(struct ds_context *context, enum ds_qualifier qual,
+		     size_t index, const void **record)
 {
-	struct ds_context *context;
 	unsigned long base, idx;
-	int error;
 
 	if (!record)
 		return -EINVAL;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
-
 	base = ds_get(context->ds, qual, ds_buffer_base);
 	idx = base + (index * ds_cfg.sizeof_rec[qual]);
 
-	error = -EINVAL;
 	if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
-		goto out;
+		return -EINVAL;
 
 	*record = (const void *)idx;
-	error = ds_cfg.sizeof_rec[qual];
- out:
-	ds_put_context(context);
-	return error;
+
+	return ds_cfg.sizeof_rec[qual];
 }
 
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
+int ds_access_bts(struct bts_tracer *tracer, size_t index,
+		  const void **record)
 {
-	return ds_access(task, index, record, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	return ds_access(tracer->ds.context, ds_bts, index, record);
 }
 
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
+int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
+		   const void **record)
 {
-	return ds_access(task, index, record, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	return ds_access(tracer->ds.context, ds_pebs, index, record);
 }
 
-static int ds_write(struct task_struct *task, const void *record, size_t size,
-		    enum ds_qualifier qual, int force)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+		    const void *record, size_t size)
 {
-	struct ds_context *context;
-	int error;
+	int bytes_written = 0;
 
 	if (!record)
 		return -EINVAL;
 
-	error = -EPERM;
-	context = ds_get_context(task);
-	if (!context)
-		goto out;
-
-	if (!force) {
-		error = ds_validate_access(context, qual);
-		if (error < 0)
-			goto out;
-	}
-
-	error = 0;
 	while (size) {
 		unsigned long base, index, end, write_end, int_th;
 		unsigned long write_size, adj_write_size;
@@ -678,14 +625,14 @@
 			write_end = end;
 
 		if (write_end <= index)
-			goto out;
+			break;
 
 		write_size = min((unsigned long) size, write_end - index);
 		memcpy((void *)index, record, write_size);
 
 		record = (const char *)record + write_size;
-		size  -= write_size;
-		error += write_size;
+		size -= write_size;
+		bytes_written += write_size;
 
 		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
 		adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -700,47 +647,32 @@
 		ds_set(context->ds, qual, ds_index, index);
 
 		if (index >= int_th)
-			ds_overflow(task, context, qual);
+			ds_overflow(context, qual);
 	}
 
- out:
-	ds_put_context(context);
-	return error;
+	return bytes_written;
 }
 
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
 {
-	return ds_write(task, record, size, ds_bts, /* force = */ 0);
+	if (!tracer)
+		return -EINVAL;
+
+	return ds_write(tracer->ds.context, ds_bts, record, size);
 }
 
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
+int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
 {
-	return ds_write(task, record, size, ds_pebs, /* force = */ 0);
+	if (!tracer)
+		return -EINVAL;
+
+	return ds_write(tracer->ds.context, ds_pebs, record, size);
 }
 
-int ds_unchecked_write_bts(struct task_struct *task,
-			   const void *record, size_t size)
+static void ds_reset_or_clear(struct ds_context *context,
+			      enum ds_qualifier qual, int clear)
 {
-	return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
-
-int ds_unchecked_write_pebs(struct task_struct *task,
-			    const void *record, size_t size)
-{
-	return ds_write(task, record, size, ds_pebs, /* force = */ 1);
-}
-
-static int ds_reset_or_clear(struct task_struct *task,
-			     enum ds_qualifier qual, int clear)
-{
-	struct ds_context *context;
 	unsigned long base, end;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
 
 	base = ds_get(context->ds, qual, ds_buffer_base);
 	end  = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -749,70 +681,69 @@
 		memset((void *)base, 0, end - base);
 
 	ds_set(context->ds, qual, ds_index, base);
-
-	error = 0;
- out:
-	ds_put_context(context);
-	return error;
 }
 
-int ds_reset_bts(struct task_struct *task)
+int ds_reset_bts(struct bts_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
+
+	return 0;
 }
 
-int ds_reset_pebs(struct task_struct *task)
+int ds_reset_pebs(struct pebs_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
+
+	return 0;
 }
 
-int ds_clear_bts(struct task_struct *task)
+int ds_clear_bts(struct bts_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
+
+	return 0;
 }
 
-int ds_clear_pebs(struct task_struct *task)
+int ds_clear_pebs(struct pebs_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
+
+	return 0;
 }
 
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
 {
-	struct ds_context *context;
-	int error;
+	if (!tracer)
+		return -EINVAL;
 
 	if (!value)
 		return -EINVAL;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, ds_pebs);
-	if (error < 0)
-		goto out;
+	*value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
 
-	*value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
-
-	error = 0;
- out:
-	ds_put_context(context);
-	return error;
+	return 0;
 }
 
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 {
-	struct ds_context *context;
-	int error;
+	if (!tracer)
+		return -EINVAL;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, ds_pebs);
-	if (error < 0)
-		goto out;
+	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
 
-	*(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
-
-	error = 0;
- out:
-	ds_put_context(context);
-	return error;
+	return 0;
 }
 
 static const struct ds_configuration ds_cfg_var = {
@@ -840,6 +771,10 @@
 ds_configure(const struct ds_configuration *cfg)
 {
 	ds_cfg = *cfg;
+
+	printk(KERN_INFO "DS available\n");
+
+	BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,17 +782,16 @@
 	switch (c->x86) {
 	case 0x6:
 		switch (c->x86_model) {
+		case 0 ... 0xC:
+			/* sorry, don't know about them */
+			break;
 		case 0xD:
 		case 0xE: /* Pentium M */
 			ds_configure(&ds_cfg_var);
 			break;
-		case 0xF: /* Core2 */
-		case 0x1C: /* Atom */
+		default: /* Core2, Atom, ... */
 			ds_configure(&ds_cfg_64);
 			break;
-		default:
-			/* sorry, don't know about them */
-			break;
 		}
 		break;
 	case 0xF:
@@ -884,6 +818,8 @@
 	 * is dying. There should not be any user of that context left
 	 * to disturb us, anymore. */
 	unsigned long leftovers = context->count;
-	while (leftovers--)
+	while (leftovers--) {
+		put_tracer(context->task);
 		ds_put_context(context);
+	}
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 0000000..6b1f6f6
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#include "dumpstack.h"
+
+int panic_on_unrecovered_nmi;
+unsigned int code_bytes = 64;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+	printk(" [<%p>] %s%pS\n", (void *) address,
+			reliable ? "" : "? ", (void *) address);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+			const struct stacktrace_ops *ops,
+			struct thread_info *tinfo, int *graph)
+{
+	struct task_struct *task = tinfo->task;
+	unsigned long ret_addr;
+	int index = task->curr_ret_stack;
+
+	if (addr != (unsigned long)return_to_handler)
+		return;
+
+	if (!task->ret_stack || index < *graph)
+		return;
+
+	index -= *graph;
+	ret_addr = task->ret_stack[index].ret;
+
+	ops->address(data, ret_addr, 1);
+
+	(*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+			const struct stacktrace_ops *ops,
+			struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size, void *end)
+{
+	void *t = tinfo;
+	if (end) {
+		if (p < end && p >= (end-THREAD_SIZE))
+			return 1;
+		else
+			return 0;
+	}
+	return p > t && p < t + THREAD_SIZE - size;
+}
+
+unsigned long
+print_context_stack(struct thread_info *tinfo,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data,
+		unsigned long *end, int *graph)
+{
+	struct stack_frame *frame = (struct stack_frame *)bp;
+
+	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+		unsigned long addr;
+
+		addr = *stack;
+		if (__kernel_text_address(addr)) {
+			if ((unsigned long) stack == bp + sizeof(long)) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, bp == 0);
+			}
+			print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+		}
+		stack++;
+	}
+	return bp;
+}
+
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk(data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk("%s <%s> ", (char *)data, name);
+	return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	touch_nmi_watchdog();
+	printk(data);
+	printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
+
+void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+	printk("%sCall Trace:\n", log_lvl);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp)
+{
+	show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+	show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+	unsigned long bp = 0;
+	unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		get_bp(bp);
+#endif
+
+	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+		current->pid, current->comm, print_tainted(),
+		init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+	show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+	int cpu;
+	unsigned long flags;
+
+	oops_enter();
+
+	/* racy, but better than risking deadlock. */
+	raw_local_irq_save(flags);
+	cpu = smp_processor_id();
+	if (!__raw_spin_trylock(&die_lock)) {
+		if (cpu == die_owner)
+			/* nested oops. should stop eventually */;
+		else
+			__raw_spin_lock(&die_lock);
+	}
+	die_nest_count++;
+	die_owner = cpu;
+	console_verbose();
+	bust_spinlocks(1);
+	return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+	if (regs && kexec_should_crash(current))
+		crash_kexec(regs);
+
+	bust_spinlocks(0);
+	die_owner = -1;
+	add_taint(TAINT_DIE);
+	die_nest_count--;
+	if (!die_nest_count)
+		/* Nest count reaches zero, release the lock. */
+		__raw_spin_unlock(&die_lock);
+	raw_local_irq_restore(flags);
+	oops_exit();
+
+	if (!signr)
+		return;
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+	if (panic_on_oops)
+		panic("Fatal exception");
+	do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+#ifdef CONFIG_X86_32
+	unsigned short ss;
+	unsigned long sp;
+#endif
+	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
+#endif
+	printk("\n");
+	sysfs_printk_last_file();
+	if (notify_die(DIE_OOPS, str, regs, err,
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;
+
+	show_registers(regs);
+#ifdef CONFIG_X86_32
+	sp = (unsigned long) (&regs->sp);
+	savesegment(ss, ss);
+	if (user_mode(regs)) {
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
+	}
+	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+	print_symbol("%s", regs->ip);
+	printk(" SS:ESP %04x:%08lx\n", ss, sp);
+#else
+	/* Executive summary in case the oops scrolled away */
+	printk(KERN_ALERT "RIP ");
+	printk_address(regs->ip, 1);
+	printk(" RSP <%016lx>\n", regs->sp);
+#endif
+	return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+	unsigned long flags = oops_begin();
+	int sig = SIGSEGV;
+
+	if (!user_mode_vm(regs))
+		report_bug(regs->ip, regs);
+
+	if (__die(str, regs, err))
+		sig = 0;
+	oops_end(flags, regs, sig);
+}
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+	unsigned long flags;
+
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+		return;
+
+	/*
+	 * We are in trouble anyway, lets at least try
+	 * to get a message out.
+	 */
+	flags = oops_begin();
+	printk(KERN_EMERG "%s", str);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
+	show_registers(regs);
+	oops_end(flags, regs, 0);
+	if (do_panic || panic_on_oops)
+		panic("Non maskable interrupt");
+	nmi_exit();
+	local_irq_enable();
+	do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "panic"))
+		panic_on_oops = 1;
+	return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+	return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 0000000..da87590b
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef DUMPSTACK_H
+#define DUMPSTACK_H
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data,
+		unsigned long *end, int *graph);
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+extern int kstack_depth_to_print;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b361475..d593cd1 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
 
 #include <asm/stacktrace.h>
 
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-	printk(" [<%p>] %s%pS\n", (void *) address,
-			reliable ? "" : "? ", (void *) address);
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-			void *p, unsigned int size, void *end)
-{
-	void *t = tinfo;
-	if (end) {
-		if (p < end && p >= (end-THREAD_SIZE))
-			return 1;
-		else
-			return 0;
-	}
-	return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-		unsigned long *stack, unsigned long bp,
-		const struct stacktrace_ops *ops, void *data,
-		unsigned long *end)
-{
-	struct stack_frame *frame = (struct stack_frame *)bp;
-
-	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-		unsigned long addr;
-
-		addr = *stack;
-		if (__kernel_text_address(addr)) {
-			if ((unsigned long) stack == bp + sizeof(long)) {
-				ops->address(data, addr, 1);
-				frame = frame->next_frame;
-				bp = (unsigned long) frame;
-			} else {
-				ops->address(data, addr, bp == 0);
-			}
-		}
-		stack++;
-	}
-	return bp;
-}
+#include "dumpstack.h"
 
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
+	int graph = 0;
+
 	if (!task)
 		task = current;
 
@@ -107,7 +52,8 @@
 
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		bp = print_context_stack(context, stack, bp, ops, data, NULL);
+		bp = print_context_stack(context, stack, bp, ops,
+					 data, NULL, &graph);
 
 		stack = (unsigned long *)context->previous_esp;
 		if (!stack)
@@ -119,57 +65,7 @@
 }
 EXPORT_SYMBOL(dump_trace);
 
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	printk(data);
-	print_symbol(msg, symbol);
-	printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-	printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-	printk("%s <%s> ", (char *)data, name);
-	return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-	touch_nmi_watchdog();
-	printk(data);
-	printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-	.warning = print_trace_warning,
-	.warning_symbol = print_trace_warning_symbol,
-	.stack = print_trace_stack,
-	.address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-	printk("%sCall Trace:\n", log_lvl);
-	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp)
-{
-	show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *sp, unsigned long bp, char *log_lvl)
 {
@@ -196,33 +92,6 @@
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-	show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long bp = 0;
-	unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-	if (!bp)
-		get_bp(bp);
-#endif
-
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-	show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
 
 void show_registers(struct pt_regs *regs)
 {
@@ -283,167 +152,3 @@
 	return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-	unsigned long flags;
-
-	oops_enter();
-
-	if (die_owner != raw_smp_processor_id()) {
-		console_verbose();
-		raw_local_irq_save(flags);
-		__raw_spin_lock(&die_lock);
-		die_owner = smp_processor_id();
-		die_nest_count = 0;
-		bust_spinlocks(1);
-	} else {
-		raw_local_irq_save(flags);
-	}
-	die_nest_count++;
-	return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
-	bust_spinlocks(0);
-	die_owner = -1;
-	add_taint(TAINT_DIE);
-	__raw_spin_unlock(&die_lock);
-	raw_local_irq_restore(flags);
-
-	if (!regs)
-		return;
-
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-	if (panic_on_oops)
-		panic("Fatal exception");
-	oops_exit();
-	do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-	unsigned short ss;
-	unsigned long sp;
-
-	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-	printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-	printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
-#endif
-	printk("\n");
-	sysfs_printk_last_file();
-	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-		return 1;
-
-	show_registers(regs);
-	/* Executive summary in case the oops scrolled away */
-	sp = (unsigned long) (&regs->sp);
-	savesegment(ss, ss);
-	if (user_mode(regs)) {
-		sp = regs->sp;
-		ss = regs->ss & 0xffff;
-	}
-	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-	print_symbol("%s", regs->ip);
-	printk(" SS:ESP %04x:%08lx\n", ss, sp);
-	return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
-	unsigned long flags = oops_begin();
-
-	if (die_nest_count < 3) {
-		report_bug(regs->ip, regs);
-
-		if (__die(str, regs, err))
-			regs = NULL;
-	} else {
-		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
-	}
-
-	oops_end(flags, regs, SIGSEGV);
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	spin_lock(&nmi_print_lock);
-	/*
-	* We are in trouble anyway, lets at least try
-	* to get a message out:
-	*/
-	bust_spinlocks(1);
-	printk(KERN_EMERG "%s", str);
-	printk(" on CPU%d, ip %08lx, registers:\n",
-		smp_processor_id(), regs->ip);
-	show_registers(regs);
-	if (do_panic)
-		panic("Non maskable interrupt");
-	console_silent();
-	spin_unlock(&nmi_print_lock);
-
-	/*
-	 * If we are in kernel we are probably nested up pretty bad
-	 * and might aswell get out now while we still can:
-	 */
-	if (!user_mode_vm(regs)) {
-		current->thread.trap_no = 2;
-		crash_kexec(regs);
-	}
-
-	bust_spinlocks(0);
-	do_exit(SIGSEGV);
-}
-
-static int __init oops_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	if (!strcmp(s, "panic"))
-		panic_on_oops = 1;
-	return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-	return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-	code_bytes = simple_strtoul(s, NULL, 0);
-	if (code_bytes > 8192)
-		code_bytes = 8192;
-
-	return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7..c302d07 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
 
 #include <asm/stacktrace.h>
 
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-	printk(" [<%p>] %s%pS\n", (void *) address,
-			reliable ? "" : "? ", (void *) address);
-}
+#include "dumpstack.h"
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-			void *p, unsigned int size, void *end)
-{
-	void *t = tinfo;
-	if (end) {
-		if (p < end && p >= (end-THREAD_SIZE))
-			return 1;
-		else
-			return 0;
-	}
-	return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-		unsigned long *stack, unsigned long bp,
-		const struct stacktrace_ops *ops, void *data,
-		unsigned long *end)
-{
-	struct stack_frame *frame = (struct stack_frame *)bp;
-
-	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-		unsigned long addr;
-
-		addr = *stack;
-		if (__kernel_text_address(addr)) {
-			if ((unsigned long) stack == bp + sizeof(long)) {
-				ops->address(data, addr, 1);
-				frame = frame->next_frame;
-				bp = (unsigned long) frame;
-			} else {
-				ops->address(data, addr, bp == 0);
-			}
-		}
-		stack++;
-	}
-	return bp;
-}
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@
 	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
+	int graph = 0;
 
 	if (!task)
 		task = current;
@@ -206,7 +150,7 @@
 				break;
 
 			bp = print_context_stack(tinfo, stack, bp, ops,
-							data, estack_end);
+						 data, estack_end, &graph);
 			ops->stack(data, "<EOE>");
 			/*
 			 * We link to the next stack via the
@@ -225,7 +169,7 @@
 				if (ops->stack(data, "IRQ") < 0)
 					break;
 				bp = print_context_stack(tinfo, stack, bp,
-						ops, data, irqstack_end);
+					ops, data, irqstack_end, &graph);
 				/*
 				 * We link to the next stack (which would be
 				 * the process stack normally) the last
@@ -243,62 +187,12 @@
 	/*
 	 * This handles the process stack:
 	 */
-	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
 	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
 
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	printk(data);
-	print_symbol(msg, symbol);
-	printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-	printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-	printk("%s <%s> ", (char *)data, name);
-	return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-	touch_nmi_watchdog();
-	printk(data);
-	printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-	.warning = print_trace_warning,
-	.warning_symbol = print_trace_warning_symbol,
-	.stack = print_trace_stack,
-	.address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-	printk("%sCall Trace:\n", log_lvl);
-	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp)
-{
-	show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *sp, unsigned long bp, char *log_lvl)
 {
@@ -342,33 +236,6 @@
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-	show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long bp = 0;
-	unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-	if (!bp)
-		get_bp(bp);
-#endif
-
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-	show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_registers(struct pt_regs *regs)
 {
 	int i;
@@ -429,147 +296,3 @@
 	return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-	int cpu;
-	unsigned long flags;
-
-	oops_enter();
-
-	/* racy, but better than risking deadlock. */
-	raw_local_irq_save(flags);
-	cpu = smp_processor_id();
-	if (!__raw_spin_trylock(&die_lock)) {
-		if (cpu == die_owner)
-			/* nested oops. should stop eventually */;
-		else
-			__raw_spin_lock(&die_lock);
-	}
-	die_nest_count++;
-	die_owner = cpu;
-	console_verbose();
-	bust_spinlocks(1);
-	return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
-	die_owner = -1;
-	bust_spinlocks(0);
-	die_nest_count--;
-	if (!die_nest_count)
-		/* Nest count reaches zero, release the lock. */
-		__raw_spin_unlock(&die_lock);
-	raw_local_irq_restore(flags);
-	if (!regs) {
-		oops_exit();
-		return;
-	}
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-	if (panic_on_oops)
-		panic("Fatal exception");
-	oops_exit();
-	do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-	printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-	printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
-#endif
-	printk("\n");
-	sysfs_printk_last_file();
-	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-		return 1;
-
-	show_registers(regs);
-	add_taint(TAINT_DIE);
-	/* Executive summary in case the oops scrolled away */
-	printk(KERN_ALERT "RIP ");
-	printk_address(regs->ip, 1);
-	printk(" RSP <%016lx>\n", regs->sp);
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-	return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
-	unsigned long flags = oops_begin();
-
-	if (!user_mode(regs))
-		report_bug(regs->ip, regs);
-
-	if (__die(str, regs, err))
-		regs = NULL;
-	oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-	unsigned long flags;
-
-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	flags = oops_begin();
-	/*
-	 * We are in trouble anyway, lets at least try
-	 * to get a message out.
-	 */
-	printk(KERN_EMERG "%s", str);
-	printk(" on CPU%d, ip %08lx, registers:\n",
-		smp_processor_id(), regs->ip);
-	show_registers(regs);
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-	if (do_panic || panic_on_oops)
-		panic("Non maskable interrupt");
-	oops_end(flags, NULL, SIGBUS);
-	nmi_exit();
-	local_irq_enable();
-	do_exit(SIGBUS);
-}
-
-static int __init oops_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	if (!strcmp(s, "panic"))
-		panic_on_oops = 1;
-	return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-	return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-	code_bytes = simple_strtoul(s, NULL, 0);
-	if (code_bytes > 8192)
-		code_bytes = 8192;
-
-	return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597e..43ceb3f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	pushl %eax
 	pushl %ecx
 	pushl %edx
@@ -1171,6 +1174,11 @@
 	popl %edx
 	popl %ecx
 	popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
@@ -1180,8 +1188,18 @@
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1200,12 +1218,43 @@
 	popl %edx
 	popl %ecx
 	popl %eax
-
 	jmp ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %edx
+	lea 0x4(%ebp), %eax
+	subl $MCOUNT_INSN_SIZE, %edx
+	call prepare_ftrace_return
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+	pushl $0
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	call ftrace_return_to_handler
+	movl %eax, 0xc(%esp)
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332..54e0bbd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
 
 	/* taken from glibc */
 	subq $0x38, %rsp
@@ -96,6 +98,12 @@
 	movq (%rsp), %rax
 	addq $0x38, %rsp
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
 	retq
@@ -103,8 +111,20 @@
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpq $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
 	retq
@@ -140,6 +160,69 @@
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	leaq 8(%rbp), %rdi
+	movq 0x38(%rsp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rsi
+
+	call	prepare_ftrace_return
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+	retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+	subq  $80, %rsp
+
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+	movq %r10, 56(%rsp)
+	movq %r11, 64(%rsp)
+
+	call ftrace_return_to_handler
+
+	movq %rax, 72(%rsp)
+	movq 64(%rsp), %r11
+	movq 56(%rsp), %r10
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $72, %rsp
+	retq
+#endif
+
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 0aa2c44..53699c9 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,11 @@
 #include <asm/io.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
+#include <asm/atomic.h>
 #include <asm/apicdef.h>
 #include <mach_mpparse.h>
+#include <asm/genapic.h>
+#include <asm/setup.h>
 
 /*
  * ES7000 chipsets
@@ -161,6 +164,43 @@
 	return gsi;
 }
 
+static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+	unsigned long vect = 0, psaival = 0;
+
+	if (psai == NULL)
+		return -1;
+
+	vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+	psaival = (0x1000000 | vect | cpu);
+
+	while (*psai & 0x1000000)
+		;
+
+	*psai = psaival;
+
+	return 0;
+}
+
+static void noop_wait_for_deassert(atomic_t *deassert_not_used)
+{
+}
+
+static int __init es7000_update_genapic(void)
+{
+	genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+
+	/* MPENTIUMIII */
+	if (boot_cpu_data.x86 == 6 &&
+	    (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
+		es7000_update_genapic_to_cluster();
+		genapic->wait_for_init_deassert = noop_wait_for_deassert;
+		genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+	}
+
+	return 0;
+}
+
 void __init
 setup_unisys(void)
 {
@@ -176,6 +216,8 @@
 	else
 		es7000_plat = ES7000_CLASSIC;
 	ioapic_renumber_irq = es7000_rename_gsi;
+
+	x86_quirks->update_genapic = es7000_update_genapic;
 }
 
 /*
@@ -317,26 +359,6 @@
 	return status;
 }
 
-int
-es7000_start_cpu(int cpu, unsigned long eip)
-{
-	unsigned long vect = 0, psaival = 0;
-
-	if (psai == NULL)
-		return -1;
-
-	vect = ((unsigned long)__pa(eip)/0x1000) << 16;
-	psaival = (0x1000000 | vect | cpu);
-
-	while (*psai & 0x1000000)
-                ;
-
-	*psai = psaival;
-
-	return 0;
-
-}
-
 void __init
 es7000_sw_apic(void)
 {
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac..1b43086 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
 
 #include <asm/ftrace.h>
+#include <linux/ftrace.h>
 #include <asm/nops.h>
+#include <asm/nmi.h>
 
 
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@
 	} __attribute__((packed));
 };
 
-
 static int ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
-{
-	return ftrace_nop;
-}
-
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 	static union ftrace_code_union calc;
 
@@ -56,7 +53,142 @@
 	return calc.code;
 }
 
-int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ *    and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status;		/* holds return value of text write */
+static int mod_code_write;		/* set when NMI should do the write */
+static void *mod_code_ip;		/* holds the IP to write to */
+static void *mod_code_newcode;		/* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+	int r;
+
+	r = snprintf(buf, size, "%u %u",
+		     nmi_wait_count,
+		     atomic_read(&nmi_update_count));
+	return r;
+}
+
+static void ftrace_mod_code(void)
+{
+	/*
+	 * Yes, more than one CPU process can be writing to mod_code_status.
+	 *    (and the code itself)
+	 * But if one were to fail, then they all should, and if one were
+	 * to succeed, then they all should.
+	 */
+	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+					     MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+	/* Must have in_nmi seen before reading write flag */
+	smp_mb();
+	if (mod_code_write) {
+		ftrace_mod_code();
+		atomic_inc(&nmi_update_count);
+	}
+}
+
+void ftrace_nmi_exit(void)
+{
+	/* Finish all executions before clearing in_nmi */
+	smp_wmb();
+	atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+	int waited = 0;
+
+	while (atomic_read(&in_nmi)) {
+		waited = 1;
+		cpu_relax();
+	}
+
+	if (waited)
+		nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+	mod_code_ip = (void *)ip;
+	mod_code_newcode = new_code;
+
+	/* The buffers need to be visible before we let NMIs write them */
+	smp_wmb();
+
+	mod_code_write = 1;
+
+	/* Make sure write bit is visible before we wait on NMIs */
+	smp_mb();
+
+	wait_for_nmi();
+
+	/* Make sure all running NMIs have finished before we write the code */
+	smp_mb();
+
+	ftrace_mod_code();
+
+	/* Make sure the write happens before clearing the bit */
+	smp_wmb();
+
+	mod_code_write = 0;
+
+	/* make sure NMIs see the cleared bit */
+	smp_mb();
+
+	wait_for_nmi();
+
+	return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+	return ftrace_nop;
+}
+
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
@@ -81,7 +213,7 @@
 		return -EINVAL;
 
 	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+	if (do_ftrace_mod_code(ip, new_code))
 		return -EPERM;
 
 	sync_core();
@@ -89,6 +221,29 @@
 	return 0;
 }
 
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace();
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace();
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@
 
 	return 0;
 }
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+			  int old_offset, int new_offset)
+{
+	unsigned char code[MCOUNT_INSN_SIZE];
+
+	if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+		return -EINVAL;
+
+	*(int *)(&code[1]) = new_offset;
+
+	if (do_ftrace_mod_code(ip, &code))
+		return -EPERM;
+
+	return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+	atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+				unsigned long func, int *depth)
+{
+	int index;
+
+	if (!current->ret_stack)
+		return -EBUSY;
+
+	/* The return trace stack is full */
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
+		return -EBUSY;
+	}
+
+	index = ++current->curr_ret_stack;
+	barrier();
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
+	*depth = index;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+	int index;
+
+	index = current->curr_ret_stack;
+
+	if (unlikely(index < 0)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic, otherwise we have no where to go */
+		*ret = (unsigned long)panic;
+		return;
+	}
+
+	*ret = current->ret_stack[index].ret;
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
+	barrier();
+	current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_graph_ret trace;
+	unsigned long ret;
+
+	pop_return_trace(&trace, &ret);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_graph_return(&trace);
+
+	if (unlikely(!ret)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic. What else to do? */
+		ret = (unsigned long)panic;
+	}
+
+	return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+	unsigned long old;
+	unsigned long long calltime;
+	int faulted;
+	struct ftrace_graph_ent trace;
+	unsigned long return_hooker = (unsigned long)
+				&return_to_handler;
+
+	/* Nmi's are currently unsupported */
+	if (unlikely(atomic_read(&in_nmi)))
+		return;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	/*
+	 * Protect against fault, even if it shouldn't
+	 * happen. This tool is too much intrusive to
+	 * ignore such a protection.
+	 */
+	asm volatile(
+		"1: " _ASM_MOV " (%[parent_old]), %[old]\n"
+		"2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
+		"   movl $0, %[faulted]\n"
+
+		".section .fixup, \"ax\"\n"
+		"3: movl $1, %[faulted]\n"
+		".previous\n"
+
+		_ASM_EXTABLE(1b, 3b)
+		_ASM_EXTABLE(2b, 3b)
+
+		: [parent_replaced] "=r" (parent), [old] "=r" (old),
+		  [faulted] "=r" (faulted)
+		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+		: "memory"
+	);
+
+	if (unlikely(faulted)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		return;
+	}
+
+	if (unlikely(!__kernel_text_address(old))) {
+		ftrace_graph_stop();
+		*parent = old;
+		WARN_ON(1);
+		return;
+	}
+
+	calltime = cpu_clock(raw_smp_processor_id());
+
+	if (push_return_trace(old, calltime,
+				self_addr, &trace.depth) == -EBUSY) {
+		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9..2bced78 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/setup.h>
 
 extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@
 			genapic = &apic_physflat;
 		printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
 	}
+
+	if (x86_quirks->update_genapic)
+		x86_quirks->update_genapic();
 }
 
 /* Same for both flat and physical. */
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1184210..d7f0993 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,55 +108,6 @@
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
-struct irq_cfg {
-	unsigned int irq;
-	struct irq_pin_list *irq_2_pin;
-	cpumask_t domain;
-	cpumask_t old_domain;
-	unsigned move_cleanup_count;
-	u8 vector;
-	u8 move_in_progress : 1;
-};
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
-};
-
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
-
-static struct irq_cfg *irq_cfg(unsigned int irq)
-{
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
-}
-
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
-{
-	return irq_cfg(irq);
-}
-
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
  * This is performance-critical, we want to do it O(1)
@@ -170,30 +121,118 @@
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
-
-static void __init irq_2_pin_init(void)
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_pin_list *pin;
+	int node;
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	node = cpu_to_node(cpu);
 
-	irq_2_pin_ptr = &pin[0];
+	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+struct irq_cfg {
+	struct irq_pin_list *irq_2_pin;
+	cpumask_t domain;
+	cpumask_t old_domain;
+	unsigned move_cleanup_count;
+	u8 vector;
+	u8 move_in_progress : 1;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
+static struct irq_cfg irq_cfgx[NR_IRQS] = {
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+};
+
+void __init arch_early_irq_init(void)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
+
+	return cfg;
+}
+
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+{
+	struct irq_cfg *cfg;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+	return cfg;
+}
+
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+
+	cfg = desc->chip_data;
+	if (!cfg) {
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+		if (!desc->chip_data) {
+			printk(KERN_ERR "can not alloc irq_cfg\n");
+			BUG_ON(1);
+		}
+	}
+}
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
 }
 
 struct io_apic {
@@ -237,11 +276,10 @@
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -323,13 +361,12 @@
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -359,24 +396,27 @@
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
-static void set_ioapic_affinity_irq(unsigned int irq,
-				    const struct cpumask *mask)
+static void set_ioapic_affinity_irq_desc(struct irq_desc *desc,
+					 const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, *mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, *mask))
 		return;
 
+	set_extra_move_desc(desc, *mask);
+
 	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -384,12 +424,21 @@
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	cpumask_copy(&desc->affinity, mask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+
+static void set_ioapic_affinity_irq(unsigned int irq,
+				    const struct cpumask *mask)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -397,16 +446,18 @@
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+					apic, pin);
+			return;
+		}
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,7 +472,7 @@
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -430,11 +481,10 @@
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -451,18 +501,16 @@
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -475,9 +523,9 @@
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,47 +540,64 @@
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irq_desc(desc);
+}
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -809,7 +874,7 @@
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1034,7 +1099,7 @@
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1115,13 @@
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
-
-	cfg = irq_cfg(irq);
-
-	/* Only try and allocate irqs on cpus that are present */
-	cpus_and(mask, mask, cpu_online_map);
 
 	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
 		return -EBUSY;
 
+	/* Only try and allocate irqs on cpus that are present */
+	cpus_and(mask, mask, cpu_online_map);
+
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1113,24 +1175,22 @@
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1162,9 +1222,13 @@
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
@@ -1215,11 +1279,8 @@
 }
 #endif
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1311,7 +1372,7 @@
 	return 0;
 }
 
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
 			      int trigger, int polarity)
 {
 	struct irq_cfg *cfg;
@@ -1321,10 +1382,10 @@
 	if (!IO_APIC_IRQ(irq))
 		return;
 
-	cfg = irq_cfg(irq);
+	cfg = desc->chip_data;
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1341,12 +1402,12 @@
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
-	ioapic_register_intr(irq, trigger);
-	if (irq < 16)
+	ioapic_register_intr(irq, desc, trigger);
+	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1417,9 @@
 {
 	int apic, pin, idx, irq;
 	int notcon = 0;
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1387,9 +1451,15 @@
 			if (multi_timer_check(apic, irq))
 				continue;
 #endif
-			add_pin_to_irq(irq, apic, pin);
+			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			if (!desc) {
+				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+				continue;
+			}
+			cfg = desc->chip_data;
+			add_pin_to_irq_cpu(cfg, cpu, apic, pin);
 
-			setup_IO_APIC_irq(apic, pin, irq,
+			setup_IO_APIC_irq(apic, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
 		}
 	}
@@ -1448,6 +1518,7 @@
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1608,13 @@
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+
+		if (!desc)
+			continue;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2098,16 @@
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
+	if (irq < NR_IRQS_LEGACY) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_IO_APIC_irq(irq);
+	cfg = irq_cfg(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2092,35 +2170,37 @@
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
 	unsigned int dest;
 	unsigned long flags;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2142,14 +2222,14 @@
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irq_desc(desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2241,15 @@
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irq_desc(desc);
+
 	return ret;
 }
 
@@ -2178,6 +2259,9 @@
 	struct irq_desc *desc;
 
 	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+
 		if (desc->status & IRQ_MOVE_PENDING) {
 			unsigned long flags;
 
@@ -2198,19 +2282,24 @@
 /*
  * Migrates the IRQ destination in the process context.
  */
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+					    const struct cpumask *mask)
+{
+	if (desc->status & IRQ_LEVEL) {
+		desc->status |= IRQ_MOVE_PENDING;
+		cpumask_copy(&desc->pending_mask, mask);
+		migrate_irq_remapped_level_desc(desc);
+		return;
+	}
+
+	migrate_ioapic_irq_desc(desc, mask);
+}
 static void set_ir_ioapic_affinity_irq(unsigned int irq,
 				       const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (desc->status & IRQ_LEVEL) {
-		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, mask);
-		migrate_irq_remapped_level(irq);
-		return;
-	}
-
-	migrate_ioapic_irq(irq, *mask);
+	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
 
@@ -2230,6 +2319,9 @@
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
 
+		if (irq == -1)
+			continue;
+
 		desc = irq_to_desc(irq);
 		if (!desc)
 			continue;
@@ -2251,9 +2343,10 @@
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
 	if (likely(!cfg->move_in_progress))
@@ -2271,8 +2364,9 @@
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
 static void ack_x2apic_level(unsigned int irq)
 {
@@ -2283,11 +2377,14 @@
 {
 	ack_x2APIC_irq();
 }
+
 #endif
 
 static void ack_apic_edge(unsigned int irq)
 {
-	irq_complete_move(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	irq_complete_move(&desc);
 	move_native_irq(irq);
 	ack_APIC_irq();
 }
@@ -2296,18 +2393,21 @@
 
 static void ack_apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(&desc);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irq_desc(desc);
 	}
 #endif
 
@@ -2331,7 +2431,8 @@
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = desc->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2370,17 +2471,18 @@
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
+		cfg = desc->chip_data;
+		if (!io_apic_level_ack_pending(cfg))
 			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		unmask_IO_APIC_irq_desc(desc);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
@@ -2431,20 +2533,22 @@
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < 16)
+			if (irq < NR_IRQS_LEGACY)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
 	}
 }
@@ -2469,7 +2573,7 @@
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
 	ack_APIC_irq();
 }
@@ -2481,11 +2585,8 @@
 	.ack		= ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
@@ -2589,7 +2690,9 @@
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = boot_cpu_id;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2604,7 +2707,7 @@
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2655,10 +2758,10 @@
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2684,9 +2787,9 @@
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2718,7 +2821,7 @@
 	apic_printk(APIC_QUIET, KERN_INFO
 		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-	lapic_register_intr(0);
+	lapic_register_intr(0, desc);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2903,22 +3006,26 @@
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
-
-	irq_want = nr_irqs - 1;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu = boot_cpu_id;
+	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
-	for (new = irq_want; new > 0; new--) {
+	for (new = irq_want; new < NR_IRQS; new++) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		if (!desc_new) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", new);
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		}
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
+			continue;
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -2926,15 +3033,21 @@
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
 
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
 int create_irq(void)
 {
+	unsigned int irq_want;
 	int irq;
 
-	irq = create_irq_nr(nr_irqs - 1);
+	irq_want = nr_irqs_gsi;
+	irq = create_irq_nr(irq_want);
 
 	if (irq == 0)
 		irq = -1;
@@ -2945,14 +3058,22 @@
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2967,12 +3088,12 @@
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3030,34 +3151,34 @@
 #ifdef CONFIG_SMP
 static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, *mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, *mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, *mask);
+
 	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	read_msi_msg(irq, &msg);
+	read_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
+	write_msi_msg_desc(desc, &msg);
 	cpumask_copy(&desc->affinity, mask);
 }
-
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
@@ -3066,11 +3187,11 @@
 static void ir_set_msi_irq_affinity(unsigned int irq,
 				    const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
-	struct irq_desc *desc;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
@@ -3078,10 +3199,12 @@
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, *mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, *mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3105,9 +3228,9 @@
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	cpumask_copy(&desc->affinity, mask);
 }
+
 #endif
 #endif /* CONFIG_SMP */
 
@@ -3166,7 +3289,7 @@
 }
 #endif
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
 	int ret;
 	struct msi_msg msg;
@@ -3175,7 +3298,7 @@
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, desc);
+	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
 #ifdef CONFIG_INTR_REMAP
@@ -3195,26 +3318,13 @@
 	return 0;
 }
 
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
-	unsigned int irq;
-
-	irq = dev->bus->number;
-	irq <<= 8;
-	irq |= dev->devfn;
-	irq <<= 12;
-
-	return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	unsigned int irq;
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
+	irq_want = nr_irqs_gsi;
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
 		return -1;
@@ -3228,7 +3338,7 @@
 		goto error;
 no_ir:
 #endif
-	ret = setup_msi_irq(dev, desc, irq);
+	ret = setup_msi_irq(dev, msidesc, irq);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
@@ -3246,7 +3356,7 @@
 {
 	unsigned int irq;
 	int ret, sub_handle;
-	struct msi_desc *desc;
+	struct msi_desc *msidesc;
 	unsigned int irq_want;
 
 #ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3364,11 @@
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
-	list_for_each_entry(desc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want--);
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = create_irq_nr(irq_want);
+		irq_want++;
 		if (irq == 0)
 			return -1;
 #ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3400,7 @@
 		}
 no_ir:
 #endif
-		ret = setup_msi_irq(dev, desc, irq);
+		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3310,19 +3421,21 @@
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, *mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, *mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, *mask);
+
 	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3334,9 +3447,9 @@
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	cpumask_copy(&desc->affinity, mask);
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip dmar_msi_type = {
@@ -3370,8 +3483,8 @@
 #ifdef CONFIG_SMP
 static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
@@ -3379,10 +3492,12 @@
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, *mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, *mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, *mask);
+
 	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3394,9 +3509,9 @@
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	cpumask_copy(&desc->affinity, mask);
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip hpet_msi_type = {
@@ -3451,25 +3566,27 @@
 
 static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, *mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, *mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, *mask);
+
 	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	cpumask_copy(&desc->affinity, mask);
 }
+
 #endif
 
 static struct irq_chip ht_irq_chip = {
@@ -3489,13 +3606,13 @@
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3541,7 +3658,9 @@
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3550,8 +3669,6 @@
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3603,9 +3720,16 @@
 	return reg_01.bits.entries;
 }
 
-int __init probe_nr_irqs(void)
+void __init probe_nr_irqs_gsi(void)
 {
-	return NR_IRQS;
+	int idx;
+	int nr = 0;
+
+	for (idx = 0; idx < nr_ioapics; idx++)
+		nr += io_apic_get_redir_entries(idx) + 1;
+
+	if (nr > nr_irqs_gsi)
+		nr_irqs_gsi = nr;
 }
 
 /* --------------------------------------------------------------------------
@@ -3704,19 +3828,31 @@
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
+
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
 			ioapic);
 		return -EINVAL;
 	}
 
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	if (!desc) {
+		printk(KERN_INFO "can not get irq_desc %d\n", irq);
+		return 0;
+	}
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
+	if (irq >= NR_IRQS_LEGACY) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
 
 	return 0;
 }
@@ -3770,9 +3906,10 @@
 			 * when you have too many devices, because at that time only boot
 			 * cpu is online.
 			 */
-			cfg = irq_cfg(irq);
+			desc = irq_to_desc(irq);
+			cfg = desc->chip_data;
 			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq,
+				setup_IO_APIC_irq(ioapic, pin, irq, desc,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
 				continue;
@@ -3782,7 +3919,6 @@
 			/*
 			 * Honour affinities which have been set in early boot
 			 */
-			desc = irq_to_desc(irq);
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
 				mask = desc->affinity;
@@ -3791,10 +3927,10 @@
 
 #ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq(irq, &mask);
+				set_ir_ioapic_affinity_irq_desc(desc, &mask);
 			else
 #endif
-				set_ioapic_affinity_irq(irq, &mask);
+				set_ioapic_affinity_irq_desc(desc, &mask);
 		}
 
 	}
@@ -3843,7 +3979,6 @@
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc5..3f1d9d1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -118,6 +118,9 @@
 	}
 
 	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 87870a4..9cf9cbb 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -242,6 +242,8 @@
 	for_each_irq_desc(irq, desc) {
 		cpumask_t mask;
 
+		if (!desc)
+			continue;
 		if (irq == 2)
 			continue;
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 7d37f84..27f2307 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -94,6 +94,8 @@
 		int break_affinity = 0;
 		int set_affinity = 1;
 
+		if (!desc)
+			continue;
 		if (irq == 2)
 			continue;
 
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa98..6a92f47 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@
 	/*
 	 * 16 old-style INTA-cycle interrupts:
 	 */
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff02353..40c1e62 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0f4c1fd..45e3b69 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -586,26 +586,23 @@
 {
 	struct intel_mp_floating *mpf = mpf_found;
 
+	if (!mpf)
+		return;
+
+	if (acpi_lapic && early)
+		return;
+
+	/*
+	 * MPS doesn't support hyperthreading, aka only have
+	 * thread 0 apic id in MPS table
+	 */
+	if (acpi_lapic && acpi_ioapic)
+		return;
+
 	if (x86_quirks->mach_get_smp_config) {
 		if (x86_quirks->mach_get_smp_config(early))
 			return;
 	}
-	if (acpi_lapic && early)
-		return;
-	/*
-	 * ACPI supports both logical (e.g. Hyper-Threading) and physical
-	 * processors, where MPS only supports physical.
-	 */
-	if (acpi_lapic && acpi_ioapic) {
-		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
-		       "information\n");
-		return;
-	} else if (acpi_lapic)
-		printk(KERN_INFO "Using ACPI for processor (LAPIC) "
-		       "configuration information\n");
-
-	if (!mpf)
-		return;
 
 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->mpf_specification);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39..0deea37 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
 #include <asm/numaq.h>
 #include <asm/topology.h>
 #include <asm/processor.h>
-#include <asm/mpspec.h>
+#include <asm/genapic.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 
@@ -235,6 +235,13 @@
 	return 1;
 }
 
+static int __init numaq_update_genapic(void)
+{
+	genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
+
+	return 0;
+}
+
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init	= numaq_pre_time_init,
 	.arch_time_init		= NULL,
@@ -250,6 +257,7 @@
 	.mpc_oem_pci_bus	= mpc_oem_pci_bus,
 	.smp_read_mpc_oem	= smp_read_mpc_oem,
 	.setup_ioapic_ids	= numaq_setup_ioapic_ids,
+	.update_genapic		= numaq_update_genapic,
 };
 
 void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772..95d811a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,7 +7,9 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
+#include <asm/apic.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -100,6 +102,9 @@
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
+		struct power_trace it;
+
+		trace_power_start(&it, POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +117,7 @@
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
+		trace_power_end(&it);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -122,6 +128,21 @@
 EXPORT_SYMBOL(default_idle);
 #endif
 
+void stop_this_cpu(void *dummy)
+{
+	local_irq_disable();
+	/*
+	 * Remove this CPU:
+	 */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+	disable_local_APIC();
+
+	for (;;) {
+		if (hlt_works(smp_processor_id()))
+			halt();
+	}
+}
+
 static void do_nothing(void *unused)
 {
 }
@@ -154,24 +175,31 @@
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
+	trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+	struct power_trace it;
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 		local_irq_enable();
 }
@@ -183,9 +211,13 @@
  */
 static void poll_idle(void)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
+	trace_power_end(&it);
 }
 
 /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302f..24c2276 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -548,7 +549,8 @@
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120f..fbb321d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -551,8 +552,9 @@
  * - could test fs/gs bitsliced
  *
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c1..2c8ec1b 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -668,14 +668,14 @@
 	size_t bts_index, bts_end;
 	int error;
 
-	error = ds_get_bts_end(child, &bts_end);
+	error = ds_get_bts_end(child->bts, &bts_end);
 	if (error < 0)
 		return error;
 
 	if (bts_end <= index)
 		return -EINVAL;
 
-	error = ds_get_bts_index(child, &bts_index);
+	error = ds_get_bts_index(child->bts, &bts_index);
 	if (error < 0)
 		return error;
 
@@ -684,7 +684,7 @@
 	if (bts_end <= bts_index)
 		bts_index -= bts_end;
 
-	error = ds_access_bts(child, bts_index, &bts_record);
+	error = ds_access_bts(child->bts, bts_index, &bts_record);
 	if (error < 0)
 		return error;
 
@@ -705,14 +705,14 @@
 	size_t end, i;
 	int error;
 
-	error = ds_get_bts_index(child, &end);
+	error = ds_get_bts_index(child->bts, &end);
 	if (error < 0)
 		return error;
 
 	if (size < (end * sizeof(struct bts_struct)))
 		return -EIO;
 
-	error = ds_access_bts(child, 0, (const void **)&raw);
+	error = ds_access_bts(child->bts, 0, (const void **)&raw);
 	if (error < 0)
 		return error;
 
@@ -723,18 +723,13 @@
 			return -EFAULT;
 	}
 
-	error = ds_clear_bts(child);
+	error = ds_clear_bts(child->bts);
 	if (error < 0)
 		return error;
 
 	return end;
 }
 
-static void ptrace_bts_ovfl(struct task_struct *child)
-{
-	send_sig(child->thread.bts_ovfl_signal, child, 0);
-}
-
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
@@ -760,24 +755,46 @@
 		goto errout;
 
 	if (cfg.flags & PTRACE_BTS_O_ALLOC) {
-		ds_ovfl_callback_t ovfl = NULL;
+		bts_ovfl_callback_t ovfl = NULL;
 		unsigned int sig = 0;
 
-		/* we ignore the error in case we were not tracing child */
-		(void)ds_release_bts(child);
+		error = -EINVAL;
+		if (cfg.size < (10 * bts_cfg.sizeof_bts))
+			goto errout;
 
 		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 			if (!cfg.signal)
 				goto errout;
 
+			error = -EOPNOTSUPP;
+			goto errout;
+
 			sig  = cfg.signal;
-			ovfl = ptrace_bts_ovfl;
 		}
 
-		error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
-		if (error < 0)
+		if (child->bts) {
+			(void)ds_release_bts(child->bts);
+			kfree(child->bts_buffer);
+
+			child->bts = NULL;
+			child->bts_buffer = NULL;
+		}
+
+		error = -ENOMEM;
+		child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
+		if (!child->bts_buffer)
 			goto errout;
 
+		child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
+					    ovfl, /* th = */ (size_t)-1);
+		if (IS_ERR(child->bts)) {
+			error = PTR_ERR(child->bts);
+			kfree(child->bts_buffer);
+			child->bts = NULL;
+			child->bts_buffer = NULL;
+			goto errout;
+		}
+
 		child->thread.bts_ovfl_signal = sig;
 	}
 
@@ -823,15 +840,15 @@
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	error = ds_get_bts_end(child, &end);
+	error = ds_get_bts_end(child->bts, &end);
 	if (error < 0)
 		return error;
 
-	error = ds_access_bts(child, /* index = */ 0, &base);
+	error = ds_access_bts(child->bts, /* index = */ 0, &base);
 	if (error < 0)
 		return error;
 
-	error = ds_access_bts(child, /* index = */ end, &max);
+	error = ds_access_bts(child->bts, /* index = */ end, &max);
 	if (error < 0)
 		return error;
 
@@ -884,10 +901,7 @@
 		return -EINVAL;
 	}
 
-	/* The writing task will be the switched-to task on a context
-	 * switch. It needs to write into the switched-from task's BTS
-	 * buffer. */
-	return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+	return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
 }
 
 void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -929,17 +943,16 @@
 	switch (c->x86) {
 	case 0x6:
 		switch (c->x86_model) {
+		case 0 ... 0xC:
+			/* sorry, don't know about them */
+			break;
 		case 0xD:
 		case 0xE: /* Pentium M */
 			bts_configure(&bts_cfg_pentium_m);
 			break;
-		case 0xF: /* Core2 */
-        case 0x1C: /* Atom */
+		default: /* Core2, Atom, ... */
 			bts_configure(&bts_cfg_core2);
 			break;
-		default:
-			/* sorry, don't know about them */
-			break;
 		}
 		break;
 	case 0xF:
@@ -973,13 +986,17 @@
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 #ifdef CONFIG_X86_PTRACE_BTS
-	(void)ds_release_bts(child);
+	if (child->bts) {
+		(void)ds_release_bts(child->bts);
+		kfree(child->bts_buffer);
+		child->bts_buffer = NULL;
 
-	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+		if (!child->thread.debugctlmsr)
+			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 
-	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+	}
 #endif /* CONFIG_X86_PTRACE_BTS */
 }
 
@@ -1111,9 +1128,16 @@
 			(child, data, (struct ptrace_bts_config __user *)addr);
 		break;
 
-	case PTRACE_BTS_SIZE:
-		ret = ds_get_bts_index(child, /* pos = */ NULL);
+	case PTRACE_BTS_SIZE: {
+		size_t size;
+
+		ret = ds_get_bts_index(child->bts, &size);
+		if (ret == 0) {
+			BUG_ON(size != (int) size);
+			ret = (int) size;
+		}
 		break;
+	}
 
 	case PTRACE_BTS_GET:
 		ret = ptrace_bts_read_record
@@ -1121,7 +1145,7 @@
 		break;
 
 	case PTRACE_BTS_CLEAR:
-		ret = ds_clear_bts(child);
+		ret = ds_clear_bts(child->bts);
 		break;
 
 	case PTRACE_BTS_DRAIN:
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index cc5a254..0e3dbc7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,10 @@
 static int reboot_cpu = -1;
 #endif
 
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
    warm   Don't set the cold reboot flag
    cold   Set the cold reboot flag
    bios   Reboot by jumping through the BIOS (only for X86_32)
@@ -45,6 +48,7 @@
    kbd    Use the keyboard controller. cold reset (default)
    acpi   Use the RESET_REG in the FADT
    efi    Use efi reset_system runtime service
+   pci    Use the so-called "PCI reset register", CF9
    force  Avoid anything that could hang.
  */
 static int __init reboot_setup(char *str)
@@ -79,6 +83,7 @@
 		case 'k':
 		case 't':
 		case 'e':
+		case 'p':
 			reboot_type = *str;
 			break;
 
@@ -404,12 +409,27 @@
 			reboot_type = BOOT_KBD;
 			break;
 
-
 		case BOOT_EFI:
 			if (efi_enabled)
-				efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+				efi.reset_system(reboot_mode ?
+						 EFI_RESET_WARM :
+						 EFI_RESET_COLD,
 						 EFI_SUCCESS, 0, NULL);
+			reboot_type = BOOT_KBD;
+			break;
 
+		case BOOT_CF9:
+			port_cf9_safe = true;
+			/* fall through */
+
+		case BOOT_CF9_COND:
+			if (port_cf9_safe) {
+				u8 cf9 = inb(0xcf9) & ~6;
+				outb(cf9|2, 0xcf9); /* Request hard reset */
+				udelay(50);
+				outb(cf9|6, 0xcf9); /* Actually do the reset */
+				udelay(50);
+			}
 			reboot_type = BOOT_KBD;
 			break;
 		}
@@ -470,6 +490,11 @@
 
 static void native_machine_halt(void)
 {
+	/* stop other cpus and apics */
+	machine_shutdown();
+
+	/* stop this cpu */
+	stop_this_cpu(NULL);
 }
 
 static void native_machine_power_off(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9d5674f..b901895 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -583,7 +583,20 @@
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
-static struct x86_quirks default_x86_quirks __initdata;
+static int __init default_update_genapic(void)
+{
+#ifdef CONFIG_X86_SMP
+# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
+	genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
+# endif
+#endif
+
+	return 0;
+}
+
+static struct x86_quirks default_x86_quirks __initdata = {
+	.update_genapic         = default_update_genapic,
+};
 
 struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
 
@@ -1082,7 +1095,7 @@
 	ioapic_init_mappings();
 
 	/* need to wait for io_apic is mapped */
-	nr_irqs = probe_nr_irqs();
+	probe_nr_irqs_gsi();
 
 	kvm_guest_init();
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19..3f92b13 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -140,19 +140,6 @@
 		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
 }
 
-static void stop_this_cpu(void *dummy)
-{
-	local_irq_disable();
-	/*
-	 * Remove this CPU:
-	 */
-	cpu_clear(smp_processor_id(), cpu_online_map);
-	disable_local_APIC();
-	if (hlt_works(smp_processor_id()))
-		for (;;) halt();
-	for (;;);
-}
-
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 468c2f9..9d58134 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
 #include <asm/mtrr.h>
 #include <asm/vmi.h>
 #include <asm/genapic.h>
+#include <asm/setup.h>
 #include <linux/mc146818rtc.h>
 
 #include <mach_apic.h>
@@ -530,7 +531,7 @@
 	pr_debug("Before bogocount - setting activated=1.\n");
 }
 
-static inline void __inquire_remote_apic(int apicid)
+void __inquire_remote_apic(int apicid)
 {
 	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
 	char *names[] = { "ID", "VERSION", "SPIV" };
@@ -569,14 +570,13 @@
 	}
 }
 
-#ifdef WAKE_SECONDARY_VIA_NMI
 /*
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
  */
-static int __devinit
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
 	int maxlvt;
@@ -593,7 +593,7 @@
 	 * Give the other CPU some time to accept the IPI.
 	 */
 	udelay(200);
-	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 		maxlvt = lapic_get_maxlvt();
 		if (maxlvt > 3)			/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
@@ -608,11 +608,9 @@
 
 	return (send_status | accept_status);
 }
-#endif	/* WAKE_SECONDARY_VIA_NMI */
 
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __devinit
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
 	int maxlvt, num_starts, j;
@@ -731,7 +729,6 @@
 
 	return (send_status | accept_status);
 }
-#endif	/* WAKE_SECONDARY_VIA_INIT */
 
 struct create_idle {
 	struct work_struct work;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6..10786af 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
 static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+	const struct pt_regs *regs = task_pt_regs(current);
+	const void __user *fp = (const void __user *)regs->bp;
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = regs->ip;
+
+	while (trace->nr_entries < trace->max_entries) {
+		struct stack_frame frame;
+
+		frame.next_fp = NULL;
+		frame.ret_addr = 0;
+		if (!copy_stack_frame(fp, &frame))
+			break;
+		if ((unsigned long)fp < regs->sp)
+			break;
+		if (frame.ret_addr) {
+			trace->entries[trace->nr_entries++] =
+				frame.ret_addr;
+		}
+		if (fp == frame.next_fp)
+			break;
+		fp = frame.next_fp;
+	}
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+	/*
+	 * Trace user stack if we are not a kernel thread
+	 */
+	if (current->mm) {
+		__save_stack_trace_user(trace);
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b669..6f3d3d4 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>