[PATCH] genirq: x86_64 irq: make vector_irq per cpu This refactors the irq handling code to make the vectors a per cpu resource so the same vector number can be simultaneously used on multiple cpus for different irqs. This should make systems that were hitting limits on the total number of irqs much more livable. [akpm@osdl.org: build fix] [akpm@osdl.org: __target_IO_APIC_irq is unneeded on UP] Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Rajesh Shah <rajesh.shah@intel.com> Cc: Andi Kleen <ak@muc.de> Cc: "Protasevich, Natalie" <Natalie.Protasevich@UNISYS.com> Cc: "Luck, Tony" <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

commit: 550f2299ac8ffaba943cf211380d3a8d3fa75301 [log] [tgz]
author: Eric W. Biederman <ebiederm@xmission.com> Wed Oct 04 02:16:51 2006 -0700
committer: Linus Torvalds <torvalds@g5.osdl.org> Wed Oct 04 07:55:29 2006 -0700
tree: 9a8cb7b0d62f48f581f66ab4951aa6c9793131c0
parent: e500f57436b9056a245216c53113613928155eba [diff]
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index bddde43..0612a33 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c

@@ -394,7 +394,7 @@
  */
 
 static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
-int vector_irq[NR_VECTORS] __read_mostly = {
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[0 ... FIRST_EXTERNAL_VECTOR - 1] = -1,
 	[FIRST_EXTERNAL_VECTOR + 0] = 0,
 	[FIRST_EXTERNAL_VECTOR + 1] = 1,
@@ -520,7 +520,7 @@
 	 * IRQ0 must be given a fixed assignment and initialized,
 	 * because it's used before the IO-APIC is set up.
 	 */
-	vector_irq[FIRST_DEVICE_VECTOR] = 0;
+	__get_cpu_var(vector_irq)[FIRST_DEVICE_VECTOR] = 0;
 
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 9f84949..7d53f66 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c

@@ -44,7 +44,7 @@
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 
-static int assign_irq_vector(int irq);
+static int assign_irq_vector(int irq, cpumask_t mask);
 
 #define __apicdebuginit  __init
 
@@ -135,11 +135,35 @@
 }
 
 #ifdef CONFIG_SMP
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+	int apic, pin;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	BUG_ON(irq >= NR_IRQS);
+	for (;;) {
+		unsigned int reg;
+		apic = entry->apic;
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~0x000000ff;
+		reg |= vector;
+		io_apic_modify(apic, reg);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
+	int vector;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -147,7 +171,13 @@
 
 	cpus_and(mask, tmp, CPU_MASK_ALL);
 
-	dest = cpu_mask_to_apicid(mask);
+	vector = assign_irq_vector(irq, mask);
+	if (vector < 0)
+		return;
+
+	cpus_clear(tmp);
+	cpu_set(vector >> 8, tmp);
+	dest = cpu_mask_to_apicid(tmp);
 
 	/*
 	 * Only the high 8 bits are valid.
@@ -155,7 +185,7 @@
 	dest = SET_APIC_LOGICAL_ID(dest);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
+	__target_IO_APIC_irq(irq, dest, vector & 0xff);
 	set_native_irq_info(irq, mask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
@@ -512,7 +542,7 @@
 
 	tries = NR_IRQS;
   try_again:
-	vector = assign_irq_vector(gsi);
+	vector = assign_irq_vector(gsi, TARGET_CPUS);
 
 	/*
 	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
@@ -591,45 +621,77 @@
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+unsigned int irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_EXTERNAL_VECTOR, 0 };
 
-static int __assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq, cpumask_t mask)
 {
-	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
-	int vector;
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	static struct {
+		int vector;
+		int offset;
+	} pos[NR_CPUS] = { [ 0 ... NR_CPUS - 1] = {FIRST_DEVICE_VECTOR, 0} };
+	int old_vector = -1;
+	int cpu;
 
 	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
 
-	if (IO_APIC_VECTOR(irq) > 0) {
-		return IO_APIC_VECTOR(irq);
+	if (IO_APIC_VECTOR(irq) > 0)
+		old_vector = IO_APIC_VECTOR(irq);
+	if ((old_vector > 0) && cpu_isset(old_vector >> 8, mask)) {
+		return old_vector;
 	}
+
+	for_each_cpu_mask(cpu, mask) {
+		int vector, offset;
+		vector = pos[cpu].vector;
+		offset = pos[cpu].offset;
 next:
-	current_vector += 8;
-	if (current_vector == IA32_SYSCALL_VECTOR)
-		goto next;
-
-	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		/* If we run out of vectors on large boxen, must share them. */
-		offset = (offset + 1) % 8;
-		current_vector = FIRST_DEVICE_VECTOR + offset;
+		vector += 8;
+		if (vector >= FIRST_SYSTEM_VECTOR) {
+			/* If we run out of vectors on large boxen, must share them. */
+			offset = (offset + 1) % 8;
+			vector = FIRST_DEVICE_VECTOR + offset;
+		}
+		if (unlikely(pos[cpu].vector == vector))
+			continue;
+		if (vector == IA32_SYSCALL_VECTOR)
+			goto next;
+		if (per_cpu(vector_irq, cpu)[vector] != -1)
+			goto next;
+		/* Found one! */
+		pos[cpu].vector = vector;
+		pos[cpu].offset = offset;
+		if (old_vector >= 0) {
+			int old_cpu = old_vector >> 8;
+			old_vector &= 0xff;
+			per_cpu(vector_irq, old_cpu)[old_vector] = -1;
+		}
+		per_cpu(vector_irq, cpu)[vector] = irq;
+		vector |= cpu << 8;
+		IO_APIC_VECTOR(irq) = vector;
+		return vector;
 	}
-
-	vector = current_vector;
-	vector_irq[vector] = irq;
-	IO_APIC_VECTOR(irq) = vector;
-
-	return vector;
+	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq)
+static int assign_irq_vector(int irq, cpumask_t mask)
 {
 	int vector;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	vector = __assign_irq_vector(irq);
+	vector = __assign_irq_vector(irq, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
-
 	return vector;
 }
 
@@ -699,8 +761,15 @@
 			continue;
 
 		if (IO_APIC_IRQ(irq)) {
-			vector = assign_irq_vector(irq);
-			entry.vector = vector;
+			cpumask_t mask;
+			vector = assign_irq_vector(irq, TARGET_CPUS);
+			if (vector < 0)
+				continue;
+
+			cpus_clear(mask);
+			cpu_set(vector >> 8, mask);
+			entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
+			entry.vector = vector & 0xff;
 
 			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
 			if (!apic && (irq < 16))
@@ -1197,7 +1266,14 @@
 
 static int ioapic_retrigger_irq(unsigned int irq)
 {
-	send_IPI_self(IO_APIC_VECTOR(irq));
+	cpumask_t mask;
+	unsigned vector;
+
+	vector = irq_vector[irq];
+	cpus_clear(mask);
+	cpu_set(vector >> 8, mask);
+
+	send_IPI_mask(mask, vector & 0xff);
 
 	return 1;
 }
@@ -1419,7 +1495,7 @@
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	vector = assign_irq_vector(0);
+	vector = assign_irq_vector(0, TARGET_CPUS);
 
 	/*
 	 * Subtle, code in do_timer_interrupt() expects an AEOI
@@ -1662,7 +1738,7 @@
 			continue;
 		if (irq_vector[new] != 0)
 			continue;
-		vector = __assign_irq_vector(new);
+		vector = __assign_irq_vector(new, TARGET_CPUS);
 		if (likely(vector > 0))
 			irq = new;
 		break;
@@ -1698,12 +1774,12 @@
 	int vector;
 	unsigned dest;
 
-	vector = assign_irq_vector(irq);
+	vector = assign_irq_vector(irq, TARGET_CPUS);
 	if (vector >= 0) {
 		cpumask_t tmp;
 
 		cpus_clear(tmp);
-		cpu_set(first_cpu(cpu_online_map), tmp);
+		cpu_set(vector >> 8, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
@@ -1738,9 +1814,13 @@
 	int vector;
 	unsigned dest;
 
-	vector = assign_irq_vector(irq);
+	vector = assign_irq_vector(irq, mask);
 	if (vector > 0) {
-		dest = cpu_mask_to_apicid(mask);
+		cpumask_t tmp;
+
+		cpus_clear(tmp);
+		cpu_set(vector >> 8, tmp);
+		dest = cpu_mask_to_apicid(tmp);
 
 		msg->data &= ~MSI_DATA_VECTOR_MASK;
 		msg->data |= MSI_DATA_VECTOR(vector);
@@ -1783,6 +1863,8 @@
 {
 	struct IO_APIC_route_entry entry;
 	unsigned long flags;
+	int vector;
+	cpumask_t mask;
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -1790,6 +1872,21 @@
 		return -EINVAL;
 	}
 
+	irq = gsi_irq_sharing(irq);
+	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= 16)
+		add_pin_to_irq(irq, ioapic, pin);
+
+
+	vector = assign_irq_vector(irq, TARGET_CPUS);
+	if (vector < 0)
+		return vector;
+
+	cpus_clear(mask);
+	cpu_set(vector >> 8, mask);
+
 	/*
 	 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
 	 * Note that we mask (disable) IRQs now -- these get enabled when the
@@ -1800,19 +1897,11 @@
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
 	entry.trigger = triggering;
 	entry.polarity = polarity;
 	entry.mask = 1;					 /* Disabled (masked) */
-
-	irq = gsi_irq_sharing(irq);
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
-
-	entry.vector = assign_irq_vector(irq);
+	entry.vector = vector & 0xff;
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
 		"IRQ %d Mode:%i Active:%i)\n", ioapic, 

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 4542fb0..506f27c 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c

@@ -110,7 +110,7 @@
 
 	exit_idle();
 	irq_enter();
-	irq = vector_irq[vector];
+	irq = __get_cpu_var(vector_irq)[vector];
 
 	if (unlikely(irq >= NR_IRQS)) {
 		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",

diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 9f6a0bf..53d0d9f 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h

@@ -19,6 +19,7 @@
 #include <asm/irq.h>
 #include <linux/profile.h>
 #include <linux/smp.h>
+#include <linux/percpu.h>
 #endif
 
 #define NMI_VECTOR		0x02
@@ -73,8 +74,9 @@
 
 
 #ifndef __ASSEMBLY__
-extern u8 irq_vector[NR_IRQ_VECTORS];
-extern int vector_irq[NR_VECTORS];
+extern unsigned int irq_vector[NR_IRQ_VECTORS];
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
 
 /*

diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h
index b8f8728..5006c6e 100644
--- a/include/asm-x86_64/irq.h
+++ b/include/asm-x86_64/irq.h

@@ -31,8 +31,8 @@
 
 #define FIRST_SYSTEM_VECTOR	0xef   /* duplicated in hw_irq.h */
 
-#define NR_IRQS 224
-#define NR_IRQ_VECTORS (32 * NR_CPUS)
+#define NR_IRQS (NR_VECTORS + (32 *NR_CPUS))
+#define NR_IRQ_VECTORS NR_IRQS
 
 static __inline__ int irq_canonicalize(int irq)
 {
commit	550f2299ac8ffaba943cf211380d3a8d3fa75301	[log] [tgz]
author	Eric W. Biederman <ebiederm@xmission.com>	Wed Oct 04 02:16:51 2006 -0700
committer	Linus Torvalds <torvalds@g5.osdl.org>	Wed Oct 04 07:55:29 2006 -0700
tree	9a8cb7b0d62f48f581f66ab4951aa6c9793131c0
parent	e500f57436b9056a245216c53113613928155eba [diff]