x86: Avoid race condition in pci_enable_msix()
Keep chip_data in create_irq_nr and destroy_irq.
When two drivers are setting up MSI-X at the same time via
pci_enable_msix() there is a race. See this dmesg excerpt:
[ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X
[ 85.170611] alloc irq_desc for 99 on node -1
[ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X
[ 85.170614] alloc kstat_irqs on node -1
[ 85.170616] alloc irq_2_iommu on node -1
[ 85.170617] alloc irq_desc for 100 on node -1
[ 85.170619] alloc kstat_irqs on node -1
[ 85.170621] alloc irq_2_iommu on node -1
[ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X
[ 85.170626] alloc irq_desc for 101 on node -1
[ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X
[ 85.170630] alloc kstat_irqs on node -1
[ 85.170631] alloc irq_2_iommu on node -1
[ 85.170635] alloc irq_desc for 102 on node -1
[ 85.170636] alloc kstat_irqs on node -1
[ 85.170639] alloc irq_2_iommu on node -1
[ 85.170646] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000088
As you can see igb and ixgbe are both alternating on create_irq_nr()
via pci_enable_msix() in their probe function.
ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe
choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and
calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data =
NULL via dynamic_irq_init().
igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[]
via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this:
cfg_new = irq_desc_ptrs[102]->chip_data;
if (cfg_new->vector != 0)
continue;
This hits the NULL deref.
Another possible race exists via pci_disable_msix() in a driver or in
the number of error paths that call free_msi_irqs():
destroy_irq()
dynamic_irq_cleanup() which sets desc->chip_data = NULL
...race window...
desc->chip_data = cfg;
Remove the save and restore code for cfg in create_irq_nr() and
destroy_irq() and take the desc->lock when checking the irq_cfg.
Reported-and-analyzed-by: Brandon Philips <bphilips@suse.de>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1265793639-15071-3-git-send-email-yinghai@kernel.org>
Signed-off-by: Brandon Phililps <bphilips@suse.de>
Cc: stable@kernel.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa2..d70394f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
#include "internals.h"
-/**
- * dynamic_irq_init - initialize a dynamically allocated irq
- * @irq: irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc;
unsigned long flags;
@@ -41,7 +37,8 @@
desc->depth = 1;
desc->msi_desc = NULL;
desc->handler_data = NULL;
- desc->chip_data = NULL;
+ if (!keep_chip_data)
+ desc->chip_data = NULL;
desc->action = NULL;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@
}
/**
- * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * dynamic_irq_init - initialize a dynamically allocated irq
* @irq: irq number to initialize
*/
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+ dynamic_irq_init_x(irq, false);
+}
+
+/**
+ * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ * @irq: irq number to initialize
+ *
+ * does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+ dynamic_irq_init_x(irq, true);
+}
+
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -77,7 +90,8 @@
}
desc->msi_desc = NULL;
desc->handler_data = NULL;
- desc->chip_data = NULL;
+ if (!keep_chip_data)
+ desc->chip_data = NULL;
desc->handle_irq = handle_bad_irq;
desc->chip = &no_irq_chip;
desc->name = NULL;
@@ -85,6 +99,26 @@
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+ dynamic_irq_cleanup_x(irq, false);
+}
+
+/**
+ * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ *
+ * does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+ dynamic_irq_cleanup_x(irq, true);
+}
+
/**
* set_irq_chip - set the irq chip for an irq