kernel/xen.pcifront.next-2.6.38.patch

15285 lines
416 KiB
Diff

diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h
index 96fc623..ed28bcd 100644
--- a/arch/ia64/include/asm/xen/hypercall.h
+++ b/arch/ia64/include/asm/xen/hypercall.h
@@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2,
static inline int
xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
{
- return _hypercall2(int, sched_op_new, cmd, arg);
+ return _hypercall2(int, sched_op, cmd, arg);
}
static inline long
diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c
index fd66b04..419c862 100644
--- a/arch/ia64/xen/suspend.c
+++ b/arch/ia64/xen/suspend.c
@@ -37,19 +37,14 @@ xen_mm_unpin_all(void)
/* nothing */
}
-void xen_pre_device_suspend(void)
-{
- /* nothing */
-}
-
void
-xen_pre_suspend()
+xen_arch_pre_suspend()
{
/* nothing */
}
void
-xen_post_suspend(int suspend_cancelled)
+xen_arch_post_suspend(int suspend_cancelled)
{
if (suspend_cancelled)
return;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae..8508bfe 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
static inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
- return _hypercall2(int, sched_op_new, cmd, arg);
+ return _hypercall2(int, sched_op, cmd, arg);
}
static inline long
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
#endif
static inline int
-HYPERVISOR_suspend(unsigned long srec)
+HYPERVISOR_suspend(unsigned long start_info_mfn)
{
- return _hypercall3(int, sched_op, SCHEDOP_shutdown,
- SHUTDOWN_suspend, srec);
+ struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+
+ /*
+ * For a PV guest the tools require that the start_info mfn be
+ * present in rdx/edx when the hypercall is made. Per the
+ * hypercall calling convention this is the third hypercall
+ * argument, which is start_info_mfn here.
+ */
+ return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
}
static inline int
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf2..64a619d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {
/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
#define INVALID_P2M_ENTRY (~0UL)
-#define FOREIGN_FRAME_BIT (1UL<<31)
+#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
+#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
/* Maximum amount of memory we can handle in a domain in pages */
#define MAX_DOMAIN_PAGES \
@@ -41,12 +43,19 @@ extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern unsigned long set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
-extern int m2p_add_override(unsigned long mfn, struct page *page);
-extern int m2p_remove_override(struct page *page);
+extern int m2p_add_override(unsigned long mfn, struct page *page,
+ bool clear_pte);
+extern int m2p_remove_override(struct page *page, bool clear_pte);
extern struct page *m2p_find_override(unsigned long mfn);
extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+#ifdef CONFIG_XEN_DEBUG_FS
+extern int p2m_dump_show(struct seq_file *m, void *v);
+#endif
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -57,7 +66,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
mfn = get_phys_to_machine(pfn);
if (mfn != INVALID_P2M_ENTRY)
- mfn &= ~FOREIGN_FRAME_BIT;
+ mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
return mfn;
}
@@ -73,25 +82,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
static inline unsigned long mfn_to_pfn(unsigned long mfn)
{
unsigned long pfn;
+ int ret = 0;
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
+ if (unlikely((mfn >> machine_to_phys_order) != 0)) {
+ pfn = ~0;
+ goto try_override;
+ }
pfn = 0;
/*
* The array access can fail (e.g., device space beyond end of RAM).
* In such cases it doesn't matter what we return (we return garbage),
* but we must handle the fault without crashing!
*/
- __get_user(pfn, &machine_to_phys_mapping[mfn]);
-
- /*
- * If this appears to be a foreign mfn (because the pfn
- * doesn't map back to the mfn), then check the local override
- * table to see if there's a better pfn to use.
+ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+try_override:
+ /* ret might be < 0 if there are no entries in the m2p for mfn */
+ if (ret < 0)
+ pfn = ~0;
+ else if (get_phys_to_machine(pfn) != mfn)
+ /*
+ * If this appears to be a foreign mfn (because the pfn
+ * doesn't map back to the mfn), then check the local override
+ * table to see if there's a better pfn to use.
+ *
+ * m2p_find_override_pfn returns ~0 if it doesn't find anything.
+ */
+ pfn = m2p_find_override_pfn(mfn, ~0);
+
+ /*
+ * pfn is ~0 if there are no entries in the m2p for mfn or if the
+ * entry doesn't map back to the mfn and m2p_override doesn't have a
+ * valid entry for it.
*/
- if (get_phys_to_machine(pfn) != mfn)
- pfn = m2p_find_override_pfn(mfn, pfn);
+ if (pfn == ~0 &&
+ get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
+ pfn = mfn;
return pfn;
}
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3e..4fbda9a 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void)
#endif
#if defined(CONFIG_XEN_DOM0)
void __init xen_setup_pirqs(void);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
#else
static inline void __init xen_setup_pirqs(void)
{
}
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+ uint16_t domain)
+{
+ return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
#endif
#if defined(CONFIG_PCI_MSI)
@@ -27,16 +43,16 @@ static inline void __init xen_setup_pirqs(void)
* its own functions.
*/
struct xen_pci_frontend_ops {
- int (*enable_msi)(struct pci_dev *dev, int **vectors);
+ int (*enable_msi)(struct pci_dev *dev, int vectors[]);
void (*disable_msi)(struct pci_dev *dev);
- int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+ int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
void (*disable_msix)(struct pci_dev *dev);
};
extern struct xen_pci_frontend_ops *xen_pci_frontend;
static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
- int **vectors)
+ int vectors[])
{
if (xen_pci_frontend && xen_pci_frontend->enable_msi)
return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +64,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
xen_pci_frontend->disable_msi(dev);
}
static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
- int **vectors, int nvec)
+ int vectors[], int nvec)
{
if (xen_pci_frontend && xen_pci_frontend->enable_msix)
return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42a..66637bd 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -283,6 +283,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
if (!after_bootmem && !start) {
pud_t *pud;
pmd_t *pmd;
+ unsigned long addr;
+ u64 size, memblock_addr;
mmu_cr4_features = read_cr4();
@@ -291,11 +293,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* located on different 2M pages. cleanup_highmap(), however,
* can only consider _end when it runs, so destroy any
* mappings beyond _brk_end here.
+ * Respect memblock reserved regions.
*/
pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
pmd = pmd_offset(pud, _brk_end - 1);
- while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
- pmd_clear(pmd);
+ addr = (_brk_end + PMD_SIZE - 1) & PMD_MASK;
+ while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) {
+ memblock_addr = memblock_x86_find_in_range_size(__pa(addr),
+ &size, PMD_SIZE);
+ if (memblock_addr == (u64) __pa(addr) && size >= PMD_SIZE)
+ pmd_clear(pmd);
+ addr += PMD_SIZE;
+ }
}
#endif
__flush_tlb_all();
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a0..309c0a0 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
#include <asm/xen/pci.h>
#ifdef CONFIG_ACPI
-static int xen_hvm_register_pirq(u32 gsi, int triggering)
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+ int trigger, int polarity)
{
int rc, irq;
struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
return -1;
}
- if (triggering == ACPI_EDGE_SENSITIVE) {
+ if (trigger == ACPI_EDGE_SENSITIVE) {
shareable = 0;
name = "ioapic-edge";
} else {
@@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
return irq;
}
-
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
- int trigger, int polarity)
-{
- return xen_hvm_register_pirq(gsi, trigger);
-}
#endif
#if defined(CONFIG_PCI_MSI)
@@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- int irq, pirq, ret = 0;
+ int irq, pirq;
struct msi_desc *msidesc;
struct msi_msg msg;
@@ -99,39 +94,33 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
__read_msi_msg(msidesc, &msg);
pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
- if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
- xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
- if (irq < 0)
+ if (msg.data != XEN_PIRQ_MSI_DATA ||
+ xen_irq_from_pirq(pirq) < 0) {
+ pirq = xen_allocate_pirq_msi(dev, msidesc);
+ if (pirq < 0)
goto error;
- ret = set_irq_msi(irq, msidesc);
- if (ret < 0)
- goto error_while;
- printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
- " pirq=%d\n", irq, pirq);
- return 0;
+ xen_msi_compose_msg(dev, pirq, &msg);
+ __write_msi_msg(msidesc, &msg);
+ dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
+ } else {
+ dev_dbg(&dev->dev,
+ "xen: msi already bound to pirq=%d\n", pirq);
}
- xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
- if (irq < 0 || pirq < 0)
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
+ (type == PCI_CAP_ID_MSIX) ?
+ "msi-x" : "msi",
+ DOMID_SELF);
+ if (irq < 0)
goto error;
- printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
- xen_msi_compose_msg(dev, pirq, &msg);
- ret = set_irq_msi(irq, msidesc);
- if (ret < 0)
- goto error_while;
- write_msi_msg(irq, &msg);
+ dev_dbg(&dev->dev,
+ "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
}
return 0;
-error_while:
- unbind_from_irqhandler(irq, NULL);
error:
- if (ret == -ENODEV)
- dev_err(&dev->dev, "Xen PCI frontend has not registered" \
- " MSI/MSI-X support!\n");
-
- return ret;
+ dev_err(&dev->dev,
+ "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+ return -ENODEV;
}
/*
@@ -150,35 +139,27 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return -ENOMEM;
if (type == PCI_CAP_ID_MSIX)
- ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+ ret = xen_pci_frontend_enable_msix(dev, v, nvec);
else
- ret = xen_pci_frontend_enable_msi(dev, &v);
+ ret = xen_pci_frontend_enable_msi(dev, v);
if (ret)
goto error;
i = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = xen_allocate_pirq(v[i], 0, /* not sharable */
- (type == PCI_CAP_ID_MSIX) ?
- "pcifront-msi-x" : "pcifront-msi");
- if (irq < 0) {
- ret = -1;
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x" :
+ "pcifront-msi",
+ DOMID_SELF);
+ if (irq < 0)
goto free;
- }
-
- ret = set_irq_msi(irq, msidesc);
- if (ret)
- goto error_while;
i++;
}
kfree(v);
return 0;
-error_while:
- unbind_from_irqhandler(irq, NULL);
error:
- if (ret == -ENODEV)
- dev_err(&dev->dev, "Xen PCI frontend has not registered" \
- " MSI/MSI-X support!\n");
+ dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
free:
kfree(v);
return ret;
@@ -193,6 +174,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
xen_pci_frontend_disable_msix(dev);
else
xen_pci_frontend_disable_msi(dev);
+
+ /* Free the IRQ's and the msidesc using the generic code. */
+ default_teardown_msi_irqs(dev);
}
static void xen_teardown_msi_irq(unsigned int irq)
@@ -200,47 +184,91 @@ static void xen_teardown_msi_irq(unsigned int irq)
xen_destroy_irq(irq);
}
+#ifdef CONFIG_XEN_DOM0
static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- int irq, ret;
+ int ret = 0;
struct msi_desc *msidesc;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = xen_create_msi_irq(dev, msidesc, type);
- if (irq < 0)
- return -1;
+ struct physdev_map_pirq map_irq;
+ domid_t domid;
- ret = set_irq_msi(irq, msidesc);
- if (ret)
- goto error;
- }
- return 0;
+ domid = ret = xen_find_device_domain_owner(dev);
+ /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
+ * hence check ret value for < 0. */
+ if (ret < 0)
+ domid = DOMID_SELF;
-error:
- xen_destroy_irq(irq);
+ memset(&map_irq, 0, sizeof(map_irq));
+ map_irq.domid = domid;
+ map_irq.type = MAP_PIRQ_TYPE_MSI;
+ map_irq.index = -1;
+ map_irq.pirq = -1;
+ map_irq.bus = dev->bus->number;
+ map_irq.devfn = dev->devfn;
+
+ if (type == PCI_CAP_ID_MSIX) {
+ int pos;
+ u32 table_offset, bir;
+
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+ pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
+ &table_offset);
+ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+ map_irq.table_base = pci_resource_start(dev, bir);
+ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ }
+
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (ret) {
+ dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
+ ret, domid);
+ goto out;
+ }
+
+ ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
+ map_irq.pirq, map_irq.index,
+ (type == PCI_CAP_ID_MSIX) ?
+ "msi-x" : "msi",
+ domid);
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+out:
return ret;
}
#endif
+#endif
static int xen_pcifront_enable_irq(struct pci_dev *dev)
{
int rc;
int share = 1;
+ u8 gsi;
- dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
-
- if (dev->irq < 0)
- return -EINVAL;
+ rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+ rc);
+ return rc;
+ }
- if (dev->irq < NR_IRQS_LEGACY)
+ if (gsi < NR_IRQS_LEGACY)
share = 0;
- rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+ rc = xen_allocate_pirq(gsi, share, "pcifront");
if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
- dev->irq, rc);
+ dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n",
+ gsi, rc);
return rc;
}
+
+ dev->irq = rc;
+ dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
return 0;
}
@@ -427,3 +455,76 @@ void __init xen_setup_pirqs(void)
}
}
#endif
+
+struct xen_device_domain_owner {
+ domid_t domain;
+ struct pci_dev *dev;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
+
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ list_for_each_entry(owner, &dev_domain_list, list) {
+ if (owner->dev == dev)
+ return owner;
+ }
+ return NULL;
+}
+
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+ int domain = -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (owner)
+ domain = owner->domain;
+ spin_unlock(&dev_domain_list_spinlock);
+ return domain;
+}
+EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
+
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+ struct xen_device_domain_owner *owner;
+
+ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+ if (!owner)
+ return -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ if (find_device(dev)) {
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return -EEXIST;
+ }
+ owner->domain = domain;
+ owner->dev = dev;
+ list_add_tail(&owner->list, &dev_domain_list);
+ spin_unlock(&dev_domain_list_spinlock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (!owner) {
+ spin_unlock(&dev_domain_list_spinlock);
+ return -ENODEV;
+ }
+ list_del(&owner->list);
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892..e4343fe 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -48,3 +48,11 @@ config XEN_DEBUG_FS
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+
+config XEN_DEBUG
+ bool "Enable Xen debug checks"
+ depends on XEN
+ default n
+ help
+ Enable various WARN_ON checks in the Xen MMU code.
+ Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542ef..49dbd78 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1284,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor)
xen_setup_features();
- pv_info = xen_info;
- pv_info.kernel_rpl = 0;
+ pv_info.name = "Xen HVM";
xen_domain_type = XEN_HVM_DOMAIN;
return 0;
}
-void xen_hvm_init_shared_info(void)
+void __ref xen_hvm_init_shared_info(void)
{
int cpu;
struct xen_add_to_physmap xatp;
@@ -1331,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_UP_PREPARE:
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ if (xen_have_vector_callback)
+ xen_init_lock_cpu(cpu);
break;
default:
break;
@@ -1355,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;
+ xen_hvm_smp_init();
register_cpu_notifier(&xen_hvm_cpu_notifier);
xen_unplug_emulated_devices();
have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61..0c376a2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
+#include <linux/seq_file.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -416,8 +417,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
- unsigned long mfn = pfn_to_mfn(pfn);
+ unsigned long mfn;
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ mfn = get_phys_to_machine(pfn);
+ else
+ mfn = pfn;
/*
* If there's no mfn for the pfn, then just create an
* empty non-present pte. Unfortunately this loses
@@ -427,8 +432,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
if (unlikely(mfn == INVALID_P2M_ENTRY)) {
mfn = 0;
flags = 0;
+ } else {
+ /*
+ * Paramount to do this test _after_ the
+ * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
+ * IDENTITY_FRAME_BIT resolves to true.
+ */
+ mfn &= ~FOREIGN_FRAME_BIT;
+ if (mfn & IDENTITY_FRAME_BIT) {
+ mfn &= ~IDENTITY_FRAME_BIT;
+ flags |= _PAGE_IOMAP;
+ }
}
-
val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
}
@@ -532,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
+#ifdef CONFIG_XEN_DEBUG
+pte_t xen_make_pte_debug(pteval_t pte)
+{
+ phys_addr_t addr = (pte & PTE_PFN_MASK);
+ phys_addr_t other_addr;
+ bool io_page = false;
+ pte_t _pte;
+
+ if (pte & _PAGE_IOMAP)
+ io_page = true;
+
+ _pte = xen_make_pte(pte);
+
+ if (!addr)
+ return _pte;
+
+ if (io_page &&
+ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+ other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
+ WARN(addr != other_addr,
+ "0x%lx is using VM_IO, but it is 0x%lx!\n",
+ (unsigned long)addr, (unsigned long)other_addr);
+ } else {
+ pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
+ other_addr = (_pte.pte & PTE_PFN_MASK);
+ WARN((addr == other_addr) && (!io_page) && (!iomap_set),
+ "0x%lx is missing VM_IO (and wasn't fixed)!\n",
+ (unsigned long)addr);
+ }
+
+ return _pte;
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
+#endif
+
pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
@@ -1942,6 +1992,9 @@ __init void xen_ident_map_ISA(void)
static __init void xen_post_allocator_init(void)
{
+#ifdef CONFIG_XEN_DEBUG
+ pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
+#endif
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
@@ -2074,7 +2127,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
in_frames[i] = virt_to_mfn(vaddr);
MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
- set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+ __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
if (out_frames)
out_frames[i] = virt_to_pfn(vaddr);
@@ -2353,6 +2406,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
#ifdef CONFIG_XEN_DEBUG_FS
+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+ .open = p2m_dump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static struct dentry *d_mmu_debug;
static int __init xen_mmu_debugfs(void)
@@ -2408,6 +2473,7 @@ static int __init xen_mmu_debugfs(void)
debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
&mmu_stats.prot_commit_batched);
+ debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
return 0;
}
fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7c..dd5e735 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -30,6 +30,7 @@
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/sched.h>
+#include <linux/seq_file.h>
#include <asm/cache.h>
#include <asm/setup.h>
@@ -59,9 +60,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+/* We might hit two boundary violations at the start and end, at max each
+ * boundary violation will require three middle nodes. */
+RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
+
static inline unsigned p2m_top_index(unsigned long pfn)
{
BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +143,7 @@ static void p2m_init(unsigned long *p2m)
* - After resume we're called from within stop_machine, but the mfn
* tree should alreay be completely allocated.
*/
-void xen_build_mfn_list_list(void)
+void __ref xen_build_mfn_list_list(void)
{
unsigned long pfn;
@@ -221,6 +228,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_top_init(p2m_top);
+ p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_identity);
+
/*
* The domain builder gives us a pre-constructed p2m array in
* mfn_list for all the pages initially given to us, so we just
@@ -266,6 +276,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
+ /*
+ * The INVALID_P2M_ENTRY is filled in both p2m_*identity
+ * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
+ * would be wrong.
+ */
+ if (p2m_top[topidx][mididx] == p2m_identity)
+ return IDENTITY_FRAME(pfn);
+
return p2m_top[topidx][mididx][idx];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -335,9 +353,11 @@ static bool alloc_p2m(unsigned long pfn)
p2m_top_mfn_p[topidx] = mid_mfn;
}
- if (p2m_top[topidx][mididx] == p2m_missing) {
+ if (p2m_top[topidx][mididx] == p2m_identity ||
+ p2m_top[topidx][mididx] == p2m_missing) {
/* p2m leaf page is missing */
unsigned long *p2m;
+ unsigned long *p2m_orig = p2m_top[topidx][mididx];
p2m = alloc_p2m_page();
if (!p2m)
@@ -345,7 +365,7 @@ static bool alloc_p2m(unsigned long pfn)
p2m_init(p2m);
- if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
free_p2m_page(p2m);
else
mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -354,11 +374,91 @@ static bool alloc_p2m(unsigned long pfn)
return true;
}
+bool __early_alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ /* Pfff.. No boundary cross-over, lets get out. */
+ if (!idx)
+ return false;
+
+ WARN(p2m_top[topidx][mididx] == p2m_identity,
+ "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
+ topidx, mididx);
+
+ /*
+ * Could be done by xen_build_dynamic_phys_to_machine..
+ */
+ if (p2m_top[topidx][mididx] != p2m_missing)
+ return false;
+
+ /* Boundary cross-over for the edges: */
+ if (idx) {
+ unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+ p2m_init(p2m);
+
+ p2m_top[topidx][mididx] = p2m;
+
+ }
+ return idx != 0;
+}
+unsigned long set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e)
+{
+ unsigned long pfn;
+
+ if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
+ return 0;
+
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+ return pfn_e - pfn_s;
+
+ if (pfn_s > pfn_e)
+ return 0;
+
+ for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
+ pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
+ pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
+ {
+ unsigned topidx = p2m_top_index(pfn);
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+ p2m_mid_init(mid);
+
+ p2m_top[topidx] = mid;
+ }
+ }
+
+ __early_alloc_p2m(pfn_s);
+ __early_alloc_p2m(pfn_e);
+
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
+ break;
+
+ if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
+ (pfn_e - pfn_s) - (pfn - pfn_s)))
+ printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+
+ return pfn - pfn_s;
+}
+
/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
unsigned topidx, mididx, idx;
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
@@ -368,6 +468,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
+ /* For sparse holes were the p2m leaf has real PFN along with
+ * PCI holes, stick in the PFN as the MFN value.
+ */
+ if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+ if (p2m_top[topidx][mididx] == p2m_identity)
+ return true;
+
+ /* Swap over from MISSING to IDENTITY if needed. */
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
+ p2m_identity) != p2m_missing);
+ return true;
+ }
+ }
+
if (p2m_top[topidx][mididx] == p2m_missing)
return mfn == INVALID_P2M_ENTRY;
@@ -378,11 +493,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
if (!alloc_p2m(pfn))
return false;
@@ -417,11 +527,11 @@ static unsigned long mfn_hash(unsigned long mfn)
}
/* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page)
+int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long pfn;
- unsigned long address;
+ unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
@@ -429,7 +539,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
-
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
@@ -439,10 +548,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
page->index = pfn_to_mfn(pfn);
__set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
- if (!PageHighMem(page))
+ if (clear_pte && !PageHighMem(page))
/* Just zap old mapping for now */
pte_clear(&init_mm, address, ptep);
-
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -450,12 +558,12 @@ int m2p_add_override(unsigned long mfn, struct page *page)
return 0;
}
-int m2p_remove_override(struct page *page)
+int m2p_remove_override(struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long mfn;
unsigned long pfn;
- unsigned long address;
+ unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
@@ -478,7 +586,7 @@ int m2p_remove_override(struct page *page)
spin_unlock_irqrestore(&m2p_override_lock, flags);
__set_phys_to_machine(pfn, page->index);
- if (!PageHighMem(page))
+ if (clear_pte && !PageHighMem(page))
set_pte_at(&init_mm, address, ptep,
pfn_pte(pfn, PAGE_KERNEL));
/* No tlb flush necessary because the caller already
@@ -520,3 +628,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
return ret;
}
EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+int p2m_dump_show(struct seq_file *m, void *v)
+{
+ static const char * const level_name[] = { "top", "middle",
+ "entry", "abnormal" };
+ static const char * const type_name[] = { "identity", "missing",
+ "pfn", "abnormal"};
+#define TYPE_IDENTITY 0
+#define TYPE_MISSING 1
+#define TYPE_PFN 2
+#define TYPE_UNKNOWN 3
+ unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
+ unsigned int uninitialized_var(prev_level);
+ unsigned int uninitialized_var(prev_type);
+
+ if (!p2m_top)
+ return 0;
+
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned idx = p2m_index(pfn);
+ unsigned lvl, type;
+
+ lvl = 4;
+ type = TYPE_UNKNOWN;
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ lvl = 0; type = TYPE_MISSING;
+ } else if (p2m_top[topidx] == NULL) {
+ lvl = 0; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx] == NULL) {
+ lvl = 1; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx] == p2m_identity) {
+ lvl = 1; type = TYPE_IDENTITY;
+ } else if (p2m_top[topidx][mididx] == p2m_missing) {
+ lvl = 1; type = TYPE_MISSING;
+ } else if (p2m_top[topidx][mididx][idx] == 0) {
+ lvl = 2; type = TYPE_UNKNOWN;
+ } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
+ lvl = 2; type = TYPE_IDENTITY;
+ } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
+ lvl = 2; type = TYPE_MISSING;
+ } else if (p2m_top[topidx][mididx][idx] == pfn) {
+ lvl = 2; type = TYPE_PFN;
+ } else if (p2m_top[topidx][mididx][idx] != pfn) {
+ lvl = 2; type = TYPE_PFN;
+ }
+ if (pfn == 0) {
+ prev_level = lvl;
+ prev_type = type;
+ }
+ if (pfn == MAX_DOMAIN_PAGES-1) {
+ lvl = 3;
+ type = TYPE_UNKNOWN;
+ }
+ if (prev_type != type) {
+ seq_printf(m, " [0x%lx->0x%lx] %s\n",
+ prev_pfn_type, pfn, type_name[prev_type]);
+ prev_pfn_type = pfn;
+ prev_type = type;
+ }
+ if (prev_level != lvl) {
+ seq_printf(m, " [0x%lx->0x%lx] level %s\n",
+ prev_pfn_level, pfn, level_name[prev_level]);
+ prev_pfn_level = pfn;
+ prev_level = lvl;
+ }
+ }
+ return 0;
+#undef TYPE_IDENTITY
+#undef TYPE_MISSING
+#undef TYPE_PFN
+#undef TYPE_UNKNOWN
+}
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a5..edeaff2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
static __init void xen_add_extra_mem(unsigned long pages)
{
+ unsigned long pfn;
+
u64 size = (u64)pages * PAGE_SIZE;
u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
xen_extra_mem_size += size;
xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+
+ for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
start, end, ret);
if (ret == 1) {
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
len++;
}
}
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
return released;
}
+static unsigned long __init xen_set_identity(const struct e820entry *list,
+ ssize_t map_size)
+{
+ phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
+ phys_addr_t start_pci = last;
+ const struct e820entry *entry;
+ unsigned long identity = 0;
+ int i;
+
+ for (i = 0, entry = list; i < map_size; i++, entry++) {
+ phys_addr_t start = entry->addr;
+ phys_addr_t end = start + entry->size;
+
+ if (start < last)
+ start = last;
+
+ if (end <= start)
+ continue;
+
+ /* Skip over the 1MB region. */
+ if (last > end)
+ continue;
+
+ if (entry->type == E820_RAM) {
+ if (start > start_pci)
+ identity += set_phys_range_identity(
+ PFN_UP(start_pci), PFN_DOWN(start));
+
+ /* Without saving 'last' we would gooble RAM too
+ * at the end of the loop. */
+ last = end;
+ start_pci = end;
+ continue;
+ }
+ start_pci = min(start, start_pci);
+ last = end;
+ }
+ if (last > start_pci)
+ identity += set_phys_range_identity(
+ PFN_UP(start_pci), PFN_DOWN(last));
+ return identity;
+}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
static struct e820entry map[E820MAX] __initdata;
+ static struct e820entry map_raw[E820MAX] __initdata;
unsigned long max_pfn = xen_start_info->nr_pages;
unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
struct xen_memory_map memmap;
unsigned long extra_pages = 0;
unsigned long extra_limit;
+ unsigned long identity_pages = 0;
int i;
int op;
@@ -176,6 +225,7 @@ char * __init xen_memory_setup(void)
}
BUG_ON(rc);
+ memcpy(map_raw, map, sizeof(map));
e820.nr_map = 0;
xen_extra_mem_start = mem_end;
for (i = 0; i < memmap.nr_entries; i++) {
@@ -194,6 +244,14 @@ char * __init xen_memory_setup(void)
end -= delta;
extra_pages += PFN_DOWN(delta);
+ /*
+ * Set RAM below 4GB that is not for us to be unusable.
+ * This prevents "System RAM" address space from being
+ * used as potential resource for I/O address (happens
+ * when 'allocate_resource' is called).
+ */
+ if (delta && end < 0x100000000UL)
+ e820_add_region(end, delta, E820_UNUSABLE);
}
if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -251,6 +309,13 @@ char * __init xen_memory_setup(void)
xen_add_extra_mem(extra_pages);
+ /*
+ * Set P2M for all non-RAM pages and E820 gaps to be identity
+ * type PFNs. We supply it with the non-sanitized version
+ * of the E820.
+ */
+ identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
+ printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
return "Xen";
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c79..3061244 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
xen_fill_possible_map();
xen_init_spinlocks();
}
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ WARN_ON(xen_smp_intr_init(0));
+
+ if (!xen_have_vector_callback)
+ return;
+ xen_init_lock_cpu(0);
+ xen_init_spinlocks();
+}
+
+static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
+{
+ int rc;
+ rc = native_cpu_up(cpu);
+ WARN_ON (xen_smp_intr_init(cpu));
+ return rc;
+}
+
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
+ native_cpu_die(cpu);
+}
+
+void __init xen_hvm_smp_init(void)
+{
+ smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+ smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+ smp_ops.cpu_up = xen_hvm_cpu_up;
+ smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+ smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a..45329c8 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
#include "xen-ops.h"
#include "mmu.h"
-void xen_pre_suspend(void)
+void xen_arch_pre_suspend(void)
{
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
BUG();
}
-void xen_hvm_post_suspend(int suspend_cancelled)
+void xen_arch_hvm_post_suspend(int suspend_cancelled)
{
+#ifdef CONFIG_XEN_PVHVM
int cpu;
xen_hvm_init_shared_info();
xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
xen_setup_runstate_info(cpu);
}
}
+#endif
}
-void xen_post_suspend(int suspend_cancelled)
+void xen_arch_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e..2e2d370 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -397,7 +397,9 @@ void xen_setup_timer(int cpu)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
+ IRQF_DISABLED|IRQF_PERCPU|
+ IRQF_NOBALANCING|IRQF_TIMER|
+ IRQF_FORCE_RESUME,
name, NULL);
evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf9..3112f55 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);
extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/block/blk-core.c b/block/blk-core.c
index 2f4002f..77836fc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -455,6 +455,7 @@ void blk_put_queue(struct request_queue *q)
{
kobject_put(&q->kobj);
}
+EXPORT_SYMBOL_GPL(blk_put_queue);
void blk_cleanup_queue(struct request_queue *q)
{
@@ -662,6 +663,7 @@ int blk_get_queue(struct request_queue *q)
return 1;
}
+EXPORT_SYMBOL_GPL(blk_get_queue);
static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d7aa39e..9cb8668 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -120,6 +120,10 @@ static DEFINE_SPINLOCK(minor_lock);
#define EXTENDED (1<<EXT_SHIFT)
#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
#define DEV_NAME "xvd" /* name in /dev */
@@ -281,7 +285,7 @@ static int blkif_queue_request(struct request *req)
info->shadow[id].request = req;
ring_req->id = id;
- ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
ring_req->handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
@@ -317,7 +321,7 @@ static int blkif_queue_request(struct request *req)
rq_data_dir(req) );
info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
- ring_req->seg[i] =
+ ring_req->u.rw.seg[i] =
(struct blkif_request_segment) {
.gref = ref,
.first_sect = fsect,
@@ -434,6 +438,65 @@ static void xlvbd_flush(struct blkfront_info *info)
info->feature_flush ? "enabled" : "disabled");
}
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+ int major;
+ major = BLKIF_MAJOR(vdevice);
+ *minor = BLKIF_MINOR(vdevice);
+ switch (major) {
+ case XEN_IDE0_MAJOR:
+ *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = ((*minor / 64) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_IDE1_MAJOR:
+ *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK0_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK1_MAJOR:
+ case XEN_SCSI_DISK2_MAJOR:
+ case XEN_SCSI_DISK3_MAJOR:
+ case XEN_SCSI_DISK4_MAJOR:
+ case XEN_SCSI_DISK5_MAJOR:
+ case XEN_SCSI_DISK6_MAJOR:
+ case XEN_SCSI_DISK7_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK8_MAJOR:
+ case XEN_SCSI_DISK9_MAJOR:
+ case XEN_SCSI_DISK10_MAJOR:
+ case XEN_SCSI_DISK11_MAJOR:
+ case XEN_SCSI_DISK12_MAJOR:
+ case XEN_SCSI_DISK13_MAJOR:
+ case XEN_SCSI_DISK14_MAJOR:
+ case XEN_SCSI_DISK15_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XENVBD_MAJOR:
+ *offset = *minor / PARTS_PER_DISK;
+ break;
+ default:
+ printk(KERN_WARNING "blkfront: your disk configuration is "
+ "incorrect, please use an xvd device instead\n");
+ return -ENODEV;
+ }
+ return 0;
+}
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info,
@@ -441,7 +504,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
{
struct gendisk *gd;
int nr_minors = 1;
- int err = -ENODEV;
+ int err;
unsigned int offset;
int minor;
int nr_parts;
@@ -456,12 +519,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
}
if (!VDEV_IS_EXTENDED(info->vdevice)) {
- minor = BLKIF_MINOR(info->vdevice);
- nr_parts = PARTS_PER_DISK;
+ err = xen_translate_vdev(info->vdevice, &minor, &offset);
+ if (err)
+ return err;
+ nr_parts = PARTS_PER_DISK;
} else {
minor = BLKIF_MINOR_EXT(info->vdevice);
nr_parts = PARTS_PER_EXT_DISK;
+ offset = minor / nr_parts;
+ if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+ printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+ "emulated IDE disks,\n\t choose an xvd device name"
+ "from xvde on\n", info->vdevice);
}
+ err = -ENODEV;
if ((minor % nr_parts) == 0)
nr_minors = nr_parts;
@@ -475,8 +546,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (gd == NULL)
goto release;
- offset = minor / nr_parts;
-
if (nr_minors > 1) {
if (offset < 26)
sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
@@ -615,7 +684,7 @@ static void blkif_completion(struct blk_shadow *s)
{
int i;
for (i = 0; i < s->req.nr_segments; i++)
- gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+ gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
}
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -932,7 +1001,7 @@ static int blkif_recover(struct blkfront_info *info)
/* Rewrite any grant references invalidated by susp/resume. */
for (j = 0; j < req->nr_segments; j++)
gnttab_grant_foreign_access_ref(
- req->seg[j].gref,
+ req->u.rw.seg[j].gref,
info->xbdev->otherend_id,
pfn_to_mfn(info->shadow[req->id].frame[j]),
rq_data_dir(info->shadow[req->id].request));
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 26347b7..3706156 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -412,7 +412,8 @@ nouveau_mem_vram_init(struct drm_device *dev)
ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
dev_priv->ttm.bo_global_ref.ref.object,
&nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
- dma_bits <= 32 ? true : false);
+ dma_bits <= 32 ? true : false,
+ dev->dev);
if (ret) {
NV_ERROR(dev, "Error initialising bo driver: %d\n", ret);
return ret;
diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index 9a250eb..07b1151 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -12,6 +12,7 @@ struct nouveau_sgdma_be {
struct drm_device *dev;
dma_addr_t *pages;
+ bool *ttm_alloced;
unsigned nr_pages;
u64 offset;
@@ -20,7 +21,8 @@ struct nouveau_sgdma_be {
static int
nouveau_sgdma_populate(struct ttm_backend *be, unsigned long num_pages,
- struct page **pages, struct page *dummy_read_page)
+ struct page **pages, struct page *dummy_read_page,
+ dma_addr_t *dma_addrs)
{
struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)be;
struct drm_device *dev = nvbe->dev;
@@ -34,15 +36,25 @@ nouveau_sgdma_populate(struct ttm_backend *be, unsigned long num_pages,
if (!nvbe->pages)
return -ENOMEM;
+ nvbe->ttm_alloced = kmalloc(sizeof(bool) * num_pages, GFP_KERNEL);
+ if (!nvbe->ttm_alloced)
+ return -ENOMEM;
+
nvbe->nr_pages = 0;
while (num_pages--) {
- nvbe->pages[nvbe->nr_pages] =
- pci_map_page(dev->pdev, pages[nvbe->nr_pages], 0,
+ if (dma_addrs[nvbe->nr_pages] != DMA_ERROR_CODE) {
+ nvbe->pages[nvbe->nr_pages] =
+ dma_addrs[nvbe->nr_pages];
+ nvbe->ttm_alloced[nvbe->nr_pages] = true;
+ } else {
+ nvbe->pages[nvbe->nr_pages] =
+ pci_map_page(dev->pdev, pages[nvbe->nr_pages], 0,
PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
- if (pci_dma_mapping_error(dev->pdev,
- nvbe->pages[nvbe->nr_pages])) {
- be->func->clear(be);
- return -EFAULT;
+ if (pci_dma_mapping_error(dev->pdev,
+ nvbe->pages[nvbe->nr_pages])) {
+ be->func->clear(be);
+ return -EFAULT;
+ }
}
nvbe->nr_pages++;
@@ -65,11 +77,14 @@ nouveau_sgdma_clear(struct ttm_backend *be)
be->func->unbind(be);
while (nvbe->nr_pages--) {
- pci_unmap_page(dev->pdev, nvbe->pages[nvbe->nr_pages],
+ if (!nvbe->ttm_alloced[nvbe->nr_pages])
+ pci_unmap_page(dev->pdev, nvbe->pages[nvbe->nr_pages],
PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
}
kfree(nvbe->pages);
+ kfree(nvbe->ttm_alloced);
nvbe->pages = NULL;
+ nvbe->ttm_alloced = NULL;
nvbe->nr_pages = 0;
}
}
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index d270b3f..f643133 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3048,9 +3048,6 @@ int evergreen_init(struct radeon_device *rdev)
{
int r;
- r = radeon_dummy_page_init(rdev);
- if (r)
- return r;
/* This don't do much */
r = radeon_gem_init(rdev);
if (r)
@@ -3162,7 +3159,6 @@ void evergreen_fini(struct radeon_device *rdev)
radeon_atombios_fini(rdev);
kfree(rdev->bios);
rdev->bios = NULL;
- radeon_dummy_page_fini(rdev);
}
static void evergreen_pcie_gen2_enable(struct radeon_device *rdev)
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index de88624..36efc45 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -2509,9 +2509,6 @@ int r600_init(struct radeon_device *rdev)
{
int r;
- r = radeon_dummy_page_init(rdev);
- if (r)
- return r;
if (r600_debugfs_mc_info_init(rdev)) {
DRM_ERROR("Failed to register debugfs file for mc !\n");
}
@@ -2625,7 +2622,6 @@ void r600_fini(struct radeon_device *rdev)
radeon_atombios_fini(rdev);
kfree(rdev->bios);
rdev->bios = NULL;
- radeon_dummy_page_fini(rdev);
}
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 56c48b6..c5955d3 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -319,6 +319,7 @@ struct radeon_gart {
union radeon_gart_table table;
struct page **pages;
dma_addr_t *pages_addr;
+ bool *ttm_alloced;
bool ready;
};
@@ -331,7 +332,8 @@ void radeon_gart_fini(struct radeon_device *rdev);
void radeon_gart_unbind(struct radeon_device *rdev, unsigned offset,
int pages);
int radeon_gart_bind(struct radeon_device *rdev, unsigned offset,
- int pages, struct page **pagelist);
+ int pages, struct page **pagelist,
+ dma_addr_t *dma_addr);
/*
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
index 6501611..de4a86f 100644
--- a/drivers/gpu/drm/radeon/radeon_gart.c
+++ b/drivers/gpu/drm/radeon/radeon_gart.c
@@ -149,8 +149,9 @@ void radeon_gart_unbind(struct radeon_device *rdev, unsigned offset,
p = t / (PAGE_SIZE / RADEON_GPU_PAGE_SIZE);
for (i = 0; i < pages; i++, p++) {
if (rdev->gart.pages[p]) {
- pci_unmap_page(rdev->pdev, rdev->gart.pages_addr[p],
- PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ if (!rdev->gart.ttm_alloced[p])
+ pci_unmap_page(rdev->pdev, rdev->gart.pages_addr[p],
+ PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
rdev->gart.pages[p] = NULL;
rdev->gart.pages_addr[p] = rdev->dummy_page.addr;
page_base = rdev->gart.pages_addr[p];
@@ -165,7 +166,7 @@ void radeon_gart_unbind(struct radeon_device *rdev, unsigned offset,
}
int radeon_gart_bind(struct radeon_device *rdev, unsigned offset,
- int pages, struct page **pagelist)
+ int pages, struct page **pagelist, dma_addr_t *dma_addr)
{
unsigned t;
unsigned p;
@@ -180,15 +181,22 @@ int radeon_gart_bind(struct radeon_device *rdev, unsigned offset,
p = t / (PAGE_SIZE / RADEON_GPU_PAGE_SIZE);
for (i = 0; i < pages; i++, p++) {
- /* we need to support large memory configurations */
- /* assume that unbind have already been call on the range */
- rdev->gart.pages_addr[p] = pci_map_page(rdev->pdev, pagelist[i],
+ /* On TTM path, we only use the DMA API if TTM_PAGE_FLAG_DMA32
+ * is requested. */
+ if (dma_addr[i] != DMA_ERROR_CODE) {
+ rdev->gart.ttm_alloced[p] = true;
+ rdev->gart.pages_addr[p] = dma_addr[i];
+ } else {
+ /* we need to support large memory configurations */
+ /* assume that unbind have already been call on the range */
+ rdev->gart.pages_addr[p] = pci_map_page(rdev->pdev, pagelist[i],
0, PAGE_SIZE,
PCI_DMA_BIDIRECTIONAL);
- if (pci_dma_mapping_error(rdev->pdev, rdev->gart.pages_addr[p])) {
- /* FIXME: failed to map page (return -ENOMEM?) */
- radeon_gart_unbind(rdev, offset, pages);
- return -ENOMEM;
+ if (pci_dma_mapping_error(rdev->pdev, rdev->gart.pages_addr[p])) {
+ /* FIXME: failed to map page (return -ENOMEM?) */
+ radeon_gart_unbind(rdev, offset, pages);
+ return -ENOMEM;
+ }
}
rdev->gart.pages[p] = pagelist[i];
page_base = rdev->gart.pages_addr[p];
@@ -251,6 +259,12 @@ int radeon_gart_init(struct radeon_device *rdev)
radeon_gart_fini(rdev);
return -ENOMEM;
}
+ rdev->gart.ttm_alloced = kzalloc(sizeof(bool) *
+ rdev->gart.num_cpu_pages, GFP_KERNEL);
+ if (rdev->gart.ttm_alloced == NULL) {
+ radeon_gart_fini(rdev);
+ return -ENOMEM;
+ }
/* set GART entry to point to the dummy page by default */
for (i = 0; i < rdev->gart.num_cpu_pages; i++) {
rdev->gart.pages_addr[i] = rdev->dummy_page.addr;
@@ -267,6 +281,9 @@ void radeon_gart_fini(struct radeon_device *rdev)
rdev->gart.ready = false;
kfree(rdev->gart.pages);
kfree(rdev->gart.pages_addr);
+ kfree(rdev->gart.ttm_alloced);
rdev->gart.pages = NULL;
rdev->gart.pages_addr = NULL;
+ rdev->gart.ttm_alloced = NULL;
+ radeon_dummy_page_fini(rdev);
}
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index e5b2cf1..371890c 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -517,7 +517,8 @@ int radeon_ttm_init(struct radeon_device *rdev)
r = ttm_bo_device_init(&rdev->mman.bdev,
rdev->mman.bo_global_ref.ref.object,
&radeon_bo_driver, DRM_FILE_PAGE_OFFSET,
- rdev->need_dma32);
+ rdev->need_dma32,
+ rdev->dev);
if (r) {
DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
return r;
@@ -647,6 +648,7 @@ struct radeon_ttm_backend {
unsigned long num_pages;
struct page **pages;
struct page *dummy_read_page;
+ dma_addr_t *dma_addrs;
bool populated;
bool bound;
unsigned offset;
@@ -655,12 +657,14 @@ struct radeon_ttm_backend {
static int radeon_ttm_backend_populate(struct ttm_backend *backend,
unsigned long num_pages,
struct page **pages,
- struct page *dummy_read_page)
+ struct page *dummy_read_page,
+ dma_addr_t *dma_addrs)
{
struct radeon_ttm_backend *gtt;
gtt = container_of(backend, struct radeon_ttm_backend, backend);
gtt->pages = pages;
+ gtt->dma_addrs = dma_addrs;
gtt->num_pages = num_pages;
gtt->dummy_read_page = dummy_read_page;
gtt->populated = true;
@@ -673,6 +677,7 @@ static void radeon_ttm_backend_clear(struct ttm_backend *backend)
gtt = container_of(backend, struct radeon_ttm_backend, backend);
gtt->pages = NULL;
+ gtt->dma_addrs = NULL;
gtt->num_pages = 0;
gtt->dummy_read_page = NULL;
gtt->populated = false;
@@ -693,7 +698,7 @@ static int radeon_ttm_backend_bind(struct ttm_backend *backend,
gtt->num_pages, bo_mem, backend);
}
r = radeon_gart_bind(gtt->rdev, gtt->offset,
- gtt->num_pages, gtt->pages);
+ gtt->num_pages, gtt->pages, gtt->dma_addrs);
if (r) {
DRM_ERROR("failed to bind %lu pages at 0x%08X\n",
gtt->num_pages, gtt->offset);
diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c
index d8ba676..6a312e6 100644
--- a/drivers/gpu/drm/radeon/rv770.c
+++ b/drivers/gpu/drm/radeon/rv770.c
@@ -1256,9 +1256,6 @@ int rv770_init(struct radeon_device *rdev)
{
int r;
- r = radeon_dummy_page_init(rdev);
- if (r)
- return r;
/* This don't do much */
r = radeon_gem_init(rdev);
if (r)
@@ -1373,7 +1370,6 @@ void rv770_fini(struct radeon_device *rdev)
radeon_atombios_fini(rdev);
kfree(rdev->bios);
rdev->bios = NULL;
- radeon_dummy_page_fini(rdev);
}
static void rv770_pcie_gen2_enable(struct radeon_device *rdev)
diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c b/drivers/gpu/drm/ttm/ttm_agp_backend.c
index f999e36..1c4a72f 100644
--- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
+++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
@@ -47,7 +47,8 @@ struct ttm_agp_backend {
static int ttm_agp_populate(struct ttm_backend *backend,
unsigned long num_pages, struct page **pages,
- struct page *dummy_read_page)
+ struct page *dummy_read_page,
+ dma_addr_t *dma_addrs)
{
struct ttm_agp_backend *agp_be =
container_of(backend, struct ttm_agp_backend, backend);
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index af61fc2..278a2d3 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1526,12 +1526,14 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob,
struct ttm_bo_driver *driver,
uint64_t file_page_offset,
- bool need_dma32)
+ bool need_dma32,
+ struct device *dev)
{
int ret = -EINVAL;
rwlock_init(&bdev->vm_lock);
bdev->driver = driver;
+ bdev->dev = dev;
memset(bdev->man, 0, sizeof(bdev->man));
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index b1e02ff..35849db 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -38,6 +38,7 @@
#include <linux/mm.h>
#include <linux/seq_file.h> /* for seq_printf */
#include <linux/slab.h>
+#include <linux/dma-mapping.h>
#include <asm/atomic.h>
@@ -662,7 +663,8 @@ out:
* cached pages.
*/
int ttm_get_pages(struct list_head *pages, int flags,
- enum ttm_caching_state cstate, unsigned count)
+ enum ttm_caching_state cstate, unsigned count,
+ dma_addr_t *dma_address, struct device *dev)
{
struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
struct page *p = NULL;
@@ -681,14 +683,22 @@ int ttm_get_pages(struct list_head *pages, int flags,
gfp_flags |= GFP_HIGHUSER;
for (r = 0; r < count; ++r) {
- p = alloc_page(gfp_flags);
+ if ((flags & TTM_PAGE_FLAG_DMA32) && dma_address) {
+ void *addr;
+ addr = dma_alloc_coherent(dev, PAGE_SIZE,
+ &dma_address[r],
+ gfp_flags);
+ if (addr == NULL)
+ return -ENOMEM;
+ p = virt_to_page(addr);
+ } else
+ p = alloc_page(gfp_flags);
if (!p) {
printk(KERN_ERR TTM_PFX
"Unable to allocate page.");
return -ENOMEM;
}
-
list_add(&p->lru, pages);
}
return 0;
@@ -720,7 +730,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
printk(KERN_ERR TTM_PFX
"Failed to allocate extra pages "
"for large request.");
- ttm_put_pages(pages, 0, flags, cstate);
+ ttm_put_pages(pages, 0, flags, cstate, NULL, NULL);
return r;
}
}
@@ -731,17 +741,30 @@ int ttm_get_pages(struct list_head *pages, int flags,
/* Put all pages in pages list to correct pool to wait for reuse */
void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
- enum ttm_caching_state cstate)
+ enum ttm_caching_state cstate, dma_addr_t *dma_address,
+ struct device *dev)
{
unsigned long irq_flags;
struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
struct page *p, *tmp;
+ unsigned r;
if (pool == NULL) {
/* No pool for this memory type so free the pages */
+ r = page_count-1;
list_for_each_entry_safe(p, tmp, pages, lru) {
- __free_page(p);
+ if ((flags & TTM_PAGE_FLAG_DMA32) && dma_address) {
+ void *addr = page_address(p);
+ WARN_ON(!addr || !dma_address[r]);
+ if (addr)
+ dma_free_coherent(dev, PAGE_SIZE,
+ addr,
+ dma_address[r]);
+ dma_address[r] = 0;
+ } else
+ __free_page(p);
+ r--;
}
/* Make the pages list empty */
INIT_LIST_HEAD(pages);
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index af789dc..354f9d9 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -49,12 +49,16 @@ static int ttm_tt_swapin(struct ttm_tt *ttm);
static void ttm_tt_alloc_page_directory(struct ttm_tt *ttm)
{
ttm->pages = drm_calloc_large(ttm->num_pages, sizeof(*ttm->pages));
+ ttm->dma_address = drm_calloc_large(ttm->num_pages,
+ sizeof(*ttm->dma_address));
}
static void ttm_tt_free_page_directory(struct ttm_tt *ttm)
{
drm_free_large(ttm->pages);
ttm->pages = NULL;
+ drm_free_large(ttm->dma_address);
+ ttm->dma_address = NULL;
}
static void ttm_tt_free_user_pages(struct ttm_tt *ttm)
@@ -105,7 +109,8 @@ static struct page *__ttm_tt_get_page(struct ttm_tt *ttm, int index)
INIT_LIST_HEAD(&h);
- ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1);
+ ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1,
+ &ttm->dma_address[index], ttm->dev);
if (ret != 0)
return NULL;
@@ -164,7 +169,7 @@ int ttm_tt_populate(struct ttm_tt *ttm)
}
be->func->populate(be, ttm->num_pages, ttm->pages,
- ttm->dummy_read_page);
+ ttm->dummy_read_page, ttm->dma_address);
ttm->state = tt_unbound;
return 0;
}
@@ -298,7 +303,8 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
count++;
}
}
- ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state);
+ ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state,
+ ttm->dma_address, ttm->dev);
ttm->state = tt_unpopulated;
ttm->first_himem_page = ttm->num_pages;
ttm->last_lomem_page = -1;
@@ -391,6 +397,7 @@ struct ttm_tt *ttm_tt_create(struct ttm_bo_device *bdev, unsigned long size,
ttm->last_lomem_page = -1;
ttm->caching_state = tt_cached;
ttm->page_flags = page_flags;
+ ttm->dev = bdev->dev;
ttm->dummy_read_page = dummy_read_page;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
index 80bc37b..87e43e0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
@@ -102,7 +102,8 @@ struct vmw_ttm_backend {
static int vmw_ttm_populate(struct ttm_backend *backend,
unsigned long num_pages, struct page **pages,
- struct page *dummy_read_page)
+ struct page *dummy_read_page,
+ dma_addr_t *dma_addrs)
{
struct vmw_ttm_backend *vmw_be =
container_of(backend, struct vmw_ttm_backend, backend);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 10ca97e..803d979 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -322,11 +322,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
dev_priv->active_master = &dev_priv->fbdev_master;
-
ret = ttm_bo_device_init(&dev_priv->bdev,
dev_priv->bo_global_ref.ref.object,
&vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
- false);
+ false,
+ dev->dev);
if (unlikely(ret != 0)) {
DRM_ERROR("Failed initializing TTM buffer object driver.\n");
goto out_err1;
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0382332..1826d5d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2966,12 +2966,38 @@ config XEN_NETDEV_FRONTEND
select XEN_XENBUS_FRONTEND
default y
help
- The network device frontend driver allows the kernel to
- access network devices exported exported by a virtual
- machine containing a physical network device driver. The
- frontend driver is intended for unprivileged guest domains;
- if you are compiling a kernel for a Xen guest, you almost
- certainly want to enable this.
+ This driver provides support for Xen paravirtual network
+ devices exported by a Xen network driver domain (often
+ domain 0).
+
+ The corresponding Linux backend driver is enabled by the
+ CONFIG_XEN_NETDEV_BACKEND option.
+
+ If you are compiling a kernel for use as Xen guest, you
+ should say Y here. To compile this driver as a module, chose
+ M here: the module will be called xen-netfront.
+
+config XEN_NETDEV_BACKEND
+ tristate "Xen backend network device"
+ depends on XEN_BACKEND
+ help
+ This driver allows the kernel to act as a Xen network driver
+ domain which exports paravirtual network devices to other
+ Xen domains. These devices can be accessed by any operating
+ system that implements a compatible front end.
+
+ The corresponding Linux frontend driver is enabled by the
+ CONFIG_XEN_NETDEV_FRONTEND configuration option.
+
+ The backend driver presents a standard network device
+ endpoint for each paravirtual network device to the driver
+ domain network stack. These can then be bridged or routed
+ etc in order to provide full network connectivity.
+
+ If you are compiling a kernel to run in a Xen network driver
+ domain (often this is domain 0) you should say Y here. To
+ compile this driver as a module, chose M here: the module
+ will be called xen-netback.
config ISERIES_VETH
tristate "iSeries Virtual Ethernet driver support"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b90738d..145dfd7 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -171,6 +171,7 @@ obj-$(CONFIG_SLIP) += slip.o
obj-$(CONFIG_SLHC) += slhc.o
obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
+obj-$(CONFIG_XEN_NETDEV_BACKEND) += xen-netback/
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile
new file mode 100644
index 0000000..e346e81
--- /dev/null
+++ b/drivers/net/xen-netback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
+
+xen-netback-y := netback.o xenbus.o interface.o
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
new file mode 100644
index 0000000..21f4c0c
--- /dev/null
+++ b/drivers/net/xen-netback/common.h
@@ -0,0 +1,162 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_NETBACK__COMMON_H__
+#define __XEN_NETBACK__COMMON_H__
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/io.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include <xen/interface/io/netif.h>
+#include <xen/interface/grant_table.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+struct xen_netbk;
+
+struct xenvif {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+
+ /* Reference to netback processing backend. */
+ struct xen_netbk *netbk;
+
+ u8 fe_dev_addr[6];
+
+ /* Physical parameters of the comms window. */
+ grant_handle_t tx_shmem_handle;
+ grant_ref_t tx_shmem_ref;
+ grant_handle_t rx_shmem_handle;
+ grant_ref_t rx_shmem_ref;
+ unsigned int irq;
+
+ /* List of frontends to notify after a batch of frames sent. */
+ struct list_head notify_list;
+
+ /* The shared rings and indexes. */
+ struct xen_netif_tx_back_ring tx;
+ struct xen_netif_rx_back_ring rx;
+ struct vm_struct *tx_comms_area;
+ struct vm_struct *rx_comms_area;
+
+ /* Flags that must not be set in dev->features */
+ int features_disabled;
+
+ /* Frontend feature information. */
+ u8 can_sg:1;
+ u8 gso:1;
+ u8 gso_prefix:1;
+ u8 csum:1;
+
+ /* Internal feature information. */
+ u8 can_queue:1; /* can queue packets for receiver? */
+
+ /*
+ * Allow xenvif_start_xmit() to peek ahead in the rx request
+ * ring. This is a prediction of what rx_req_cons will be
+ * once all queued skbs are put on the ring.
+ */
+ RING_IDX rx_req_cons_peek;
+
+ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+ unsigned long credit_bytes;
+ unsigned long credit_usec;
+ unsigned long remaining_credit;
+ struct timer_list credit_timeout;
+
+ /* Statistics */
+ int rx_gso_checksum_fixup;
+
+ /* Miscellaneous private stuff. */
+ struct list_head schedule_list;
+ atomic_t refcnt;
+ struct net_device *dev;
+ struct net_device_stats stats;
+
+ wait_queue_head_t waiting_to_free;
+};
+
+#define XEN_NETIF_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
+#define XEN_NETIF_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+
+struct xenvif *xenvif_alloc(struct device *parent,
+ domid_t domid,
+ unsigned int handle);
+
+int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
+ unsigned long rx_ring_ref, unsigned int evtchn);
+void xenvif_disconnect(struct xenvif *vif);
+
+void xenvif_get(struct xenvif *vif);
+void xenvif_put(struct xenvif *vif);
+
+int xenvif_xenbus_init(void);
+
+int xenvif_schedulable(struct xenvif *vif);
+
+int xen_netbk_rx_ring_full(struct xenvif *vif);
+
+int xen_netbk_must_stop_queue(struct xenvif *vif);
+
+/* (Un)Map communication rings. */
+void xen_netbk_unmap_frontend_rings(struct xenvif *vif);
+int xen_netbk_map_frontend_rings(struct xenvif *vif,
+ grant_ref_t tx_ring_ref,
+ grant_ref_t rx_ring_ref);
+
+/* (De)Register a xenvif with the netback backend. */
+void xen_netbk_add_xenvif(struct xenvif *vif);
+void xen_netbk_remove_xenvif(struct xenvif *vif);
+
+/* (De)Schedule backend processing for a xenvif */
+void xen_netbk_schedule_xenvif(struct xenvif *vif);
+void xen_netbk_deschedule_xenvif(struct xenvif *vif);
+
+/* Check for SKBs from frontend and schedule backend processing */
+void xen_netbk_check_rx_xenvif(struct xenvif *vif);
+/* Receive an SKB from the frontend */
+void xenvif_receive_skb(struct xenvif *vif, struct sk_buff *skb);
+
+/* Queue an SKB for transmission to the frontend */
+void xen_netbk_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb);
+/* Notify xenvif that ring now has space to send an skb to the frontend */
+void xenvif_notify_tx_completion(struct xenvif *vif);
+
+/* Returns number of ring slots required to send an skb to the frontend */
+unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb);
+
+#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
new file mode 100644
index 0000000..1614ba5
--- /dev/null
+++ b/drivers/net/xen-netback/interface.c
@@ -0,0 +1,424 @@
+/*
+ * Network-device interface management.
+ *
+ * Copyright (c) 2004-2005, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+
+#include <xen/events.h>
+#include <asm/xen/hypercall.h>
+
+#define XENVIF_QUEUE_LENGTH 32
+
+void xenvif_get(struct xenvif *vif)
+{
+ atomic_inc(&vif->refcnt);
+}
+
+void xenvif_put(struct xenvif *vif)
+{
+ if (atomic_dec_and_test(&vif->refcnt))
+ wake_up(&vif->waiting_to_free);
+}
+
+int xenvif_schedulable(struct xenvif *vif)
+{
+ return netif_running(vif->dev) && netif_carrier_ok(vif->dev);
+}
+
+static int xenvif_rx_schedulable(struct xenvif *vif)
+{
+ return xenvif_schedulable(vif) && !xen_netbk_rx_ring_full(vif);
+}
+
+static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
+{
+ struct xenvif *vif = dev_id;
+
+ if (vif->netbk == NULL)
+ return IRQ_NONE;
+
+ xen_netbk_schedule_xenvif(vif);
+
+ if (xenvif_rx_schedulable(vif))
+ netif_wake_queue(vif->dev);
+
+ return IRQ_HANDLED;
+}
+
+static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xenvif *vif = netdev_priv(dev);
+
+ BUG_ON(skb->dev != dev);
+
+ if (vif->netbk == NULL)
+ goto drop;
+
+ /* Drop the packet if the target domain has no receive buffers. */
+ if (!xenvif_rx_schedulable(vif))
+ goto drop;
+
+ /* Reserve ring slots for the worst-case number of fragments. */
+ vif->rx_req_cons_peek += xen_netbk_count_skb_slots(vif, skb);
+ xenvif_get(vif);
+
+ if (vif->can_queue && xen_netbk_must_stop_queue(vif))
+ netif_stop_queue(dev);
+
+ xen_netbk_queue_tx_skb(vif, skb);
+
+ return NETDEV_TX_OK;
+
+ drop:
+ vif->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+void xenvif_receive_skb(struct xenvif *vif, struct sk_buff *skb)
+{
+ netif_rx_ni(skb);
+ vif->dev->last_rx = jiffies;
+}
+
+void xenvif_notify_tx_completion(struct xenvif *vif)
+{
+ if (netif_queue_stopped(vif->dev) && xenvif_rx_schedulable(vif))
+ netif_wake_queue(vif->dev);
+}
+
+static struct net_device_stats *xenvif_get_stats(struct net_device *dev)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ return &vif->stats;
+}
+
+static void xenvif_up(struct xenvif *vif)
+{
+ xen_netbk_add_xenvif(vif);
+ enable_irq(vif->irq);
+ xen_netbk_check_rx_xenvif(vif);
+}
+
+static void xenvif_down(struct xenvif *vif)
+{
+ disable_irq(vif->irq);
+ xen_netbk_deschedule_xenvif(vif);
+ xen_netbk_remove_xenvif(vif);
+}
+
+static int xenvif_open(struct net_device *dev)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ if (netif_carrier_ok(dev))
+ xenvif_up(vif);
+ netif_start_queue(dev);
+ return 0;
+}
+
+static int xenvif_close(struct net_device *dev)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ if (netif_carrier_ok(dev))
+ xenvif_down(vif);
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static int xenvif_change_mtu(struct net_device *dev, int mtu)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ int max = vif->can_sg ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+ if (mtu > max)
+ return -EINVAL;
+ dev->mtu = mtu;
+ return 0;
+}
+
+static void xenvif_set_features(struct xenvif *vif)
+{
+ struct net_device *dev = vif->dev;
+ int features = dev->features;
+
+ if (vif->can_sg)
+ features |= NETIF_F_SG;
+ if (vif->gso || vif->gso_prefix)
+ features |= NETIF_F_TSO;
+ if (vif->csum)
+ features |= NETIF_F_IP_CSUM;
+
+ features &= ~(vif->features_disabled);
+
+ if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN)
+ dev->mtu = ETH_DATA_LEN;
+
+ dev->features = features;
+}
+
+static int xenvif_set_tx_csum(struct net_device *dev, u32 data)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ if (data) {
+ if (!vif->csum)
+ return -EOPNOTSUPP;
+ vif->features_disabled &= ~NETIF_F_IP_CSUM;
+ } else {
+ vif->features_disabled |= NETIF_F_IP_CSUM;
+ }
+
+ xenvif_set_features(vif);
+ return 0;
+}
+
+static int xenvif_set_sg(struct net_device *dev, u32 data)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ if (data) {
+ if (!vif->can_sg)
+ return -EOPNOTSUPP;
+ vif->features_disabled &= ~NETIF_F_SG;
+ } else {
+ vif->features_disabled |= NETIF_F_SG;
+ }
+
+ xenvif_set_features(vif);
+ return 0;
+}
+
+static int xenvif_set_tso(struct net_device *dev, u32 data)
+{
+ struct xenvif *vif = netdev_priv(dev);
+ if (data) {
+ if (!vif->gso && !vif->gso_prefix)
+ return -EOPNOTSUPP;
+ vif->features_disabled &= ~NETIF_F_TSO;
+ } else {
+ vif->features_disabled |= NETIF_F_TSO;
+ }
+
+ xenvif_set_features(vif);
+ return 0;
+}
+
+static const struct xenvif_stat {
+ char name[ETH_GSTRING_LEN];
+ u16 offset;
+} xenvif_stats[] = {
+ {
+ "rx_gso_checksum_fixup",
+ offsetof(struct xenvif, rx_gso_checksum_fixup)
+ },
+};
+
+static int xenvif_get_sset_count(struct net_device *dev, int string_set)
+{
+ switch (string_set) {
+ case ETH_SS_STATS:
+ return ARRAY_SIZE(xenvif_stats);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void xenvif_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, u64 * data)
+{
+ void *vif = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(xenvif_stats); i++)
+ data[i] = *(int *)(vif + xenvif_stats[i].offset);
+}
+
+static void xenvif_get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+ int i;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ for (i = 0; i < ARRAY_SIZE(xenvif_stats); i++)
+ memcpy(data + i * ETH_GSTRING_LEN,
+ xenvif_stats[i].name, ETH_GSTRING_LEN);
+ break;
+ }
+}
+
+static struct ethtool_ops xenvif_ethtool_ops = {
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = xenvif_set_tx_csum,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = xenvif_set_sg,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = xenvif_set_tso,
+ .get_link = ethtool_op_get_link,
+
+ .get_sset_count = xenvif_get_sset_count,
+ .get_ethtool_stats = xenvif_get_ethtool_stats,
+ .get_strings = xenvif_get_strings,
+};
+
+static struct net_device_ops xenvif_netdev_ops = {
+ .ndo_start_xmit = xenvif_start_xmit,
+ .ndo_get_stats = xenvif_get_stats,
+ .ndo_open = xenvif_open,
+ .ndo_stop = xenvif_close,
+ .ndo_change_mtu = xenvif_change_mtu,
+};
+
+struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
+ unsigned int handle)
+{
+ int err;
+ struct net_device *dev;
+ struct xenvif *vif;
+ char name[IFNAMSIZ] = {};
+
+ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+ dev = alloc_netdev(sizeof(struct xenvif), name, ether_setup);
+ if (dev == NULL) {
+ pr_warn("Could not allocate netdev\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ SET_NETDEV_DEV(dev, parent);
+
+ vif = netdev_priv(dev);
+ vif->domid = domid;
+ vif->handle = handle;
+ vif->netbk = NULL;
+ vif->can_sg = 1;
+ vif->csum = 1;
+ atomic_set(&vif->refcnt, 1);
+ init_waitqueue_head(&vif->waiting_to_free);
+ vif->dev = dev;
+ INIT_LIST_HEAD(&vif->schedule_list);
+ INIT_LIST_HEAD(&vif->notify_list);
+
+ vif->credit_bytes = vif->remaining_credit = ~0UL;
+ vif->credit_usec = 0UL;
+ init_timer(&vif->credit_timeout);
+ /* Initialize 'expires' now: it's used to track the credit window. */
+ vif->credit_timeout.expires = jiffies;
+
+ dev->netdev_ops = &xenvif_netdev_ops;
+ xenvif_set_features(vif);
+ SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops);
+
+ dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
+
+ /*
+ * Initialise a dummy MAC address. We choose the numerically
+ * largest non-broadcast address to prevent the address getting
+ * stolen by an Ethernet bridge for STP purposes.
+ * (FE:FF:FF:FF:FF:FF)
+ */
+ memset(dev->dev_addr, 0xFF, ETH_ALEN);
+ dev->dev_addr[0] &= ~0x01;
+
+ netif_carrier_off(dev);
+
+ err = register_netdev(dev);
+ if (err) {
+ netdev_warn(dev, "Could not register device: err=%d\n", err);
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+
+ netdev_dbg(dev, "Successfully created xenvif\n");
+ return vif;
+}
+
+int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
+ unsigned long rx_ring_ref, unsigned int evtchn)
+{
+ int err = -ENOMEM;
+
+ /* Already connected through? */
+ if (vif->irq)
+ return 0;
+
+ xenvif_set_features(vif);
+
+ err = xen_netbk_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
+ if (err < 0)
+ goto err;
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ vif->domid, evtchn, xenvif_interrupt, 0,
+ vif->dev->name, vif);
+ if (err < 0)
+ goto err_unmap;
+ vif->irq = err;
+ disable_irq(vif->irq);
+
+ xenvif_get(vif);
+
+ rtnl_lock();
+ netif_carrier_on(vif->dev);
+ if (netif_running(vif->dev))
+ xenvif_up(vif);
+ rtnl_unlock();
+
+ return 0;
+err_unmap:
+ xen_netbk_unmap_frontend_rings(vif);
+err:
+ return err;
+}
+
+void xenvif_disconnect(struct xenvif *vif)
+{
+ struct net_device *dev = vif->dev;
+ if (netif_carrier_ok(dev)) {
+ rtnl_lock();
+ netif_carrier_off(dev); /* discard queued packets */
+ if (netif_running(dev))
+ xenvif_down(vif);
+ rtnl_unlock();
+ xenvif_put(vif);
+ }
+
+ atomic_dec(&vif->refcnt);
+ wait_event(vif->waiting_to_free, atomic_read(&vif->refcnt) == 0);
+
+ del_timer_sync(&vif->credit_timeout);
+
+ if (vif->irq)
+ unbind_from_irqhandler(vif->irq, vif);
+
+ unregister_netdev(vif->dev);
+
+ xen_netbk_unmap_frontend_rings(vif);
+
+ free_netdev(vif->dev);
+}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
new file mode 100644
index 0000000..c2669b8
--- /dev/null
+++ b/drivers/net/xen-netback/netback.c
@@ -0,0 +1,1745 @@
+/*
+ * Back-end of the driver for virtual network devices. This portion of the
+ * driver exports a 'unified' network-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * drivers/net/xen-netfront.c
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <linux/kthread.h>
+#include <linux/if_vlan.h>
+#include <linux/udp.h>
+
+#include <net/tcp.h>
+
+#include <xen/events.h>
+#include <xen/interface/memory.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+struct pending_tx_info {
+ struct xen_netif_tx_request req;
+ struct xenvif *vif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct netbk_rx_meta {
+ int id;
+ int size;
+ int gso_size;
+};
+
+#define MAX_PENDING_REQS 256
+
+#define MAX_BUFFER_OFFSET PAGE_SIZE
+
+/* extra field used in struct page */
+union page_ext {
+ struct {
+#if BITS_PER_LONG < 64
+#define IDX_WIDTH 8
+#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH)
+ unsigned int group:GROUP_WIDTH;
+ unsigned int idx:IDX_WIDTH;
+#else
+ unsigned int group, idx;
+#endif
+ } e;
+ void *mapping;
+};
+
+struct xen_netbk {
+ wait_queue_head_t wq;
+ struct task_struct *task;
+
+ struct sk_buff_head rx_queue;
+ struct sk_buff_head tx_queue;
+
+ struct timer_list net_timer;
+
+ struct page *mmap_pages[MAX_PENDING_REQS];
+
+ pending_ring_idx_t pending_prod;
+ pending_ring_idx_t pending_cons;
+ struct list_head net_schedule_list;
+
+ /* Protect the net_schedule_list in netif. */
+ spinlock_t net_schedule_list_lock;
+
+ atomic_t netfront_count;
+
+ struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+ struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
+
+ u16 pending_ring[MAX_PENDING_REQS];
+
+ /*
+ * Given MAX_BUFFER_OFFSET of 4096 the worst case is that each
+ * head/fragment page uses 2 copy operations because it
+ * straddles two buffers in the frontend.
+ */
+ struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE];
+ struct netbk_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE];
+};
+
+static struct xen_netbk *xen_netbk;
+static int xen_netbk_group_nr;
+
+void xen_netbk_add_xenvif(struct xenvif *vif)
+{
+ int i;
+ int min_netfront_count;
+ int min_group = 0;
+ struct xen_netbk *netbk;
+
+ min_netfront_count = atomic_read(&xen_netbk[0].netfront_count);
+ for (i = 0; i < xen_netbk_group_nr; i++) {
+ int netfront_count = atomic_read(&xen_netbk[i].netfront_count);
+ if (netfront_count < min_netfront_count) {
+ min_group = i;
+ min_netfront_count = netfront_count;
+ }
+ }
+
+ netbk = &xen_netbk[min_group];
+
+ vif->netbk = netbk;
+ atomic_inc(&netbk->netfront_count);
+}
+
+void xen_netbk_remove_xenvif(struct xenvif *vif)
+{
+ struct xen_netbk *netbk = vif->netbk;
+ vif->netbk = NULL;
+ atomic_dec(&netbk->netfront_count);
+}
+
+static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx);
+static void make_tx_response(struct xenvif *vif,
+ struct xen_netif_tx_request *txp,
+ s8 st);
+static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
+ u16 id,
+ s8 st,
+ u16 offset,
+ u16 size,
+ u16 flags);
+
+static inline unsigned long idx_to_pfn(struct xen_netbk *netbk,
+ unsigned int idx)
+{
+ return page_to_pfn(netbk->mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk,
+ unsigned int idx)
+{
+ return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
+}
+
+/* extra field used in struct page */
+static inline void set_page_ext(struct page *pg, struct xen_netbk *netbk,
+ unsigned int idx)
+{
+ unsigned int group = netbk - xen_netbk;
+ union page_ext ext = { .e = { .group = group + 1, .idx = idx } };
+
+ BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
+ pg->mapping = ext.mapping;
+}
+
+static int get_page_ext(struct page *pg,
+ unsigned int *pgroup, unsigned int *pidx)
+{
+ union page_ext ext = { .mapping = pg->mapping };
+ struct xen_netbk *netbk;
+ unsigned int group, idx;
+
+ group = ext.e.group - 1;
+
+ if (group < 0 || group >= xen_netbk_group_nr)
+ return 0;
+
+ netbk = &xen_netbk[group];
+
+ idx = ext.e.idx;
+
+ if ((idx < 0) || (idx >= MAX_PENDING_REQS))
+ return 0;
+
+ if (netbk->mmap_pages[idx] != pg)
+ return 0;
+
+ *pgroup = group;
+ *pidx = idx;
+
+ return 1;
+}
+
+/*
+ * This is the amount of packet we copy rather than map, so that the
+ * guest can't fiddle with the contents of the headers while we do
+ * packet processing on them (netfilter, routing, etc).
+ */
+#define PKT_PROT_LEN (ETH_HLEN + \
+ VLAN_HLEN + \
+ sizeof(struct iphdr) + MAX_IPOPTLEN + \
+ sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
+
+static inline pending_ring_idx_t pending_index(unsigned i)
+{
+ return i & (MAX_PENDING_REQS-1);
+}
+
+static inline pending_ring_idx_t nr_pending_reqs(struct xen_netbk *netbk)
+{
+ return MAX_PENDING_REQS -
+ netbk->pending_prod + netbk->pending_cons;
+}
+
+static void xen_netbk_kick_thread(struct xen_netbk *netbk)
+{
+ wake_up(&netbk->wq);
+}
+
+static int max_required_rx_slots(struct xenvif *vif)
+{
+ int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
+
+ if (vif->can_sg || vif->gso || vif->gso_prefix)
+ max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
+
+ return max;
+}
+
+int xen_netbk_rx_ring_full(struct xenvif *vif)
+{
+ RING_IDX peek = vif->rx_req_cons_peek;
+ RING_IDX needed = max_required_rx_slots(vif);
+
+ return ((vif->rx.sring->req_prod - peek) < needed) ||
+ ((vif->rx.rsp_prod_pvt + XEN_NETIF_RX_RING_SIZE - peek) < needed);
+}
+
+int xen_netbk_must_stop_queue(struct xenvif *vif)
+{
+ if (!xen_netbk_rx_ring_full(vif))
+ return 0;
+
+ vif->rx.sring->req_event = vif->rx_req_cons_peek +
+ max_required_rx_slots(vif);
+ mb(); /* request notification /then/ check the queue */
+
+ return xen_netbk_rx_ring_full(vif);
+}
+
+/*
+ * Returns true if we should start a new receive buffer instead of
+ * adding 'size' bytes to a buffer which currently contains 'offset'
+ * bytes.
+ */
+static bool start_new_rx_buffer(int offset, unsigned long size, int head)
+{
+ /* simple case: we have completely filled the current buffer. */
+ if (offset == MAX_BUFFER_OFFSET)
+ return true;
+
+ /*
+ * complex case: start a fresh buffer if the current frag
+ * would overflow the current buffer but only if:
+ * (i) this frag would fit completely in the next buffer
+ * and (ii) there is already some data in the current buffer
+ * and (iii) this is not the head buffer.
+ *
+ * Where:
+ * - (i) stops us splitting a frag into two copies
+ * unless the frag is too large for a single buffer.
+ * - (ii) stops us from leaving a buffer pointlessly empty.
+ * - (iii) stops us leaving the first buffer
+ * empty. Strictly speaking this is already covered
+ * by (ii) but is explicitly checked because
+ * netfront relies on the first buffer being
+ * non-empty and can crash otherwise.
+ *
+ * This means we will effectively linearise small
+ * frags but do not needlessly split large buffers
+ * into multiple copies tend to give large frags their
+ * own buffers as before.
+ */
+ if ((offset + size > MAX_BUFFER_OFFSET) &&
+ (size <= MAX_BUFFER_OFFSET) && offset && !head)
+ return true;
+
+ return false;
+}
+
+/*
+ * Figure out how many ring slots we're going to need to send @skb to
+ * the guest. This function is essentially a dry run of
+ * netbk_gop_frag_copy.
+ */
+unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb)
+{
+ unsigned int count;
+ int i, copy_off;
+
+ count = DIV_ROUND_UP(
+ offset_in_page(skb->data)+skb_headlen(skb), PAGE_SIZE);
+
+ copy_off = skb_headlen(skb) % PAGE_SIZE;
+
+ if (skb_shinfo(skb)->gso_size)
+ count++;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ unsigned long size = skb_shinfo(skb)->frags[i].size;
+ unsigned long bytes;
+ while (size > 0) {
+ BUG_ON(copy_off > MAX_BUFFER_OFFSET);
+
+ if (start_new_rx_buffer(copy_off, size, 0)) {
+ count++;
+ copy_off = 0;
+ }
+
+ bytes = size;
+ if (copy_off + bytes > MAX_BUFFER_OFFSET)
+ bytes = MAX_BUFFER_OFFSET - copy_off;
+
+ copy_off += bytes;
+ size -= bytes;
+ }
+ }
+ return count;
+}
+
+struct netrx_pending_operations {
+ unsigned copy_prod, copy_cons;
+ unsigned meta_prod, meta_cons;
+ struct gnttab_copy *copy;
+ struct netbk_rx_meta *meta;
+ int copy_off;
+ grant_ref_t copy_gref;
+};
+
+static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif,
+ struct netrx_pending_operations *npo)
+{
+ struct netbk_rx_meta *meta;
+ struct xen_netif_rx_request *req;
+
+ req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
+
+ meta = npo->meta + npo->meta_prod++;
+ meta->gso_size = 0;
+ meta->size = 0;
+ meta->id = req->id;
+
+ npo->copy_off = 0;
+ npo->copy_gref = req->gref;
+
+ return meta;
+}
+
+/*
+ * Set up the grant operations for this fragment. If it's a flipping
+ * interface, we also set up the unmap request from here.
+ */
+static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
+ struct netrx_pending_operations *npo,
+ struct page *page, unsigned long size,
+ unsigned long offset, int *head)
+{
+ struct gnttab_copy *copy_gop;
+ struct netbk_rx_meta *meta;
+ /*
+ * These variables a used iff get_page_ext returns true,
+ * in which case they are guaranteed to be initialized.
+ */
+ unsigned int uninitialized_var(group), uninitialized_var(idx);
+ int foreign = get_page_ext(page, &group, &idx);
+ unsigned long bytes;
+
+ /* Data must not cross a page boundary. */
+ BUG_ON(size + offset > PAGE_SIZE);
+
+ meta = npo->meta + npo->meta_prod - 1;
+
+ while (size > 0) {
+ BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
+
+ if (start_new_rx_buffer(npo->copy_off, size, *head)) {
+ /*
+ * Netfront requires there to be some data in the head
+ * buffer.
+ */
+ BUG_ON(*head);
+
+ meta = get_next_rx_buffer(vif, npo);
+ }
+
+ bytes = size;
+ if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
+ bytes = MAX_BUFFER_OFFSET - npo->copy_off;
+
+ copy_gop = npo->copy + npo->copy_prod++;
+ copy_gop->flags = GNTCOPY_dest_gref;
+ if (foreign) {
+ struct xen_netbk *netbk = &xen_netbk[group];
+ struct pending_tx_info *src_pend;
+
+ src_pend = &netbk->pending_tx_info[idx];
+
+ copy_gop->source.domid = src_pend->vif->domid;
+ copy_gop->source.u.ref = src_pend->req.gref;
+ copy_gop->flags |= GNTCOPY_source_gref;
+ } else {
+ void *vaddr = page_address(page);
+ copy_gop->source.domid = DOMID_SELF;
+ copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
+ }
+ copy_gop->source.offset = offset;
+ copy_gop->dest.domid = vif->domid;
+
+ copy_gop->dest.offset = npo->copy_off;
+ copy_gop->dest.u.ref = npo->copy_gref;
+ copy_gop->len = bytes;
+
+ npo->copy_off += bytes;
+ meta->size += bytes;
+
+ offset += bytes;
+ size -= bytes;
+
+ /* Leave a gap for the GSO descriptor. */
+ if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix)
+ vif->rx.req_cons++;
+
+ *head = 0; /* There must be something in this buffer now. */
+
+ }
+}
+
+/*
+ * Prepare an SKB to be transmitted to the frontend.
+ *
+ * This function is responsible for allocating grant operations, meta
+ * structures, etc.
+ *
+ * It returns the number of meta structures consumed. The number of
+ * ring slots used is always equal to the number of meta slots used
+ * plus the number of GSO descriptors used. Currently, we use either
+ * zero GSO descriptors (for non-GSO packets) or one descriptor (for
+ * frontend-side LRO).
+ */
+static int netbk_gop_skb(struct sk_buff *skb,
+ struct netrx_pending_operations *npo)
+{
+ struct xenvif *vif = netdev_priv(skb->dev);
+ int nr_frags = skb_shinfo(skb)->nr_frags;
+ int i;
+ struct xen_netif_rx_request *req;
+ struct netbk_rx_meta *meta;
+ unsigned char *data;
+ int head = 1;
+ int old_meta_prod;
+
+ old_meta_prod = npo->meta_prod;
+
+ /* Set up a GSO prefix descriptor, if necessary */
+ if (skb_shinfo(skb)->gso_size && vif->gso_prefix) {
+ req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
+ meta = npo->meta + npo->meta_prod++;
+ meta->gso_size = skb_shinfo(skb)->gso_size;
+ meta->size = 0;
+ meta->id = req->id;
+ }
+
+ req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
+ meta = npo->meta + npo->meta_prod++;
+
+ if (!vif->gso_prefix)
+ meta->gso_size = skb_shinfo(skb)->gso_size;
+ else
+ meta->gso_size = 0;
+
+ meta->size = 0;
+ meta->id = req->id;
+ npo->copy_off = 0;
+ npo->copy_gref = req->gref;
+
+ data = skb->data;
+ while (data < skb_tail_pointer(skb)) {
+ unsigned int offset = offset_in_page(data);
+ unsigned int len = PAGE_SIZE - offset;
+
+ if (data + len > skb_tail_pointer(skb))
+ len = skb_tail_pointer(skb) - data;
+
+ netbk_gop_frag_copy(vif, skb, npo,
+ virt_to_page(data), len, offset, &head);
+ data += len;
+ }
+
+ for (i = 0; i < nr_frags; i++) {
+ netbk_gop_frag_copy(vif, skb, npo,
+ skb_shinfo(skb)->frags[i].page,
+ skb_shinfo(skb)->frags[i].size,
+ skb_shinfo(skb)->frags[i].page_offset,
+ &head);
+ }
+
+ return npo->meta_prod - old_meta_prod;
+}
+
+/*
+ * This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was
+ * used to set up the operations on the top of
+ * netrx_pending_operations, which have since been done. Check that
+ * they didn't give any errors and advance over them.
+ */
+static int netbk_check_gop(struct xenvif *vif, int nr_meta_slots,
+ struct netrx_pending_operations *npo)
+{
+ struct gnttab_copy *copy_op;
+ int status = XEN_NETIF_RSP_OKAY;
+ int i;
+
+ for (i = 0; i < nr_meta_slots; i++) {
+ copy_op = npo->copy + npo->copy_cons++;
+ if (copy_op->status != GNTST_okay) {
+ netdev_dbg(vif->dev,
+ "Bad status %d from copy to DOM%d.\n",
+ copy_op->status, vif->domid);
+ status = XEN_NETIF_RSP_ERROR;
+ }
+ }
+
+ return status;
+}
+
+static void netbk_add_frag_responses(struct xenvif *vif, int status,
+ struct netbk_rx_meta *meta,
+ int nr_meta_slots)
+{
+ int i;
+ unsigned long offset;
+
+ /* No fragments used */
+ if (nr_meta_slots <= 1)
+ return;
+
+ nr_meta_slots--;
+
+ for (i = 0; i < nr_meta_slots; i++) {
+ int flags;
+ if (i == nr_meta_slots - 1)
+ flags = 0;
+ else
+ flags = XEN_NETRXF_more_data;
+
+ offset = 0;
+ make_rx_response(vif, meta[i].id, status, offset,
+ meta[i].size, flags);
+ }
+}
+
+struct skb_cb_overlay {
+ int meta_slots_used;
+};
+
+static void xen_netbk_rx_action(struct xen_netbk *netbk)
+{
+ struct xenvif *vif = NULL, *tmp;
+ s8 status;
+ u16 irq, flags;
+ struct xen_netif_rx_response *resp;
+ struct sk_buff_head rxq;
+ struct sk_buff *skb;
+ LIST_HEAD(notify);
+ int ret;
+ int nr_frags;
+ int count;
+ unsigned long offset;
+ struct skb_cb_overlay *sco;
+
+ struct netrx_pending_operations npo = {
+ .copy = netbk->grant_copy_op,
+ .meta = netbk->meta,
+ };
+
+ skb_queue_head_init(&rxq);
+
+ count = 0;
+
+ while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
+ vif = netdev_priv(skb->dev);
+ nr_frags = skb_shinfo(skb)->nr_frags;
+
+ sco = (struct skb_cb_overlay *)skb->cb;
+ sco->meta_slots_used = netbk_gop_skb(skb, &npo);
+
+ count += nr_frags + 1;
+
+ __skb_queue_tail(&rxq, skb);
+
+ /* Filled the batch queue? */
+ if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
+ break;
+ }
+
+ BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
+
+ if (!npo.copy_prod)
+ return;
+
+ BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
+ npo.copy_prod);
+ BUG_ON(ret != 0);
+
+ while ((skb = __skb_dequeue(&rxq)) != NULL) {
+ sco = (struct skb_cb_overlay *)skb->cb;
+
+ vif = netdev_priv(skb->dev);
+
+ if (netbk->meta[npo.meta_cons].gso_size && vif->gso_prefix) {
+ resp = RING_GET_RESPONSE(&vif->rx,
+ vif->rx.rsp_prod_pvt++);
+
+ resp->flags = XEN_NETRXF_gso_prefix | XEN_NETRXF_more_data;
+
+ resp->offset = netbk->meta[npo.meta_cons].gso_size;
+ resp->id = netbk->meta[npo.meta_cons].id;
+ resp->status = sco->meta_slots_used;
+
+ npo.meta_cons++;
+ sco->meta_slots_used--;
+ }
+
+
+ vif->stats.tx_bytes += skb->len;
+ vif->stats.tx_packets++;
+
+ status = netbk_check_gop(vif, sco->meta_slots_used, &npo);
+
+ if (sco->meta_slots_used == 1)
+ flags = 0;
+ else
+ flags = XEN_NETRXF_more_data;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+ flags |= XEN_NETRXF_csum_blank | XEN_NETRXF_data_validated;
+ else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+ /* remote but checksummed. */
+ flags |= XEN_NETRXF_data_validated;
+
+ offset = 0;
+ resp = make_rx_response(vif, netbk->meta[npo.meta_cons].id,
+ status, offset,
+ netbk->meta[npo.meta_cons].size,
+ flags);
+
+ if (netbk->meta[npo.meta_cons].gso_size && !vif->gso_prefix) {
+ struct xen_netif_extra_info *gso =
+ (struct xen_netif_extra_info *)
+ RING_GET_RESPONSE(&vif->rx,
+ vif->rx.rsp_prod_pvt++);
+
+ resp->flags |= XEN_NETRXF_extra_info;
+
+ gso->u.gso.size = netbk->meta[npo.meta_cons].gso_size;
+ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+ gso->u.gso.pad = 0;
+ gso->u.gso.features = 0;
+
+ gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+ gso->flags = 0;
+ }
+
+ netbk_add_frag_responses(vif, status,
+ netbk->meta + npo.meta_cons + 1,
+ sco->meta_slots_used);
+
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret);
+ irq = vif->irq;
+ if (ret && list_empty(&vif->notify_list))
+ list_add_tail(&vif->notify_list, &notify);
+
+ xenvif_notify_tx_completion(vif);
+
+ xenvif_put(vif);
+ npo.meta_cons += sco->meta_slots_used;
+ dev_kfree_skb(skb);
+ }
+
+ list_for_each_entry_safe(vif, tmp, &notify, notify_list) {
+ notify_remote_via_irq(vif->irq);
+ list_del_init(&vif->notify_list);
+ }
+
+ /* More work to do? */
+ if (!skb_queue_empty(&netbk->rx_queue) &&
+ !timer_pending(&netbk->net_timer))
+ xen_netbk_kick_thread(netbk);
+}
+
+void xen_netbk_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb)
+{
+ struct xen_netbk *netbk = vif->netbk;
+
+ skb_queue_tail(&netbk->rx_queue, skb);
+
+ xen_netbk_kick_thread(netbk);
+}
+
+static void xen_netbk_alarm(unsigned long data)
+{
+ struct xen_netbk *netbk = (struct xen_netbk *)data;
+ xen_netbk_kick_thread(netbk);
+}
+
+static int __on_net_schedule_list(struct xenvif *vif)
+{
+ return !list_empty(&vif->schedule_list);
+}
+
+/* Must be called with net_schedule_list_lock held */
+static void remove_from_net_schedule_list(struct xenvif *vif)
+{
+ if (likely(__on_net_schedule_list(vif))) {
+ list_del_init(&vif->schedule_list);
+ xenvif_put(vif);
+ }
+}
+
+static struct xenvif *poll_net_schedule_list(struct xen_netbk *netbk)
+{
+ struct xenvif *vif = NULL;
+
+ spin_lock_irq(&netbk->net_schedule_list_lock);
+ if (list_empty(&netbk->net_schedule_list))
+ goto out;
+
+ vif = list_first_entry(&netbk->net_schedule_list,
+ struct xenvif, schedule_list);
+ if (!vif)
+ goto out;
+
+ xenvif_get(vif);
+
+ remove_from_net_schedule_list(vif);
+out:
+ spin_unlock_irq(&netbk->net_schedule_list_lock);
+ return vif;
+}
+
+void xen_netbk_schedule_xenvif(struct xenvif *vif)
+{
+ unsigned long flags;
+ struct xen_netbk *netbk = vif->netbk;
+
+ if (__on_net_schedule_list(vif))
+ goto kick;
+
+ spin_lock_irqsave(&netbk->net_schedule_list_lock, flags);
+ if (!__on_net_schedule_list(vif) &&
+ likely(xenvif_schedulable(vif))) {
+ list_add_tail(&vif->schedule_list, &netbk->net_schedule_list);
+ xenvif_get(vif);
+ }
+ spin_unlock_irqrestore(&netbk->net_schedule_list_lock, flags);
+
+kick:
+ smp_mb();
+ if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&netbk->net_schedule_list))
+ xen_netbk_kick_thread(netbk);
+}
+
+void xen_netbk_deschedule_xenvif(struct xenvif *vif)
+{
+ struct xen_netbk *netbk = vif->netbk;
+ spin_lock_irq(&netbk->net_schedule_list_lock);
+ remove_from_net_schedule_list(vif);
+ spin_unlock_irq(&netbk->net_schedule_list_lock);
+}
+
+void xen_netbk_check_rx_xenvif(struct xenvif *vif)
+{
+ int more_to_do;
+
+ RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do);
+
+ if (more_to_do)
+ xen_netbk_schedule_xenvif(vif);
+}
+
+static void tx_add_credit(struct xenvif *vif)
+{
+ unsigned long max_burst, max_credit;
+
+ /*
+ * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+ * Otherwise the interface can seize up due to insufficient credit.
+ */
+ max_burst = RING_GET_REQUEST(&vif->tx, vif->tx.req_cons)->size;
+ max_burst = min(max_burst, 131072UL);
+ max_burst = max(max_burst, vif->credit_bytes);
+
+ /* Take care that adding a new chunk of credit doesn't wrap to zero. */
+ max_credit = vif->remaining_credit + vif->credit_bytes;
+ if (max_credit < vif->remaining_credit)
+ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+ vif->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+ struct xenvif *vif = (struct xenvif *)data;
+ tx_add_credit(vif);
+ xen_netbk_check_rx_xenvif(vif);
+}
+
+static void netbk_tx_err(struct xenvif *vif,
+ struct xen_netif_tx_request *txp, RING_IDX end)
+{
+ RING_IDX cons = vif->tx.req_cons;
+
+ do {
+ make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+ if (cons >= end)
+ break;
+ txp = RING_GET_REQUEST(&vif->tx, cons++);
+ } while (1);
+ vif->tx.req_cons = cons;
+ xen_netbk_check_rx_xenvif(vif);
+ xenvif_put(vif);
+}
+
+static int netbk_count_requests(struct xenvif *vif,
+ struct xen_netif_tx_request *first,
+ struct xen_netif_tx_request *txp,
+ int work_to_do)
+{
+ RING_IDX cons = vif->tx.req_cons;
+ int frags = 0;
+
+ if (!(first->flags & XEN_NETTXF_more_data))
+ return 0;
+
+ do {
+ if (frags >= work_to_do) {
+ netdev_dbg(vif->dev, "Need more frags\n");
+ return -frags;
+ }
+
+ if (unlikely(frags >= MAX_SKB_FRAGS)) {
+ netdev_dbg(vif->dev, "Too many frags\n");
+ return -frags;
+ }
+
+ memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
+ sizeof(*txp));
+ if (txp->size > first->size) {
+ netdev_dbg(vif->dev, "Frags galore\n");
+ return -frags;
+ }
+
+ first->size -= txp->size;
+ frags++;
+
+ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+ netdev_dbg(vif->dev, "txp->offset: %x, size: %u\n",
+ txp->offset, txp->size);
+ return -frags;
+ }
+ } while ((txp++)->flags & XEN_NETTXF_more_data);
+ return frags;
+}
+
+static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
+ struct sk_buff *skb,
+ unsigned long pending_idx)
+{
+ struct page *page;
+ page = alloc_page(GFP_KERNEL|__GFP_COLD);
+ if (!page)
+ return NULL;
+ set_page_ext(page, netbk, pending_idx);
+ netbk->mmap_pages[pending_idx] = page;
+ return page;
+}
+
+static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
+ struct xenvif *vif,
+ struct sk_buff *skb,
+ struct xen_netif_tx_request *txp,
+ struct gnttab_copy *gop)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ skb_frag_t *frags = shinfo->frags;
+ unsigned long pending_idx = *((u16 *)skb->data);
+ int i, start;
+
+ /* Skip first skb fragment if it is on same page as header fragment. */
+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+ for (i = start; i < shinfo->nr_frags; i++, txp++) {
+ struct page *page;
+ pending_ring_idx_t index;
+ struct pending_tx_info *pending_tx_info =
+ netbk->pending_tx_info;
+
+ index = pending_index(netbk->pending_cons++);
+ pending_idx = netbk->pending_ring[index];
+ page = xen_netbk_alloc_page(netbk, skb, pending_idx);
+ if (!page)
+ return NULL;
+
+ netbk->mmap_pages[pending_idx] = page;
+
+ gop->source.u.ref = txp->gref;
+ gop->source.domid = vif->domid;
+ gop->source.offset = txp->offset;
+
+ gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+ gop->dest.domid = DOMID_SELF;
+ gop->dest.offset = txp->offset;
+
+ gop->len = txp->size;
+ gop->flags = GNTCOPY_source_gref;
+
+ gop++;
+
+ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+ xenvif_get(vif);
+ pending_tx_info[pending_idx].vif = vif;
+ frags[i].page = (void *)pending_idx;
+ }
+
+ return gop;
+}
+
+static int xen_netbk_tx_check_gop(struct xen_netbk *netbk,
+ struct sk_buff *skb,
+ struct gnttab_copy **gopp)
+{
+ struct gnttab_copy *gop = *gopp;
+ int pending_idx = *((u16 *)skb->data);
+ struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
+ struct xenvif *vif = pending_tx_info[pending_idx].vif;
+ struct xen_netif_tx_request *txp;
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ int i, err, start;
+
+ /* Check status of header. */
+ err = gop->status;
+ if (unlikely(err)) {
+ pending_ring_idx_t index;
+ index = pending_index(netbk->pending_prod++);
+ txp = &pending_tx_info[pending_idx].req;
+ make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+ netbk->pending_ring[index] = pending_idx;
+ xenvif_put(vif);
+ }
+
+ /* Skip first skb fragment if it is on same page as header fragment. */
+ start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+ for (i = start; i < nr_frags; i++) {
+ int j, newerr;
+ pending_ring_idx_t index;
+
+ pending_idx = (unsigned long)shinfo->frags[i].page;
+
+ /* Check error status: if okay then remember grant handle. */
+ newerr = (++gop)->status;
+ if (likely(!newerr)) {
+ /* Had a previous error? Invalidate this fragment. */
+ if (unlikely(err))
+ xen_netbk_idx_release(netbk, pending_idx);
+ continue;
+ }
+
+ /* Error on this fragment: respond to client with an error. */
+ txp = &netbk->pending_tx_info[pending_idx].req;
+ make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+ index = pending_index(netbk->pending_prod++);
+ netbk->pending_ring[index] = pending_idx;
+ xenvif_put(vif);
+
+ /* Not the first error? Preceding frags already invalidated. */
+ if (err)
+ continue;
+
+ /* First error: invalidate header and preceding fragments. */
+ pending_idx = *((u16 *)skb->data);
+ xen_netbk_idx_release(netbk, pending_idx);
+ for (j = start; j < i; j++) {
+ pending_idx = (unsigned long)shinfo->frags[i].page;
+ xen_netbk_idx_release(netbk, pending_idx);
+ }
+
+ /* Remember the error: invalidate all subsequent fragments. */
+ err = newerr;
+ }
+
+ *gopp = gop + 1;
+ return err;
+}
+
+static void xen_netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int nr_frags = shinfo->nr_frags;
+ int i;
+
+ for (i = 0; i < nr_frags; i++) {
+ skb_frag_t *frag = shinfo->frags + i;
+ struct xen_netif_tx_request *txp;
+ unsigned long pending_idx;
+
+ pending_idx = (unsigned long)frag->page;
+
+ txp = &netbk->pending_tx_info[pending_idx].req;
+ frag->page = virt_to_page(idx_to_kaddr(netbk, pending_idx));
+ frag->size = txp->size;
+ frag->page_offset = txp->offset;
+
+ skb->len += txp->size;
+ skb->data_len += txp->size;
+ skb->truesize += txp->size;
+
+ /* Take an extra reference to offset xen_netbk_idx_release */
+ get_page(netbk->mmap_pages[pending_idx]);
+ xen_netbk_idx_release(netbk, pending_idx);
+ }
+}
+
+static int xen_netbk_get_extras(struct xenvif *vif,
+ struct xen_netif_extra_info *extras,
+ int work_to_do)
+{
+ struct xen_netif_extra_info extra;
+ RING_IDX cons = vif->tx.req_cons;
+
+ do {
+ if (unlikely(work_to_do-- <= 0)) {
+ netdev_dbg(vif->dev, "Missing extra info\n");
+ return -EBADR;
+ }
+
+ memcpy(&extra, RING_GET_REQUEST(&vif->tx, cons),
+ sizeof(extra));
+ if (unlikely(!extra.type ||
+ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+ vif->tx.req_cons = ++cons;
+ netdev_dbg(vif->dev,
+ "Invalid extra type: %d\n", extra.type);
+ return -EINVAL;
+ }
+
+ memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
+ vif->tx.req_cons = ++cons;
+ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+ return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct xenvif *vif,
+ struct sk_buff *skb,
+ struct xen_netif_extra_info *gso)
+{
+ if (!gso->u.gso.size) {
+ netdev_dbg(vif->dev, "GSO size must not be zero.\n");
+ return -EINVAL;
+ }
+
+ /* Currently only TCPv4 S.O. is supported. */
+ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+ netdev_dbg(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
+ return -EINVAL;
+ }
+
+ skb_shinfo(skb)->gso_size = gso->u.gso.size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+
+ return 0;
+}
+
+static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ unsigned char *th;
+ int err = -EPROTO;
+ int recalculate_partial_csum = 0;
+
+ /*
+ * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
+ * peers can fail to set NETRXF_csum_blank when sending a GSO
+ * frame. In this case force the SKB to CHECKSUM_PARTIAL and
+ * recalculate the partial checksum.
+ */
+ if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
+ vif->rx_gso_checksum_fixup++;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ recalculate_partial_csum = 1;
+ }
+
+ /* A non-CHECKSUM_PARTIAL SKB does not require setup. */
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ return 0;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto out;
+
+ iph = (void *)skb->data;
+ th = skb->data + 4 * iph->ihl;
+ if (th >= skb_tail_pointer(skb))
+ goto out;
+
+ skb->csum_start = th - skb->head;
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (recalculate_partial_csum) {
+ struct tcphdr *tcph = (struct tcphdr *)th;
+ tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - iph->ihl*4,
+ IPPROTO_TCP, 0);
+ }
+ break;
+ case IPPROTO_UDP:
+ skb->csum_offset = offsetof(struct udphdr, check);
+
+ if (recalculate_partial_csum) {
+ struct udphdr *udph = (struct udphdr *)th;
+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - iph->ihl*4,
+ IPPROTO_UDP, 0);
+ }
+ break;
+ default:
+ if (net_ratelimit())
+ netdev_err(vif->dev,
+ "Attempting to checksum a non-TCP/UDP packet, dropping a protocol %d packet\n",
+ iph->protocol);
+ goto out;
+ }
+
+ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
+ goto out;
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
+{
+ unsigned long now = jiffies;
+ unsigned long next_credit =
+ vif->credit_timeout.expires +
+ msecs_to_jiffies(vif->credit_usec / 1000);
+
+ /* Timer could already be pending in rare cases. */
+ if (timer_pending(&vif->credit_timeout))
+ return true;
+
+ /* Passed the point where we can replenish credit? */
+ if (time_after_eq(now, next_credit)) {
+ vif->credit_timeout.expires = now;
+ tx_add_credit(vif);
+ }
+
+ /* Still too big to send right now? Set a callback. */
+ if (size > vif->remaining_credit) {
+ vif->credit_timeout.data =
+ (unsigned long)vif;
+ vif->credit_timeout.function =
+ tx_credit_callback;
+ mod_timer(&vif->credit_timeout,
+ next_credit);
+
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
+{
+ struct gnttab_copy *gop = netbk->tx_copy_ops, *request_gop;
+ struct sk_buff *skb;
+ int ret;
+
+ while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+ !list_empty(&netbk->net_schedule_list)) {
+ struct xenvif *vif;
+ struct xen_netif_tx_request txreq;
+ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
+ struct page *page;
+ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
+ u16 pending_idx;
+ RING_IDX idx;
+ int work_to_do;
+ unsigned int data_len;
+ pending_ring_idx_t index;
+
+ /* Get a netif from the list with work to do. */
+ vif = poll_net_schedule_list(netbk);
+ if (!vif)
+ continue;
+
+ RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, work_to_do);
+ if (!work_to_do) {
+ xenvif_put(vif);
+ continue;
+ }
+
+ idx = vif->tx.req_cons;
+ rmb(); /* Ensure that we see the request before we copy it. */
+ memcpy(&txreq, RING_GET_REQUEST(&vif->tx, idx), sizeof(txreq));
+
+ /* Credit-based scheduling. */
+ if (txreq.size > vif->remaining_credit &&
+ tx_credit_exceeded(vif, txreq.size)) {
+ xenvif_put(vif);
+ continue;
+ }
+
+ vif->remaining_credit -= txreq.size;
+
+ work_to_do--;
+ vif->tx.req_cons = ++idx;
+
+ memset(extras, 0, sizeof(extras));
+ if (txreq.flags & XEN_NETTXF_extra_info) {
+ work_to_do = xen_netbk_get_extras(vif, extras,
+ work_to_do);
+ idx = vif->tx.req_cons;
+ if (unlikely(work_to_do < 0)) {
+ netbk_tx_err(vif, &txreq, idx);
+ continue;
+ }
+ }
+
+ ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do);
+ if (unlikely(ret < 0)) {
+ netbk_tx_err(vif, &txreq, idx - ret);
+ continue;
+ }
+ idx += ret;
+
+ if (unlikely(txreq.size < ETH_HLEN)) {
+ netdev_dbg(vif->dev,
+ "Bad packet size: %d\n", txreq.size);
+ netbk_tx_err(vif, &txreq, idx);
+ continue;
+ }
+
+ /* No crossing a page as the payload mustn't fragment. */
+ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+ netdev_dbg(vif->dev,
+ "txreq.offset: %x, size: %u, end: %lu\n",
+ txreq.offset, txreq.size,
+ (txreq.offset&~PAGE_MASK) + txreq.size);
+ netbk_tx_err(vif, &txreq, idx);
+ continue;
+ }
+
+ index = pending_index(netbk->pending_cons);
+ pending_idx = netbk->pending_ring[index];
+
+ data_len = (txreq.size > PKT_PROT_LEN &&
+ ret < MAX_SKB_FRAGS) ?
+ PKT_PROT_LEN : txreq.size;
+
+ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(skb == NULL)) {
+ netdev_dbg(vif->dev,
+ "Can't allocate a skb in start_xmit.\n");
+ netbk_tx_err(vif, &txreq, idx);
+ break;
+ }
+
+ /* Packets passed to netif_rx() must have some headroom. */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+ struct xen_netif_extra_info *gso;
+ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+ if (netbk_set_skb_gso(vif, skb, gso)) {
+ kfree_skb(skb);
+ netbk_tx_err(vif, &txreq, idx);
+ continue;
+ }
+ }
+
+ /* XXX could copy straight to head */
+ page = xen_netbk_alloc_page(netbk, skb, pending_idx);
+ if (!page) {
+ kfree_skb(skb);
+ netbk_tx_err(vif, &txreq, idx);
+ continue;
+ }
+
+ netbk->mmap_pages[pending_idx] = page;
+
+ gop->source.u.ref = txreq.gref;
+ gop->source.domid = vif->domid;
+ gop->source.offset = txreq.offset;
+
+ gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+ gop->dest.domid = DOMID_SELF;
+ gop->dest.offset = txreq.offset;
+
+ gop->len = txreq.size;
+ gop->flags = GNTCOPY_source_gref;
+
+ gop++;
+
+ memcpy(&netbk->pending_tx_info[pending_idx].req,
+ &txreq, sizeof(txreq));
+ netbk->pending_tx_info[pending_idx].vif = vif;
+ *((u16 *)skb->data) = pending_idx;
+
+ __skb_put(skb, data_len);
+
+ skb_shinfo(skb)->nr_frags = ret;
+ if (data_len < txreq.size) {
+ skb_shinfo(skb)->nr_frags++;
+ skb_shinfo(skb)->frags[0].page =
+ (void *)(unsigned long)pending_idx;
+ } else {
+ /* Discriminate from any valid pending_idx value. */
+ skb_shinfo(skb)->frags[0].page = (void *)~0UL;
+ }
+
+ __skb_queue_tail(&netbk->tx_queue, skb);
+
+ netbk->pending_cons++;
+
+ request_gop = xen_netbk_get_requests(netbk, vif,
+ skb, txfrags, gop);
+ if (request_gop == NULL) {
+ kfree_skb(skb);
+ netbk_tx_err(vif, &txreq, idx);
+ continue;
+ }
+ gop = request_gop;
+
+ vif->tx.req_cons = idx;
+ xen_netbk_check_rx_xenvif(vif);
+
+ if ((gop-netbk->tx_copy_ops) >= ARRAY_SIZE(netbk->tx_copy_ops))
+ break;
+ }
+
+ return gop - netbk->tx_copy_ops;
+}
+
+static void xen_netbk_tx_submit(struct xen_netbk *netbk)
+{
+ struct gnttab_copy *gop = netbk->tx_copy_ops;
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
+ struct xen_netif_tx_request *txp;
+ struct xenvif *vif;
+ u16 pending_idx;
+ unsigned data_len;
+
+ pending_idx = *((u16 *)skb->data);
+ vif = netbk->pending_tx_info[pending_idx].vif;
+ txp = &netbk->pending_tx_info[pending_idx].req;
+
+ /* Check the remap error code. */
+ if (unlikely(xen_netbk_tx_check_gop(netbk, skb, &gop))) {
+ netdev_dbg(vif->dev, "netback grant failed.\n");
+ skb_shinfo(skb)->nr_frags = 0;
+ kfree_skb(skb);
+ continue;
+ }
+
+ data_len = skb->len;
+ memcpy(skb->data,
+ (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
+ data_len);
+ if (data_len < txp->size) {
+ /* Append the packet payload as a fragment. */
+ txp->offset += data_len;
+ txp->size -= data_len;
+ } else {
+ /* Schedule a response immediately. */
+ xen_netbk_idx_release(netbk, pending_idx);
+ }
+
+ if (txp->flags & XEN_NETTXF_csum_blank)
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ else if (txp->flags & XEN_NETTXF_data_validated)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ xen_netbk_fill_frags(netbk, skb);
+
+ /*
+ * If the initial fragment was < PKT_PROT_LEN then
+ * pull through some bytes from the other fragments to
+ * increase the linear region to PKT_PROT_LEN bytes.
+ */
+ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
+ int target = min_t(int, skb->len, PKT_PROT_LEN);
+ __pskb_pull_tail(skb, target - skb_headlen(skb));
+ }
+
+ skb->dev = vif->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ if (checksum_setup(vif, skb)) {
+ netdev_dbg(vif->dev,
+ "Can't setup checksum in net_tx_action\n");
+ kfree_skb(skb);
+ continue;
+ }
+
+ vif->stats.rx_bytes += skb->len;
+ vif->stats.rx_packets++;
+
+ xenvif_receive_skb(vif, skb);
+ }
+}
+
+/* Called after netfront has transmitted */
+static void xen_netbk_tx_action(struct xen_netbk *netbk)
+{
+ unsigned nr_gops;
+ int ret;
+
+ nr_gops = xen_netbk_tx_build_gops(netbk);
+
+ if (nr_gops == 0)
+ return;
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
+ netbk->tx_copy_ops, nr_gops);
+ BUG_ON(ret);
+
+ xen_netbk_tx_submit(netbk);
+
+}
+
+static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx)
+{
+ struct xenvif *vif;
+ struct pending_tx_info *pending_tx_info;
+ pending_ring_idx_t index;
+
+ /* Already complete? */
+ if (netbk->mmap_pages[pending_idx] == NULL)
+ return;
+
+ pending_tx_info = &netbk->pending_tx_info[pending_idx];
+
+ vif = pending_tx_info->vif;
+
+ make_tx_response(vif, &pending_tx_info->req, XEN_NETIF_RSP_OKAY);
+
+ index = pending_index(netbk->pending_prod++);
+ netbk->pending_ring[index] = pending_idx;
+
+ xenvif_put(vif);
+
+ netbk->mmap_pages[pending_idx]->mapping = 0;
+ put_page(netbk->mmap_pages[pending_idx]);
+ netbk->mmap_pages[pending_idx] = NULL;
+}
+
+static void make_tx_response(struct xenvif *vif,
+ struct xen_netif_tx_request *txp,
+ s8 st)
+{
+ RING_IDX i = vif->tx.rsp_prod_pvt;
+ struct xen_netif_tx_response *resp;
+ int notify;
+
+ resp = RING_GET_RESPONSE(&vif->tx, i);
+ resp->id = txp->id;
+ resp->status = st;
+
+ if (txp->flags & XEN_NETTXF_extra_info)
+ RING_GET_RESPONSE(&vif->tx, ++i)->status = XEN_NETIF_RSP_NULL;
+
+ vif->tx.rsp_prod_pvt = ++i;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->tx, notify);
+ if (notify)
+ notify_remote_via_irq(vif->irq);
+}
+
+static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
+ u16 id,
+ s8 st,
+ u16 offset,
+ u16 size,
+ u16 flags)
+{
+ RING_IDX i = vif->rx.rsp_prod_pvt;
+ struct xen_netif_rx_response *resp;
+
+ resp = RING_GET_RESPONSE(&vif->rx, i);
+ resp->offset = offset;
+ resp->flags = flags;
+ resp->id = id;
+ resp->status = (s16)size;
+ if (st < 0)
+ resp->status = (s16)st;
+
+ vif->rx.rsp_prod_pvt = ++i;
+
+ return resp;
+}
+
+static inline int rx_work_todo(struct xen_netbk *netbk)
+{
+ return !skb_queue_empty(&netbk->rx_queue);
+}
+
+static inline int tx_work_todo(struct xen_netbk *netbk)
+{
+
+ if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+ !list_empty(&netbk->net_schedule_list))
+ return 1;
+
+ return 0;
+}
+
+static int xen_netbk_kthread(void *data)
+{
+ struct xen_netbk *netbk = data;
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(netbk->wq,
+ rx_work_todo(netbk) ||
+ tx_work_todo(netbk) ||
+ kthread_should_stop());
+ cond_resched();
+
+ if (kthread_should_stop())
+ break;
+
+ if (rx_work_todo(netbk))
+ xen_netbk_rx_action(netbk);
+
+ if (tx_work_todo(netbk))
+ xen_netbk_tx_action(netbk);
+ }
+
+ return 0;
+}
+
+void xen_netbk_unmap_frontend_rings(struct xenvif *vif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ if (vif->tx.sring) {
+ gnttab_set_unmap_op(&op, (unsigned long)vif->tx_comms_area->addr,
+ GNTMAP_host_map, vif->tx_shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+ }
+
+ if (vif->rx.sring) {
+ gnttab_set_unmap_op(&op, (unsigned long)vif->rx_comms_area->addr,
+ GNTMAP_host_map, vif->rx_shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+ }
+ if (vif->rx_comms_area)
+ free_vm_area(vif->rx_comms_area);
+ if (vif->tx_comms_area)
+ free_vm_area(vif->tx_comms_area);
+}
+
+int xen_netbk_map_frontend_rings(struct xenvif *vif,
+ grant_ref_t tx_ring_ref,
+ grant_ref_t rx_ring_ref)
+{
+ struct gnttab_map_grant_ref op;
+ struct xen_netif_tx_sring *txs;
+ struct xen_netif_rx_sring *rxs;
+
+ int err = -ENOMEM;
+
+ vif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
+ if (vif->tx_comms_area == NULL)
+ goto err;
+
+ vif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
+ if (vif->rx_comms_area == NULL)
+ goto err;
+
+ gnttab_set_map_op(&op, (unsigned long)vif->tx_comms_area->addr,
+ GNTMAP_host_map, tx_ring_ref, vif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ netdev_warn(vif->dev,
+ "failed to map tx ring. err=%d status=%d\n",
+ err, op.status);
+ err = op.status;
+ goto err;
+ }
+
+ vif->tx_shmem_ref = tx_ring_ref;
+ vif->tx_shmem_handle = op.handle;
+
+ txs = (struct xen_netif_tx_sring *)vif->tx_comms_area->addr;
+ BACK_RING_INIT(&vif->tx, txs, PAGE_SIZE);
+
+ gnttab_set_map_op(&op, (unsigned long)vif->rx_comms_area->addr,
+ GNTMAP_host_map, rx_ring_ref, vif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ netdev_warn(vif->dev,
+ "failed to map rx ring. err=%d status=%d\n",
+ err, op.status);
+ err = op.status;
+ goto err;
+ }
+
+ vif->rx_shmem_ref = rx_ring_ref;
+ vif->rx_shmem_handle = op.handle;
+ vif->rx_req_cons_peek = 0;
+
+ rxs = (struct xen_netif_rx_sring *)vif->rx_comms_area->addr;
+ BACK_RING_INIT(&vif->rx, rxs, PAGE_SIZE);
+
+ return 0;
+
+err:
+ xen_netbk_unmap_frontend_rings(vif);
+ return err;
+}
+
+static int __init netback_init(void)
+{
+ int i;
+ int rc = 0;
+ int group;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ xen_netbk_group_nr = num_online_cpus();
+ xen_netbk = vzalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr);
+ if (!xen_netbk) {
+ printk(KERN_ALERT "%s: out of memory\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (group = 0; group < xen_netbk_group_nr; group++) {
+ struct xen_netbk *netbk = &xen_netbk[group];
+ skb_queue_head_init(&netbk->rx_queue);
+ skb_queue_head_init(&netbk->tx_queue);
+
+ init_timer(&netbk->net_timer);
+ netbk->net_timer.data = (unsigned long)netbk;
+ netbk->net_timer.function = xen_netbk_alarm;
+
+ netbk->pending_cons = 0;
+ netbk->pending_prod = MAX_PENDING_REQS;
+ for (i = 0; i < MAX_PENDING_REQS; i++)
+ netbk->pending_ring[i] = i;
+
+ init_waitqueue_head(&netbk->wq);
+ netbk->task = kthread_create(xen_netbk_kthread,
+ (void *)netbk,
+ "netback/%u", group);
+
+ if (IS_ERR(netbk->task)) {
+ printk(KERN_ALERT "kthread_run() fails at netback\n");
+ del_timer(&netbk->net_timer);
+ rc = PTR_ERR(netbk->task);
+ goto failed_init;
+ }
+
+ kthread_bind(netbk->task, group);
+
+ INIT_LIST_HEAD(&netbk->net_schedule_list);
+
+ spin_lock_init(&netbk->net_schedule_list_lock);
+
+ atomic_set(&netbk->netfront_count, 0);
+
+ wake_up_process(netbk->task);
+ }
+
+ rc = xenvif_xenbus_init();
+ if (rc)
+ goto failed_init;
+
+ return 0;
+
+failed_init:
+ while (--group >= 0) {
+ struct xen_netbk *netbk = &xen_netbk[group];
+ for (i = 0; i < MAX_PENDING_REQS; i++) {
+ if (netbk->mmap_pages[i])
+ __free_page(netbk->mmap_pages[i]);
+ }
+ del_timer(&netbk->net_timer);
+ kthread_stop(netbk->task);
+ }
+ vfree(xen_netbk);
+ return rc;
+
+}
+
+module_init(netback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
new file mode 100644
index 0000000..22b8c35
--- /dev/null
+++ b/drivers/net/xen-netback/xenbus.c
@@ -0,0 +1,490 @@
+/*
+ * Xenbus code for netif backend
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include "common.h"
+
+struct backend_info {
+ struct xenbus_device *dev;
+ struct xenvif *vif;
+ enum xenbus_state frontend_state;
+ struct xenbus_watch hotplug_status_watch;
+ int have_hotplug_status_watch:1;
+};
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static void backend_create_xenvif(struct backend_info *be);
+static void unregister_hotplug_status_watch(struct backend_info *be);
+
+static int netback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ unregister_hotplug_status_watch(be);
+ if (be->vif) {
+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
+ xenvif_disconnect(be->vif);
+ be->vif = NULL;
+ }
+ kfree(be);
+ dev_set_drvdata(&dev->dev, NULL);
+ return 0;
+}
+
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int netback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ const char *message;
+ struct xenbus_transaction xbt;
+ int err;
+ int sg;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+
+ be->dev = dev;
+ dev_set_drvdata(&dev->dev, be);
+
+ sg = 1;
+
+ do {
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto fail;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
+ if (err) {
+ message = "writing feature-sg";
+ goto abort_transaction;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+ "%d", sg);
+ if (err) {
+ message = "writing feature-gso-tcpv4";
+ goto abort_transaction;
+ }
+
+ /* We support rx-copy path. */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-rx-copy", "%d", 1);
+ if (err) {
+ message = "writing feature-rx-copy";
+ goto abort_transaction;
+ }
+
+ /*
+ * We don't support rx-flip path (except old guests who don't
+ * grok this feature flag).
+ */
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-rx-flip", "%d", 0);
+ if (err) {
+ message = "writing feature-rx-flip";
+ goto abort_transaction;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ } while (err == -EAGAIN);
+
+ if (err) {
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto fail;
+ }
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ /* This kicks hotplug scripts, so do it immediately. */
+ backend_create_xenvif(be);
+
+ return 0;
+
+abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+ pr_debug("failed");
+ netback_remove(dev);
+ return err;
+}
+
+
+/*
+ * Handle the creation of the hotplug script environment. We add the script
+ * and vif variables to the environment, for the benefit of the vif-* hotplug
+ * scripts.
+ */
+static int netback_uevent(struct xenbus_device *xdev,
+ struct kobj_uevent_env *env)
+{
+ struct backend_info *be = dev_get_drvdata(&xdev->dev);
+ char *val;
+
+ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
+ if (IS_ERR(val)) {
+ int err = PTR_ERR(val);
+ xenbus_dev_fatal(xdev, err, "reading script");
+ return err;
+ } else {
+ if (add_uevent_var(env, "script=%s", val)) {
+ kfree(val);
+ return -ENOMEM;
+ }
+ kfree(val);
+ }
+
+ if (!be || !be->vif)
+ return 0;
+
+ return add_uevent_var(env, "vif=%s", be->vif->dev->name);
+}
+
+
+static void backend_create_xenvif(struct backend_info *be)
+{
+ int err;
+ long handle;
+ struct xenbus_device *dev = be->dev;
+
+ if (be->vif != NULL)
+ return;
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+ if (err != 1) {
+ xenbus_dev_fatal(dev, err, "reading handle");
+ return;
+ }
+
+ be->vif = xenvif_alloc(&dev->dev, dev->otherend_id, handle);
+ if (IS_ERR(be->vif)) {
+ err = PTR_ERR(be->vif);
+ be->vif = NULL;
+ xenbus_dev_fatal(dev, err, "creating interface");
+ return;
+ }
+
+ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
+
+
+static void disconnect_backend(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ if (be->vif) {
+ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
+ xenvif_disconnect(be->vif);
+ be->vif = NULL;
+ }
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ pr_debug("frontend state %s", xenbus_strstate(frontend_state));
+
+ be->frontend_state = frontend_state;
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __func__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ break;
+
+ case XenbusStateConnected:
+ if (dev->state == XenbusStateConnected)
+ break;
+ backend_create_xenvif(be);
+ if (be->vif)
+ connect(be);
+ break;
+
+ case XenbusStateClosing:
+ if (be->vif)
+ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+ disconnect_backend(dev);
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+static void xen_net_read_rate(struct xenbus_device *dev,
+ unsigned long *bytes, unsigned long *usec)
+{
+ char *s, *e;
+ unsigned long b, u;
+ char *ratestr;
+
+ /* Default to unlimited bandwidth. */
+ *bytes = ~0UL;
+ *usec = 0;
+
+ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+ if (IS_ERR(ratestr))
+ return;
+
+ s = ratestr;
+ b = simple_strtoul(s, &e, 10);
+ if ((s == e) || (*e != ','))
+ goto fail;
+
+ s = e + 1;
+ u = simple_strtoul(s, &e, 10);
+ if ((s == e) || (*e != '\0'))
+ goto fail;
+
+ *bytes = b;
+ *usec = u;
+
+ kfree(ratestr);
+ return;
+
+ fail:
+ pr_warn("Failed to parse network rate limit. Traffic unlimited.\n");
+ kfree(ratestr);
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+ char *s, *e, *macstr;
+ int i;
+
+ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+ if (IS_ERR(macstr))
+ return PTR_ERR(macstr);
+
+ for (i = 0; i < ETH_ALEN; i++) {
+ mac[i] = simple_strtoul(s, &e, 16);
+ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+ kfree(macstr);
+ return -ENOENT;
+ }
+ s = e+1;
+ }
+
+ kfree(macstr);
+ return 0;
+}
+
+static void unregister_hotplug_status_watch(struct backend_info *be)
+{
+ if (be->have_hotplug_status_watch) {
+ unregister_xenbus_watch(&be->hotplug_status_watch);
+ kfree(be->hotplug_status_watch.node);
+ }
+ be->have_hotplug_status_watch = 0;
+}
+
+static void hotplug_status_changed(struct xenbus_watch *watch,
+ const char **vec,
+ unsigned int vec_size)
+{
+ struct backend_info *be = container_of(watch,
+ struct backend_info,
+ hotplug_status_watch);
+ char *str;
+ unsigned int len;
+
+ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
+ if (IS_ERR(str))
+ return;
+ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
+ xenbus_switch_state(be->dev, XenbusStateConnected);
+ /* Not interested in this watch anymore. */
+ unregister_hotplug_status_watch(be);
+ }
+ kfree(str);
+}
+
+static void connect(struct backend_info *be)
+{
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ err = connect_rings(be);
+ if (err)
+ return;
+
+ err = xen_net_read_mac(dev, be->vif->fe_dev_addr);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+ return;
+ }
+
+ xen_net_read_rate(dev, &be->vif->credit_bytes,
+ &be->vif->credit_usec);
+ be->vif->remaining_credit = be->vif->credit_bytes;
+
+ unregister_hotplug_status_watch(be);
+ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
+ hotplug_status_changed,
+ "%s/%s", dev->nodename, "hotplug-status");
+ if (err) {
+ /* Switch now, since we can't do a watch. */
+ xenbus_switch_state(dev, XenbusStateConnected);
+ } else {
+ be->have_hotplug_status_watch = 1;
+ }
+
+ netif_wake_queue(be->vif->dev);
+}
+
+
+static int connect_rings(struct backend_info *be)
+{
+ struct xenvif *vif = be->vif;
+ struct xenbus_device *dev = be->dev;
+ unsigned long tx_ring_ref, rx_ring_ref;
+ unsigned int evtchn, rx_copy;
+ int err;
+ int val;
+
+ err = xenbus_gather(XBT_NIL, dev->otherend,
+ "tx-ring-ref", "%lu", &tx_ring_ref,
+ "rx-ring-ref", "%lu", &rx_ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+ &rx_copy);
+ if (err == -ENOENT) {
+ err = 0;
+ rx_copy = 0;
+ }
+ if (err < 0) {
+ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+ dev->otherend);
+ return err;
+ }
+ if (!rx_copy)
+ return -EOPNOTSUPP;
+
+ if (vif->dev->tx_queue_len != 0) {
+ if (xenbus_scanf(XBT_NIL, dev->otherend,
+ "feature-rx-notify", "%d", &val) < 0)
+ val = 0;
+ if (val)
+ vif->can_queue = 1;
+ else
+ /* Must be non-zero for pfifo_fast to work. */
+ vif->dev->tx_queue_len = 1;
+ }
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg",
+ "%d", &val) < 0)
+ val = 0;
+ vif->can_sg = !!val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
+ "%d", &val) < 0)
+ val = 0;
+ vif->gso = !!val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
+ "%d", &val) < 0)
+ val = 0;
+ vif->gso_prefix = !!val;
+
+ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+ "%d", &val) < 0)
+ val = 0;
+ vif->csum = !val;
+
+ /* Map the shared frame, irq etc. */
+ err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "mapping shared-frames %lu/%lu port %u",
+ tx_ring_ref, rx_ring_ref, evtchn);
+ return err;
+ }
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id netback_ids[] = {
+ { "vif" },
+ { "" }
+};
+
+
+static struct xenbus_driver netback = {
+ .name = "vif",
+ .owner = THIS_MODULE,
+ .ids = netback_ids,
+ .probe = netback_probe,
+ .remove = netback_remove,
+ .uevent = netback_uevent,
+ .otherend_changed = frontend_changed,
+};
+
+int xenvif_xenbus_init(void)
+{
+ return xenbus_register_backend(&netback);
+}
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index da1f121..a6ab973 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -359,7 +359,7 @@ static void xennet_tx_buf_gc(struct net_device *dev)
struct xen_netif_tx_response *txrsp;
txrsp = RING_GET_RESPONSE(&np->tx, cons);
- if (txrsp->status == NETIF_RSP_NULL)
+ if (txrsp->status == XEN_NETIF_RSP_NULL)
continue;
id = txrsp->id;
@@ -416,7 +416,7 @@ static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
larger than a page), split it it into page-sized chunks. */
while (len > PAGE_SIZE - offset) {
tx->size = PAGE_SIZE - offset;
- tx->flags |= NETTXF_more_data;
+ tx->flags |= XEN_NETTXF_more_data;
len -= tx->size;
data += tx->size;
offset = 0;
@@ -442,7 +442,7 @@ static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
for (i = 0; i < frags; i++) {
skb_frag_t *frag = skb_shinfo(skb)->frags + i;
- tx->flags |= NETTXF_more_data;
+ tx->flags |= XEN_NETTXF_more_data;
id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
np->tx_skbs[id].skb = skb_get(skb);
@@ -517,10 +517,10 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
tx->flags = 0;
if (skb->ip_summed == CHECKSUM_PARTIAL)
/* local packet? */
- tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
+ tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
/* remote but checksummed. */
- tx->flags |= NETTXF_data_validated;
+ tx->flags |= XEN_NETTXF_data_validated;
if (skb_shinfo(skb)->gso_size) {
struct xen_netif_extra_info *gso;
@@ -531,7 +531,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
if (extra)
extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
else
- tx->flags |= NETTXF_extra_info;
+ tx->flags |= XEN_NETTXF_extra_info;
gso->u.gso.size = skb_shinfo(skb)->gso_size;
gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
@@ -651,7 +651,7 @@ static int xennet_get_responses(struct netfront_info *np,
int err = 0;
unsigned long ret;
- if (rx->flags & NETRXF_extra_info) {
+ if (rx->flags & XEN_NETRXF_extra_info) {
err = xennet_get_extras(np, extras, rp);
cons = np->rx.rsp_cons;
}
@@ -688,7 +688,7 @@ static int xennet_get_responses(struct netfront_info *np,
__skb_queue_tail(list, skb);
next:
- if (!(rx->flags & NETRXF_more_data))
+ if (!(rx->flags & XEN_NETRXF_more_data))
break;
if (cons + frags == rp) {
@@ -983,9 +983,9 @@ err:
skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
skb->len += skb->data_len;
- if (rx->flags & NETRXF_csum_blank)
+ if (rx->flags & XEN_NETRXF_csum_blank)
skb->ip_summed = CHECKSUM_PARTIAL;
- else if (rx->flags & NETRXF_data_validated)
+ else if (rx->flags & XEN_NETRXF_data_validated)
skb->ip_summed = CHECKSUM_UNNECESSARY;
__skb_queue_tail(&rxq, skb);
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 3a5a6fc..492b7d8 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = {
#ifdef CONFIG_PCI_MSI
static int pci_frontend_enable_msix(struct pci_dev *dev,
- int **vector, int nvec)
+ int vector[], int nvec)
{
int err;
int i;
@@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev,
if (likely(!err)) {
if (likely(!op.value)) {
/* we get the result */
- for (i = 0; i < nvec; i++)
- *(*vector+i) = op.msix_entries[i].vector;
- return 0;
+ for (i = 0; i < nvec; i++) {
+ if (op.msix_entries[i].vector <= 0) {
+ dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n",
+ i, op.msix_entries[i].vector);
+ err = -EINVAL;
+ vector[i] = -1;
+ continue;
+ }
+ vector[i] = op.msix_entries[i].vector;
+ }
} else {
printk(KERN_DEBUG "enable msix get value %x\n",
op.value);
- return op.value;
}
} else {
dev_err(&dev->dev, "enable msix get err %x\n", err);
- return err;
}
+ return err;
}
static void pci_frontend_disable_msix(struct pci_dev *dev)
@@ -310,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev)
dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
}
-static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[])
{
int err;
struct xen_pci_op op = {
@@ -324,7 +330,13 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
err = do_pci_op(pdev, &op);
if (likely(!err)) {
- *(*vector) = op.value;
+ vector[0] = op.value;
+ if (op.value <= 0) {
+ dev_warn(&dev->dev, "MSI entry is invalid: %d!\n",
+ op.value);
+ err = -EINVAL;
+ vector[0] = -1;
+ }
} else {
dev_err(&dev->dev, "pci frontend enable msi failed for dev "
"%x:%x\n", op.bus, op.devfn);
@@ -733,8 +745,7 @@ static void free_pdev(struct pcifront_device *pdev)
pcifront_free_roots(pdev);
- /*For PCIE_AER error handling job*/
- flush_scheduled_work();
+ cancel_work_sync(&pdev->op_work);
if (pdev->irq >= 0)
unbind_from_irqhandler(pdev->irq, pdev);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 07bec09..e5ecae6 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,6 +37,79 @@ config XEN_BACKEND
Support for backend device drivers that provide I/O services
to other virtual machines.
+config XEN_BLKDEV_BACKEND
+ tristate "Block-device backend driver"
+ depends on XEN_BACKEND && BLOCK
+ help
+ The block-device backend driver allows the kernel to export its
+ block devices to other guests via a high-performance shared-memory
+ interface.
+
+config XEN_PCIDEV_BACKEND
+ tristate "PCI-device backend driver"
+ depends on PCI
+ depends on XEN_BACKEND
+ help
+ The PCI device backend driver allows the kernel to export arbitrary
+ PCI devices to other guests. If you select this to be a module, you
+ will need to make sure no other driver has bound to the device(s)
+ you want to make visible to other guests.
+
+choice
+ prompt "PCI Backend Mode"
+ depends on XEN_PCIDEV_BACKEND
+ default XEN_PCIDEV_BACKEND_VPCI if !IA64
+ default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
+
+config XEN_PCIDEV_BACKEND_VPCI
+ bool "Virtual PCI"
+ ---help---
+ This PCI Backend hides the true PCI topology and makes the frontend
+ think there is a single PCI bus with only the exported devices on it.
+ For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+ second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+config XEN_PCIDEV_BACKEND_PASS
+ bool "Passthrough"
+ ---help---
+ This PCI Backend provides a real view of the PCI topology to the
+ frontend (for example, a device at 06:01.b will still appear at
+ 06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+ PCI devices to its driver domains. This may be required for drivers
+ which depend on finding their hardward in certain bus/slot
+ locations.
+
+config XEN_PCIDEV_BACKEND_SLOT
+ bool "Slot"
+ ---help---
+ This PCI Backend hides the true PCI topology and makes the frontend
+ think there is a single PCI bus with only the exported devices on it.
+ Contrary to the virtual PCI backend, a function becomes a new slot.
+ For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+ second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+ bool "Controller"
+ depends on IA64
+ ---help---
+ This PCI backend virtualizes the PCI bus topology by providing a
+ virtual bus per PCI root device. Devices which are physically under
+ the same root bus will appear on the same virtual bus. For systems
+ with complex I/O addressing, this is the only backend which supports
+ extended I/O port spaces and MMIO translation offsets. This backend
+ also supports slot virtualization. For example, a device at
+ 0000:01:02.1 will be re-assigned to 0000:00:00.0. A second device
+ at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+ re-assigned to 0000:00:01.0. A third device at 0000:16:05.0 (under
+ a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+endchoice
+
+config XEN_PCIDEV_BE_DEBUG
+ bool "PCI Backend Debugging"
+ depends on XEN_PCIDEV_BACKEND
+
+
config XENFS
tristate "Xen filesystem"
default y
@@ -76,10 +149,20 @@ config XEN_XENBUS_FRONTEND
config XEN_GNTDEV
tristate "userspace grant access device driver"
depends on XEN
+ default m
select MMU_NOTIFIER
help
Allows userspace processes to use grants.
+config XEN_GRANT_DEV_ALLOC
+ tristate "User-space grant reference allocator driver"
+ depends on XEN
+ default m
+ help
+ Allows userspace processes to create pages with access granted
+ to other domains. This can be used to implement frontend drivers
+ or as part of an inter-domain shared memory channel.
+
config XEN_PLATFORM_PCI
tristate "xen platform pci device driver"
depends on XEN_PVHVM && PCI
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 5088cc2..c1cb873 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,7 +9,10 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += balloon.o
obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/
obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o
obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o
@@ -18,5 +21,6 @@ obj-$(CONFIG_XEN_DOM0) += pci.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
+xen-gntalloc-y := gntalloc.o
xen-platform-pci-y := platform-pci.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 43f9f02..718050a 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -232,7 +232,7 @@ static int increase_reservation(unsigned long nr_pages)
set_phys_to_machine(pfn, frame_list[i]);
/* Link back into the page tables if not highmem. */
- if (pfn < max_low_pfn) {
+ if (!xen_hvm_domain() && pfn < max_low_pfn) {
int ret;
ret = HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
@@ -280,7 +280,7 @@ static int decrease_reservation(unsigned long nr_pages)
scrub_page(page);
- if (!PageHighMem(page)) {
+ if (!xen_hvm_domain() && !PageHighMem(page)) {
ret = HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
__pte_ma(0), 0);
@@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages)
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
pfn = mfn_to_pfn(frame_list[i]);
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
balloon_append(pfn_to_page(pfn));
}
@@ -392,15 +392,19 @@ static struct notifier_block xenstore_notifier;
static int __init balloon_init(void)
{
- unsigned long pfn, extra_pfn_end;
+ unsigned long pfn, nr_pages, extra_pfn_end;
struct page *page;
- if (!xen_pv_domain())
+ if (!xen_domain())
return -ENODEV;
pr_info("xen_balloon: Initialising balloon driver.\n");
- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ if (xen_pv_domain())
+ nr_pages = xen_start_info->nr_pages;
+ else
+ nr_pages = max_pfn;
+ balloon_stats.current_pages = min(nr_pages, max_pfn);
balloon_stats.target_pages = balloon_stats.current_pages;
balloon_stats.balloon_low = 0;
balloon_stats.balloon_high = 0;
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
new file mode 100644
index 0000000..f1ae1ff
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+
+xen-blkback-y := blkback.o xenbus.o interface.o vbd.o
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
new file mode 100644
index 0000000..15790ae
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,708 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+
+#include <xen/events.h>
+#include <xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ u64 id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+} pending_req_t;
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+struct xen_blkbk {
+ pending_req_t *pending_reqs;
+ struct list_head pending_free;
+ spinlock_t pending_free_lock;
+ wait_queue_head_t pending_free_wq;
+ struct page **pending_pages;
+ grant_handle_t *pending_grant_handles;
+};
+
+static struct xen_blkbk *blkbk;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+ return (req - blkbk->pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+ unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
+ return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+ (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ struct blkif_request *req,
+ pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+ pending_req_t *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkbk->pending_free_lock, flags);
+ if (!list_empty(&blkbk->pending_free)) {
+ req = list_entry(blkbk->pending_free.next, pending_req_t, free_list);
+ list_del(&req->free_list);
+ }
+ spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+ return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&blkbk->pending_free_lock, flags);
+ was_empty = list_empty(&blkbk->pending_free);
+ list_add(&req->free_list, &blkbk->pending_free);
+ spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+ if (was_empty)
+ wake_up(&blkbk->pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+ if (blkif->plug == NULL)
+ return;
+ if (blkif->plug->unplug_fn)
+ blkif->plug->unplug_fn(blkif->plug);
+ blk_put_queue(blkif->plug);
+ blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q == blkif->plug)
+ return;
+ unplug_queue(blkif);
+ blk_get_queue(q);
+ blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int i, invcount = 0;
+ grant_handle_t handle;
+ int ret;
+
+ for (i = 0; i < req->nr_pages; i++) {
+ handle = pending_handle(req, i);
+ if (handle == BLKBACK_INVALID_HANDLE)
+ continue;
+ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+ GNTMAP_host_map, handle);
+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+ invcount++;
+ }
+
+ ret = HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, invcount);
+ BUG_ON(ret);
+ /* Note, we use invcount, so nr->pages, so we can't index
+ * using vaddr(req, i). */
+ for (i = 0; i < invcount; i++) {
+ ret = m2p_remove_override(
+ virt_to_page(unmap[i].host_addr), false);
+ if (ret) {
+ printk(KERN_ALERT "Failed to remove M2P override for " \
+ "%lx\n", (unsigned long)unmap[i].host_addr);
+ continue;
+ }
+ }
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+ blkif_t *blkif = arg;
+ struct vbd *vbd = &blkif->vbd;
+
+ blkif_get(blkif);
+
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: started\n", current->comm);
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+ if (unlikely(vbd->size != vbd_size(vbd)))
+ vbd_resize(blkif);
+
+ wait_event_interruptible(
+ blkif->wq,
+ blkif->waiting_reqs || kthread_should_stop());
+ wait_event_interruptible(
+ blkbk->pending_free_wq,
+ !list_empty(&blkbk->pending_free) || kthread_should_stop());
+
+ blkif->waiting_reqs = 0;
+ smp_mb(); /* clear flag *before* checking for work */
+
+ if (do_block_io_op(blkif))
+ blkif->waiting_reqs = 1;
+ unplug_queue(blkif);
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ if (log_stats)
+ print_stats(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+ blkif->xenblkd = NULL;
+ blkif_put(blkif);
+
+ return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+ /* An error fails the entire request. */
+ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+ (error == -EOPNOTSUPP)) {
+ DPRINTK("blkback: write barrier op failed, not supported\n");
+ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+ } else if (error) {
+ DPRINTK("Buffer not up-to-date at end of operation, "
+ "error=%d\n", error);
+ pending_req->status = BLKIF_RSP_ERROR;
+ }
+
+ if (atomic_dec_and_test(&pending_req->pendcnt)) {
+ fast_flush_area(pending_req);
+ make_response(pending_req->blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ blkif_put(pending_req->blkif);
+ free_req(pending_req);
+ }
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+ __end_block_io_op(bio->bi_private, error);
+ bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+ blkif->waiting_reqs = 1;
+ wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+ blkif_notify_work(dev_id);
+ return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif)
+{
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ struct blkif_request req;
+ pending_req_t *pending_req;
+ RING_IDX rc, rp;
+ int more_to_do = 0;
+
+ rc = blk_rings->common.req_cons;
+ rp = blk_rings->common.sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ while (rc != rp) {
+
+ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+ break;
+
+ if (kthread_should_stop()) {
+ more_to_do = 1;
+ break;
+ }
+
+ pending_req = alloc_req();
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
+ more_to_do = 1;
+ break;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+ /* Apply all sanity checks to /private copy/ of request. */
+ barrier();
+
+ switch (req.operation) {
+ case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ blkif->st_br_req++;
+ /* fall through */
+ case BLKIF_OP_WRITE:
+ blkif->st_wr_req++;
+ dispatch_rw_block_io(blkif, &req, pending_req);
+ break;
+ default:
+ /* A good sign something is wrong: sleep for a while to
+ * avoid excessive CPU consumption by a bad guest. */
+ msleep(1);
+ DPRINTK("error: unknown block io operation [%d]\n",
+ req.operation);
+ make_response(blkif, req.id, req.operation,
+ BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ break;
+ }
+
+ /* Yield point for this unbounded loop. */
+ cond_resched();
+ }
+
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+ struct blkif_request *req,
+ pending_req_t *pending_req)
+{
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct phys_req preq;
+ struct {
+ unsigned long buf; unsigned int nsec;
+ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int nseg;
+ struct bio *bio = NULL;
+ int ret, i;
+ int operation;
+
+ switch (req->operation) {
+ case BLKIF_OP_READ:
+ operation = READ;
+ break;
+ case BLKIF_OP_WRITE:
+ operation = WRITE;
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ operation = REQ_FLUSH | REQ_FUA;
+ break;
+ default:
+ operation = 0; /* make gcc happy */
+ BUG();
+ }
+
+ /* Check that number of segments is sane. */
+ nseg = req->nr_segments;
+ if (unlikely(nseg == 0 && operation != (REQ_FLUSH | REQ_FUA)) ||
+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ DPRINTK("Bad number of segments in request (%d)\n", nseg);
+ goto fail_response;
+ }
+
+ preq.dev = req->handle;
+ preq.sector_number = req->u.rw.sector_number;
+ preq.nr_sects = 0;
+
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = req->operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+
+ for (i = 0; i < nseg; i++) {
+ uint32_t flags;
+
+ seg[i].nsec = req->u.rw.seg[i].last_sect -
+ req->u.rw.seg[i].first_sect + 1;
+
+ if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
+ goto fail_response;
+ preq.nr_sects += seg[i].nsec;
+
+ flags = GNTMAP_host_map;
+ if (operation != READ)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+ req->u.rw.seg[i].gref, blkif->domid);
+ }
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+ BUG_ON(ret);
+
+ for (i = 0; i < nseg; i++) {
+ if (unlikely(map[i].status != 0)) {
+ DPRINTK("invalid buffer -- could not remap it\n");
+ map[i].handle = BLKBACK_INVALID_HANDLE;
+ ret |= 1;
+ }
+
+ pending_handle(pending_req, i) = map[i].handle;
+
+ if (ret)
+ continue;
+
+ ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
+ blkbk->pending_page(pending_req, i), false);
+ if (ret) {
+ printk(KERN_ALERT "Failed to install M2P override for"\
+ " %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret);
+ continue;
+ }
+
+ seg[i].buf = map[i].dev_bus_addr |
+ (req->u.rw.seg[i].first_sect << 9);
+ }
+
+ if (ret)
+ goto fail_flush;
+
+ if (vbd_translate(&preq, blkif, operation) != 0) {
+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+ operation == READ ? "read" : "write",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects, preq.dev);
+ goto fail_flush;
+ }
+
+ plug_queue(blkif, preq.bdev);
+ atomic_set(&pending_req->pendcnt, 1);
+ blkif_get(blkif);
+
+ for (i = 0; i < nseg; i++) {
+ if (((int)preq.sector_number|(int)seg[i].nsec) &
+ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+ DPRINTK("Misaligned I/O request from domain %d",
+ blkif->domid);
+ goto fail_put_bio;
+ }
+
+ while ((bio == NULL) ||
+ (bio_add_page(bio,
+ blkbk->pending_page(pending_req, i),
+ seg[i].nsec << 9,
+ seg[i].buf & ~PAGE_MASK) == 0)) {
+ if (bio) {
+ atomic_inc(&pending_req->pendcnt);
+ submit_bio(operation, bio);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, nseg-i);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_sector = preq.sector_number;
+ }
+
+ preq.sector_number += seg[i].nsec;
+ }
+
+ if (!bio) {
+ BUG_ON(operation != (REQ_FLUSH | REQ_FUA));
+ bio = bio_alloc(GFP_KERNEL, 0);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_sector = -1;
+ }
+
+ submit_bio(operation, bio);
+
+ if (operation == READ)
+ blkif->st_rd_sect += preq.nr_sects;
+ else if (operation == WRITE || operation == (REQ_FLUSH | REQ_FUA))
+ blkif->st_wr_sect += preq.nr_sects;
+
+ return;
+
+ fail_flush:
+ fast_flush_area(pending_req);
+ fail_response:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ free_req(pending_req);
+ msleep(1); /* back off a bit */
+ return;
+
+ fail_put_bio:
+ __end_block_io_op(pending_req, -EINVAL);
+ if (bio)
+ bio_put(bio);
+ unplug_queue(blkif);
+ msleep(1); /* back off a bit */
+ return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+ unsigned short op, int st)
+{
+ struct blkif_response resp;
+ unsigned long flags;
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ int more_to_do = 0;
+ int notify;
+
+ resp.id = id;
+ resp.operation = op;
+ resp.status = st;
+
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ /* Place on the response ring for the relevant domain. */
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.rsp_prod_pvt++;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+ /*
+ * Tail check for pending requests. Allows frontend to avoid
+ * notifications if requests are already in flight (lower
+ * overheads and promotes batching).
+ */
+ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+ more_to_do = 1;
+ }
+
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+ if (more_to_do)
+ blkif_notify_work(blkif);
+ if (notify)
+ notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+ int i, mmap_pages;
+ int rc = 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ blkbk = (struct xen_blkbk *)vmalloc(sizeof(struct xen_blkbk));
+ if (!blkbk) {
+ printk(KERN_ALERT "%s: out of memory!\n", __func__);
+ return -ENOMEM;
+ }
+
+ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+ blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) *
+ blkif_reqs, GFP_KERNEL);
+ blkbk->pending_grant_handles = vzalloc(sizeof(blkbk->pending_grant_handles[0]) *
+ mmap_pages);
+ blkbk->pending_pages = vzalloc(sizeof(blkbk->pending_pages[0]) * mmap_pages);
+
+ if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || !blkbk->pending_pages) {
+ rc = -ENOMEM;
+ goto out_of_memory;
+ }
+
+ for (i = 0; i < mmap_pages; i++) {
+ blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+ blkbk->pending_pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (blkbk->pending_pages[i] == NULL) {
+ rc = -ENOMEM;
+ goto out_of_memory;
+ }
+ }
+ rc = blkif_interface_init();
+ if (rc)
+ goto failed_init;
+
+ memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
+
+ INIT_LIST_HEAD(&blkbk->pending_free);
+ spin_lock_init(&blkbk->pending_free_lock);
+ init_waitqueue_head(&blkbk->pending_free_wq);
+
+ for (i = 0; i < blkif_reqs; i++)
+ list_add_tail(&blkbk->pending_reqs[i].free_list, &blkbk->pending_free);
+
+ rc = blkif_xenbus_init();
+ if (rc)
+ goto failed_init;
+
+ return 0;
+
+ out_of_memory:
+ printk(KERN_ERR "%s: out of memory\n", __func__);
+ failed_init:
+ kfree(blkbk->pending_reqs);
+ vfree(blkbk->pending_grant_handles);
+ for (i = 0; i < mmap_pages; i++) {
+ if (blkbk->pending_pages[i])
+ __free_page(blkbk->pending_pages[i]);
+ }
+ vfree(blkbk->pending_pages);
+ vfree(blkbk);
+ blkbk = NULL;
+ return rc;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
new file mode 100644
index 0000000..0f91830
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,141 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+#define DPRINTK(_f, _a...) \
+ pr_debug("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+ blkif_vdev_t handle; /* what the domain refers to this vbd as */
+ unsigned char readonly; /* Non-zero -> read-only */
+ unsigned char type; /* VDISK_xxx */
+ u32 pdevice; /* phys device that this vbd maps to */
+ struct block_device *bdev;
+ sector_t size; /* Cached size parameter */
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned int irq;
+ /* Comms information. */
+ enum blkif_protocol blk_protocol;
+ union blkif_back_rings blk_rings;
+ struct vm_struct *blk_ring_area;
+ /* The VBD attached to this interface. */
+ struct vbd vbd;
+ /* Back pointer to the backend_info. */
+ struct backend_info *be;
+ /* Private fields. */
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ wait_queue_head_t wq;
+ struct task_struct *xenblkd;
+ unsigned int waiting_reqs;
+ struct request_queue *plug;
+
+ /* statistics */
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;
+ int st_br_req;
+ int st_rd_sect;
+ int st_wr_sect;
+
+ wait_queue_head_t waiting_to_free;
+
+ grant_handle_t shmem_handle;
+ grant_ref_t shmem_ref;
+} blkif_t;
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+void vbd_resize(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b) \
+ do { \
+ if (atomic_dec_and_test(&(_b)->refcnt)) \
+ wake_up(&(_b)->waiting_to_free);\
+ } while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+ unsigned minor, int readonly, int cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+ unsigned short dev;
+ unsigned short nr_sects;
+ struct block_device *bdev;
+ blkif_sector_t sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+int blkif_interface_init(void);
+
+int blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
new file mode 100644
index 0000000..e397a41
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <linux/kthread.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+ blkif_t *blkif;
+
+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+ if (!blkif)
+ return ERR_PTR(-ENOMEM);
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 1);
+ init_waitqueue_head(&blkif->wq);
+ blkif->st_print = jiffies;
+ init_waitqueue_head(&blkif->waiting_to_free);
+
+ return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+ struct gnttab_map_grant_ref op;
+
+ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, shared_page, blkif->domid);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+ BUG();
+
+ if (op.status) {
+ DPRINTK(" Grant table operation failure !\n");
+ return op.status;
+ }
+
+ blkif->shmem_ref = shared_page;
+ blkif->shmem_handle = op.handle;
+
+ return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+ GNTMAP_host_map, blkif->shmem_handle);
+
+ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+ BUG();
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+ int err;
+
+ /* Already connected through? */
+ if (blkif->irq)
+ return 0;
+
+ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+ return -ENOMEM;
+
+ err = map_frontend_page(blkif, shared_page);
+ if (err) {
+ free_vm_area(blkif->blk_ring_area);
+ return err;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ struct blkif_sring *sring;
+ sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ struct blkif_x86_32_sring *sring_x86_32;
+ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ struct blkif_x86_64_sring *sring_x86_64;
+ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+ if (err < 0)
+ {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ return err;
+ }
+ blkif->irq = err;
+
+ return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+ if (blkif->xenblkd) {
+ kthread_stop(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ }
+
+ atomic_dec(&blkif->refcnt);
+ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+ atomic_inc(&blkif->refcnt);
+
+ if (blkif->irq) {
+ unbind_from_irqhandler(blkif->irq, blkif);
+ blkif->irq = 0;
+ }
+
+ if (blkif->blk_rings.common.sring) {
+ unmap_frontend_page(blkif);
+ free_vm_area(blkif->blk_ring_area);
+ blkif->blk_rings.common.sring = NULL;
+ }
+}
+
+void blkif_free(blkif_t *blkif)
+{
+ if (!atomic_dec_and_test(&blkif->refcnt))
+ BUG();
+ kmem_cache_free(blkif_cachep, blkif);
+}
+
+int __init blkif_interface_init(void)
+{
+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+ 0, 0, NULL);
+ if (!blkif_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
new file mode 100644
index 0000000..8c91a2f
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,163 @@
+/******************************************************************************
+ * blkback/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
+ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+ return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+ return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+ return bdev_logical_block_size(vbd->bdev);
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+ unsigned minor, int readonly, int cdrom)
+{
+ struct vbd *vbd;
+ struct block_device *bdev;
+
+ vbd = &blkif->vbd;
+ vbd->handle = handle;
+ vbd->readonly = readonly;
+ vbd->type = 0;
+
+ vbd->pdevice = MKDEV(major, minor);
+
+ bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+ FMODE_READ : FMODE_WRITE, NULL);
+
+ if (IS_ERR(bdev)) {
+ DPRINTK("vbd_creat: device %08x could not be opened.\n",
+ vbd->pdevice);
+ return -ENOENT;
+ }
+
+ vbd->bdev = bdev;
+ vbd->size = vbd_size(vbd);
+
+ if (vbd->bdev->bd_disk == NULL) {
+ DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+ vbd->pdevice);
+ vbd_free(vbd);
+ return -ENOENT;
+ }
+
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+ vbd->type |= VDISK_CDROM;
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+ vbd->type |= VDISK_REMOVABLE;
+
+ DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+ handle, blkif->domid);
+ return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+ if (vbd->bdev)
+ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+ vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+ struct vbd *vbd = &blkif->vbd;
+ int rc = -EACCES;
+
+ if ((operation != READ) && vbd->readonly)
+ goto out;
+
+ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
+ goto out;
+
+ req->dev = vbd->pdevice;
+ req->bdev = vbd->bdev;
+ rc = 0;
+
+ out:
+ return rc;
+}
+
+void vbd_resize(blkif_t *blkif)
+{
+ struct vbd *vbd = &blkif->vbd;
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = blkback_xenbus(blkif->be);
+ unsigned long long new_size = vbd_size(vbd);
+
+ printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
+ blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
+ printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
+ vbd->size = new_size;
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ printk(KERN_WARNING "Error starting transaction");
+ return;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+ vbd_size(vbd));
+ if (err) {
+ printk(KERN_WARNING "Error writing new size");
+ goto abort;
+ }
+ /*
+ * Write the current state; we will use this to synchronize
+ * the front-end. If the current state is "connected" the
+ * front-end will get the new size information online.
+ */
+ err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+ if (err) {
+ printk(KERN_WARNING "Error writing the state");
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ printk(KERN_WARNING "Error ending transaction");
+abort:
+ xenbus_transaction_end(xbt, 1);
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
new file mode 100644
index 0000000..031bc3d
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,559 @@
+/* Xenbus code for blkif backend
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ Copyright (C) 2005 XenSource Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include "common.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \
+ __FUNCTION__, __LINE__, ##args)
+
+struct backend_info
+{
+ struct xenbus_device *dev;
+ blkif_t *blkif;
+ struct xenbus_watch backend_watch;
+ unsigned major;
+ unsigned minor;
+ char *mode;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+ unsigned int);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be)
+{
+ return be->dev;
+}
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+ char *devpath, *devname;
+ struct xenbus_device *dev = blkif->be->dev;
+
+ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+ if (IS_ERR(devpath))
+ return PTR_ERR(devpath);
+
+ if ((devname = strstr(devpath, "/dev/")) != NULL)
+ devname += strlen("/dev/");
+ else
+ devname = devpath;
+
+ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+ kfree(devpath);
+
+ return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{
+ int err;
+ char name[TASK_COMM_LEN];
+
+ /* Not ready to connect? */
+ if (!blkif->irq || !blkif->vbd.bdev)
+ return;
+
+ /* Already connected? */
+ if (blkif->be->dev->state == XenbusStateConnected)
+ return;
+
+ /* Attempt to connect: exit if we fail to. */
+ connect(blkif->be);
+ if (blkif->be->dev->state != XenbusStateConnected)
+ return;
+
+ err = blkback_name(blkif, name);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+ return;
+ }
+
+ err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "block flush");
+ return;
+ }
+ invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+ if (IS_ERR(blkif->xenblkd)) {
+ err = PTR_ERR(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+ }
+}
+
+
+/****************************************************************
+ * sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *_dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct xenbus_device *dev = to_xenbus_device(_dev); \
+ struct backend_info *be = dev_get_drvdata(&dev->dev); \
+ \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+ &dev_attr_oo_req.attr,
+ &dev_attr_rd_req.attr,
+ &dev_attr_wr_req.attr,
+ &dev_attr_br_req.attr,
+ &dev_attr_rd_sect.attr,
+ &dev_attr_wr_sect.attr,
+ NULL
+};
+
+static struct attribute_group vbdstat_group = {
+ .name = "statistics",
+ .attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+ int error;
+
+ error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ if (error)
+ goto fail1;
+
+ error = device_create_file(&dev->dev, &dev_attr_mode);
+ if (error)
+ goto fail2;
+
+ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+ if (error)
+ goto fail3;
+
+ return 0;
+
+fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
+ return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+ device_remove_file(&dev->dev, &dev_attr_mode);
+ device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ DPRINTK("");
+
+ if (be->major || be->minor)
+ xenvbd_sysfs_delif(dev);
+
+ if (be->backend_watch.node) {
+ unregister_xenbus_watch(&be->backend_watch);
+ kfree(be->backend_watch.node);
+ be->backend_watch.node = NULL;
+ }
+
+ if (be->blkif) {
+ blkif_disconnect(be->blkif);
+ vbd_free(&be->blkif->vbd);
+ blkif_free(be->blkif);
+ be->blkif = NULL;
+ }
+
+ kfree(be);
+ dev_set_drvdata(&dev->dev, NULL);
+ return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+ "%d", state);
+ if (err)
+ xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+ return err;
+}
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers. Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+ be->dev = dev;
+ dev_set_drvdata(&dev->dev, be);
+
+ be->blkif = blkif_alloc(dev->otherend_id);
+ if (IS_ERR(be->blkif)) {
+ err = PTR_ERR(be->blkif);
+ be->blkif = NULL;
+ xenbus_dev_fatal(dev, err, "creating block interface");
+ goto fail;
+ }
+
+ /* setup back pointer */
+ be->blkif->be = be;
+
+ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+ "%s/%s", dev->nodename, "physical-device");
+ if (err)
+ goto fail;
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ DPRINTK("failed");
+ blkback_remove(dev);
+ return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node. Read it and the mode node, and create a vbd. If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ unsigned major;
+ unsigned minor;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+ int cdrom = 0;
+ char *device_type;
+
+ DPRINTK("");
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+ &major, &minor);
+ if (XENBUS_EXIST_ERR(err)) {
+ /* Since this watch will fire once immediately after it is
+ registered, we expect this. Ignore it, and wait for the
+ hotplug scripts. */
+ return;
+ }
+ if (err != 2) {
+ xenbus_dev_fatal(dev, err, "reading physical-device");
+ return;
+ }
+
+ if ((be->major || be->minor) &&
+ ((be->major != major) || (be->minor != minor))) {
+ printk(KERN_WARNING
+ "blkback: changing physical device (from %x:%x to "
+ "%x:%x) not supported.\n", be->major, be->minor,
+ major, minor);
+ return;
+ }
+
+ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+ if (IS_ERR(be->mode)) {
+ err = PTR_ERR(be->mode);
+ be->mode = NULL;
+ xenbus_dev_fatal(dev, err, "reading mode");
+ return;
+ }
+
+ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+ if (!IS_ERR(device_type)) {
+ cdrom = strcmp(device_type, "cdrom") == 0;
+ kfree(device_type);
+ }
+
+ if (be->major == 0 && be->minor == 0) {
+ /* Front end dir is a number, which is used as the handle. */
+
+ char *p = strrchr(dev->otherend, '/') + 1;
+ long handle = simple_strtoul(p, NULL, 0);
+
+ be->major = major;
+ be->minor = minor;
+
+ err = vbd_create(be->blkif, handle, major, minor,
+ (NULL == strchr(be->mode, 'w')), cdrom);
+ if (err) {
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating vbd structure");
+ return;
+ }
+
+ err = xenvbd_sysfs_addif(dev);
+ if (err) {
+ vbd_free(&be->blkif->vbd);
+ be->major = be->minor = 0;
+ xenbus_dev_fatal(dev, err, "creating sysfs entries");
+ return;
+ }
+
+ /* We're potentially connected now */
+ update_blkif_status(be->blkif);
+ }
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+ int err;
+
+ DPRINTK("%s", xenbus_strstate(frontend_state));
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ /* Ensure we connect even when two watches fire in
+ close successsion and we miss the intermediate value
+ of frontend_state. */
+ if (dev->state == XenbusStateConnected)
+ break;
+
+ /* Enforce precondition before potential leak point.
+ * blkif_disconnect() is idempotent.
+ */
+ blkif_disconnect(be->blkif);
+
+ err = connect_ring(be);
+ if (err)
+ break;
+ update_blkif_status(be->blkif);
+ break;
+
+ case XenbusStateClosing:
+ blkif_disconnect(be->blkif);
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ /* implies blkif_disconnect() via blkback_remove() */
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ DPRINTK("%s", dev->otherend);
+
+ /* Supply the information about the device the frontend needs */
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ return;
+ }
+
+ err = blkback_barrier(xbt, be, 1);
+ if (err)
+ goto abort;
+
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+ vbd_size(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sectors",
+ dev->nodename);
+ goto abort;
+ }
+
+ /* FIXME: use a typename instead */
+ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+ vbd_info(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/info",
+ dev->nodename);
+ goto abort;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+ vbd_secsize(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+ dev->nodename);
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ xenbus_dev_fatal(dev, err, "ending transaction");
+
+ err = xenbus_switch_state(dev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(dev, err, "switching to Connected state",
+ dev->nodename);
+
+ return;
+ abort:
+ xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ char protocol[64] = "";
+ int err;
+
+ DPRINTK("%s", dev->otherend);
+
+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+ "%63s", protocol, NULL);
+ if (err)
+ strcpy(protocol, "unspecified, assuming native");
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+ else {
+ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+ return -1;
+ }
+ printk(KERN_INFO
+ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+ ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+ /* Map the shared frame, irq etc. */
+ err = blkif_map(be->blkif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+ ring_ref, evtchn);
+ return err;
+ }
+
+ return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+ { "vbd" },
+ { "" }
+};
+
+
+static struct xenbus_driver blkback = {
+ .name = "vbd",
+ .owner = THIS_MODULE,
+ .ids = blkback_ids,
+ .probe = blkback_probe,
+ .remove = blkback_remove,
+ .otherend_changed = frontend_changed
+};
+
+
+int blkif_xenbus_init(void)
+{
+ return xenbus_register_backend(&blkback);
+}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 7468147..8b7fc9a6 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -97,6 +97,7 @@ struct irq_info
unsigned short gsi;
unsigned char vector;
unsigned char flags;
+ uint16_t domid;
} pirq;
} u;
};
@@ -114,7 +115,7 @@ struct cpu_evtchn_s {
static __initdata struct cpu_evtchn_s init_evtchn_mask = {
.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
};
-static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask;
static inline unsigned long *cpu_evtchn_mask(int cpu)
{
@@ -153,11 +154,13 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
}
static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq,
- unsigned short gsi, unsigned short vector)
+ unsigned short gsi, unsigned short vector,
+ domid_t domid)
{
return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
.cpu = 0,
- .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } };
+ .u.pirq = { .pirq = pirq, .gsi = gsi,
+ .vector = vector, .domid = domid } };
}
/*
@@ -277,7 +280,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
BUG_ON(irq == -1);
#ifdef CONFIG_SMP
- cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
+ cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
#endif
clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
@@ -294,7 +297,7 @@ static void init_evtchn_cpu_bindings(void)
/* By default all event channels notify CPU#0. */
for_each_irq_desc(i, desc) {
- cpumask_copy(desc->affinity, cpumask_of(0));
+ cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
}
#endif
@@ -376,81 +379,69 @@ static void unmask_evtchn(int port)
put_cpu();
}
-static int get_nr_hw_irqs(void)
+static int xen_allocate_irq_dynamic(void)
{
- int ret = 1;
+ int first = 0;
+ int irq;
#ifdef CONFIG_X86_IO_APIC
- ret = get_nr_irqs_gsi();
+ /*
+ * For an HVM guest or domain 0 which see "real" (emulated or
+ * actual repectively) GSIs we allocate dynamic IRQs
+ * e.g. those corresponding to event channels or MSIs
+ * etc. from the range above those "real" GSIs to avoid
+ * collisions.
+ */
+ if (xen_initial_domain() || xen_hvm_domain())
+ first = get_nr_irqs_gsi();
#endif
- return ret;
-}
+retry:
+ irq = irq_alloc_desc_from(first, -1);
-static int find_unbound_pirq(int type)
-{
- int rc, i;
- struct physdev_get_free_pirq op_get_free_pirq;
- op_get_free_pirq.type = type;
+ if (irq == -ENOMEM && first > NR_IRQS_LEGACY) {
+ printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n");
+ first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY);
+ goto retry;
+ }
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
- if (!rc)
- return op_get_free_pirq.pirq;
+ if (irq < 0)
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
- for (i = 0; i < nr_irqs; i++) {
- if (pirq_to_irq[i] < 0)
- return i;
- }
- return -1;
+ return irq;
}
-static int find_unbound_irq(void)
+static int xen_allocate_irq_gsi(unsigned gsi)
{
- struct irq_data *data;
- int irq, res;
- int bottom = get_nr_hw_irqs();
- int top = nr_irqs-1;
-
- if (bottom == nr_irqs)
- goto no_irqs;
+ int irq;
- /* This loop starts from the top of IRQ space and goes down.
- * We need this b/c if we have a PCI device in a Xen PV guest
- * we do not have an IO-APIC (though the backend might have them)
- * mapped in. To not have a collision of physical IRQs with the Xen
- * event channels start at the top of the IRQ space for virtual IRQs.
+ /*
+ * A PV guest has no concept of a GSI (since it has no ACPI
+ * nor access to/knowledge of the physical APICs). Therefore
+ * all IRQs are dynamically allocated from the entire IRQ
+ * space.
*/
- for (irq = top; irq > bottom; irq--) {
- data = irq_get_irq_data(irq);
- /* only 15->0 have init'd desc; handle irq > 16 */
- if (!data)
- break;
- if (data->chip == &no_irq_chip)
- break;
- if (data->chip != &xen_dynamic_chip)
- continue;
- if (irq_info[irq].type == IRQT_UNBOUND)
- return irq;
- }
+ if (xen_pv_domain() && !xen_initial_domain())
+ return xen_allocate_irq_dynamic();
- if (irq == bottom)
- goto no_irqs;
+ /* Legacy IRQ descriptors are already allocated by the arch. */
+ if (gsi < NR_IRQS_LEGACY)
+ return gsi;
- res = irq_alloc_desc_at(irq, -1);
-
- if (WARN_ON(res != irq))
- return -1;
+ irq = irq_alloc_desc_at(gsi, -1);
+ if (irq < 0)
+ panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq);
return irq;
-
-no_irqs:
- panic("No available IRQ to bind to: increase nr_irqs!\n");
}
-static bool identity_mapped_irq(unsigned irq)
+static void xen_free_irq(unsigned irq)
{
- /* identity map all the hardware irqs */
- return irq < get_nr_hw_irqs();
+ /* Legacy IRQ descriptors are managed by the arch. */
+ if (irq < NR_IRQS_LEGACY)
+ return;
+
+ irq_free_desc(irq);
}
static void pirq_unmask_notify(int irq)
@@ -486,7 +477,7 @@ static bool probing_irq(int irq)
return desc && desc->action == NULL;
}
-static unsigned int startup_pirq(unsigned int irq)
+static unsigned int __startup_pirq(unsigned int irq)
{
struct evtchn_bind_pirq bind_pirq;
struct irq_info *info = info_for_irq(irq);
@@ -524,9 +515,15 @@ out:
return 0;
}
-static void shutdown_pirq(unsigned int irq)
+static unsigned int startup_pirq(struct irq_data *data)
+{
+ return __startup_pirq(data->irq);
+}
+
+static void shutdown_pirq(struct irq_data *data)
{
struct evtchn_close close;
+ unsigned int irq = data->irq;
struct irq_info *info = info_for_irq(irq);
int evtchn = evtchn_from_irq(irq);
@@ -546,20 +543,20 @@ static void shutdown_pirq(unsigned int irq)
info->evtchn = 0;
}
-static void enable_pirq(unsigned int irq)
+static void enable_pirq(struct irq_data *data)
{
- startup_pirq(irq);
+ startup_pirq(data);
}
-static void disable_pirq(unsigned int irq)
+static void disable_pirq(struct irq_data *data)
{
}
-static void ack_pirq(unsigned int irq)
+static void ack_pirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
- move_native_irq(irq);
+ move_native_irq(data->irq);
if (VALID_EVTCHN(evtchn)) {
mask_evtchn(evtchn);
@@ -567,23 +564,6 @@ static void ack_pirq(unsigned int irq)
}
}
-static void end_pirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (WARN_ON(!desc))
- return;
-
- if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
- (IRQ_DISABLED|IRQ_PENDING)) {
- shutdown_pirq(irq);
- } else if (VALID_EVTCHN(evtchn)) {
- unmask_evtchn(evtchn);
- pirq_unmask_notify(irq);
- }
-}
-
static int find_irq_by_gsi(unsigned gsi)
{
int irq;
@@ -638,14 +618,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
goto out; /* XXX need refcount? */
}
- /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
- * we are using the !xen_initial_domain() to drop in the function.*/
- if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
- xen_pv_domain())) {
- irq = gsi;
- irq_alloc_desc_at(irq, -1);
- } else
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_gsi(gsi);
set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
handle_level_irq, name);
@@ -658,12 +631,12 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
* this in the priv domain. */
if (xen_initial_domain() &&
HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
- irq_free_desc(irq);
+ xen_free_irq(irq);
irq = -ENOSPC;
goto out;
}
- irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector);
+ irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector, DOMID_SELF);
irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
pirq_to_irq[pirq] = irq;
@@ -674,87 +647,47 @@ out:
}
#ifdef CONFIG_PCI_MSI
-#include <linux/msi.h>
-#include "../pci/msi.h"
-
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
{
- spin_lock(&irq_mapping_update_lock);
-
- if (alloc & XEN_ALLOC_IRQ) {
- *irq = find_unbound_irq();
- if (*irq == -1)
- goto out;
- }
-
- if (alloc & XEN_ALLOC_PIRQ) {
- *pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
- if (*pirq == -1)
- goto out;
- }
+ int rc;
+ struct physdev_get_free_pirq op_get_free_pirq;
- set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
- handle_level_irq, name);
+ op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
- irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
- pirq_to_irq[*pirq] = *irq;
+ WARN_ONCE(rc == -ENOSYS,
+ "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
-out:
- spin_unlock(&irq_mapping_update_lock);
+ return rc ? -1 : op_get_free_pirq.pirq;
}
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ int pirq, int vector, const char *name,
+ domid_t domid)
{
- int irq = -1;
- struct physdev_map_pirq map_irq;
- int rc;
- int pos;
- u32 table_offset, bir;
-
- memset(&map_irq, 0, sizeof(map_irq));
- map_irq.domid = DOMID_SELF;
- map_irq.type = MAP_PIRQ_TYPE_MSI;
- map_irq.index = -1;
- map_irq.pirq = -1;
- map_irq.bus = dev->bus->number;
- map_irq.devfn = dev->devfn;
-
- if (type == PCI_CAP_ID_MSIX) {
- pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-
- pci_read_config_dword(dev, msix_table_offset_reg(pos),
- &table_offset);
- bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
-
- map_irq.table_base = pci_resource_start(dev, bir);
- map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
- }
+ int irq, ret;
spin_lock(&irq_mapping_update_lock);
- irq = find_unbound_irq();
-
+ irq = xen_allocate_irq_dynamic();
if (irq == -1)
goto out;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
- if (rc) {
- printk(KERN_WARNING "xen map irq failed %d\n", rc);
-
- irq_free_desc(irq);
-
- irq = -1;
- goto out;
- }
- irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
-
set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
- handle_level_irq,
- (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+ handle_level_irq, name);
+ irq_info[irq] = mk_pirq_info(0, pirq, 0, vector, domid);
+ pirq_to_irq[pirq] = irq;
+ ret = set_irq_msi(irq, msidesc);
+ if (ret < 0)
+ goto error_irq;
out:
spin_unlock(&irq_mapping_update_lock);
return irq;
+error_irq:
+ spin_unlock(&irq_mapping_update_lock);
+ xen_free_irq(irq);
+ return -1;
}
#endif
@@ -773,17 +706,25 @@ int xen_destroy_irq(int irq)
if (xen_initial_domain()) {
unmap_irq.pirq = info->u.pirq.pirq;
- unmap_irq.domid = DOMID_SELF;
+ unmap_irq.domid = info->u.pirq.domid;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
- if (rc) {
+ /* If another domain quits without making the pci_disable_msix
+ * call, the Xen hypervisor takes care of freeing the PIRQs
+ * (free_domain_pirqs).
+ */
+ if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
+ printk(KERN_INFO "domain %d does not have %d anymore\n",
+ info->u.pirq.domid, info->u.pirq.pirq);
+ else if (rc) {
printk(KERN_WARNING "unmap irq failed %d\n", rc);
goto out;
}
- pirq_to_irq[info->u.pirq.pirq] = -1;
}
+ pirq_to_irq[info->u.pirq.pirq] = -1;
+
irq_info[irq] = mk_unbound_info();
- irq_free_desc(irq);
+ xen_free_irq(irq);
out:
spin_unlock(&irq_mapping_update_lock);
@@ -805,6 +746,12 @@ int xen_irq_from_pirq(unsigned pirq)
return pirq_to_irq[pirq];
}
+
+int xen_pirq_from_irq(unsigned irq)
+{
+ return pirq_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
int bind_evtchn_to_irq(unsigned int evtchn)
{
int irq;
@@ -814,7 +761,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
irq = evtchn_to_irq[evtchn];
if (irq == -1) {
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_dynamic();
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
handle_fasteoi_irq, "event");
@@ -839,7 +786,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
irq = per_cpu(ipi_to_irq, cpu)[ipi];
if (irq == -1) {
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_dynamic();
if (irq < 0)
goto out;
@@ -864,6 +811,21 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
return irq;
}
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+ unsigned int remote_port)
+{
+ struct evtchn_bind_interdomain bind_interdomain;
+ int err;
+
+ bind_interdomain.remote_dom = remote_domain;
+ bind_interdomain.remote_port = remote_port;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+ &bind_interdomain);
+
+ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
+}
+
int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
@@ -875,7 +837,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
- irq = find_unbound_irq();
+ irq = xen_allocate_irq_dynamic();
set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
handle_percpu_irq, "virq");
@@ -934,7 +896,7 @@ static void unbind_from_irq(unsigned int irq)
if (irq_info[irq].type != IRQT_UNBOUND) {
irq_info[irq] = mk_unbound_info();
- irq_free_desc(irq);
+ xen_free_irq(irq);
}
spin_unlock(&irq_mapping_update_lock);
@@ -959,6 +921,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
@@ -990,7 +975,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
if (irq < 0)
return irq;
- irqflags |= IRQF_NO_SUSPEND;
+ irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
@@ -1234,11 +1219,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
return 0;
}
-static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
+ bool force)
{
unsigned tcpu = cpumask_first(dest);
- return rebind_irq_to_cpu(irq, tcpu);
+ return rebind_irq_to_cpu(data->irq, tcpu);
}
int resend_irq_on_evtchn(unsigned int irq)
@@ -1257,35 +1243,35 @@ int resend_irq_on_evtchn(unsigned int irq)
return 1;
}
-static void enable_dynirq(unsigned int irq)
+static void enable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
}
-static void disable_dynirq(unsigned int irq)
+static void disable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
if (VALID_EVTCHN(evtchn))
mask_evtchn(evtchn);
}
-static void ack_dynirq(unsigned int irq)
+static void ack_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
- move_masked_irq(irq);
+ move_masked_irq(data->irq);
if (VALID_EVTCHN(evtchn))
unmask_evtchn(evtchn);
}
-static int retrigger_dynirq(unsigned int irq)
+static int retrigger_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(irq);
+ int evtchn = evtchn_from_irq(data->irq);
struct shared_info *sh = HYPERVISOR_shared_info;
int ret = 0;
@@ -1334,7 +1320,7 @@ static void restore_cpu_pirqs(void)
printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
- startup_pirq(irq);
+ __startup_pirq(irq);
}
}
@@ -1442,10 +1428,21 @@ void xen_poll_irq(int irq)
xen_poll_irq_timeout(irq, 0 /* no timeout */);
}
+/* Check whether the IRQ line is shared with other guests. */
+int xen_test_irq_shared(int irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+ struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq };
+
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ return 0;
+ return !(irq_status.flags & XENIRQSTAT_shared);
+}
+EXPORT_SYMBOL_GPL(xen_test_irq_shared);
+
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
- struct irq_desc *desc;
init_evtchn_cpu_bindings();
@@ -1465,66 +1462,48 @@ void xen_irq_resume(void)
restore_cpu_ipis(cpu);
}
- /*
- * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
- * are not handled by the IRQ core.
- */
- for_each_irq_desc(irq, desc) {
- if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
- continue;
- if (desc->status & IRQ_DISABLED)
- continue;
-
- evtchn = evtchn_from_irq(irq);
- if (evtchn == -1)
- continue;
-
- unmask_evtchn(evtchn);
- }
-
restore_cpu_pirqs();
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
- .name = "xen-dyn",
+ .name = "xen-dyn",
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
- .eoi = ack_dynirq,
- .set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
+ .irq_eoi = ack_dynirq,
+ .irq_set_affinity = set_affinity_irq,
+ .irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_pirq_chip __read_mostly = {
- .name = "xen-pirq",
+ .name = "xen-pirq",
- .startup = startup_pirq,
- .shutdown = shutdown_pirq,
+ .irq_startup = startup_pirq,
+ .irq_shutdown = shutdown_pirq,
- .enable = enable_pirq,
- .unmask = enable_pirq,
+ .irq_enable = enable_pirq,
+ .irq_unmask = enable_pirq,
- .disable = disable_pirq,
- .mask = disable_pirq,
+ .irq_disable = disable_pirq,
+ .irq_mask = disable_pirq,
- .ack = ack_pirq,
- .end = end_pirq,
+ .irq_ack = ack_pirq,
- .set_affinity = set_affinity_irq,
+ .irq_set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
+ .irq_retrigger = retrigger_dynirq,
};
static struct irq_chip xen_percpu_chip __read_mostly = {
- .name = "xen-percpu",
+ .name = "xen-percpu",
- .disable = disable_dynirq,
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
- .ack = ack_dynirq,
+ .irq_ack = ack_dynirq,
};
int xen_set_callback_via(uint64_t via)
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
new file mode 100644
index 0000000..a7ffdfe
--- /dev/null
+++ b/drivers/xen/gntalloc.c
@@ -0,0 +1,545 @@
+/******************************************************************************
+ * gntalloc.c
+ *
+ * Device for creating grant references (in user-space) that may be shared
+ * with other domains.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * This driver exists to allow userspace programs in Linux to allocate kernel
+ * memory that will later be shared with another domain. Without this device,
+ * Linux userspace programs cannot create grant references.
+ *
+ * How this stuff works:
+ * X -> granting a page to Y
+ * Y -> mapping the grant from X
+ *
+ * 1. X uses the gntalloc device to allocate a page of kernel memory, P.
+ * 2. X creates an entry in the grant table that says domid(Y) can access P.
+ * This is done without a hypercall unless the grant table needs expansion.
+ * 3. X gives the grant reference identifier, GREF, to Y.
+ * 4. Y maps the page, either directly into kernel memory for use in a backend
+ * driver, or via a the gntdev device to map into the address space of an
+ * application running in Y. This is the first point at which Xen does any
+ * tracking of the page.
+ * 5. A program in X mmap()s a segment of the gntalloc device that corresponds
+ * to the shared page, and can now communicate with Y over the shared page.
+ *
+ *
+ * NOTE TO USERSPACE LIBRARIES:
+ * The grant allocation and mmap()ing are, naturally, two separate operations.
+ * You set up the sharing by calling the create ioctl() and then the mmap().
+ * Teardown requires munmap() and either close() or ioctl().
+ *
+ * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant
+ * reference, this device can be used to consume kernel memory by leaving grant
+ * references mapped by another domain when an application exits. Therefore,
+ * there is a global limit on the number of pages that can be allocated. When
+ * all references to the page are unmapped, it will be freed during the next
+ * grant operation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/gntalloc.h>
+#include <xen/events.h>
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by "
+ "the gntalloc device");
+
+static LIST_HEAD(gref_list);
+static DEFINE_SPINLOCK(gref_lock);
+static int gref_size;
+
+struct notify_info {
+ uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */
+ uint16_t flags:2; /* Bits 12-13: Unmap notification flags */
+ int event; /* Port (event channel) to notify */
+};
+
+/* Metadata on a grant reference. */
+struct gntalloc_gref {
+ struct list_head next_gref; /* list entry gref_list */
+ struct list_head next_file; /* list entry file->list, if open */
+ struct page *page; /* The shared page */
+ uint64_t file_index; /* File offset for mmap() */
+ unsigned int users; /* Use count - when zero, waiting on Xen */
+ grant_ref_t gref_id; /* The grant reference number */
+ struct notify_info notify; /* Unmap notification */
+};
+
+struct gntalloc_file_private_data {
+ struct list_head list;
+ uint64_t index;
+};
+
+static void __del_gref(struct gntalloc_gref *gref);
+
+static void do_cleanup(void)
+{
+ struct gntalloc_gref *gref, *n;
+ list_for_each_entry_safe(gref, n, &gref_list, next_gref) {
+ if (!gref->users)
+ __del_gref(gref);
+ }
+}
+
+static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
+ uint32_t *gref_ids, struct gntalloc_file_private_data *priv)
+{
+ int i, rc, readonly;
+ LIST_HEAD(queue_gref);
+ LIST_HEAD(queue_file);
+ struct gntalloc_gref *gref;
+
+ readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE);
+ rc = -ENOMEM;
+ for (i = 0; i < op->count; i++) {
+ gref = kzalloc(sizeof(*gref), GFP_KERNEL);
+ if (!gref)
+ goto undo;
+ list_add_tail(&gref->next_gref, &queue_gref);
+ list_add_tail(&gref->next_file, &queue_file);
+ gref->users = 1;
+ gref->file_index = op->index + i * PAGE_SIZE;
+ gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!gref->page)
+ goto undo;
+
+ /* Grant foreign access to the page. */
+ gref->gref_id = gnttab_grant_foreign_access(op->domid,
+ pfn_to_mfn(page_to_pfn(gref->page)), readonly);
+ if (gref->gref_id < 0) {
+ rc = gref->gref_id;
+ goto undo;
+ }
+ gref_ids[i] = gref->gref_id;
+ }
+
+ /* Add to gref lists. */
+ spin_lock(&gref_lock);
+ list_splice_tail(&queue_gref, &gref_list);
+ list_splice_tail(&queue_file, &priv->list);
+ spin_unlock(&gref_lock);
+
+ return 0;
+
+undo:
+ spin_lock(&gref_lock);
+ gref_size -= (op->count - i);
+
+ list_for_each_entry(gref, &queue_file, next_file) {
+ /* __del_gref does not remove from queue_file */
+ __del_gref(gref);
+ }
+
+ /* It's possible for the target domain to map the just-allocated grant
+ * references by blindly guessing their IDs; if this is done, then
+ * __del_gref will leave them in the queue_gref list. They need to be
+ * added to the global list so that we can free them when they are no
+ * longer referenced.
+ */
+ if (unlikely(!list_empty(&queue_gref)))
+ list_splice_tail(&queue_gref, &gref_list);
+ spin_unlock(&gref_lock);
+ return rc;
+}
+
+static void __del_gref(struct gntalloc_gref *gref)
+{
+ if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+ uint8_t *tmp = kmap(gref->page);
+ tmp[gref->notify.pgoff] = 0;
+ kunmap(gref->page);
+ }
+ if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+ notify_remote_via_evtchn(gref->notify.event);
+
+ gref->notify.flags = 0;
+
+ if (gref->gref_id > 0) {
+ if (gnttab_query_foreign_access(gref->gref_id))
+ return;
+
+ if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
+ return;
+ }
+
+ gref_size--;
+ list_del(&gref->next_gref);
+
+ if (gref->page)
+ __free_page(gref->page);
+
+ kfree(gref);
+}
+
+/* finds contiguous grant references in a file, returns the first */
+static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv,
+ uint64_t index, uint32_t count)
+{
+ struct gntalloc_gref *rv = NULL, *gref;
+ list_for_each_entry(gref, &priv->list, next_file) {
+ if (gref->file_index == index && !rv)
+ rv = gref;
+ if (rv) {
+ if (gref->file_index != index)
+ return NULL;
+ index += PAGE_SIZE;
+ count--;
+ if (count == 0)
+ return rv;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * -------------------------------------
+ * File operations.
+ * -------------------------------------
+ */
+static int gntalloc_open(struct inode *inode, struct file *filp)
+{
+ struct gntalloc_file_private_data *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto out_nomem;
+ INIT_LIST_HEAD(&priv->list);
+
+ filp->private_data = priv;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ return 0;
+
+out_nomem:
+ return -ENOMEM;
+}
+
+static int gntalloc_release(struct inode *inode, struct file *filp)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_gref *gref;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ spin_lock(&gref_lock);
+ while (!list_empty(&priv->list)) {
+ gref = list_entry(priv->list.next,
+ struct gntalloc_gref, next_file);
+ list_del(&gref->next_file);
+ gref->users--;
+ if (gref->users == 0)
+ __del_gref(gref);
+ }
+ kfree(priv);
+ spin_unlock(&gref_lock);
+
+ return 0;
+}
+
+static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
+ struct ioctl_gntalloc_alloc_gref __user *arg)
+{
+ int rc = 0;
+ struct ioctl_gntalloc_alloc_gref op;
+ uint32_t *gref_ids;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ if (copy_from_user(&op, arg, sizeof(op))) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ gref_ids = kzalloc(sizeof(gref_ids[0]) * op.count, GFP_TEMPORARY);
+ if (!gref_ids) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&gref_lock);
+ /* Clean up pages that were at zero (local) users but were still mapped
+ * by remote domains. Since those pages count towards the limit that we
+ * are about to enforce, removing them here is a good idea.
+ */
+ do_cleanup();
+ if (gref_size + op.count > limit) {
+ spin_unlock(&gref_lock);
+ rc = -ENOSPC;
+ goto out_free;
+ }
+ gref_size += op.count;
+ op.index = priv->index;
+ priv->index += op.count * PAGE_SIZE;
+ spin_unlock(&gref_lock);
+
+ rc = add_grefs(&op, gref_ids, priv);
+ if (rc < 0)
+ goto out_free;
+
+ /* Once we finish add_grefs, it is unsafe to touch the new reference,
+ * since it is possible for a concurrent ioctl to remove it (by guessing
+ * its index). If the userspace application doesn't provide valid memory
+ * to write the IDs to, then it will need to close the file in order to
+ * release - which it will do by segfaulting when it tries to access the
+ * IDs to close them.
+ */
+ if (copy_to_user(arg, &op, sizeof(op))) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+ if (copy_to_user(arg->gref_ids, gref_ids,
+ sizeof(gref_ids[0]) * op.count)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ kfree(gref_ids);
+out:
+ return rc;
+}
+
+static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
+ void __user *arg)
+{
+ int i, rc = 0;
+ struct ioctl_gntalloc_dealloc_gref op;
+ struct gntalloc_gref *gref, *n;
+
+ pr_debug("%s: priv %p\n", __func__, priv);
+
+ if (copy_from_user(&op, arg, sizeof(op))) {
+ rc = -EFAULT;
+ goto dealloc_grant_out;
+ }
+
+ spin_lock(&gref_lock);
+ gref = find_grefs(priv, op.index, op.count);
+ if (gref) {
+ /* Remove from the file list only, and decrease reference count.
+ * The later call to do_cleanup() will remove from gref_list and
+ * free the memory if the pages aren't mapped anywhere.
+ */
+ for (i = 0; i < op.count; i++) {
+ n = list_entry(gref->next_file.next,
+ struct gntalloc_gref, next_file);
+ list_del(&gref->next_file);
+ gref->users--;
+ gref = n;
+ }
+ } else {
+ rc = -EINVAL;
+ }
+
+ do_cleanup();
+
+ spin_unlock(&gref_lock);
+dealloc_grant_out:
+ return rc;
+}
+
+static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
+ void __user *arg)
+{
+ struct ioctl_gntalloc_unmap_notify op;
+ struct gntalloc_gref *gref;
+ uint64_t index;
+ int pgoff;
+ int rc;
+
+ if (copy_from_user(&op, arg, sizeof(op)))
+ return -EFAULT;
+
+ index = op.index & ~(PAGE_SIZE - 1);
+ pgoff = op.index & (PAGE_SIZE - 1);
+
+ spin_lock(&gref_lock);
+
+ gref = find_grefs(priv, index, 1);
+ if (!gref) {
+ rc = -ENOENT;
+ goto unlock_out;
+ }
+
+ if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+
+ gref->notify.flags = op.action;
+ gref->notify.pgoff = pgoff;
+ gref->notify.event = op.event_channel_port;
+ rc = 0;
+ unlock_out:
+ spin_unlock(&gref_lock);
+ return rc;
+}
+
+static long gntalloc_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+
+ switch (cmd) {
+ case IOCTL_GNTALLOC_ALLOC_GREF:
+ return gntalloc_ioctl_alloc(priv, (void __user *)arg);
+
+ case IOCTL_GNTALLOC_DEALLOC_GREF:
+ return gntalloc_ioctl_dealloc(priv, (void __user *)arg);
+
+ case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY:
+ return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg);
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static void gntalloc_vma_close(struct vm_area_struct *vma)
+{
+ struct gntalloc_gref *gref = vma->vm_private_data;
+ if (!gref)
+ return;
+
+ spin_lock(&gref_lock);
+ gref->users--;
+ if (gref->users == 0)
+ __del_gref(gref);
+ spin_unlock(&gref_lock);
+}
+
+static struct vm_operations_struct gntalloc_vmops = {
+ .close = gntalloc_vma_close,
+};
+
+static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct gntalloc_file_private_data *priv = filp->private_data;
+ struct gntalloc_gref *gref;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ int rv, i;
+
+ pr_debug("%s: priv %p, page %lu+%d\n", __func__,
+ priv, vma->vm_pgoff, count);
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ printk(KERN_ERR "%s: Mapping must be shared.\n", __func__);
+ return -EINVAL;
+ }
+
+ spin_lock(&gref_lock);
+ gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
+ if (gref == NULL) {
+ rv = -ENOENT;
+ pr_debug("%s: Could not find grant reference",
+ __func__);
+ goto out_unlock;
+ }
+
+ vma->vm_private_data = gref;
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTCOPY;
+ vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP;
+
+ vma->vm_ops = &gntalloc_vmops;
+
+ for (i = 0; i < count; i++) {
+ gref->users++;
+ rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+ gref->page);
+ if (rv)
+ goto out_unlock;
+
+ gref = list_entry(gref->next_file.next,
+ struct gntalloc_gref, next_file);
+ }
+ rv = 0;
+
+out_unlock:
+ spin_unlock(&gref_lock);
+ return rv;
+}
+
+static const struct file_operations gntalloc_fops = {
+ .owner = THIS_MODULE,
+ .open = gntalloc_open,
+ .release = gntalloc_release,
+ .unlocked_ioctl = gntalloc_ioctl,
+ .mmap = gntalloc_mmap
+};
+
+/*
+ * -------------------------------------
+ * Module creation/destruction.
+ * -------------------------------------
+ */
+static struct miscdevice gntalloc_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntalloc",
+ .fops = &gntalloc_fops,
+};
+
+static int __init gntalloc_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntalloc_miscdev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register misc gntalloc device\n");
+ return err;
+ }
+
+ pr_debug("Created grant allocation device at %d,%d\n",
+ MISC_MAJOR, gntalloc_miscdev.minor);
+
+ return 0;
+}
+
+static void __exit gntalloc_exit(void)
+{
+ misc_deregister(&gntalloc_miscdev);
+}
+
+module_init(gntalloc_init);
+module_exit(gntalloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, "
+ "Daniel De Graaf <dgdegra@tycho.nsa.gov>");
+MODULE_DESCRIPTION("User-space grant reference allocator driver");
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 1e31cdc..d43ff30 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -32,10 +32,12 @@
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
+#include <linux/highmem.h>
#include <xen/xen.h>
#include <xen/grant_table.h>
#include <xen/gntdev.h>
+#include <xen/events.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
@@ -45,35 +47,46 @@ MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
"Gerd Hoffmann <kraxel@redhat.com>");
MODULE_DESCRIPTION("User-space granted page access driver");
-static int limit = 1024;
+static int limit = 1024*1024;
module_param(limit, int, 0644);
-MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
- "once by a gntdev instance");
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
+ "the gntdev device");
+
+static atomic_t pages_mapped = ATOMIC_INIT(0);
+
+static int use_ptemod;
struct gntdev_priv {
struct list_head maps;
- uint32_t used;
- uint32_t limit;
/* lock protects maps from concurrent changes */
spinlock_t lock;
struct mm_struct *mm;
struct mmu_notifier mn;
};
+struct unmap_notify {
+ int flags;
+ /* Address relative to the start of the grant_map */
+ int addr;
+ int event;
+};
+
struct grant_map {
struct list_head next;
- struct gntdev_priv *priv;
struct vm_area_struct *vma;
int index;
int count;
int flags;
- int is_mapped;
+ atomic_t users;
+ struct unmap_notify notify;
struct ioctl_gntdev_grant_ref *grants;
struct gnttab_map_grant_ref *map_ops;
struct gnttab_unmap_grant_ref *unmap_ops;
struct page **pages;
};
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages);
+
/* ------------------------------------------------------------------ */
static void gntdev_print_maps(struct gntdev_priv *priv,
@@ -82,9 +95,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv,
#ifdef DEBUG
struct grant_map *map;
- pr_debug("maps list (priv %p, usage %d/%d)\n",
- priv, priv->used, priv->limit);
-
+ pr_debug("%s: maps list (priv %p)\n", __func__, priv);
list_for_each_entry(map, &priv->maps, next)
pr_debug(" index %2d, count %2d %s\n",
map->index, map->count,
@@ -115,14 +126,13 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
if (add->pages[i] == NULL)
goto err;
+ add->map_ops[i].handle = -1;
+ add->unmap_ops[i].handle = -1;
}
add->index = 0;
add->count = count;
- add->priv = priv;
-
- if (add->count + priv->used > priv->limit)
- goto err;
+ atomic_set(&add->users, 1);
return add;
@@ -154,7 +164,6 @@ static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
list_add_tail(&add->next, &priv->maps);
done:
- priv->used += add->count;
gntdev_print_maps(priv, "[new]", add->index);
}
@@ -166,57 +175,57 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
list_for_each_entry(map, &priv->maps, next) {
if (map->index != index)
continue;
- if (map->count != count)
- continue;
- return map;
- }
- return NULL;
-}
-
-static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
- unsigned long vaddr)
-{
- struct grant_map *map;
-
- list_for_each_entry(map, &priv->maps, next) {
- if (!map->vma)
- continue;
- if (vaddr < map->vma->vm_start)
- continue;
- if (vaddr >= map->vma->vm_end)
+ if (count && map->count != count)
continue;
return map;
}
return NULL;
}
-static int gntdev_del_map(struct grant_map *map)
+static void gntdev_put_map(struct grant_map *map)
{
int i;
- if (map->vma)
- return -EBUSY;
- for (i = 0; i < map->count; i++)
- if (map->unmap_ops[i].handle)
- return -EBUSY;
+ if (!map)
+ return;
- map->priv->used -= map->count;
- list_del(&map->next);
- return 0;
-}
+ if (!atomic_dec_and_test(&map->users))
+ return;
-static void gntdev_free_map(struct grant_map *map)
-{
- int i;
+ atomic_sub(map->count, &pages_mapped);
- if (!map)
- return;
+ if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+ notify_remote_via_evtchn(map->notify.event);
+ }
+
+ if (map->pages) {
+ if (!use_ptemod)
+ unmap_grant_pages(map, 0, map->count);
- if (map->pages)
for (i = 0; i < map->count; i++) {
- if (map->pages[i])
+ uint32_t check, *tmp;
+ if (!map->pages[i])
+ continue;
+ /* XXX When unmapping in an HVM domain, Xen will
+ * sometimes end up mapping the GFN to an invalid MFN.
+ * In this case, writes will be discarded and reads will
+ * return all 0xFF bytes. Leak these unusable GFNs
+ * until Xen supports fixing their p2m mapping.
+ *
+ * Confirmed present in Xen 4.1-RC3 with HVM source
+ */
+ tmp = kmap(map->pages[i]);
+ *tmp = 0xdeaddead;
+ mb();
+ check = *tmp;
+ kunmap(map->pages[i]);
+ if (check == 0xdeaddead)
__free_page(map->pages[i]);
+ else
+ pr_debug("Discard page %d=%ld\n", i,
+ page_to_pfn(map->pages[i]));
}
+ }
kfree(map->pages);
kfree(map->grants);
kfree(map->map_ops);
@@ -231,24 +240,39 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token,
{
struct grant_map *map = data;
unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
u64 pte_maddr;
BUG_ON(pgnr >= map->count);
pte_maddr = arbitrary_virt_to_machine(pte).maddr;
- gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
- GNTMAP_contains_pte | map->flags,
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
map->grants[pgnr].ref,
map->grants[pgnr].domid);
- gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
- GNTMAP_contains_pte | map->flags,
- 0 /* handle */);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
+ -1 /* handle */);
return 0;
}
static int map_grant_pages(struct grant_map *map)
{
int i, err = 0;
+ phys_addr_t addr;
+
+ if (!use_ptemod) {
+ /* Note: it could already be mapped */
+ if (map->map_ops[0].handle != -1)
+ return 0;
+ for (i = 0; i < map->count; i++) {
+ addr = (phys_addr_t)
+ pfn_to_kaddr(page_to_pfn(map->pages[i]));
+ gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
+ map->grants[i].ref,
+ map->grants[i].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[i], addr,
+ map->flags, -1 /* handle */);
+ }
+ }
pr_debug("map %d+%d\n", map->index, map->count);
err = gnttab_map_refs(map->map_ops, map->pages, map->count);
@@ -258,28 +282,81 @@ static int map_grant_pages(struct grant_map *map)
for (i = 0; i < map->count; i++) {
if (map->map_ops[i].status)
err = -EINVAL;
- map->unmap_ops[i].handle = map->map_ops[i].handle;
+ else {
+ BUG_ON(map->map_ops[i].handle == -1);
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ pr_debug("map handle=%d\n", map->map_ops[i].handle);
+ }
}
return err;
}
-static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
{
int i, err = 0;
- pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
- err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+ if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+ int pgno = (map->notify.addr >> PAGE_SHIFT);
+ if (pgno >= offset && pgno < offset + pages && use_ptemod) {
+ void __user *tmp = (void __user *)
+ map->vma->vm_start + map->notify.addr;
+ err = copy_to_user(tmp, &err, 1);
+ if (err)
+ return err;
+ map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+ } else if (pgno >= offset && pgno < offset + pages) {
+ uint8_t *tmp = kmap(map->pages[pgno]);
+ tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
+ kunmap(map->pages[pgno]);
+ map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+ }
+ }
+
+ err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, pages);
if (err)
return err;
for (i = 0; i < pages; i++) {
if (map->unmap_ops[offset+i].status)
err = -EINVAL;
- map->unmap_ops[offset+i].handle = 0;
+ pr_debug("unmap handle=%d st=%d\n",
+ map->unmap_ops[offset+i].handle,
+ map->unmap_ops[offset+i].status);
+ map->unmap_ops[offset+i].handle = -1;
}
return err;
}
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int range, err = 0;
+
+ pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+
+ /* It is possible the requested range will have a "hole" where we
+ * already unmapped some of the grants. Only unmap valid ranges.
+ */
+ while (pages && !err) {
+ while (pages && map->unmap_ops[offset].handle == -1) {
+ offset++;
+ pages--;
+ }
+ range = 0;
+ while (range < pages) {
+ if (map->unmap_ops[offset+range].handle == -1) {
+ range--;
+ break;
+ }
+ range++;
+ }
+ err = __unmap_grant_pages(map, offset, range);
+ offset += range;
+ pages -= range;
+ }
+
+ return err;
+}
+
/* ------------------------------------------------------------------ */
static void gntdev_vma_close(struct vm_area_struct *vma)
@@ -287,22 +364,13 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
struct grant_map *map = vma->vm_private_data;
pr_debug("close %p\n", vma);
- map->is_mapped = 0;
map->vma = NULL;
vma->vm_private_data = NULL;
-}
-
-static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
- vmf->virtual_address, vmf->pgoff);
- vmf->flags = VM_FAULT_ERROR;
- return 0;
+ gntdev_put_map(map);
}
static struct vm_operations_struct gntdev_vmops = {
.close = gntdev_vma_close,
- .fault = gntdev_vma_fault,
};
/* ------------------------------------------------------------------ */
@@ -320,8 +388,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
list_for_each_entry(map, &priv->maps, next) {
if (!map->vma)
continue;
- if (!map->is_mapped)
- continue;
if (map->vma->vm_start >= end)
continue;
if (map->vma->vm_end <= start)
@@ -386,16 +452,17 @@ static int gntdev_open(struct inode *inode, struct file *flip)
INIT_LIST_HEAD(&priv->maps);
spin_lock_init(&priv->lock);
- priv->limit = limit;
- priv->mm = get_task_mm(current);
- if (!priv->mm) {
- kfree(priv);
- return -ENOMEM;
+ if (use_ptemod) {
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ ret = mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
}
- priv->mn.ops = &gntdev_mmu_ops;
- ret = mmu_notifier_register(&priv->mn, priv->mm);
- mmput(priv->mm);
if (ret) {
kfree(priv);
@@ -412,21 +479,19 @@ static int gntdev_release(struct inode *inode, struct file *flip)
{
struct gntdev_priv *priv = flip->private_data;
struct grant_map *map;
- int err;
pr_debug("priv %p\n", priv);
spin_lock(&priv->lock);
while (!list_empty(&priv->maps)) {
map = list_entry(priv->maps.next, struct grant_map, next);
- err = gntdev_del_map(map);
- if (WARN_ON(err))
- gntdev_free_map(map);
-
+ list_del(&map->next);
+ gntdev_put_map(map);
}
spin_unlock(&priv->lock);
- mmu_notifier_unregister(&priv->mn, priv->mm);
+ if (use_ptemod)
+ mmu_notifier_unregister(&priv->mn, priv->mm);
kfree(priv);
return 0;
}
@@ -443,16 +508,21 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
pr_debug("priv %p, add %d\n", priv, op.count);
if (unlikely(op.count <= 0))
return -EINVAL;
- if (unlikely(op.count > priv->limit))
- return -EINVAL;
err = -ENOMEM;
map = gntdev_alloc_map(priv, op.count);
if (!map)
return err;
+
+ if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) {
+ pr_debug("can't map: over limit\n");
+ gntdev_put_map(map);
+ return err;
+ }
+
if (copy_from_user(map->grants, &u->refs,
sizeof(map->grants[0]) * op.count) != 0) {
- gntdev_free_map(map);
+ gntdev_put_map(map);
return err;
}
@@ -461,13 +531,9 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
op.index = map->index << PAGE_SHIFT;
spin_unlock(&priv->lock);
- if (copy_to_user(u, &op, sizeof(op)) != 0) {
- spin_lock(&priv->lock);
- gntdev_del_map(map);
- spin_unlock(&priv->lock);
- gntdev_free_map(map);
- return err;
- }
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+
return 0;
}
@@ -484,11 +550,12 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
spin_lock(&priv->lock);
map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
- if (map)
- err = gntdev_del_map(map);
+ if (map) {
+ list_del(&map->next);
+ gntdev_put_map(map);
+ err = 0;
+ }
spin_unlock(&priv->lock);
- if (!err)
- gntdev_free_map(map);
return err;
}
@@ -496,43 +563,66 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
struct ioctl_gntdev_get_offset_for_vaddr __user *u)
{
struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct vm_area_struct *vma;
struct grant_map *map;
if (copy_from_user(&op, u, sizeof(op)) != 0)
return -EFAULT;
pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);
- spin_lock(&priv->lock);
- map = gntdev_find_map_vaddr(priv, op.vaddr);
- if (map == NULL ||
- map->vma->vm_start != op.vaddr) {
- spin_unlock(&priv->lock);
+ vma = find_vma(current->mm, op.vaddr);
+ if (!vma || vma->vm_ops != &gntdev_vmops)
return -EINVAL;
- }
+
+ map = vma->vm_private_data;
+ if (!map)
+ return -EINVAL;
+
op.offset = map->index << PAGE_SHIFT;
op.count = map->count;
- spin_unlock(&priv->lock);
if (copy_to_user(u, &op, sizeof(op)) != 0)
return -EFAULT;
return 0;
}
-static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
- struct ioctl_gntdev_set_max_grants __user *u)
+static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
{
- struct ioctl_gntdev_set_max_grants op;
+ struct ioctl_gntdev_unmap_notify op;
+ struct grant_map *map;
+ int rc;
- if (copy_from_user(&op, u, sizeof(op)) != 0)
+ if (copy_from_user(&op, u, sizeof(op)))
return -EFAULT;
- pr_debug("priv %p, limit %d\n", priv, op.count);
- if (op.count > limit)
- return -E2BIG;
+
+ if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
+ return -EINVAL;
spin_lock(&priv->lock);
- priv->limit = op.count;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ uint64_t begin = map->index << PAGE_SHIFT;
+ uint64_t end = (map->index + map->count) << PAGE_SHIFT;
+ if (op.index >= begin && op.index < end)
+ goto found;
+ }
+ rc = -ENOENT;
+ goto unlock_out;
+
+ found:
+ if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
+ (map->flags & GNTMAP_readonly)) {
+ rc = -EINVAL;
+ goto unlock_out;
+ }
+
+ map->notify.flags = op.action;
+ map->notify.addr = op.index - (map->index << PAGE_SHIFT);
+ map->notify.event = op.event_channel_port;
+ rc = 0;
+ unlock_out:
spin_unlock(&priv->lock);
- return 0;
+ return rc;
}
static long gntdev_ioctl(struct file *flip,
@@ -551,8 +641,8 @@ static long gntdev_ioctl(struct file *flip,
case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
- case IOCTL_GNTDEV_SET_MAX_GRANTS:
- return gntdev_ioctl_set_max_grants(priv, ptr);
+ case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
+ return gntdev_ioctl_notify(priv, ptr);
default:
pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
@@ -568,7 +658,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
int index = vma->vm_pgoff;
int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
struct grant_map *map;
- int err = -EINVAL;
+ int i, err = -EINVAL;
if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
return -EINVAL;
@@ -580,47 +670,70 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
map = gntdev_find_map_index(priv, index, count);
if (!map)
goto unlock_out;
- if (map->vma)
+ if (use_ptemod && map->vma)
goto unlock_out;
- if (priv->mm != vma->vm_mm) {
+ if (use_ptemod && priv->mm != vma->vm_mm) {
printk(KERN_WARNING "Huh? Other mm?\n");
goto unlock_out;
}
+ atomic_inc(&map->users);
+
vma->vm_ops = &gntdev_vmops;
vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;
vma->vm_private_data = map;
- map->vma = vma;
- map->flags = GNTMAP_host_map | GNTMAP_application_map;
- if (!(vma->vm_flags & VM_WRITE))
- map->flags |= GNTMAP_readonly;
+ if (use_ptemod)
+ map->vma = vma;
+
+ if (map->flags) {
+ if ((vma->vm_flags & VM_WRITE) &&
+ (map->flags & GNTMAP_readonly))
+ return -EINVAL;
+ } else {
+ map->flags = GNTMAP_host_map;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+ }
spin_unlock(&priv->lock);
- err = apply_to_page_range(vma->vm_mm, vma->vm_start,
- vma->vm_end - vma->vm_start,
- find_grant_ptes, map);
- if (err) {
- printk(KERN_WARNING "find_grant_ptes() failure.\n");
- return err;
+ if (use_ptemod) {
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ printk(KERN_WARNING "find_grant_ptes() failure.\n");
+ goto out_put_map;
+ }
}
err = map_grant_pages(map);
- if (err) {
- printk(KERN_WARNING "map_grant_pages() failure.\n");
- return err;
- }
+ if (err)
+ goto out_put_map;
- map->is_mapped = 1;
+ if (!use_ptemod) {
+ for (i = 0; i < count; i++) {
+ err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
+ map->pages[i]);
+ if (err)
+ goto out_put_map;
+ }
+ }
return 0;
unlock_out:
spin_unlock(&priv->lock);
return err;
+
+out_put_map:
+ if (use_ptemod)
+ map->vma = NULL;
+ gntdev_put_map(map);
+ return err;
}
static const struct file_operations gntdev_fops = {
@@ -646,6 +759,8 @@ static int __init gntdev_init(void)
if (!xen_domain())
return -ENODEV;
+ use_ptemod = xen_pv_domain();
+
err = misc_register(&gntdev_miscdev);
if (err != 0) {
printk(KERN_ERR "Could not register gntdev device\n");
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 9ef54eb..1a9bc2b 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -458,14 +458,19 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
if (ret)
return ret;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return ret;
+
for (i = 0; i < count; i++) {
- /* m2p override only supported for GNTMAP_contains_pte mappings */
- if (!(map_ops[i].flags & GNTMAP_contains_pte))
- continue;
- pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
(map_ops[i].host_addr & ~PAGE_MASK));
- mfn = pte_mfn(*pte);
- ret = m2p_add_override(mfn, pages[i]);
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+ }
+ ret = m2p_add_override(mfn, pages[i],
+ map_ops[i].flags & GNTMAP_contains_pte);
if (ret)
return ret;
}
@@ -483,8 +488,13 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
if (ret)
return ret;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return ret;
+
for (i = 0; i < count; i++) {
- ret = m2p_remove_override(pages[i]);
+ /* We do not have the means of checking if GNTMAP_contains_pte
+ * is set. */
+ ret = m2p_remove_override(pages[i], true /* clear the PTE */);
if (ret)
return ret;
}
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2417727..ebb2928 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -34,42 +34,38 @@ enum shutdown_state {
/* Ignore multiple shutdown requests. */
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
-#ifdef CONFIG_PM_SLEEP
-static int xen_hvm_suspend(void *data)
-{
- int err;
- struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
- int *cancelled = data;
-
- BUG_ON(!irqs_disabled());
-
- err = sysdev_suspend(PMSG_SUSPEND);
- if (err) {
- printk(KERN_ERR "xen_hvm_suspend: sysdev_suspend failed: %d\n",
- err);
- return err;
- }
-
- *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+struct suspend_info {
+ int cancelled;
+ unsigned long arg; /* extra hypercall argument */
+ void (*pre)(void);
+ void (*post)(int cancelled);
+};
- xen_hvm_post_suspend(*cancelled);
+static void xen_hvm_post_suspend(int cancelled)
+{
+ xen_arch_hvm_post_suspend(cancelled);
gnttab_resume();
+}
- if (!*cancelled) {
- xen_irq_resume();
- xen_console_resume();
- xen_timer_resume();
- }
-
- sysdev_resume();
+static void xen_pre_suspend(void)
+{
+ xen_mm_pin_all();
+ gnttab_suspend();
+ xen_arch_pre_suspend();
+}
- return 0;
+static void xen_post_suspend(int cancelled)
+{
+ xen_arch_post_suspend(cancelled);
+ gnttab_resume();
+ xen_mm_unpin_all();
}
+#ifdef CONFIG_PM_SLEEP
static int xen_suspend(void *data)
{
+ struct suspend_info *si = data;
int err;
- int *cancelled = data;
BUG_ON(!irqs_disabled());
@@ -80,22 +76,20 @@ static int xen_suspend(void *data)
return err;
}
- xen_mm_pin_all();
- gnttab_suspend();
- xen_pre_suspend();
+ if (si->pre)
+ si->pre();
/*
* This hypercall returns 1 if suspend was cancelled
* or the domain was merely checkpointed, and 0 if it
* is resuming in a new domain.
*/
- *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+ si->cancelled = HYPERVISOR_suspend(si->arg);
- xen_post_suspend(*cancelled);
- gnttab_resume();
- xen_mm_unpin_all();
+ if (si->post)
+ si->post(si->cancelled);
- if (!*cancelled) {
+ if (!si->cancelled) {
xen_irq_resume();
xen_console_resume();
xen_timer_resume();
@@ -109,7 +103,7 @@ static int xen_suspend(void *data)
static void do_suspend(void)
{
int err;
- int cancelled = 1;
+ struct suspend_info si;
shutting_down = SHUTDOWN_SUSPEND;
@@ -139,20 +133,29 @@ static void do_suspend(void)
goto out_resume;
}
- if (xen_hvm_domain())
- err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
- else
- err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+ si.cancelled = 1;
+
+ if (xen_hvm_domain()) {
+ si.arg = 0UL;
+ si.pre = NULL;
+ si.post = &xen_hvm_post_suspend;
+ } else {
+ si.arg = virt_to_mfn(xen_start_info);
+ si.pre = &xen_pre_suspend;
+ si.post = &xen_post_suspend;
+ }
+
+ err = stop_machine(xen_suspend, &si, cpumask_of(0));
dpm_resume_noirq(PMSG_RESUME);
if (err) {
printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
- cancelled = 1;
+ si.cancelled = 1;
}
out_resume:
- if (!cancelled) {
+ if (!si.cancelled) {
xen_arch_resume();
xs_resume();
} else
@@ -172,12 +175,39 @@ out:
}
#endif /* CONFIG_PM_SLEEP */
+struct shutdown_handler {
+ const char *command;
+ void (*cb)(void);
+};
+
+static void do_poweroff(void)
+{
+ shutting_down = SHUTDOWN_POWEROFF;
+ orderly_poweroff(false);
+}
+
+static void do_reboot(void)
+{
+ shutting_down = SHUTDOWN_POWEROFF; /* ? */
+ ctrl_alt_del();
+}
+
static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len)
{
char *str;
struct xenbus_transaction xbt;
int err;
+ static struct shutdown_handler handlers[] = {
+ { "poweroff", do_poweroff },
+ { "halt", do_poweroff },
+ { "reboot", do_reboot },
+#ifdef CONFIG_PM_SLEEP
+ { "suspend", do_suspend },
+#endif
+ {NULL, NULL},
+ };
+ static struct shutdown_handler *handler;
if (shutting_down != SHUTDOWN_INVALID)
return;
@@ -194,7 +224,14 @@ static void shutdown_handler(struct xenbus_watch *watch,
return;
}
- xenbus_write(xbt, "control", "shutdown", "");
+ for (handler = &handlers[0]; handler->command; handler++) {
+ if (strcmp(str, handler->command) == 0)
+ break;
+ }
+
+ /* Only acknowledge commands which we are prepared to handle. */
+ if (handler->cb)
+ xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
@@ -202,17 +239,8 @@ static void shutdown_handler(struct xenbus_watch *watch,
goto again;
}
- if (strcmp(str, "poweroff") == 0 ||
- strcmp(str, "halt") == 0) {
- shutting_down = SHUTDOWN_POWEROFF;
- orderly_poweroff(false);
- } else if (strcmp(str, "reboot") == 0) {
- shutting_down = SHUTDOWN_POWEROFF; /* ? */
- ctrl_alt_del();
-#ifdef CONFIG_PM_SLEEP
- } else if (strcmp(str, "suspend") == 0) {
- do_suspend();
-#endif
+ if (handler->cb) {
+ handler->cb();
} else {
printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
@@ -291,27 +319,18 @@ static int shutdown_event(struct notifier_block *notifier,
return NOTIFY_DONE;
}
-static int __init __setup_shutdown_event(void)
-{
- /* Delay initialization in the PV on HVM case */
- if (xen_hvm_domain())
- return 0;
-
- if (!xen_pv_domain())
- return -ENODEV;
-
- return xen_setup_shutdown_event();
-}
-
int xen_setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
+
+ if (!xen_domain())
+ return -ENODEV;
register_xenstore_notifier(&xenstore_notifier);
return 0;
}
EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
-subsys_initcall(__setup_shutdown_event);
+subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
new file mode 100644
index 0000000..38bc123
--- /dev/null
+++ b/drivers/xen/pciback/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+
+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
+xen-pciback-y += conf_space.o conf_space_header.o \
+ conf_space_capability.o \
+ conf_space_capability_vpd.o \
+ conf_space_capability_pm.o \
+ conf_space_quirks.o
+xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
+
+ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
new file mode 100644
index 0000000..eb6bba0
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.c
@@ -0,0 +1,435 @@
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ * exported PCI Devices.
+ * It's dangerous to allow PCI Driver Domains to change their
+ * device's resources (memory, i/o ports, interrupts). We need to
+ * restrict changes to certain PCI Configuration registers:
+ * BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+static int permissive;
+module_param(permissive, bool, 0644);
+
+#define DEFINE_PCI_CONFIG(op, size, type) \
+int pciback_##op##_config_##size \
+(struct pci_dev *dev, int offset, type value, void *data) \
+{ \
+ return pci_##op##_config_##size(dev, offset, value); \
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+ const struct config_field_entry *entry,
+ int offset, u32 *value)
+{
+ int ret = 0;
+ const struct config_field *field = entry->field;
+
+ *value = 0;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.read)
+ ret = field->u.b.read(dev, offset, (u8 *) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.read)
+ ret = field->u.w.read(dev, offset, (u16 *) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.read)
+ ret = field->u.dw.read(dev, offset, value, entry->data);
+ break;
+ }
+ return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+ const struct config_field_entry *entry,
+ int offset, u32 value)
+{
+ int ret = 0;
+ const struct config_field *field = entry->field;
+
+ switch (field->size) {
+ case 1:
+ if (field->u.b.write)
+ ret = field->u.b.write(dev, offset, (u8) value,
+ entry->data);
+ break;
+ case 2:
+ if (field->u.w.write)
+ ret = field->u.w.write(dev, offset, (u16) value,
+ entry->data);
+ break;
+ case 4:
+ if (field->u.dw.write)
+ ret = field->u.dw.write(dev, offset, value,
+ entry->data);
+ break;
+ }
+ return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+ if (size == 1)
+ return 0xff;
+ else if (size == 2)
+ return 0xffff;
+ else
+ return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+ /* Validate request (no un-aligned requests) */
+ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+ return 1;
+ return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+ int offset)
+{
+ if (offset >= 0) {
+ new_val_mask <<= (offset * 8);
+ new_val <<= (offset * 8);
+ } else {
+ new_val_mask >>= (offset * -8);
+ new_val >>= (offset * -8);
+ }
+ val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+ return val;
+}
+
+static int pcibios_err_to_errno(int err)
+{
+ switch (err) {
+ case PCIBIOS_SUCCESSFUL:
+ return XEN_PCI_ERR_success;
+ case PCIBIOS_DEVICE_NOT_FOUND:
+ return XEN_PCI_ERR_dev_not_found;
+ case PCIBIOS_BAD_REGISTER_NUMBER:
+ return XEN_PCI_ERR_invalid_offset;
+ case PCIBIOS_FUNC_NOT_SUPPORTED:
+ return XEN_PCI_ERR_not_implemented;
+ case PCIBIOS_SET_FAILED:
+ return XEN_PCI_ERR_access_denied;
+ }
+ return err;
+}
+
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+ u32 *ret_val)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+ int req_start, req_end, field_start, field_end;
+ /* if read fails for any reason, return 0
+ * (as if device didn't respond) */
+ u32 value = 0, tmp_val;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
+ pci_name(dev), size, offset);
+
+ if (!valid_request(offset, size)) {
+ err = XEN_PCI_ERR_invalid_offset;
+ goto out;
+ }
+
+ /* Get the real value first, then modify as appropriate */
+ switch (size) {
+ case 1:
+ err = pci_read_config_byte(dev, offset, (u8 *) &value);
+ break;
+ case 2:
+ err = pci_read_config_word(dev, offset, (u16 *) &value);
+ break;
+ case 4:
+ err = pci_read_config_dword(dev, offset, &value);
+ break;
+ }
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ err = conf_space_read(dev, cfg_entry, field_start,
+ &tmp_val);
+ if (err)
+ goto out;
+
+ value = merge_value(value, tmp_val,
+ get_mask(field->size),
+ field_start - req_start);
+ }
+ }
+
+out:
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ *ret_val = value;
+ return pcibios_err_to_errno(err);
+}
+
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+ int err = 0, handled = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+ u32 tmp_val;
+ int req_start, req_end, field_start, field_end;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ "pciback: %s: write request %d bytes at 0x%x = %x\n",
+ pci_name(dev), size, offset, value);
+
+ if (!valid_request(offset, size))
+ return XEN_PCI_ERR_invalid_offset;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ req_start = offset;
+ req_end = offset + size;
+ field_start = OFFSET(cfg_entry);
+ field_end = OFFSET(cfg_entry) + field->size;
+
+ if ((req_start >= field_start && req_start < field_end)
+ || (req_end > field_start && req_end <= field_end)) {
+ tmp_val = 0;
+
+ err = pciback_config_read(dev, field_start,
+ field->size, &tmp_val);
+ if (err)
+ break;
+
+ tmp_val = merge_value(tmp_val, value, get_mask(size),
+ req_start - field_start);
+
+ err = conf_space_write(dev, cfg_entry, field_start,
+ tmp_val);
+
+ /* handled is set true here, but not every byte
+ * may have been written! Properly detecting if
+ * every byte is handled is unnecessary as the
+ * flag is used to detect devices that need
+ * special helpers to work correctly.
+ */
+ handled = 1;
+ }
+ }
+
+ if (!handled && !err) {
+ /* By default, anything not specificially handled above is
+ * read-only. The permissive flag changes this behavior so
+ * that anything not specifically handled above is writable.
+ * This means that some fields may still be read-only because
+ * they have entries in the config_field list that intercept
+ * the write and do nothing. */
+ if (dev_data->permissive || permissive) {
+ switch (size) {
+ case 1:
+ err = pci_write_config_byte(dev, offset,
+ (u8) value);
+ break;
+ case 2:
+ err = pci_write_config_word(dev, offset,
+ (u16) value);
+ break;
+ case 4:
+ err = pci_write_config_dword(dev, offset,
+ (u32) value);
+ break;
+ }
+ } else if (!dev_data->warned_on_write) {
+ dev_data->warned_on_write = 1;
+ dev_warn(&dev->dev, "Driver tried to write to a "
+ "read-only configuration space field at offset"
+ " 0x%x, size %d. This may be harmless, but if "
+ "you have problems with your device:\n"
+ "1) see permissive attribute in sysfs\n"
+ "2) report problems to the xen-devel "
+ "mailing list along with details of your "
+ "device obtained from lspci.\n", offset, size);
+ }
+ }
+
+ return pcibios_err_to_errno(err);
+}
+
+void pciback_config_free_dyn_fields(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
+ "configuration space fields\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->clean) {
+ field->clean((struct config_field *)field);
+
+ kfree(cfg_entry->data);
+
+ list_del(&cfg_entry->list);
+ kfree(cfg_entry);
+ }
+
+ }
+}
+
+void pciback_config_reset_dev(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ const struct config_field_entry *cfg_entry;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+
+ if (field->reset)
+ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+ }
+}
+
+void pciback_config_free_dev(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry, *t;
+ const struct config_field *field;
+
+ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+ if (!dev_data)
+ return;
+
+ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+ list_del(&cfg_entry->list);
+
+ field = cfg_entry->field;
+
+ if (field->release)
+ field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+ kfree(cfg_entry);
+ }
+}
+
+int pciback_config_add_field_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int base_offset)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+ void *tmp;
+
+ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+ if (!cfg_entry) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cfg_entry->data = NULL;
+ cfg_entry->field = field;
+ cfg_entry->base_offset = base_offset;
+
+ /* silently ignore duplicate fields */
+ err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
+ if (err)
+ goto out;
+
+ if (field->init) {
+ tmp = field->init(dev, OFFSET(cfg_entry));
+
+ if (IS_ERR(tmp)) {
+ err = PTR_ERR(tmp);
+ goto out;
+ }
+
+ cfg_entry->data = tmp;
+ }
+
+ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+ OFFSET(cfg_entry));
+ list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+out:
+ if (err)
+ kfree(cfg_entry);
+
+ return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int pciback_config_init_dev(struct pci_dev *dev)
+{
+ int err = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+ dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+ INIT_LIST_HEAD(&dev_data->config_fields);
+
+ err = pciback_config_header_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = pciback_config_capability_add_fields(dev);
+ if (err)
+ goto out;
+
+ err = pciback_config_quirks_init(dev);
+
+out:
+ return err;
+}
+
+int pciback_config_init(void)
+{
+ return pciback_config_capability_init();
+}
diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
new file mode 100644
index 0000000..50ebef2
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.h
@@ -0,0 +1,126 @@
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
+ void *data);
+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
+ void *data);
+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
+ void *data);
+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
+ void *data);
+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
+ void *data);
+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
+ void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+ unsigned int offset;
+ unsigned int size;
+ unsigned int mask;
+ conf_field_init init;
+ conf_field_reset reset;
+ conf_field_free release;
+ void (*clean) (struct config_field *field);
+ union {
+ struct {
+ conf_dword_write write;
+ conf_dword_read read;
+ } dw;
+ struct {
+ conf_word_write write;
+ conf_word_read read;
+ } w;
+ struct {
+ conf_byte_write write;
+ conf_byte_read read;
+ } b;
+ } u;
+ struct list_head list;
+};
+
+struct config_field_entry {
+ struct list_head list;
+ const struct config_field *field;
+ unsigned int base_offset;
+ void *data;
+};
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int pciback_config_add_field_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int offset);
+
+static inline int pciback_config_add_field(struct pci_dev *dev,
+ const struct config_field *field)
+{
+ return pciback_config_add_field_offset(dev, field, 0);
+}
+
+static inline int pciback_config_add_fields(struct pci_dev *dev,
+ const struct config_field *field)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = pciback_config_add_field(dev, &field[i]);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
+ const struct config_field *field,
+ unsigned int offset)
+{
+ int i, err = 0;
+ for (i = 0; field[i].size != 0; i++) {
+ err = pciback_config_add_field_offset(dev, &field[i], offset);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/* Read/Write the real configuration space */
+int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
+ void *data);
+int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
+ void *data);
+int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
+ void *data);
+int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+ void *data);
+int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
+ void *data);
+int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+ void *data);
+
+int pciback_config_capability_init(void);
+
+int pciback_config_header_add_fields(struct pci_dev *dev);
+int pciback_config_capability_add_fields(struct pci_dev *dev);
+
+#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
new file mode 100644
index 0000000..0ea84d6
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.c
@@ -0,0 +1,66 @@
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ * in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static LIST_HEAD(capabilities);
+
+static const struct config_field caplist_header[] = {
+ {
+ .offset = PCI_CAP_LIST_ID,
+ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = NULL,
+ },
+ {}
+};
+
+static inline void register_capability(struct pciback_config_capability *cap)
+{
+ list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int pciback_config_capability_add_fields(struct pci_dev *dev)
+{
+ int err = 0;
+ struct pciback_config_capability *cap;
+ int cap_offset;
+
+ list_for_each_entry(cap, &capabilities, cap_list) {
+ cap_offset = pci_find_capability(dev, cap->capability);
+ if (cap_offset) {
+ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+ cap->capability, cap_offset);
+
+ err = pciback_config_add_fields_offset(dev,
+ caplist_header,
+ cap_offset);
+ if (err)
+ goto out;
+ err = pciback_config_add_fields_offset(dev,
+ cap->fields,
+ cap_offset);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ return err;
+}
+
+int pciback_config_capability_init(void)
+{
+ register_capability(&pciback_config_capability_vpd);
+ register_capability(&pciback_config_capability_pm);
+
+ return 0;
+}
diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
new file mode 100644
index 0000000..8da3ac4
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.h
@@ -0,0 +1,26 @@
+/*
+ * PCI Backend - Data structures for special overlays for structures on
+ * the capability list.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
+#define __PCIBACK_CONFIG_CAPABILITY_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_capability {
+ struct list_head cap_list;
+
+ int capability;
+
+ /* If the device has the capability found above, add these fields */
+ const struct config_field *fields;
+};
+
+extern struct pciback_config_capability pciback_config_capability_vpd;
+extern struct pciback_config_capability pciback_config_capability_pm;
+
+#endif
diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
new file mode 100644
index 0000000..041e4aa
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_msi.c
@@ -0,0 +1,136 @@
+/*
+ * PCI Backend -- Configuration overlay for MSI capability
+ */
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+#include <xen/interface/io/pciif.h>
+#include <xen/events.h>
+#include "pciback.h"
+
+int pciback_enable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+ int otherend = pdev->xdev->otherend_id;
+ int status;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable MSI\n", pci_name(dev));
+
+ status = pci_enable_msi(dev);
+
+ if (status) {
+ printk(KERN_ERR "error enable msi for guest %x status %x\n",
+ otherend, status);
+ op->value = 0;
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* The value the guest needs is actually the IDT vector, not the
+ * the local domain's IRQ number. */
+
+ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: MSI: %d\n", pci_name(dev),
+ op->value);
+
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 0;
+
+ return 0;
+}
+
+int pciback_disable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable MSI\n", pci_name(dev));
+ pci_disable_msi(dev);
+
+ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: MSI: %d\n", pci_name(dev),
+ op->value);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ return 0;
+}
+
+int pciback_enable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+ int i, result;
+ struct msix_entry *entries;
+
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable MSI-X\n", pci_name(dev));
+ if (op->value > SH_INFO_MAX_VEC)
+ return -EINVAL;
+
+ entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
+ if (entries == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < op->value; i++) {
+ entries[i].entry = op->msix_entries[i].entry;
+ entries[i].vector = op->msix_entries[i].vector;
+ }
+
+ result = pci_enable_msix(dev, entries, op->value);
+
+ if (result == 0) {
+ for (i = 0; i < op->value; i++) {
+ op->msix_entries[i].entry = entries[i].entry;
+ if (entries[i].vector)
+ op->msix_entries[i].vector =
+ xen_pirq_from_irq(entries[i].vector);
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: " \
+ "MSI-X[%d]: %d\n",
+ pci_name(dev), i,
+ op->msix_entries[i].vector);
+ }
+ } else {
+ printk(KERN_WARNING "pciback: %s: failed to enable MSI-X: err %d!\n",
+ pci_name(dev), result);
+ }
+ kfree(entries);
+
+ op->value = result;
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 0;
+
+ return result;
+}
+
+int pciback_disable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op)
+{
+ struct pciback_dev_data *dev_data;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable MSI-X\n",
+ pci_name(dev));
+ pci_disable_msix(dev);
+
+ /*
+ * SR-IOV devices (which don't have any legacy IRQ) have
+ * an undefined IRQ value of zero.
+ */
+ op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: MSI-X: %d\n", pci_name(dev),
+ op->value);
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ return 0;
+}
+
diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
new file mode 100644
index 0000000..0442616
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_pm.c
@@ -0,0 +1,113 @@
+/*
+ * PCI Backend - Configuration space overlay for power management
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+ void *data)
+{
+ int err;
+ u16 real_value;
+
+ err = pci_read_config_word(dev, offset, &real_value);
+ if (err)
+ goto out;
+
+ *value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+out:
+ return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+ void *data)
+{
+ int err;
+ u16 old_value;
+ pci_power_t new_state, old_state;
+
+ err = pci_read_config_word(dev, offset, &old_value);
+ if (err)
+ goto out;
+
+ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
+ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+ new_value &= PM_OK_BITS;
+ if ((old_value & PM_OK_BITS) != new_value) {
+ new_value = (old_value & ~PM_OK_BITS) | new_value;
+ err = pci_write_config_word(dev, offset, new_value);
+ if (err)
+ goto out;
+ }
+
+ /* Let pci core handle the power management change */
+ dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+ err = pci_set_power_state(dev, new_state);
+ if (err) {
+ err = PCIBIOS_SET_FAILED;
+ goto out;
+ }
+
+ out:
+ return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+ int err;
+ u16 value;
+
+ err = pci_read_config_word(dev, offset, &value);
+ if (err)
+ goto out;
+
+ if (value & PCI_PM_CTRL_PME_ENABLE) {
+ value &= ~PCI_PM_CTRL_PME_ENABLE;
+ err = pci_write_config_word(dev, offset, value);
+ }
+
+out:
+ return ERR_PTR(err);
+}
+
+static const struct config_field caplist_pm[] = {
+ {
+ .offset = PCI_PM_PMC,
+ .size = 2,
+ .u.w.read = pm_caps_read,
+ },
+ {
+ .offset = PCI_PM_CTRL,
+ .size = 2,
+ .init = pm_ctrl_init,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = pm_ctrl_write,
+ },
+ {
+ .offset = PCI_PM_PPB_EXTENSIONS,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .offset = PCI_PM_DATA_REGISTER,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {}
+};
+
+struct pciback_config_capability pciback_config_capability_pm = {
+ .capability = PCI_CAP_ID_PM,
+ .fields = caplist_pm,
+};
diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
new file mode 100644
index 0000000..e7b4d66
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_vpd.c
@@ -0,0 +1,40 @@
+/*
+ * PCI Backend - Configuration space overlay for Vital Product Data
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+ void *data)
+{
+ /* Disallow writes to the vital product data */
+ if (value & PCI_VPD_ADDR_F)
+ return PCIBIOS_SET_FAILED;
+ else
+ return pci_write_config_word(dev, offset, value);
+}
+
+static const struct config_field caplist_vpd[] = {
+ {
+ .offset = PCI_VPD_ADDR,
+ .size = 2,
+ .u.w.read = pciback_read_config_word,
+ .u.w.write = vpd_address_write,
+ },
+ {
+ .offset = PCI_VPD_DATA,
+ .size = 4,
+ .u.dw.read = pciback_read_config_dword,
+ .u.dw.write = NULL,
+ },
+ {}
+};
+
+struct pciback_config_capability pciback_config_capability_vpd = {
+ .capability = PCI_CAP_ID_VPD,
+ .fields = caplist_vpd,
+};
diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
new file mode 100644
index 0000000..22ad0f5
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_header.c
@@ -0,0 +1,385 @@
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_bar_info {
+ u32 val;
+ u32 len_val;
+ int which;
+};
+
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
+ int i;
+ int ret;
+
+ ret = pciback_read_config_word(dev, offset, value, data);
+ if (!atomic_read(&dev->enable_cnt))
+ return ret;
+
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ if (dev->resource[i].flags & IORESOURCE_IO)
+ *value |= PCI_COMMAND_IO;
+ if (dev->resource[i].flags & IORESOURCE_MEM)
+ *value |= PCI_COMMAND_MEMORY;
+ }
+
+ return ret;
+}
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+ struct pciback_dev_data *dev_data;
+ int err;
+
+ dev_data = pci_get_drvdata(dev);
+ if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: enable\n",
+ pci_name(dev));
+ err = pci_enable_device(dev);
+ if (err)
+ return err;
+ if (dev_data)
+ dev_data->enable_intx = 1;
+ } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: disable\n",
+ pci_name(dev));
+ pci_disable_device(dev);
+ if (dev_data)
+ dev_data->enable_intx = 0;
+ }
+
+ if (!dev->is_busmaster && is_master_cmd(value)) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG "pciback: %s: set bus master\n",
+ pci_name(dev));
+ pci_set_master(dev);
+ }
+
+ if (value & PCI_COMMAND_INVALIDATE) {
+ if (unlikely(verbose_request))
+ printk(KERN_DEBUG
+ "pciback: %s: enable memory-write-invalidate\n",
+ pci_name(dev));
+ err = pci_set_mwi(dev);
+ if (err) {
+ printk(KERN_WARNING
+ "pciback: %s: cannot enable "
+ "memory-write-invalidate (%d)\n",
+ pci_name(dev), err);
+ value &= ~PCI_COMMAND_INVALIDATE;
+ }
+ }
+
+ return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~PCI_ROM_ADDRESS_ENABLE)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ /* Do we need to support enabling/disabling the rom address here? */
+
+ return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ /* A write to obtain the length must happen as a 32-bit write.
+ * This does not (yet) support writing individual bytes
+ */
+ if (value == ~0)
+ bar->which = 1;
+ else {
+ u32 tmpval;
+ pci_read_config_dword(dev, offset, &tmpval);
+ if (tmpval != bar->val && value == bar->val) {
+ /* Allow restoration of bar value. */
+ pci_write_config_dword(dev, offset, bar->val);
+ }
+ bar->which = 0;
+ }
+
+ return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ if (unlikely(!bar)) {
+ printk(KERN_WARNING "pciback: driver data not found for %s\n",
+ pci_name(dev));
+ return XEN_PCI_ERR_op_failed;
+ }
+
+ *value = bar->which ? bar->len_val : bar->val;
+
+ return 0;
+}
+
+static inline void read_dev_bar(struct pci_dev *dev,
+ struct pci_bar_info *bar_info, int offset,
+ u32 len_mask)
+{
+ int pos;
+ struct resource *res = dev->resource;
+
+ if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
+ pos = PCI_ROM_RESOURCE;
+ else {
+ pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+ if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
+ (PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64))) {
+ bar_info->val = res[pos - 1].start >> 32;
+ bar_info->len_val = res[pos - 1].end >> 32;
+ return;
+ }
+ }
+
+ bar_info->val = res[pos].start |
+ (res[pos].flags & PCI_REGION_FLAG_MASK);
+ bar_info->len_val = res[pos].end - res[pos].start + 1;
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~0);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void *rom_init(struct pci_dev *dev, int offset)
+{
+ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+ if (!bar)
+ return ERR_PTR(-ENOMEM);
+
+ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
+ bar->which = 0;
+
+ return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+ struct pci_bar_info *bar = data;
+
+ bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+ kfree(data);
+}
+
+static int pciback_read_vendor(struct pci_dev *dev, int offset,
+ u16 *value, void *data)
+{
+ *value = dev->vendor;
+
+ return 0;
+}
+
+static int pciback_read_device(struct pci_dev *dev, int offset,
+ u16 *value, void *data)
+{
+ *value = dev->device;
+
+ return 0;
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+ void *data)
+{
+ *value = (u8) dev->irq;
+
+ return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+ u8 cur_value;
+ int err;
+
+ err = pci_read_config_byte(dev, offset, &cur_value);
+ if (err)
+ goto out;
+
+ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+ || value == PCI_BIST_START)
+ err = pci_write_config_byte(dev, offset, value);
+
+out:
+ return err;
+}
+
+static const struct config_field header_common[] = {
+ {
+ .offset = PCI_VENDOR_ID,
+ .size = 2,
+ .u.w.read = pciback_read_vendor,
+ },
+ {
+ .offset = PCI_DEVICE_ID,
+ .size = 2,
+ .u.w.read = pciback_read_device,
+ },
+ {
+ .offset = PCI_COMMAND,
+ .size = 2,
+ .u.w.read = command_read,
+ .u.w.write = command_write,
+ },
+ {
+ .offset = PCI_INTERRUPT_LINE,
+ .size = 1,
+ .u.b.read = interrupt_read,
+ },
+ {
+ .offset = PCI_INTERRUPT_PIN,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ /* Any side effects of letting driver domain control cache line? */
+ .offset = PCI_CACHE_LINE_SIZE,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ .u.b.write = pciback_write_config_byte,
+ },
+ {
+ .offset = PCI_LATENCY_TIMER,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ },
+ {
+ .offset = PCI_BIST,
+ .size = 1,
+ .u.b.read = pciback_read_config_byte,
+ .u.b.write = bist_write,
+ },
+ {}
+};
+
+#define CFG_FIELD_BAR(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = bar_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = bar_write, \
+ }
+
+#define CFG_FIELD_ROM(reg_offset) \
+ { \
+ .offset = reg_offset, \
+ .size = 4, \
+ .init = rom_init, \
+ .reset = bar_reset, \
+ .release = bar_release, \
+ .u.dw.read = bar_read, \
+ .u.dw.write = rom_write, \
+ }
+
+static const struct config_field header_0[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+ {}
+};
+
+static const struct config_field header_1[] = {
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+ CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+ {}
+};
+
+int pciback_config_header_add_fields(struct pci_dev *dev)
+{
+ int err;
+
+ err = pciback_config_add_fields(dev, header_common);
+ if (err)
+ goto out;
+
+ switch (dev->hdr_type) {
+ case PCI_HEADER_TYPE_NORMAL:
+ err = pciback_config_add_fields(dev, header_0);
+ break;
+
+ case PCI_HEADER_TYPE_BRIDGE:
+ err = pciback_config_add_fields(dev, header_1);
+ break;
+
+ default:
+ err = -EINVAL;
+ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
+ pci_name(dev), dev->hdr_type);
+ break;
+ }
+
+out:
+ return err;
+}
diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
new file mode 100644
index 0000000..45c31fb
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.c
@@ -0,0 +1,140 @@
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(pciback_quirks);
+
+static inline const struct pci_device_id *
+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+ if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+ (id->device == PCI_ANY_ID || id->device == dev->device) &&
+ (id->subvendor == PCI_ANY_ID ||
+ id->subvendor == dev->subsystem_vendor) &&
+ (id->subdevice == PCI_ANY_ID ||
+ id->subdevice == dev->subsystem_device) &&
+ !((id->class ^ dev->class) & id->class_mask))
+ return id;
+ return NULL;
+}
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *tmp_quirk;
+
+ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
+ if (match_one_device(&tmp_quirk->devid, dev) != NULL)
+ goto out;
+ tmp_quirk = NULL;
+ printk(KERN_DEBUG
+ "quirk didn't match any device pciback knows about\n");
+out:
+ return tmp_quirk;
+}
+
+static inline void register_quirk(struct pciback_config_quirk *quirk)
+{
+ list_add_tail(&quirk->quirks_list, &pciback_quirks);
+}
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+ int ret = 0;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+ struct config_field_entry *cfg_entry;
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ if (OFFSET(cfg_entry) == reg) {
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field)
+{
+ int err = 0;
+
+ switch (field->size) {
+ case 1:
+ field->u.b.read = pciback_read_config_byte;
+ field->u.b.write = pciback_write_config_byte;
+ break;
+ case 2:
+ field->u.w.read = pciback_read_config_word;
+ field->u.w.write = pciback_write_config_word;
+ break;
+ case 4:
+ field->u.dw.read = pciback_read_config_dword;
+ field->u.dw.write = pciback_write_config_dword;
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ pciback_config_add_field(dev, field);
+
+out:
+ return err;
+}
+
+int pciback_config_quirks_init(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+ if (!quirk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ quirk->devid.vendor = dev->vendor;
+ quirk->devid.device = dev->device;
+ quirk->devid.subvendor = dev->subsystem_vendor;
+ quirk->devid.subdevice = dev->subsystem_device;
+ quirk->devid.class = 0;
+ quirk->devid.class_mask = 0;
+ quirk->devid.driver_data = 0UL;
+
+ quirk->pdev = dev;
+
+ register_quirk(quirk);
+out:
+ return ret;
+}
+
+void pciback_config_field_free(struct config_field *field)
+{
+ kfree(field);
+}
+
+int pciback_config_quirk_release(struct pci_dev *dev)
+{
+ struct pciback_config_quirk *quirk;
+ int ret = 0;
+
+ quirk = pciback_find_quirk(dev);
+ if (!quirk) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ list_del(&quirk->quirks_list);
+ kfree(quirk);
+
+out:
+ return ret;
+}
diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
new file mode 100644
index 0000000..acd0e1a
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.h
@@ -0,0 +1,35 @@
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_quirk {
+ struct list_head quirks_list;
+ struct pci_device_id devid;
+ struct pci_dev *pdev;
+};
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+ *field);
+
+int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int pciback_config_quirks_init(struct pci_dev *dev);
+
+void pciback_config_field_free(struct config_field *field);
+
+int pciback_config_quirk_release(struct pci_dev *dev);
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
new file mode 100644
index 0000000..5a7e4cc
--- /dev/null
+++ b/drivers/xen/pciback/controller.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ * Alex Williamson <alex.williamson@hp.com>
+ *
+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
+ * controllers. Devices under the same PCI controller are exposed on the
+ * same virtual domain:bus. Within a bus, device slots are virtualized
+ * to compact the bus.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_MAX_BUSSES 255
+#define PCI_MAX_SLOTS 32
+
+struct controller_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+ unsigned int devfn;
+};
+
+struct controller_list_entry {
+ struct list_head list;
+ struct pci_controller *controller;
+ unsigned int domain;
+ unsigned int bus;
+ unsigned int next_devfn;
+ struct list_head dev_list;
+};
+
+struct controller_dev_data {
+ struct list_head list;
+ unsigned int next_domain;
+ unsigned int next_bus;
+ spinlock_t lock;
+};
+
+struct walk_info {
+ struct pciback_device *pdev;
+ int resource_count;
+ int root_num;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ struct pci_dev *dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->domain != domain ||
+ cntrl_entry->bus != bus)
+ continue;
+
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if (devfn == dev_entry->devfn) {
+ dev = dev_entry->dev;
+ goto found;
+ }
+ }
+ }
+found:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
+ unsigned long flags;
+ int ret = 0, found = 0;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ /* Look to see if we already have a domain:bus for this controller */
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->controller == dev_controller) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
+ if (!cntrl_entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cntrl_entry->controller = dev_controller;
+ cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
+ cntrl_entry->domain = dev_data->next_domain;
+ cntrl_entry->bus = dev_data->next_bus++;
+ if (dev_data->next_bus > PCI_MAX_BUSSES) {
+ dev_data->next_domain++;
+ dev_data->next_bus = 0;
+ }
+
+ INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
+ list_add_tail(&cntrl_entry->list, &dev_data->list);
+ }
+
+ if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
+ /*
+ * While it seems unlikely, this can actually happen if
+ * a controller has P2P bridges under it.
+ */
+ xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
+ "is full, no room to export %04x:%02x:%02x.%x",
+ cntrl_entry->domain, cntrl_entry->bus,
+ pci_domain_nr(dev->bus), dev->bus->number,
+ PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
+ if (!dev_entry) {
+ if (list_empty(&cntrl_entry->dev_list)) {
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+ dev_entry->devfn = cntrl_entry->next_devfn;
+
+ list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
+ cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
+out:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ /* TODO: Publish virtual domain:bus:slot.func here. */
+
+ return ret;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry;
+ struct controller_dev_entry *dev_entry = NULL;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ if (cntrl_entry->controller != PCI_CONTROLLER(dev))
+ continue;
+
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ found_dev = dev_entry->dev;
+ break;
+ }
+ }
+ }
+
+ if (!found_dev) {
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ return;
+ }
+
+ list_del(&dev_entry->list);
+ kfree(dev_entry);
+
+ if (list_empty(&cntrl_entry->dev_list)) {
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ struct controller_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->list);
+
+ /* Starting domain:bus numbers */
+ dev_data->next_domain = 0;
+ dev_data->next_bus = 0;
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
+{
+ struct walk_info *info = data;
+ struct acpi_resource_address64 addr;
+ acpi_status status;
+ int i, len, err;
+ char str[32], tmp[3];
+ unsigned char *ptr, *buf;
+
+ status = acpi_resource_to_address64(res, &addr);
+
+ /* Do we care about this range? Let's check. */
+ if (!ACPI_SUCCESS(status) ||
+ !(addr.resource_type == ACPI_MEMORY_RANGE ||
+ addr.resource_type == ACPI_IO_RANGE) ||
+ !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
+ return AE_OK;
+
+ /*
+ * Furthermore, we really only care to tell the guest about
+ * address ranges that require address translation of some sort.
+ */
+ if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
+ addr.info.mem.translation) &&
+ !(addr.resource_type == ACPI_IO_RANGE &&
+ addr.info.io.translation))
+ return AE_OK;
+
+ /* Store the resource in xenbus for the guest */
+ len = snprintf(str, sizeof(str), "root-%d-resource-%d",
+ info->root_num, info->resource_count);
+ if (unlikely(len >= (sizeof(str) - 1)))
+ return AE_OK;
+
+ buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
+ if (!buf)
+ return AE_OK;
+
+ /* Clean out resource_source */
+ res->data.address64.resource_source.index = 0xFF;
+ res->data.address64.resource_source.string_length = 0;
+ res->data.address64.resource_source.string_ptr = NULL;
+
+ ptr = (unsigned char *)res;
+
+ /* Turn the acpi_resource into an ASCII byte stream */
+ for (i = 0; i < sizeof(*res); i++) {
+ snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
+ strncat(buf, tmp, 2);
+ }
+
+ err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
+ str, "%s", buf);
+
+ if (!err)
+ info->resource_count++;
+
+ kfree(buf);
+
+ return AE_OK;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry;
+ int i, root_num, len, err = 0;
+ unsigned int domain, bus;
+ char str[64];
+ struct walk_info info;
+
+ spin_lock(&dev_data->lock);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ /* First publish all the domain:bus info */
+ err = publish_root_cb(pdev, cntrl_entry->domain,
+ cntrl_entry->bus);
+ if (err)
+ goto out;
+
+ /*
+ * Now figure out which root-%d this belongs to
+ * so we can associate resources with it.
+ */
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+
+ if (err != 1)
+ goto out;
+
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &domain, &bus);
+ if (err != 2)
+ goto out;
+
+ /* Is this the one we just published? */
+ if (domain == cntrl_entry->domain &&
+ bus == cntrl_entry->bus)
+ break;
+ }
+
+ if (i == root_num)
+ goto out;
+
+ info.pdev = pdev;
+ info.resource_count = 0;
+ info.root_num = i;
+
+ /* Let ACPI do the heavy lifting on decoding resources */
+ acpi_walk_resources(cntrl_entry->controller->acpi_handle,
+ METHOD_NAME__CRS, write_xenbus_resource,
+ &info);
+
+ /* No resouces. OK. On to the next one */
+ if (!info.resource_count)
+ continue;
+
+ /* Store the number of resources we wrote for this root-%d */
+ len = snprintf(str, sizeof(str), "root-%d-resources", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%d", info.resource_count);
+ if (err)
+ goto out;
+ }
+
+ /* Finally, write some magic to synchronize with the guest. */
+ len = snprintf(str, sizeof(str), "root-resource-magic");
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%lx", (sizeof(struct acpi_resource) *2) + 1);
+
+out:
+ spin_unlock(&dev_data->lock);
+
+ return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_list_entry *cntrl_entry, *c;
+ struct controller_dev_entry *dev_entry, *d;
+
+ list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
+ list_for_each_entry_safe(dev_entry, d,
+ &cntrl_entry->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+ list_del(&cntrl_entry->list);
+ kfree(cntrl_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus, unsigned int *devfn)
+{
+ struct controller_dev_data *dev_data = pdev->pci_dev_data;
+ struct controller_dev_entry *dev_entry;
+ struct controller_list_entry *cntrl_entry;
+ unsigned long flags;
+ int found = 0;
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+ list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+ if ((dev_entry->dev->bus->number ==
+ pcidev->bus->number) &&
+ (dev_entry->dev->devfn ==
+ pcidev->devfn) &&
+ (pci_domain_nr(dev_entry->dev->bus) ==
+ pci_domain_nr(pcidev->bus))) {
+ found = 1;
+ *domain = cntrl_entry->domain;
+ *bus = cntrl_entry->bus;
+ *devfn = dev_entry->devfn;
+ goto out;
+ }
+ }
+ }
+out:
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+ return found;
+
+}
+
diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
new file mode 100644
index 0000000..5386bebf
--- /dev/null
+++ b/drivers/xen/pciback/passthrough.c
@@ -0,0 +1,178 @@
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list;
+ spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ struct pci_dev *dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+ && bus == (unsigned int)dev_entry->dev->bus->number
+ && devfn == dev_entry->dev->devfn) {
+ dev = dev_entry->dev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry;
+ unsigned long flags;
+ unsigned int domain, bus, devfn;
+ int err;
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry)
+ return -ENOMEM;
+ dev_entry->dev = dev;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+ list_add_tail(&dev_entry->list, &dev_data->dev_list);
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ /* Publish this device. */
+ domain = (unsigned int)pci_domain_nr(dev->bus);
+ bus = (unsigned int)dev->bus->number;
+ devfn = dev->devfn;
+ err = publish_cb(pdev, domain, bus, devfn, devid);
+
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_data->lock, flags);
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ if (dev_entry->dev == dev) {
+ list_del(&dev_entry->list);
+ found_dev = dev_entry->dev;
+ kfree(dev_entry);
+ }
+ }
+
+ spin_unlock_irqrestore(&dev_data->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ struct passthrough_dev_data *dev_data;
+
+ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ spin_lock_init(&dev_data->lock);
+
+ INIT_LIST_HEAD(&dev_data->dev_list);
+
+ pdev->pci_dev_data = dev_data;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_root_cb)
+{
+ int err = 0;
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *e;
+ struct pci_dev *dev;
+ int found;
+ unsigned int domain, bus;
+
+ spin_lock(&dev_data->lock);
+
+ list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+ /* Only publish this device as a root if none of its
+ * parent bridges are exported
+ */
+ found = 0;
+ dev = dev_entry->dev->bus->self;
+ for (; !found && dev != NULL; dev = dev->bus->self) {
+ list_for_each_entry(e, &dev_data->dev_list, list) {
+ if (dev == e->dev) {
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+ bus = (unsigned int)dev_entry->dev->bus->number;
+
+ if (!found) {
+ err = publish_root_cb(pdev, domain, bus);
+ if (err)
+ break;
+ }
+ }
+
+ spin_unlock(&dev_data->lock);
+
+ return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+ struct pci_dev_entry *dev_entry, *t;
+
+ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+ list_del(&dev_entry->list);
+ pcistub_put_pci_dev(dev_entry->dev);
+ kfree(dev_entry);
+ }
+
+ kfree(dev_data);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+
+{
+ *domain = pci_domain_nr(pcidev->bus);
+ *bus = pcidev->bus->number;
+ *devfn = pcidev->devfn;
+ return 1;
+}
diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
new file mode 100644
index 0000000..c4d1071
--- /dev/null
+++ b/drivers/xen/pciback/pci_stub.c
@@ -0,0 +1,1371 @@
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/pci.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypervisor.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define DRV_NAME "pciback"
+
+static char *pci_devs_to_hide;
+wait_queue_head_t aer_wait_queue;
+/*Add sem for sync AER handling and pciback remove/reconfigue ops,
+* We want to avoid in middle of AER ops, pciback devices is being removed
+*/
+static DECLARE_RWSEM(pcistub_sem);
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+ struct list_head slot_list;
+ int domain;
+ unsigned char bus;
+ unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+ struct kref kref;
+ struct list_head dev_list;
+ spinlock_t lock;
+
+ struct pci_dev *dev;
+ struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+ if (!psdev)
+ return NULL;
+
+ psdev->dev = pci_dev_get(dev);
+ if (!psdev->dev) {
+ kfree(psdev);
+ return NULL;
+ }
+
+ kref_init(&psdev->kref);
+ spin_lock_init(&psdev->lock);
+
+ return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+ struct pcistub_device *psdev;
+
+ psdev = container_of(kref, struct pcistub_device, kref);
+
+ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+
+ xen_unregister_device_domain_owner(psdev->dev);
+
+ /* Clean-up the device */
+ pciback_reset_device(psdev->dev);
+ pciback_config_free_dyn_fields(psdev->dev);
+ pciback_config_free_dev(psdev->dev);
+ kfree(pci_get_drvdata(psdev->dev));
+ pci_set_drvdata(psdev->dev, NULL);
+
+ pci_dev_put(psdev->dev);
+
+ kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+ kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+ kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+ pcistub_device_get(psdev);
+ goto out;
+ }
+ }
+
+ /* didn't find it */
+ psdev = NULL;
+
+out:
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
+ struct pcistub_device *psdev)
+{
+ struct pci_dev *pci_dev = NULL;
+ unsigned long flags;
+
+ pcistub_device_get(psdev);
+
+ spin_lock_irqsave(&psdev->lock, flags);
+ if (!psdev->pdev) {
+ psdev->pdev = pdev;
+ pci_dev = psdev->dev;
+ }
+ spin_unlock_irqrestore(&psdev->lock, flags);
+
+ if (!pci_dev)
+ pcistub_device_put(psdev);
+
+ return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+ int domain, int bus,
+ int slot, int func)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev != NULL
+ && domain == pci_domain_nr(psdev->dev->bus)
+ && bus == psdev->dev->bus->number
+ && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+ struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return found_dev;
+}
+
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /*hold this lock for avoiding breaking link between
+ * pcistub and pciback when AER is in processing
+ */
+ down_write(&pcistub_sem);
+ /* Cleanup our device
+ * (so it's ready for the next domain)
+ */
+ pciback_reset_device(found_psdev->dev);
+ pciback_config_free_dyn_fields(found_psdev->dev);
+ pciback_config_reset_dev(found_psdev->dev);
+
+ spin_lock_irqsave(&found_psdev->lock, flags);
+ found_psdev->pdev = NULL;
+ spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+ pcistub_device_put(found_psdev);
+ up_write(&pcistub_sem);
+}
+
+static int __devinit pcistub_match_one(struct pci_dev *dev,
+ struct pcistub_device_id *pdev_id)
+{
+ /* Match the specified device by domain, bus, slot, func and also if
+ * any of the device's parent bridges match.
+ */
+ for (; dev != NULL; dev = dev->bus->self) {
+ if (pci_domain_nr(dev->bus) == pdev_id->domain
+ && dev->bus->number == pdev_id->bus
+ && dev->devfn == pdev_id->devfn)
+ return 1;
+
+ /* Sometimes topmost bridge links to itself. */
+ if (dev == dev->bus->self)
+ break;
+ }
+
+ return 0;
+}
+
+static int __devinit pcistub_match(struct pci_dev *dev)
+{
+ struct pcistub_device_id *pdev_id;
+ unsigned long flags;
+ int found = 0;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+ if (pcistub_match_one(dev, pdev_id)) {
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return found;
+}
+
+static int __devinit pcistub_init_device(struct pci_dev *dev)
+{
+ struct pciback_dev_data *dev_data;
+ int err = 0;
+
+ dev_dbg(&dev->dev, "initializing...\n");
+
+ /* The PCI backend is not intended to be a module (or to work with
+ * removable PCI devices (yet). If it were, pciback_config_free()
+ * would need to be called somewhere to free the memory allocated
+ * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+ */
+ dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]")
+ + strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+ if (!dev_data) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pci_set_drvdata(dev, dev_data);
+
+ /*
+ * Setup name for fake IRQ handler. It will only be enabled
+ * once the device is turned on by the guest.
+ */
+ sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+
+ dev_dbg(&dev->dev, "initializing config\n");
+
+ init_waitqueue_head(&aer_wait_queue);
+ err = pciback_config_init_dev(dev);
+ if (err)
+ goto out;
+
+ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+ * must do this here because pcibios_enable_device may specify
+ * the pci device's true irq (and possibly its other resources)
+ * if they differ from what's in the configuration space.
+ * This makes the assumption that the device's resources won't
+ * change after this point (otherwise this code may break!)
+ */
+ dev_dbg(&dev->dev, "enabling device\n");
+ err = pci_enable_device(dev);
+ if (err)
+ goto config_release;
+
+ /* Now disable the device (this also ensures some private device
+ * data is setup before we export)
+ */
+ dev_dbg(&dev->dev, "reset device\n");
+ pciback_reset_device(dev);
+
+ return 0;
+
+config_release:
+ pciback_config_free_dev(dev);
+
+out:
+ pci_set_drvdata(dev, NULL);
+ kfree(dev_data);
+ return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ pr_debug("pciback: pcistub_init_devices_late\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ while (!list_empty(&seized_devices)) {
+ psdev = container_of(seized_devices.next,
+ struct pcistub_device, dev_list);
+ list_del(&psdev->dev_list);
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ err = pcistub_init_device(psdev->dev);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d initializing device\n", err);
+ kfree(psdev);
+ psdev = NULL;
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (psdev)
+ list_add_tail(&psdev->dev_list, &pcistub_devices);
+ }
+
+ initialize_devices = 1;
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ return 0;
+}
+
+static int __devinit pcistub_seize(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ unsigned long flags;
+ int err = 0;
+
+ psdev = pcistub_device_alloc(dev);
+ if (!psdev)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (initialize_devices) {
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* don't want irqs disabled when calling pcistub_init_device */
+ err = pcistub_init_device(psdev->dev);
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ if (!err)
+ list_add(&psdev->dev_list, &pcistub_devices);
+ } else {
+ dev_dbg(&dev->dev, "deferring initialization\n");
+ list_add(&psdev->dev_list, &seized_devices);
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (err)
+ pcistub_device_put(psdev);
+
+ return err;
+}
+
+static int __devinit pcistub_probe(struct pci_dev *dev,
+ const struct pci_device_id *id)
+{
+ int err = 0;
+
+ dev_dbg(&dev->dev, "probing...\n");
+
+ if (pcistub_match(dev)) {
+
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+ dev_err(&dev->dev, "can't export pci devices that "
+ "don't have a normal (0) or bridge (1) "
+ "header type!\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ dev_info(&dev->dev, "seizing device\n");
+ err = pcistub_seize(dev);
+ } else
+ /* Didn't find the device */
+ err = -ENODEV;
+
+out:
+ return err;
+}
+
+static void pcistub_remove(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev, *found_psdev = NULL;
+ unsigned long flags;
+
+ dev_dbg(&dev->dev, "removing\n");
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+ pciback_config_quirk_release(dev);
+
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (psdev->dev == dev) {
+ found_psdev = psdev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ if (found_psdev) {
+ dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
+ found_psdev->pdev);
+
+ if (found_psdev->pdev) {
+ printk(KERN_WARNING "pciback: ****** removing device "
+ "%s while still in-use! ******\n",
+ pci_name(found_psdev->dev));
+ printk(KERN_WARNING "pciback: ****** driver domain may "
+ "still access this device's i/o resources!\n");
+ printk(KERN_WARNING "pciback: ****** shutdown driver "
+ "domain before binding device\n");
+ printk(KERN_WARNING "pciback: ****** to other drivers "
+ "or domains\n");
+
+ pciback_release_pci_dev(found_psdev->pdev,
+ found_psdev->dev);
+ }
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_del(&found_psdev->dev_list);
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+ /* the final put for releasing from the list */
+ pcistub_device_put(found_psdev);
+ }
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = {
+ {
+ .vendor = PCI_ANY_ID,
+ .device = PCI_ANY_ID,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ },
+ {0,},
+};
+
+#define PCI_NODENAME_MAX 40
+static void kill_domain_by_device(struct pcistub_device *psdev)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ char nodename[PCI_NODENAME_MAX];
+
+ if (!psdev)
+ dev_err(&psdev->dev->dev,
+ "device is NULL when do AER recovery/kill_domain\n");
+ snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
+ psdev->pdev->xdev->otherend_id);
+ nodename[strlen(nodename)] = '\0';
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ dev_err(&psdev->dev->dev,
+ "error %d when start xenbus transaction\n", err);
+ return;
+ }
+ /*PV AER handlers will set this flag*/
+ xenbus_printf(xbt, nodename, "aerState" , "aerfail");
+ err = xenbus_transaction_end(xbt, 0);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ dev_err(&psdev->dev->dev,
+ "error %d when end xenbus transaction\n", err);
+ return;
+ }
+}
+
+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
+ * backend need to have cooperation. In pciback, those steps will do similar
+ * jobs: send service request and waiting for front_end response.
+*/
+static pci_ers_result_t common_process(struct pcistub_device *psdev,
+ pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
+{
+ pci_ers_result_t res = result;
+ struct xen_pcie_aer_op *aer_op;
+ int ret;
+
+ /*with PV AER drivers*/
+ aer_op = &(psdev->pdev->sh_info->aer_op);
+ aer_op->cmd = aer_cmd ;
+ /*useful for error_detected callback*/
+ aer_op->err = state;
+ /*pcifront_end BDF*/
+ ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
+ &aer_op->domain, &aer_op->bus, &aer_op->devfn);
+ if (!ret) {
+ dev_err(&psdev->dev->dev,
+ "pciback: failed to get pcifront device\n");
+ return PCI_ERS_RESULT_NONE;
+ }
+ wmb();
+
+ dev_dbg(&psdev->dev->dev,
+ "pciback: aer_op %x dom %x bus %x devfn %x\n",
+ aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
+ /*local flag to mark there's aer request, pciback callback will use this
+ * flag to judge whether we need to check pci-front give aer service
+ * ack signal
+ */
+ set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+ /*It is possible that a pcifront conf_read_write ops request invokes
+ * the callback which cause the spurious execution of wake_up.
+ * Yet it is harmless and better than a spinlock here
+ */
+ set_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags);
+ wmb();
+ notify_remote_via_irq(psdev->pdev->evtchn_irq);
+
+ ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
+
+ if (!ret) {
+ if (test_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&psdev->dev->dev,
+ "pcifront aer process not responding!\n");
+ clear_bit(_XEN_PCIB_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags);
+ aer_op->err = PCI_ERS_RESULT_NONE;
+ return res;
+ }
+ }
+ clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+ if (test_bit(_XEN_PCIF_active,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_dbg(&psdev->dev->dev,
+ "schedule pci_conf service in pciback\n");
+ test_and_schedule_op(psdev->pdev);
+ }
+
+ res = (pci_ers_result_t)aer_op->err;
+ return res;
+}
+
+/*
+* pciback_slot_reset: it will send the slot_reset request to pcifront in case
+* of the device driver could provide this service, and then wait for pcifront
+* ack.
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_RECOVERED;
+ dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ goto release;
+ }
+ result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER slot_reset service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return result;
+
+}
+
+
+/*pciback_mmio_enabled: it will send the mmio_enabled request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_RECOVERED;
+ dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ goto release;
+ }
+ result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER mmio_enabled service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return result;
+}
+
+/*pciback_error_detected: it will send the error_detected request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+* @error: the current PCI connection state
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
+ pci_channel_state_t error)
+{
+ struct pcistub_device *psdev;
+ pci_ers_result_t result;
+
+ result = PCI_ERS_RESULT_CAN_RECOVER;
+ dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ /*Guest owns the device yet no aer handler regiested, kill guest*/
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+ result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
+ if (result == PCI_ERS_RESULT_NONE ||
+ result == PCI_ERS_RESULT_DISCONNECT) {
+ dev_dbg(&dev->dev,
+ "No AER error_detected service or disconnected!\n");
+ kill_domain_by_device(psdev);
+ }
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return result;
+}
+
+/*pciback_error_resume: it will send the error_resume request to pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+*/
+
+static void pciback_error_resume(struct pci_dev *dev)
+{
+ struct pcistub_device *psdev;
+
+ dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
+ dev->bus->number, dev->devfn);
+
+ down_write(&pcistub_sem);
+ psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+ dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ PCI_FUNC(dev->devfn));
+
+ if (!psdev || !psdev->pdev) {
+ dev_err(&dev->dev,
+ "pciback device is not found/assigned\n");
+ goto end;
+ }
+
+ if (!psdev->pdev->sh_info) {
+ dev_err(&dev->dev, "pciback device is not connected or owned"
+ " by HVM, kill it\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+
+ if (!test_bit(_XEN_PCIB_AERHANDLER,
+ (unsigned long *)&psdev->pdev->sh_info->flags)) {
+ dev_err(&dev->dev,
+ "guest with no AER driver should have been killed\n");
+ kill_domain_by_device(psdev);
+ goto release;
+ }
+ common_process(psdev, 1, XEN_PCI_OP_aer_resume,
+ PCI_ERS_RESULT_RECOVERED);
+release:
+ pcistub_device_put(psdev);
+end:
+ up_write(&pcistub_sem);
+ return;
+}
+
+/*add pciback AER handling*/
+static struct pci_error_handlers pciback_error_handler = {
+ .error_detected = pciback_error_detected,
+ .mmio_enabled = pciback_mmio_enabled,
+ .slot_reset = pciback_slot_reset,
+ .resume = pciback_error_resume,
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver pciback_pci_driver = {
+ .name = DRV_NAME,
+ .id_table = pcistub_ids,
+ .probe = pcistub_probe,
+ .remove = pcistub_remove,
+ .err_handler = &pciback_error_handler,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+ int *slot, int *func)
+{
+ int err;
+
+ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
+ if (err == 4)
+ return 0;
+ else if (err < 0)
+ return -EINVAL;
+
+ /* try again without domain */
+ *domain = 0;
+ err = sscanf(buf, " %x:%x.%x", bus, slot, func);
+ if (err == 3)
+ return 0;
+
+ return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+ *slot, int *func, int *reg, int *size, int *mask)
+{
+ int err;
+
+ err =
+ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
+ func, reg, size, mask);
+ if (err == 7)
+ return 0;
+ return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id;
+ unsigned long flags;
+
+ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+ if (!pci_dev_id)
+ return -ENOMEM;
+
+ pci_dev_id->domain = domain;
+ pci_dev_id->bus = bus;
+ pci_dev_id->devfn = PCI_DEVFN(slot, func);
+
+ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
+ domain, bus, slot, func);
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+ struct pcistub_device_id *pci_dev_id, *t;
+ int devfn = PCI_DEVFN(slot, func);
+ int err = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
+ slot_list) {
+ if (pci_dev_id->domain == domain
+ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
+ /* Don't break; here because it's possible the same
+ * slot could be in the list more than once
+ */
+ list_del(&pci_dev_id->slot_list);
+ kfree(pci_dev_id);
+
+ err = 0;
+
+ pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
+ "seize list\n", domain, bus, slot, func);
+ }
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
+ int size, int mask)
+{
+ int err = 0;
+ struct pcistub_device *psdev;
+ struct pci_dev *dev;
+ struct config_field *field;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev || !psdev->dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ dev = psdev->dev;
+
+ field = kzalloc(sizeof(*field), GFP_ATOMIC);
+ if (!field) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ field->offset = reg;
+ field->size = size;
+ field->mask = mask;
+ field->init = NULL;
+ field->reset = NULL;
+ field->release = NULL;
+ field->clean = pciback_config_field_free;
+
+ err = pciback_config_quirks_add_field(dev, field);
+ if (err)
+ kfree(field);
+out:
+ return err;
+}
+
+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+
+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+
+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device_id *pci_dev_id;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+ if (count >= PAGE_SIZE)
+ break;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%04x:%02x:%02x.%01x\n",
+ pci_dev_id->domain, pci_dev_id->bus,
+ PCI_SLOT(pci_dev_id->devfn),
+ PCI_FUNC(pci_dev_id->devfn));
+ }
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+
+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
+static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count,
+ "%s:%s:%sing:%ld\n",
+ pci_name(psdev->dev),
+ dev_data->isr_on ? "on" : "off",
+ dev_data->ack_intr ? "ack" : "not ack",
+ dev_data->handled);
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+
+DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
+
+static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
+ const char *buf,
+ size_t count)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ int domain, bus, slot, func;
+ int err = -ENOENT;
+
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+
+ psdev = pcistub_device_find(domain, bus, slot, func);
+
+ if (!psdev)
+ goto out;
+
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data)
+ goto out;
+
+ dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
+ dev_data->irq_name, dev_data->isr_on,
+ !dev_data->isr_on);
+
+ dev_data->isr_on = !(dev_data->isr_on);
+ if (dev_data->isr_on)
+ dev_data->ack_intr = 1;
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch);
+
+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func, reg, size, mask;
+ int err;
+
+ err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
+ &mask);
+ if (err)
+ goto out;
+
+ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
+ int count = 0;
+ unsigned long flags;
+ struct pciback_config_quirk *quirk;
+ struct pciback_dev_data *dev_data;
+ const struct config_field *field;
+ const struct config_field_entry *cfg_entry;
+
+ spin_lock_irqsave(&device_ids_lock, flags);
+ list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+ quirk->pdev->bus->number,
+ PCI_SLOT(quirk->pdev->devfn),
+ PCI_FUNC(quirk->pdev->devfn),
+ quirk->devid.vendor, quirk->devid.device,
+ quirk->devid.subvendor,
+ quirk->devid.subdevice);
+
+ dev_data = pci_get_drvdata(quirk->pdev);
+
+ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+ field = cfg_entry->field;
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "\t\t%08x:%01x:%08x\n",
+ cfg_entry->base_offset +
+ field->offset, field->size,
+ field->mask);
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&device_ids_lock, flags);
+
+ return count;
+}
+
+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
+
+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
+ size_t count)
+{
+ int domain, bus, slot, func;
+ int err;
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ err = str_to_slot(buf, &domain, &bus, &slot, &func);
+ if (err)
+ goto out;
+ psdev = pcistub_device_find(domain, bus, slot, func);
+ if (!psdev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (!psdev->dev) {
+ err = -ENODEV;
+ goto release;
+ }
+ dev_data = pci_get_drvdata(psdev->dev);
+ /* the driver data for a device should never be null at this point */
+ if (!dev_data) {
+ err = -ENXIO;
+ goto release;
+ }
+ if (!dev_data->permissive) {
+ dev_data->permissive = 1;
+ /* Let user know that what they're doing could be unsafe */
+ dev_warn(&psdev->dev->dev, "enabling permissive mode "
+ "configuration space accesses!\n");
+ dev_warn(&psdev->dev->dev,
+ "permissive mode is potentially unsafe!\n");
+ }
+release:
+ pcistub_device_put(psdev);
+out:
+ if (!err)
+ err = count;
+ return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+ struct pcistub_device *psdev;
+ struct pciback_dev_data *dev_data;
+ size_t count = 0;
+ unsigned long flags;
+ spin_lock_irqsave(&pcistub_devices_lock, flags);
+ list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+ if (count >= PAGE_SIZE)
+ break;
+ if (!psdev->dev)
+ continue;
+ dev_data = pci_get_drvdata(psdev->dev);
+ if (!dev_data || !dev_data->permissive)
+ continue;
+ count +=
+ scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+ pci_name(psdev->dev));
+ }
+ spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+ return count;
+}
+
+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+
+static void pcistub_exit(void)
+{
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_remove_slot);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
+ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handlers);
+ driver_remove_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handler_state);
+ pci_unregister_driver(&pciback_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+ int pos = 0;
+ int err = 0;
+ int domain, bus, slot, func;
+ int parsed;
+
+ if (pci_devs_to_hide && *pci_devs_to_hide) {
+ do {
+ parsed = 0;
+
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x:%x.%x) %n",
+ &domain, &bus, &slot, &func, &parsed);
+ if (err != 4) {
+ domain = 0;
+ err = sscanf(pci_devs_to_hide + pos,
+ " (%x:%x.%x) %n",
+ &bus, &slot, &func, &parsed);
+ if (err != 3)
+ goto parse_error;
+ }
+
+ err = pcistub_device_id_add(domain, bus, slot, func);
+ if (err)
+ goto out;
+
+ /* if parsed<=0, we've reached the end of the string */
+ pos += parsed;
+ } while (parsed > 0 && pci_devs_to_hide[pos]);
+ }
+
+ /* If we're the first PCI Device Driver to register, we're the
+ * first one to get offered PCI devices as they become
+ * available (and thus we can be the first to grab them)
+ */
+ err = pci_register_driver(&pciback_pci_driver);
+ if (err < 0)
+ goto out;
+
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_new_slot);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_remove_slot);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_slots);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_quirks);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_permissive);
+
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handlers);
+ if (!err)
+ err = driver_create_file(&pciback_pci_driver.driver,
+ &driver_attr_irq_handler_state);
+ if (err)
+ pcistub_exit();
+
+out:
+ return err;
+
+parse_error:
+ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
+ pci_devs_to_hide + pos);
+ return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so pciback *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+static int __init pciback_init(void)
+{
+ int err;
+
+ if (!xen_initial_domain())
+ return -ENODEV;
+
+ err = pciback_config_init();
+ if (err)
+ return err;
+
+#ifdef MODULE
+ err = pcistub_init();
+ if (err < 0)
+ return err;
+#endif
+
+ pcistub_init_devices_late();
+ err = pciback_xenbus_register();
+ if (err)
+ pcistub_exit();
+
+ return err;
+}
+
+static void __exit pciback_cleanup(void)
+{
+ pciback_xenbus_unregister();
+ pcistub_exit();
+}
+
+module_init(pciback_init);
+module_exit(pciback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
new file mode 100644
index 0000000..5c14020
--- /dev/null
+++ b/drivers/xen/pciback/pciback.h
@@ -0,0 +1,142 @@
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/atomic.h>
+#include <xen/interface/io/pciif.h>
+
+struct pci_dev_entry {
+ struct list_head list;
+ struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active (0)
+#define PDEVF_op_active (1<<(_PDEVF_op_active))
+#define _PCIB_op_pending (1)
+#define PCIB_op_pending (1<<(_PCIB_op_pending))
+
+struct pciback_device {
+ void *pci_dev_data;
+ spinlock_t dev_lock;
+
+ struct xenbus_device *xdev;
+
+ struct xenbus_watch be_watch;
+ u8 be_watching;
+
+ int evtchn_irq;
+
+ struct xen_pci_sharedinfo *sh_info;
+
+ unsigned long flags;
+
+ struct work_struct op_work;
+};
+
+struct pciback_dev_data {
+ struct list_head config_fields;
+ unsigned int permissive:1;
+ unsigned int warned_on_write:1;
+ unsigned int enable_intx:1;
+ unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
+ unsigned int ack_intr:1; /* .. and ACK-ing */
+ unsigned long handled;
+ unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
+ char irq_name[0]; /* pciback[000:04:00.0] */
+};
+
+/* Used by XenBus and pciback_ops.c */
+extern wait_queue_head_t aer_wait_queue;
+extern struct workqueue_struct *pciback_wq;
+/* Used by pcistub.c and conf_space_quirks.c */
+extern struct list_head pciback_quirks;
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+ int domain, int bus,
+ int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+ struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void pciback_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int pciback_config_init(void);
+int pciback_config_init_dev(struct pci_dev *dev);
+void pciback_config_free_dyn_fields(struct pci_dev *dev);
+void pciback_config_reset_dev(struct pci_dev *dev);
+void pciback_config_free_dev(struct pci_dev *dev);
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+ u32 *ret_val);
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus);
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb);
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn);
+
+/**
+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
+* before sending aer request to pcifront, so that guest could identify
+* device, coopearte with pciback to finish aer recovery job if device driver
+* has the capability
+*/
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn);
+int pciback_init_devices(struct pciback_device *pdev);
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb cb);
+void pciback_release_devices(struct pciback_device *pdev);
+
+/* Handles events from front-end */
+irqreturn_t pciback_handle_event(int irq, void *dev_id);
+void pciback_do_op(struct work_struct *data);
+
+int pciback_xenbus_register(void);
+void pciback_xenbus_unregister(void);
+
+#ifdef CONFIG_PCI_MSI
+int pciback_enable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+
+int pciback_disable_msi(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+
+
+int pciback_enable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+
+int pciback_disable_msix(struct pciback_device *pdev,
+ struct pci_dev *dev, struct xen_pci_op *op);
+#endif
+extern int verbose_request;
+
+void test_and_schedule_op(struct pciback_device *pdev);
+#endif
+
+/* Handles shared IRQs that can to device domain and control domain. */
+void pciback_irq_handler(struct pci_dev *dev, int reset);
+irqreturn_t pciback_guest_interrupt(int irq, void *dev_id);
diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
new file mode 100644
index 0000000..28a2a55
--- /dev/null
+++ b/drivers/xen/pciback/pciback_ops.c
@@ -0,0 +1,248 @@
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <xen/events.h>
+#include <linux/sched.h>
+#include "pciback.h"
+
+int verbose_request;
+module_param(verbose_request, int, 0644);
+
+/* Ensure a device is has the fake IRQ handler "turned on/off" and is
+ * ready to be exported. This MUST be run after pciback_reset_device
+ * which does the actual PCI device enable/disable.
+ */
+void pciback_control_isr(struct pci_dev *dev, int reset)
+{
+ struct pciback_dev_data *dev_data;
+ int rc;
+ int enable = 0;
+
+ dev_data = pci_get_drvdata(dev);
+ if (!dev_data)
+ return;
+
+ /* We don't deal with bridges */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return;
+
+ if (reset) {
+ dev_data->enable_intx = 0;
+ dev_data->ack_intr = 0;
+ }
+ enable = dev_data->enable_intx;
+
+ /* Asked to disable, but ISR isn't runnig */
+ if (!enable && !dev_data->isr_on)
+ return;
+
+ /* Squirrel away the IRQs in the dev_data. We need this
+ * b/c when device transitions to MSI, the dev->irq is
+ * overwritten with the MSI vector.
+ */
+ if (enable)
+ dev_data->irq = dev->irq;
+
+ /*
+ * SR-IOV devices in all use MSI-X and have no legacy
+ * interrupts, so inhibit creating a fake IRQ handler for them.
+ */
+ if (dev_data->irq == 0)
+ goto out;
+
+ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
+ dev_data->irq_name,
+ dev_data->irq,
+ pci_is_enabled(dev) ? "on" : "off",
+ dev->msi_enabled ? "MSI" : "",
+ dev->msix_enabled ? "MSI/X" : "",
+ dev_data->isr_on ? "enable" : "disable",
+ enable ? "enable" : "disable");
+
+ if (enable) {
+ rc = request_irq(dev_data->irq,
+ pciback_guest_interrupt, IRQF_SHARED,
+ dev_data->irq_name, dev);
+ if (rc) {
+ dev_err(&dev->dev, "%s: failed to install fake IRQ " \
+ "handler for IRQ %d! (rc:%d)\n",
+ dev_data->irq_name, dev_data->irq, rc);
+ goto out;
+ }
+ } else {
+ free_irq(dev_data->irq, dev);
+ dev_data->irq = 0;
+ }
+ dev_data->isr_on = enable;
+ dev_data->ack_intr = enable;
+out:
+ dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
+ dev_data->irq_name,
+ dev_data->irq,
+ pci_is_enabled(dev) ? "on" : "off",
+ dev->msi_enabled ? "MSI" : "",
+ dev->msix_enabled ? "MSI/X" : "",
+ enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
+ (dev_data->isr_on ? "failed to disable" : "disabled"));
+}
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see pciback_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void pciback_reset_device(struct pci_dev *dev)
+{
+ u16 cmd;
+
+ pciback_control_isr(dev, 1 /* reset device */);
+
+ /* Disable devices (but not bridges) */
+ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+#ifdef CONFIG_PCI_MSI
+ /* The guest could have been abruptly killed without
+ * disabling MSI/MSI-X interrupts.*/
+ if (dev->msix_enabled)
+ pci_disable_msix(dev);
+ if (dev->msi_enabled)
+ pci_disable_msi(dev);
+#endif
+ pci_disable_device(dev);
+
+ pci_write_config_word(dev, PCI_COMMAND, 0);
+
+ dev->is_busmaster = 0;
+ } else {
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+ if (cmd & (PCI_COMMAND_INVALIDATE)) {
+ cmd &= ~(PCI_COMMAND_INVALIDATE);
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+ dev->is_busmaster = 0;
+ }
+ }
+}
+/*
+* Now the same evtchn is used for both pcifront conf_read_write request
+* as well as pcie aer front end ack. We use a new work_queue to schedule
+* pciback conf_read_write service for avoiding confict with aer_core
+* do_recovery job which also use the system default work_queue
+*/
+void test_and_schedule_op(struct pciback_device *pdev)
+{
+ /* Check that frontend is requesting an operation and that we are not
+ * already processing a request */
+ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
+ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
+ queue_work(pciback_wq, &pdev->op_work);
+ }
+ /*_XEN_PCIB_active should have been cleared by pcifront. And also make
+ sure pciback is waiting for ack by checking _PCIB_op_pending*/
+ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+ && test_bit(_PCIB_op_pending, &pdev->flags)) {
+ wake_up(&aer_wait_queue);
+ }
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct pciback_device as a parameter */
+
+void pciback_do_op(struct work_struct *data)
+{
+ struct pciback_device *pdev =
+ container_of(data, struct pciback_device, op_work);
+ struct pci_dev *dev;
+ struct pciback_dev_data *dev_data = NULL;
+ struct xen_pci_op *op = &pdev->sh_info->op;
+ int test_intx = 0;
+
+ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+ if (dev == NULL)
+ op->err = XEN_PCI_ERR_dev_not_found;
+ else {
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ test_intx = dev_data->enable_intx;
+ switch (op->cmd) {
+ case XEN_PCI_OP_conf_read:
+ op->err = pciback_config_read(dev,
+ op->offset, op->size, &op->value);
+ break;
+ case XEN_PCI_OP_conf_write:
+ op->err = pciback_config_write(dev,
+ op->offset, op->size, op->value);
+ break;
+#ifdef CONFIG_PCI_MSI
+ case XEN_PCI_OP_enable_msi:
+ op->err = pciback_enable_msi(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_disable_msi:
+ op->err = pciback_disable_msi(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_enable_msix:
+ op->err = pciback_enable_msix(pdev, dev, op);
+ break;
+ case XEN_PCI_OP_disable_msix:
+ op->err = pciback_disable_msix(pdev, dev, op);
+ break;
+#endif
+ default:
+ op->err = XEN_PCI_ERR_not_implemented;
+ break;
+ }
+ }
+ if (!op->err && dev && dev_data) {
+ /* Transition detected */
+ if ((dev_data->enable_intx != test_intx))
+ pciback_control_isr(dev, 0 /* no reset */);
+ }
+ /* Tell the driver domain that we're done. */
+ wmb();
+ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+ notify_remote_via_irq(pdev->evtchn_irq);
+
+ /* Mark that we're done. */
+ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
+ clear_bit(_PDEVF_op_active, &pdev->flags);
+ smp_mb__after_clear_bit(); /* /before/ final check for work */
+
+ /* Check to see if the driver domain tried to start another request in
+ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
+ */
+ test_and_schedule_op(pdev);
+}
+
+irqreturn_t pciback_handle_event(int irq, void *dev_id)
+{
+ struct pciback_device *pdev = dev_id;
+
+ test_and_schedule_op(pdev);
+
+ return IRQ_HANDLED;
+}
+irqreturn_t pciback_guest_interrupt(int irq, void *dev_id)
+{
+ struct pci_dev *dev = (struct pci_dev *)dev_id;
+ struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+ if (dev_data->isr_on && dev_data->ack_intr) {
+ dev_data->handled++;
+ if ((dev_data->handled % 1000) == 0) {
+ if (xen_test_irq_shared(irq)) {
+ printk(KERN_INFO "%s IRQ line is not shared "
+ "with other domains. Turning ISR off\n",
+ dev_data->irq_name);
+ dev_data->ack_intr = 0;
+ }
+ }
+ return IRQ_HANDLED;
+ }
+ return IRQ_NONE;
+}
diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
new file mode 100644
index 0000000..efb922d
--- /dev/null
+++ b/drivers/xen/pciback/slot.c
@@ -0,0 +1,191 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
+ * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+/* There are at most 32 slots in a pci bus. */
+#define PCI_SLOT_MAX 32
+
+#define PCI_BUS_NBR 2
+
+struct slot_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
+ spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev *dev = NULL;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if (domain != 0 || PCI_FUNC(devfn) != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
+ return NULL;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+ dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (slot_dev->slots[bus][slot] == NULL) {
+ printk(KERN_INFO
+ "pciback: slot: %s: assign to virtual "
+ "slot %d, bus %d\n",
+ pci_name(dev), slot, bus);
+ slot_dev->slots[bus][slot] = dev;
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+unlock:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ /* Publish this device. */
+ if (!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
+out:
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (slot_dev->slots[bus][slot] == dev) {
+ slot_dev->slots[bus][slot] = NULL;
+ found_dev = dev;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev;
+
+ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
+ if (!slot_dev)
+ return -ENOMEM;
+
+ spin_lock_init(&slot_dev->lock);
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+ slot_dev->slots[bus][slot] = NULL;
+
+ pdev->pci_dev_data = slot_dev;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ int slot, bus;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *dev;
+
+ for (bus = 0; bus < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ dev = slot_dev->slots[bus][slot];
+ if (dev != NULL)
+ pcistub_put_pci_dev(dev);
+ }
+
+ kfree(slot_dev);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+{
+ int slot, busnr;
+ struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+ struct pci_dev *dev;
+ int found = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&slot_dev->lock, flags);
+
+ for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ dev = slot_dev->slots[busnr][slot];
+ if (dev && dev->bus->number == pcidev->bus->number
+ && dev->devfn == pcidev->devfn
+ && pci_domain_nr(dev->bus) ==
+ pci_domain_nr(pcidev->bus)) {
+ found = 1;
+ *domain = 0;
+ *bus = busnr;
+ *devfn = PCI_DEVFN(slot, 0);
+ goto out;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&slot_dev->lock, flags);
+ return found;
+
+}
diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
new file mode 100644
index 0000000..2857ab8
--- /dev/null
+++ b/drivers/xen/pciback/vpci.c
@@ -0,0 +1,244 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ * to the frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+
+struct vpci_dev_data {
+ /* Access to dev_list must be protected by lock */
+ struct list_head dev_list[PCI_SLOT_MAX];
+ spinlock_t lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+ return head->next;
+}
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if (domain != 0 || bus != 0)
+ return NULL;
+
+ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[PCI_SLOT(devfn)],
+ list) {
+ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+ dev = entry->dev;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+ }
+ return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+ return 1;
+
+ return 0;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+ int devid, publish_pci_dev_cb publish_cb)
+{
+ int err = 0, slot, func = -1;
+ struct pci_dev_entry *t, *dev_entry;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+
+ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+ err = -EFAULT;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Can't export bridges on the virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+ if (!dev_entry) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error adding entry to virtual PCI bus");
+ goto out;
+ }
+
+ dev_entry->dev = dev;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ /* Keep multi-function devices together on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (!list_empty(&vpci_dev->dev_list[slot])) {
+ t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+ struct pci_dev_entry, list);
+
+ if (match_slot(dev, t->dev)) {
+ pr_info("pciback: vpci: %s: "
+ "assign to virtual slot %d func %d\n",
+ pci_name(dev), slot,
+ PCI_FUNC(dev->devfn));
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+ }
+
+ /* Assign to a new slot on the virtual PCI bus */
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ if (list_empty(&vpci_dev->dev_list[slot])) {
+ printk(KERN_INFO
+ "pciback: vpci: %s: assign to virtual slot %d\n",
+ pci_name(dev), slot);
+ list_add_tail(&dev_entry->list,
+ &vpci_dev->dev_list[slot]);
+ func = PCI_FUNC(dev->devfn);
+ goto unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "No more space on root virtual PCI bus");
+
+unlock:
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+ /* Publish this device. */
+ if (!err)
+ err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+
+out:
+ return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ struct pci_dev *found_dev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ if (e->dev == dev) {
+ list_del(&e->list);
+ found_dev = e->dev;
+ kfree(e);
+ goto out;
+ }
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+ if (found_dev)
+ pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev;
+
+ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+ if (!vpci_dev)
+ return -ENOMEM;
+
+ spin_lock_init(&vpci_dev->lock);
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
+ pdev->pci_dev_data = vpci_dev;
+
+ return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+ publish_pci_root_cb publish_cb)
+{
+ /* The Virtual PCI bus has only one root */
+ return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+ int slot;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ struct pci_dev_entry *e, *tmp;
+ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+ list) {
+ list_del(&e->list);
+ pcistub_put_pci_dev(e->dev);
+ kfree(e);
+ }
+ }
+
+ kfree(vpci_dev);
+ pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+ struct pciback_device *pdev,
+ unsigned int *domain, unsigned int *bus,
+ unsigned int *devfn)
+{
+ struct pci_dev_entry *entry;
+ struct pci_dev *dev = NULL;
+ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+ unsigned long flags;
+ int found = 0, slot;
+
+ spin_lock_irqsave(&vpci_dev->lock, flags);
+ for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+ list_for_each_entry(entry,
+ &vpci_dev->dev_list[slot],
+ list) {
+ dev = entry->dev;
+ if (dev && dev->bus->number == pcidev->bus->number
+ && pci_domain_nr(dev->bus) ==
+ pci_domain_nr(pcidev->bus)
+ && dev->devfn == pcidev->devfn) {
+ found = 1;
+ *domain = 0;
+ *bus = 0;
+ *devfn = PCI_DEVFN(slot,
+ PCI_FUNC(pcidev->devfn));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&vpci_dev->lock, flags);
+ return found;
+}
diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
new file mode 100644
index 0000000..70030c4
--- /dev/null
+++ b/drivers/xen/pciback/xenbus.c
@@ -0,0 +1,726 @@
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <linux/workqueue.h>
+#include "pciback.h"
+
+#define INVALID_EVTCHN_IRQ (-1)
+struct workqueue_struct *pciback_wq;
+
+static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
+{
+ struct pciback_device *pdev;
+
+ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
+ if (pdev == NULL)
+ goto out;
+ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+ pdev->xdev = xdev;
+ dev_set_drvdata(&xdev->dev, pdev);
+
+ spin_lock_init(&pdev->dev_lock);
+
+ pdev->sh_info = NULL;
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ pdev->be_watching = 0;
+
+ INIT_WORK(&pdev->op_work, pciback_do_op);
+
+ if (pciback_init_devices(pdev)) {
+ kfree(pdev);
+ pdev = NULL;
+ }
+out:
+ return pdev;
+}
+
+static void pciback_disconnect(struct pciback_device *pdev)
+{
+ spin_lock(&pdev->dev_lock);
+
+ /* Ensure the guest can't trigger our handler before removing devices */
+ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
+ unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+ pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+ }
+ spin_unlock(&pdev->dev_lock);
+
+ /* If the driver domain started an op, make sure we complete it
+ * before releasing the shared memory */
+
+ /* Note, the workqueue does not use spinlocks at all.*/
+ flush_workqueue(pciback_wq);
+
+ spin_lock(&pdev->dev_lock);
+ if (pdev->sh_info != NULL) {
+ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+ pdev->sh_info = NULL;
+ }
+ spin_unlock(&pdev->dev_lock);
+
+}
+
+static void free_pdev(struct pciback_device *pdev)
+{
+ if (pdev->be_watching) {
+ unregister_xenbus_watch(&pdev->be_watch);
+ pdev->be_watching = 0;
+ }
+
+ pciback_disconnect(pdev);
+
+ pciback_release_devices(pdev);
+
+ dev_set_drvdata(&pdev->xdev->dev, NULL);
+ pdev->xdev = NULL;
+
+ kfree(pdev);
+}
+
+static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
+ int remote_evtchn)
+{
+ int err = 0;
+ void *vaddr;
+
+ dev_dbg(&pdev->xdev->dev,
+ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+ gnt_ref, remote_evtchn);
+
+ err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error mapping other domain page in ours.");
+ goto out;
+ }
+
+ spin_lock(&pdev->dev_lock);
+ pdev->sh_info = vaddr;
+ spin_unlock(&pdev->dev_lock);
+
+ err = bind_interdomain_evtchn_to_irqhandler(
+ pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
+ 0, "pciback", pdev);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error binding event channel to IRQ");
+ goto out;
+ }
+
+ spin_lock(&pdev->dev_lock);
+ pdev->evtchn_irq = err;
+ spin_unlock(&pdev->dev_lock);
+ err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "Attached!\n");
+out:
+ return err;
+}
+
+static int pciback_attach(struct pciback_device *pdev)
+{
+ int err = 0;
+ int gnt_ref, remote_evtchn;
+ char *magic = NULL;
+
+
+ /* Make sure we only do this setup once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitialised)
+ goto out;
+
+ /* Wait for frontend to state that it has published the configuration */
+ if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+ XenbusStateInitialised)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+ "pci-op-ref", "%u", &gnt_ref,
+ "event-channel", "%u", &remote_evtchn,
+ "magic", NULL, &magic, NULL);
+ if (err) {
+ /* If configuration didn't get read correctly, wait longer */
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading configuration from frontend");
+ goto out;
+ }
+
+ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+ xenbus_dev_fatal(pdev->xdev, -EFAULT,
+ "version mismatch (%s/%s) with pcifront - "
+ "halting pciback",
+ magic, XEN_PCI_MAGIC);
+ goto out;
+ }
+
+ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
+ if (err)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to connected state!");
+
+ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+out:
+
+ kfree(magic);
+
+ return err;
+}
+
+static int pciback_publish_pci_dev(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus,
+ unsigned int devfn, unsigned int devid)
+{
+ int err;
+ int len;
+ char str[64];
+
+ len = snprintf(str, sizeof(str), "vdev-%d", devid);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x:%02x.%02x", domain, bus,
+ PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+out:
+ return err;
+}
+
+static int pciback_export_device(struct pciback_device *pdev,
+ int domain, int bus, int slot, int func,
+ int devid)
+{
+ struct pci_dev *dev;
+ int err = 0;
+
+ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+ if (!dev) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%01x)! "
+ "perhaps already in-use?",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
+ if (err)
+ goto out;
+
+ dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
+ if (xen_register_device_domain_owner(dev,
+ pdev->xdev->otherend_id) != 0) {
+ dev_err(&dev->dev, "device has been assigned to another " \
+ "domain! Over-writting the ownership, but beware.\n");
+ xen_unregister_device_domain_owner(dev);
+ xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
+ }
+
+ /* TODO: It'd be nice to export a bridge and have all of its children
+ * get exported with it. This may be best done in xend (which will
+ * have to calculate resource usage anyway) but we probably want to
+ * put something in here to ensure that if a bridge gets given to a
+ * driver domain, that all devices under that bridge are not given
+ * to other driver domains (as he who controls the bridge can disable
+ * it and stop the other devices from working).
+ */
+out:
+ return err;
+}
+
+static int pciback_remove_device(struct pciback_device *pdev,
+ int domain, int bus, int slot, int func)
+{
+ int err = 0;
+ struct pci_dev *dev;
+
+ dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+ domain, bus, slot, func);
+
+ dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+ if (!dev) {
+ err = -EINVAL;
+ dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+ "(%04x:%02x:%02x.%01x)! not owned by this domain\n",
+ domain, bus, slot, func);
+ goto out;
+ }
+
+ dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
+ xen_unregister_device_domain_owner(dev);
+
+ pciback_release_pci_dev(pdev, dev);
+
+out:
+ return err;
+}
+
+static int pciback_publish_pci_root(struct pciback_device *pdev,
+ unsigned int domain, unsigned int bus)
+{
+ unsigned int d, b;
+ int i, root_num, len, err;
+ char str[64];
+
+ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", &root_num);
+ if (err == 0 || err == -ENOENT)
+ root_num = 0;
+ else if (err < 0)
+ goto out;
+
+ /* Verify that we haven't already published this pci root */
+ for (i = 0; i < root_num; i++) {
+ len = snprintf(str, sizeof(str), "root-%d", i);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ str, "%x:%x", &d, &b);
+ if (err < 0)
+ goto out;
+ if (err != 2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (d == domain && b == bus) {
+ err = 0;
+ goto out;
+ }
+ }
+
+ len = snprintf(str, sizeof(str), "root-%d", root_num);
+ if (unlikely(len >= (sizeof(str) - 1))) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+ root_num, domain, bus);
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+ "%04x:%02x", domain, bus);
+ if (err)
+ goto out;
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ "root_num", "%d", (root_num + 1));
+
+out:
+ return err;
+}
+
+static int pciback_reconfigure(struct pciback_device *pdev)
+{
+ int err = 0;
+ int num_devs;
+ int domain, bus, slot, func;
+ int substate;
+ int i, len;
+ char state_str[64];
+ char dev_str[64];
+
+
+ dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+ /* Make sure we only reconfigure once */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateReconfiguring)
+ goto out;
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(len >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", &substate);
+ if (err != 1)
+ substate = XenbusStateUnknown;
+
+ switch (substate) {
+ case XenbusStateInitialising:
+ dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_export_device(pdev, domain, bus, slot,
+ func, i);
+ if (err)
+ goto out;
+
+ /* Publish pci roots. */
+ err = pciback_publish_pci_roots(pdev,
+ pciback_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root"
+ "buses for frontend");
+ goto out;
+ }
+
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+ state_str, "%d",
+ XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching substate of "
+ "dev-%d\n", i);
+ goto out;
+ }
+ break;
+
+ case XenbusStateClosing:
+ dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+ len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+ if (unlikely(len >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while "
+ "reading configuration");
+ goto out;
+ }
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+ dev_str, "%x:%x:%x.%x",
+ &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device "
+ "configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_remove_device(pdev, domain, bus, slot,
+ func);
+ if (err)
+ goto out;
+
+ /* TODO: If at some point we implement support for pci
+ * root hot-remove on pcifront side, we'll need to
+ * remove unnecessary xenstore nodes of pci roots here.
+ */
+
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to reconfigured state!");
+ goto out;
+ }
+
+out:
+ return 0;
+}
+
+static void pciback_frontend_changed(struct xenbus_device *xdev,
+ enum xenbus_state fe_state)
+{
+ struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
+
+ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+ switch (fe_state) {
+ case XenbusStateInitialised:
+ pciback_attach(pdev);
+ break;
+
+ case XenbusStateReconfiguring:
+ pciback_reconfigure(pdev);
+ break;
+
+ case XenbusStateConnected:
+ /* pcifront switched its state from reconfiguring to connected.
+ * Then switch to connected state.
+ */
+ xenbus_switch_state(xdev, XenbusStateConnected);
+ break;
+
+ case XenbusStateClosing:
+ pciback_disconnect(pdev);
+ xenbus_switch_state(xdev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ pciback_disconnect(pdev);
+ xenbus_switch_state(xdev, XenbusStateClosed);
+ if (xenbus_dev_is_online(xdev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+ device_unregister(&xdev->dev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pciback_setup_backend(struct pciback_device *pdev)
+{
+ /* Get configuration from xend (if available now) */
+ int domain, bus, slot, func;
+ int err = 0;
+ int i, num_devs;
+ char dev_str[64];
+ char state_str[64];
+
+ /* It's possible we could get the call to setup twice, so make sure
+ * we're not already connected.
+ */
+ if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+ XenbusStateInitWait)
+ goto out;
+
+ dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+ &num_devs);
+ if (err != 1) {
+ if (err >= 0)
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading number of devices");
+ goto out;
+ }
+
+ for (i = 0; i < num_devs; i++) {
+ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+ if (unlikely(l >= (sizeof(dev_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+
+ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+ "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+ if (err < 0) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error reading device configuration");
+ goto out;
+ }
+ if (err != 4) {
+ err = -EINVAL;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error parsing pci device "
+ "configuration");
+ goto out;
+ }
+
+ err = pciback_export_device(pdev, domain, bus, slot, func, i);
+ if (err)
+ goto out;
+
+ /* Switch substate of this device. */
+ l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+ if (unlikely(l >= (sizeof(state_str) - 1))) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(pdev->xdev, err,
+ "String overflow while reading "
+ "configuration");
+ goto out;
+ }
+ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+ "%d", XenbusStateInitialised);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+ "substate of dev-%d\n", i);
+ goto out;
+ }
+ }
+
+ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
+ if (err) {
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error while publish PCI root buses "
+ "for frontend");
+ goto out;
+ }
+
+ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+ if (err)
+ xenbus_dev_fatal(pdev->xdev, err,
+ "Error switching to initialised state!");
+
+out:
+ if (!err)
+ /* see if pcifront is already configured (if not, we'll wait) */
+ pciback_attach(pdev);
+
+ return err;
+}
+
+static void pciback_be_watch(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ struct pciback_device *pdev =
+ container_of(watch, struct pciback_device, be_watch);
+
+ switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+ case XenbusStateInitWait:
+ pciback_setup_backend(pdev);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int pciback_xenbus_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err = 0;
+ struct pciback_device *pdev = alloc_pdev(dev);
+
+ if (pdev == NULL) {
+ err = -ENOMEM;
+ xenbus_dev_fatal(dev, err,
+ "Error allocating pciback_device struct");
+ goto out;
+ }
+
+ /* wait for xend to configure us */
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto out;
+
+ /* watch the backend node for backend configuration information */
+ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+ pciback_be_watch);
+ if (err)
+ goto out;
+
+ pdev->be_watching = 1;
+
+ /* We need to force a call to our callback here in case
+ * xend already configured us!
+ */
+ pciback_be_watch(&pdev->be_watch, NULL, 0);
+
+out:
+ return err;
+}
+
+static int pciback_xenbus_remove(struct xenbus_device *dev)
+{
+ struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
+
+ if (pdev != NULL)
+ free_pdev(pdev);
+
+ return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+ {"pci"},
+ {""},
+};
+
+static struct xenbus_driver xenbus_pciback_driver = {
+ .name = "pciback",
+ .owner = THIS_MODULE,
+ .ids = xenpci_ids,
+ .probe = pciback_xenbus_probe,
+ .remove = pciback_xenbus_remove,
+ .otherend_changed = pciback_frontend_changed,
+};
+
+int __init pciback_xenbus_register(void)
+{
+ pciback_wq = create_workqueue("pciback_workqueue");
+ if (!pciback_wq) {
+ printk(KERN_ERR "%s: create"
+ "pciback_workqueue failed\n", __func__);
+ return -EFAULT;
+ }
+ return xenbus_register_backend(&xenbus_pciback_driver);
+}
+
+void __exit pciback_xenbus_unregister(void)
+{
+ destroy_workqueue(pciback_wq);
+ xenbus_unregister_driver(&xenbus_pciback_driver);
+}
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index afbe041..319dd0a 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
if (ret)
goto out;
xenbus_probe(NULL);
- ret = xen_setup_shutdown_event();
- if (ret)
- goto out;
return 0;
out:
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 1da8af6..2024a74 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -50,13 +50,15 @@ struct ttm_backend_func {
* @pages: Array of pointers to ttm pages.
* @dummy_read_page: Page to be used instead of NULL pages in the
* array @pages.
+ * @dma_addrs: Array of DMA (bus) address of the ttm pages.
*
* Populate the backend with ttm pages. Depending on the backend,
* it may or may not copy the @pages array.
*/
int (*populate) (struct ttm_backend *backend,
unsigned long num_pages, struct page **pages,
- struct page *dummy_read_page);
+ struct page *dummy_read_page,
+ dma_addr_t *dma_addrs);
/**
* struct ttm_backend_func member clear
*
@@ -149,6 +151,7 @@ enum ttm_caching_state {
* @swap_storage: Pointer to shmem struct file for swap storage.
* @caching_state: The current caching state of the pages.
* @state: The current binding state of the pages.
+ * @dma_address: The DMA (bus) addresses of the pages (if TTM_PAGE_FLAG_DMA32)
*
* This is a structure holding the pages, caching- and aperture binding
* status for a buffer object that isn't backed by fixed (VRAM / AGP)
@@ -173,6 +176,8 @@ struct ttm_tt {
tt_unbound,
tt_unpopulated,
} state;
+ dma_addr_t *dma_address;
+ struct device *dev;
};
#define TTM_MEMTYPE_FLAG_FIXED (1 << 0) /* Fixed (on-card) PCI memory */
@@ -547,6 +552,7 @@ struct ttm_bo_device {
struct list_head device_list;
struct ttm_bo_global *glob;
struct ttm_bo_driver *driver;
+ struct device *dev;
rwlock_t vm_lock;
struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
spinlock_t fence_lock;
@@ -787,6 +793,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
* @file_page_offset: Offset into the device address space that is available
* for buffer data. This ensures compatibility with other users of the
* address space.
+ * @need_dma32: Allocate pages under 4GB
+ * @dev: 'struct device' of the PCI device.
*
* Initializes a struct ttm_bo_device:
* Returns:
@@ -795,7 +803,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
extern int ttm_bo_device_init(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob,
struct ttm_bo_driver *driver,
- uint64_t file_page_offset, bool need_dma32);
+ uint64_t file_page_offset, bool need_dma32,
+ struct device *dev);
/**
* ttm_bo_unmap_virtual
diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index 1168214..ccb6b7a 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -36,11 +36,15 @@
* @flags: ttm flags for page allocation.
* @cstate: ttm caching state for the page.
* @count: number of pages to allocate.
+ * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
*/
int ttm_get_pages(struct list_head *pages,
int flags,
enum ttm_caching_state cstate,
- unsigned count);
+ unsigned count,
+ dma_addr_t *dma_address,
+ struct device *dev);
/**
* Put linked list of pages to pool.
*
@@ -49,11 +53,15 @@ int ttm_get_pages(struct list_head *pages,
* count.
* @flags: ttm flags for page allocation.
* @cstate: ttm caching state.
+ * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
*/
void ttm_put_pages(struct list_head *pages,
unsigned page_count,
int flags,
- enum ttm_caching_state cstate);
+ enum ttm_caching_state cstate,
+ dma_addr_t *dma_address,
+ struct device *dev);
/**
* Initialize pool allocator.
*/
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d42..d746da1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -55,7 +55,7 @@
* Used by threaded interrupts which need to keep the
* irq line disabled until the threaded handler has been run.
* IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
- *
+ * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
*/
#define IRQF_DISABLED 0x00000020
#define IRQF_SAMPLE_RANDOM 0x00000040
@@ -67,6 +67,7 @@
#define IRQF_IRQPOLL 0x00001000
#define IRQF_ONESHOT 0x00002000
#define IRQF_NO_SUSPEND 0x00004000
+#define IRQF_FORCE_RESUME 0x00008000
#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND)
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
new file mode 100644
index 0000000..ab79426
--- /dev/null
+++ b/include/xen/blkif.h
@@ -0,0 +1,122 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol. Used to generate ring structs which contain
+ * the elements common to all protocols only. This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places. */
+struct blkif_common_request {
+ char dummy;
+};
+struct blkif_common_response {
+ char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+ uint64_t id; /* copied from request */
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t __attribute__((__aligned__(8))) id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+ uint64_t __attribute__((__aligned__(8))) id;
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+ struct blkif_back_ring native;
+ struct blkif_common_back_ring common;
+ struct blkif_x86_32_back_ring x86_32;
+ struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+ BLKIF_PROTOCOL_NATIVE = 1,
+ BLKIF_PROTOCOL_X86_32 = 2,
+ BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->u.rw.sector_number = src->sector_number;
+ barrier();
+ if (n > dst->nr_segments)
+ n = dst->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->u.rw.seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ dst->operation = src->operation;
+ dst->nr_segments = src->nr_segments;
+ dst->handle = src->handle;
+ dst->id = src->id;
+ dst->u.rw.sector_number = src->sector_number;
+ barrier();
+ if (n > dst->nr_segments)
+ n = dst->nr_segments;
+ for (i = 0; i < n; i++)
+ dst->u.rw.seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
index 00f53dd..a0c8185 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -23,6 +23,12 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned long irqflags,
const char *devname,
void *dev_id);
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+ unsigned int remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id);
/*
* Common unbind function for all event sources. Takes IRQ to unbind from.
@@ -75,11 +81,10 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
#ifdef CONFIG_PCI_MSI
-/* Allocate an irq and a pirq to be used with MSIs. */
-#define XEN_ALLOC_PIRQ (1 << 0)
-#define XEN_ALLOC_IRQ (1 << 1)
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_mask);
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ int pirq, int vector, const char *name,
+ domid_t domid);
#endif
/* De-allocates the above mentioned physical interrupt. */
@@ -94,4 +99,10 @@ int xen_gsi_from_irq(unsigned pirq);
/* Return irq from pirq */
int xen_irq_from_pirq(unsigned pirq);
+/* Return the pirq allocated to the irq. */
+int xen_pirq_from_irq(unsigned irq);
+
+/* Determine whether to ignore this IRQ if it is passed to a guest. */
+int xen_test_irq_shared(int irq);
+
#endif /* _XEN_EVENTS_H */
diff --git a/include/xen/gntalloc.h b/include/xen/gntalloc.h
new file mode 100644
index 0000000..76bd580
--- /dev/null
+++ b/include/xen/gntalloc.h
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * gntalloc.h
+ *
+ * Interface to /dev/xen/gntalloc.
+ *
+ * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov>
+ *
+ * This file is in the public domain.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTALLOC_H__
+#define __LINUX_PUBLIC_GNTALLOC_H__
+
+/*
+ * Allocates a new page and creates a new grant reference.
+ */
+#define IOCTL_GNTALLOC_ALLOC_GREF \
+_IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref))
+struct ioctl_gntalloc_alloc_gref {
+ /* IN parameters */
+ /* The ID of the domain to be given access to the grants. */
+ uint16_t domid;
+ /* Flags for this mapping */
+ uint16_t flags;
+ /* Number of pages to map */
+ uint32_t count;
+ /* OUT parameters */
+ /* The offset to be used on a subsequent call to mmap(). */
+ uint64_t index;
+ /* The grant references of the newly created grant, one per page */
+ /* Variable size, depending on count */
+ uint32_t gref_ids[1];
+};
+
+#define GNTALLOC_FLAG_WRITABLE 1
+
+/*
+ * Deallocates the grant reference, allowing the associated page to be freed if
+ * no other domains are using it.
+ */
+#define IOCTL_GNTALLOC_DEALLOC_GREF \
+_IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref))
+struct ioctl_gntalloc_dealloc_gref {
+ /* IN parameters */
+ /* The offset returned in the map operation */
+ uint64_t index;
+ /* Number of references to unmap */
+ uint32_t count;
+};
+
+/*
+ * Sets up an unmap notification within the page, so that the other side can do
+ * cleanup if this side crashes. Required to implement cross-domain robust
+ * mutexes or close notification on communication channels.
+ *
+ * Each mapped page only supports one notification; multiple calls referring to
+ * the same page overwrite the previous notification. You must clear the
+ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
+ * to occur.
+ */
+#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \
+_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntalloc_unmap_notify))
+struct ioctl_gntalloc_unmap_notify {
+ /* IN parameters */
+ /* Offset in the file descriptor for a byte within the page (same as
+ * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
+ * be cleared. Otherwise, it can be any byte in the page whose
+ * notification we are adjusting.
+ */
+ uint64_t index;
+ /* Action(s) to take on unmap */
+ uint32_t action;
+ /* Event channel to notify */
+ uint32_t event_channel_port;
+};
+
+/* Clear (set to zero) the byte specified by index */
+#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
+/* Send an interrupt on the indicated event channel */
+#define UNMAP_NOTIFY_SEND_EVENT 0x2
+
+#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
index eb23f41..5304bd3 100644
--- a/include/xen/gntdev.h
+++ b/include/xen/gntdev.h
@@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants {
uint32_t count;
};
+/*
+ * Sets up an unmap notification within the page, so that the other side can do
+ * cleanup if this side crashes. Required to implement cross-domain robust
+ * mutexes or close notification on communication channels.
+ *
+ * Each mapped page only supports one notification; multiple calls referring to
+ * the same page overwrite the previous notification. You must clear the
+ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
+ * to occur.
+ */
+#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
+_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
+struct ioctl_gntdev_unmap_notify {
+ /* IN parameters */
+ /* Offset in the file descriptor for a byte within the page (same as
+ * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
+ * be cleared. Otherwise, it can be any byte in the page whose
+ * notification we are adjusting.
+ */
+ uint64_t index;
+ /* Action(s) to take on unmap */
+ uint32_t action;
+ /* Event channel to notify */
+ uint32_t event_channel_port;
+};
+
+/* Clear (set to zero) the byte specified by index */
+#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
+/* Send an interrupt on the indicated event channel */
+#define UNMAP_NOTIFY_SEND_EVENT 0x2
+
#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c2d1fa4..61e523a 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -51,11 +51,7 @@ typedef uint64_t blkif_sector_t;
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
-struct blkif_request {
- uint8_t operation; /* BLKIF_OP_??? */
- uint8_t nr_segments; /* number of segments */
- blkif_vdev_t handle; /* only for read/write requests */
- uint64_t id; /* private guest value, echoed in resp */
+struct blkif_request_rw {
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment {
grant_ref_t gref; /* reference to I/O buffer frame */
@@ -65,6 +61,16 @@ struct blkif_request {
} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
+struct blkif_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ union {
+ struct blkif_request_rw rw;
+ } u;
+};
+
struct blkif_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
@@ -91,4 +97,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
#define VDISK_REMOVABLE 0x2
#define VDISK_READONLY 0x4
+/* Xen-defined major numbers for virtual disks, they look strangely
+ * familiar */
+#define XEN_IDE0_MAJOR 3
+#define XEN_IDE1_MAJOR 22
+#define XEN_SCSI_DISK0_MAJOR 8
+#define XEN_SCSI_DISK1_MAJOR 65
+#define XEN_SCSI_DISK2_MAJOR 66
+#define XEN_SCSI_DISK3_MAJOR 67
+#define XEN_SCSI_DISK4_MAJOR 68
+#define XEN_SCSI_DISK5_MAJOR 69
+#define XEN_SCSI_DISK6_MAJOR 70
+#define XEN_SCSI_DISK7_MAJOR 71
+#define XEN_SCSI_DISK8_MAJOR 128
+#define XEN_SCSI_DISK9_MAJOR 129
+#define XEN_SCSI_DISK10_MAJOR 130
+#define XEN_SCSI_DISK11_MAJOR 131
+#define XEN_SCSI_DISK12_MAJOR 132
+#define XEN_SCSI_DISK13_MAJOR 133
+#define XEN_SCSI_DISK14_MAJOR 134
+#define XEN_SCSI_DISK15_MAJOR 135
+
#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
index 518481c..cb94668 100644
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -22,50 +22,50 @@
/*
* This is the 'wire' format for packets:
- * Request 1: netif_tx_request -- NETTXF_* (any flags)
- * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info)
- * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE)
- * Request 4: netif_tx_request -- NETTXF_more_data
- * Request 5: netif_tx_request -- NETTXF_more_data
+ * Request 1: xen_netif_tx_request -- XEN_NETTXF_* (any flags)
+ * [Request 2: xen_netif_extra_info] (only if request 1 has XEN_NETTXF_extra_info)
+ * [Request 3: xen_netif_extra_info] (only if request 2 has XEN_NETIF_EXTRA_MORE)
+ * Request 4: xen_netif_tx_request -- XEN_NETTXF_more_data
+ * Request 5: xen_netif_tx_request -- XEN_NETTXF_more_data
* ...
- * Request N: netif_tx_request -- 0
+ * Request N: xen_netif_tx_request -- 0
*/
/* Protocol checksum field is blank in the packet (hardware offload)? */
-#define _NETTXF_csum_blank (0)
-#define NETTXF_csum_blank (1U<<_NETTXF_csum_blank)
+#define _XEN_NETTXF_csum_blank (0)
+#define XEN_NETTXF_csum_blank (1U<<_XEN_NETTXF_csum_blank)
/* Packet data has been validated against protocol checksum. */
-#define _NETTXF_data_validated (1)
-#define NETTXF_data_validated (1U<<_NETTXF_data_validated)
+#define _XEN_NETTXF_data_validated (1)
+#define XEN_NETTXF_data_validated (1U<<_XEN_NETTXF_data_validated)
/* Packet continues in the next request descriptor. */
-#define _NETTXF_more_data (2)
-#define NETTXF_more_data (1U<<_NETTXF_more_data)
+#define _XEN_NETTXF_more_data (2)
+#define XEN_NETTXF_more_data (1U<<_XEN_NETTXF_more_data)
/* Packet to be followed by extra descriptor(s). */
-#define _NETTXF_extra_info (3)
-#define NETTXF_extra_info (1U<<_NETTXF_extra_info)
+#define _XEN_NETTXF_extra_info (3)
+#define XEN_NETTXF_extra_info (1U<<_XEN_NETTXF_extra_info)
struct xen_netif_tx_request {
grant_ref_t gref; /* Reference to buffer page */
uint16_t offset; /* Offset within buffer page */
- uint16_t flags; /* NETTXF_* */
+ uint16_t flags; /* XEN_NETTXF_* */
uint16_t id; /* Echoed in response message. */
uint16_t size; /* Packet size in bytes. */
};
-/* Types of netif_extra_info descriptors. */
-#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
-#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
-#define XEN_NETIF_EXTRA_TYPE_MAX (2)
+/* Types of xen_netif_extra_info descriptors. */
+#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */
+#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */
+#define XEN_NETIF_EXTRA_TYPE_MAX (2)
-/* netif_extra_info flags. */
-#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
-#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
+/* xen_netif_extra_info flags. */
+#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
+#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
/* GSO types - only TCPv4 currently supported. */
-#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
+#define XEN_NETIF_GSO_TYPE_TCPV4 (1)
/*
* This structure needs to fit within both netif_tx_request and
@@ -107,7 +107,7 @@ struct xen_netif_extra_info {
struct xen_netif_tx_response {
uint16_t id;
- int16_t status; /* NETIF_RSP_* */
+ int16_t status; /* XEN_NETIF_RSP_* */
};
struct xen_netif_rx_request {
@@ -116,25 +116,29 @@ struct xen_netif_rx_request {
};
/* Packet data has been validated against protocol checksum. */
-#define _NETRXF_data_validated (0)
-#define NETRXF_data_validated (1U<<_NETRXF_data_validated)
+#define _XEN_NETRXF_data_validated (0)
+#define XEN_NETRXF_data_validated (1U<<_XEN_NETRXF_data_validated)
/* Protocol checksum field is blank in the packet (hardware offload)? */
-#define _NETRXF_csum_blank (1)
-#define NETRXF_csum_blank (1U<<_NETRXF_csum_blank)
+#define _XEN_NETRXF_csum_blank (1)
+#define XEN_NETRXF_csum_blank (1U<<_XEN_NETRXF_csum_blank)
/* Packet continues in the next request descriptor. */
-#define _NETRXF_more_data (2)
-#define NETRXF_more_data (1U<<_NETRXF_more_data)
+#define _XEN_NETRXF_more_data (2)
+#define XEN_NETRXF_more_data (1U<<_XEN_NETRXF_more_data)
/* Packet to be followed by extra descriptor(s). */
-#define _NETRXF_extra_info (3)
-#define NETRXF_extra_info (1U<<_NETRXF_extra_info)
+#define _XEN_NETRXF_extra_info (3)
+#define XEN_NETRXF_extra_info (1U<<_XEN_NETRXF_extra_info)
+
+/* GSO Prefix descriptor. */
+#define _XEN_NETRXF_gso_prefix (4)
+#define XEN_NETRXF_gso_prefix (1U<<_XEN_NETRXF_gso_prefix)
struct xen_netif_rx_response {
uint16_t id;
uint16_t offset; /* Offset in page of start of received packet */
- uint16_t flags; /* NETRXF_* */
+ uint16_t flags; /* XEN_NETRXF_* */
int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
};
@@ -149,10 +153,10 @@ DEFINE_RING_TYPES(xen_netif_rx,
struct xen_netif_rx_request,
struct xen_netif_rx_response);
-#define NETIF_RSP_DROPPED -2
-#define NETIF_RSP_ERROR -1
-#define NETIF_RSP_OKAY 0
-/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
-#define NETIF_RSP_NULL 1
+#define XEN_NETIF_RSP_DROPPED -2
+#define XEN_NETIF_RSP_ERROR -1
+#define XEN_NETIF_RSP_OKAY 0
+/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */
+#define XEN_NETIF_RSP_NULL 1
#endif
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 2befa3e..b33257b 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -30,7 +30,7 @@
#define __HYPERVISOR_stack_switch 3
#define __HYPERVISOR_set_callbacks 4
#define __HYPERVISOR_fpu_taskswitch 5
-#define __HYPERVISOR_sched_op 6
+#define __HYPERVISOR_sched_op_compat 6
#define __HYPERVISOR_dom0_op 7
#define __HYPERVISOR_set_debugreg 8
#define __HYPERVISOR_get_debugreg 9
@@ -52,7 +52,7 @@
#define __HYPERVISOR_mmuext_op 26
#define __HYPERVISOR_acm_op 27
#define __HYPERVISOR_nmi_op 28
-#define __HYPERVISOR_sched_op_new 29
+#define __HYPERVISOR_sched_op 29
#define __HYPERVISOR_callback_op 30
#define __HYPERVISOR_xenoprof_op 31
#define __HYPERVISOR_event_channel_op 32
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 98b9215..03c85d7 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,9 +5,9 @@
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
-void xen_pre_suspend(void);
-void xen_post_suspend(int suspend_cancelled);
-void xen_hvm_post_suspend(int suspend_cancelled);
+void xen_arch_pre_suspend(void);
+void xen_arch_post_suspend(int suspend_cancelled);
+void xen_arch_hvm_post_suspend(int suspend_cancelled);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9033c1c..2782bac 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq);
void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
{
- if (resume)
+ if (resume) {
+ if (!(desc->status & IRQ_SUSPENDED)) {
+ if (!desc->action)
+ return;
+ if (!(desc->action->flags & IRQF_FORCE_RESUME))
+ return;
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+ }
desc->status &= ~IRQ_SUSPENDED;
+ }
switch (desc->depth) {
case 0:
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d8..d6bfb89 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
- if (!(desc->status & IRQ_SUSPENDED))
- continue;
-
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
raw_spin_unlock_irqrestore(&desc->lock, flags);