kernel/xen.pcifront.next-2.6.38.patch

diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h
index 96fc623..ed28bcd 100644
--- a/arch/ia64/include/asm/xen/hypercall.h
+++ b/arch/ia64/include/asm/xen/hypercall.h
@@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2,
 static inline int
 xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
 {
-	return _hypercall2(int, sched_op_new, cmd, arg);
+	return _hypercall2(int, sched_op, cmd, arg);
 }

 static inline long
diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c
index fd66b04..419c862 100644
--- a/arch/ia64/xen/suspend.c
+++ b/arch/ia64/xen/suspend.c
@@ -37,19 +37,14 @@ xen_mm_unpin_all(void)
 	/* nothing */
 }

-void xen_pre_device_suspend(void)
-{
-	/* nothing */
-}
-
 void
-xen_pre_suspend()
+xen_arch_pre_suspend()
 {
 	/* nothing */
 }

 void
-xen_post_suspend(int suspend_cancelled)
+xen_arch_post_suspend(int suspend_cancelled)
 {
 	if (suspend_cancelled)
 		return;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae..8508bfe 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
 static inline int
 HYPERVISOR_sched_op(int cmd, void *arg)
 {
-	return _hypercall2(int, sched_op_new, cmd, arg);
+	return _hypercall2(int, sched_op, cmd, arg);
 }

 static inline long
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
 #endif

 static inline int
-HYPERVISOR_suspend(unsigned long srec)
+HYPERVISOR_suspend(unsigned long start_info_mfn)
 {
-	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
-			   SHUTDOWN_suspend, srec);
+	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+
+	/*
+	 * For a PV guest the tools require that the start_info mfn be
+	 * present in rdx/edx when the hypercall is made. Per the
+	 * hypercall calling convention this is the third hypercall
+	 * argument, which is start_info_mfn here.
+	 */
+	return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
 }

 static inline int
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf2..64a619d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {

 /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
 #define INVALID_P2M_ENTRY	(~0UL)
-#define FOREIGN_FRAME_BIT	(1UL<<31)
+#define FOREIGN_FRAME_BIT	(1UL<<(BITS_PER_LONG-1))
+#define IDENTITY_FRAME_BIT	(1UL<<(BITS_PER_LONG-2))
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
+#define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT)

 /* Maximum amount of memory we can handle in a domain in pages */
 #define MAX_DOMAIN_PAGES						\
@@ -41,12 +43,19 @@ extern unsigned int   machine_to_phys_order;

 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern unsigned long set_phys_range_identity(unsigned long pfn_s,
+					     unsigned long pfn_e);

-extern int m2p_add_override(unsigned long mfn, struct page *page);
-extern int m2p_remove_override(struct page *page);
+extern int m2p_add_override(unsigned long mfn, struct page *page,
+			    bool clear_pte);
+extern int m2p_remove_override(struct page *page, bool clear_pte);
 extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);

+#ifdef CONFIG_XEN_DEBUG_FS
+extern int p2m_dump_show(struct seq_file *m, void *v);
+#endif
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
 	unsigned long mfn;
@@ -57,7 +66,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 	mfn = get_phys_to_machine(pfn);

 	if (mfn != INVALID_P2M_ENTRY)
-		mfn &= ~FOREIGN_FRAME_BIT;
+		mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);

 	return mfn;
 }
@@ -73,25 +82,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
 static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
+	int ret = 0;

 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;

+	if (unlikely((mfn >> machine_to_phys_order) != 0)) {
+		pfn = ~0;
+		goto try_override;
+	}
 	pfn = 0;
 	/*
 	 * The array access can fail (e.g., device space beyond end of RAM).
 	 * In such cases it doesn't matter what we return (we return garbage),
 	 * but we must handle the fault without crashing!
 	 */
-	__get_user(pfn, &machine_to_phys_mapping[mfn]);
-
-	/*
-	 * If this appears to be a foreign mfn (because the pfn
-	 * doesn't map back to the mfn), then check the local override
-	 * table to see if there's a better pfn to use.
+	ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+try_override:
+	/* ret might be < 0 if there are no entries in the m2p for mfn */
+	if (ret < 0)
+		pfn = ~0;
+	else if (get_phys_to_machine(pfn) != mfn)
+		/*
+		 * If this appears to be a foreign mfn (because the pfn
+		 * doesn't map back to the mfn), then check the local override
+		 * table to see if there's a better pfn to use.
+		 *
+		 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
+		 */
+		pfn = m2p_find_override_pfn(mfn, ~0);
+
+	/*
+	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
+	 * entry doesn't map back to the mfn and m2p_override doesn't have a
+	 * valid entry for it.
 	 */
-	if (get_phys_to_machine(pfn) != mfn)
-		pfn = m2p_find_override_pfn(mfn, pfn);
+	if (pfn == ~0 &&
+			get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
+		pfn = mfn;

 	return pfn;
 }
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3e..4fbda9a 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void)
 #endif
 #if defined(CONFIG_XEN_DOM0)
 void __init xen_setup_pirqs(void);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
 #else
 static inline void __init xen_setup_pirqs(void)
 {
 }
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+	return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+						   uint16_t domain)
+{
+	return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+	return -1;
+}
 #endif

 #if defined(CONFIG_PCI_MSI)
@@ -27,16 +43,16 @@ static inline void __init xen_setup_pirqs(void)
  * its own functions.
  */
 struct xen_pci_frontend_ops {
-	int (*enable_msi)(struct pci_dev *dev, int **vectors);
+	int (*enable_msi)(struct pci_dev *dev, int vectors[]);
 	void (*disable_msi)(struct pci_dev *dev);
-	int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+	int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
 	void (*disable_msix)(struct pci_dev *dev);
 };

 extern struct xen_pci_frontend_ops *xen_pci_frontend;

 static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
-					      int **vectors)
+					      int vectors[])
 {
 	if (xen_pci_frontend && xen_pci_frontend->enable_msi)
 		return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +64,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
 			xen_pci_frontend->disable_msi(dev);
 }
 static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
-					       int **vectors, int nvec)
+					       int vectors[], int nvec)
 {
 	if (xen_pci_frontend && xen_pci_frontend->enable_msix)
 		return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42a..66637bd 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -283,6 +283,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	if (!after_bootmem && !start) {
 		pud_t *pud;
 		pmd_t *pmd;
+		unsigned long addr;
+		u64 size, memblock_addr;

 		mmu_cr4_features = read_cr4();

@@ -291,11 +293,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		 * located on different 2M pages. cleanup_highmap(), however,
 		 * can only consider _end when it runs, so destroy any
 		 * mappings beyond _brk_end here.
+		 * Respect memblock reserved regions.
 		 */
 		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
 		pmd = pmd_offset(pud, _brk_end - 1);
-		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-			pmd_clear(pmd);
+		addr = (_brk_end + PMD_SIZE - 1) & PMD_MASK;
+		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) {
+			memblock_addr = memblock_x86_find_in_range_size(__pa(addr),
+					&size, PMD_SIZE);
+			if (memblock_addr == (u64) __pa(addr) && size >= PMD_SIZE)
+				pmd_clear(pmd);
+			addr += PMD_SIZE;
+		}
 	}
 #endif
 	__flush_tlb_all();
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a0..309c0a0 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
 #include <asm/xen/pci.h>

 #ifdef CONFIG_ACPI
-static int xen_hvm_register_pirq(u32 gsi, int triggering)
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+				 int trigger, int polarity)
 {
 	int rc, irq;
 	struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
 		return -1;
 	}

-	if (triggering == ACPI_EDGE_SENSITIVE) {
+	if (trigger == ACPI_EDGE_SENSITIVE) {
 		shareable = 0;
 		name = "ioapic-edge";
 	} else {
@@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)

 	return irq;
 }
-
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
-				 int trigger, int polarity)
-{
-	return xen_hvm_register_pirq(gsi, trigger);
-}
 #endif

 #if defined(CONFIG_PCI_MSI)
@@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,

 static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int irq, pirq, ret = 0;
+	int irq, pirq;
 	struct msi_desc *msidesc;
 	struct msi_msg msg;

@@ -99,39 +94,33 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		__read_msi_msg(msidesc, &msg);
 		pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
 			((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
-		if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
-			xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-					"msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
-			if (irq < 0)
+		if (msg.data != XEN_PIRQ_MSI_DATA ||
+		    xen_irq_from_pirq(pirq) < 0) {
+			pirq = xen_allocate_pirq_msi(dev, msidesc);
+			if (pirq < 0)
 				goto error;
-			ret = set_irq_msi(irq, msidesc);
-			if (ret < 0)
-				goto error_while;
-			printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
-					" pirq=%d\n", irq, pirq);
-			return 0;
+			xen_msi_compose_msg(dev, pirq, &msg);
+			__write_msi_msg(msidesc, &msg);
+			dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
+		} else {
+			dev_dbg(&dev->dev,
+				"xen: msi already bound to pirq=%d\n", pirq);
 		}
-		xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-				"msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
-		if (irq < 0 || pirq < 0)
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "msi-x" : "msi",
+					       DOMID_SELF);
+		if (irq < 0)
 			goto error;
-		printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
-		xen_msi_compose_msg(dev, pirq, &msg);
-		ret = set_irq_msi(irq, msidesc);
-		if (ret < 0)
-			goto error_while;
-		write_msi_msg(irq, &msg);
+		dev_dbg(&dev->dev,
+			"xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
 	}
 	return 0;

-error_while:
-	unbind_from_irqhandler(irq, NULL);
 error:
-	if (ret == -ENODEV)
-		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
-				" MSI/MSI-X support!\n");
-
-	return ret;
+	dev_err(&dev->dev,
+		"Xen PCI frontend has not registered MSI/MSI-X support!\n");
+	return -ENODEV;
 }

 /*
@@ -150,35 +139,27 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		return -ENOMEM;

 	if (type == PCI_CAP_ID_MSIX)
-		ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+		ret = xen_pci_frontend_enable_msix(dev, v, nvec);
 	else
-		ret = xen_pci_frontend_enable_msi(dev, &v);
+		ret = xen_pci_frontend_enable_msi(dev, v);
 	if (ret)
 		goto error;
 	i = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
-			(type == PCI_CAP_ID_MSIX) ?
-			"pcifront-msi-x" : "pcifront-msi");
-		if (irq < 0) {
-			ret = -1;
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "pcifront-msi-x" :
+					       "pcifront-msi",
+						DOMID_SELF);
+		if (irq < 0)
 			goto free;
-		}
-
-		ret = set_irq_msi(irq, msidesc);
-		if (ret)
-			goto error_while;
 		i++;
 	}
 	kfree(v);
 	return 0;

-error_while:
-	unbind_from_irqhandler(irq, NULL);
 error:
-	if (ret == -ENODEV)
-		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
-			" MSI/MSI-X support!\n");
+	dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
 free:
 	kfree(v);
 	return ret;
@@ -193,6 +174,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
 		xen_pci_frontend_disable_msix(dev);
 	else
 		xen_pci_frontend_disable_msi(dev);
+
+	/* Free the IRQ's and the msidesc using the generic code. */
+	default_teardown_msi_irqs(dev);
 }

 static void xen_teardown_msi_irq(unsigned int irq)
@@ -200,47 +184,91 @@ static void xen_teardown_msi_irq(unsigned int irq)
 	xen_destroy_irq(irq);
 }

+#ifdef CONFIG_XEN_DOM0
 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int irq, ret;
+	int ret = 0;
 	struct msi_desc *msidesc;

 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_create_msi_irq(dev, msidesc, type);
-		if (irq < 0)
-			return -1;
+		struct physdev_map_pirq map_irq;
+		domid_t domid;

-		ret = set_irq_msi(irq, msidesc);
-		if (ret)
-			goto error;
-	}
-	return 0;
+		domid = ret = xen_find_device_domain_owner(dev);
+		/* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
+		 * hence check ret value for < 0. */
+		if (ret < 0)
+			domid = DOMID_SELF;

-error:
-	xen_destroy_irq(irq);
+		memset(&map_irq, 0, sizeof(map_irq));
+		map_irq.domid = domid;
+		map_irq.type = MAP_PIRQ_TYPE_MSI;
+		map_irq.index = -1;
+		map_irq.pirq = -1;
+		map_irq.bus = dev->bus->number;
+		map_irq.devfn = dev->devfn;
+
+		if (type == PCI_CAP_ID_MSIX) {
+			int pos;
+			u32 table_offset, bir;
+
+			pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+			pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
+					      &table_offset);
+			bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+			map_irq.table_base = pci_resource_start(dev, bir);
+			map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+		}
+
+		ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (ret) {
+			dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
+				 ret, domid);
+			goto out;
+		}
+
+		ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
+					       map_irq.pirq, map_irq.index,
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "msi-x" : "msi",
+						domid);
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
 	return ret;
 }
 #endif
+#endif

 static int xen_pcifront_enable_irq(struct pci_dev *dev)
 {
 	int rc;
 	int share = 1;
+	u8 gsi;

-	dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
-
-	if (dev->irq < 0)
-		return -EINVAL;
+	rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+	if (rc < 0) {
+		dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+			 rc);
+		return rc;
+	}

-	if (dev->irq < NR_IRQS_LEGACY)
+	if (gsi < NR_IRQS_LEGACY)
 		share = 0;

-	rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+	rc = xen_allocate_pirq(gsi, share, "pcifront");
 	if (rc < 0) {
-		dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
-			 dev->irq, rc);
+		dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n",
+			 gsi, rc);
 		return rc;
 	}
+
+	dev->irq = rc;
+	dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
 	return 0;
 }

@@ -427,3 +455,76 @@ void __init xen_setup_pirqs(void)
 	}
 }
 #endif
+
+struct xen_device_domain_owner {
+	domid_t domain;
+	struct pci_dev *dev;
+	struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
+
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+
+	list_for_each_entry(owner, &dev_domain_list, list) {
+		if (owner->dev == dev)
+			return owner;
+	}
+	return NULL;
+}
+
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+	int domain = -ENODEV;
+
+	spin_lock(&dev_domain_list_spinlock);
+	owner = find_device(dev);
+	if (owner)
+		domain = owner->domain;
+	spin_unlock(&dev_domain_list_spinlock);
+	return domain;
+}
+EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
+
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+	struct xen_device_domain_owner *owner;
+
+	owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+	if (!owner)
+		return -ENODEV;
+
+	spin_lock(&dev_domain_list_spinlock);
+	if (find_device(dev)) {
+		spin_unlock(&dev_domain_list_spinlock);
+		kfree(owner);
+		return -EEXIST;
+	}
+	owner->domain = domain;
+	owner->dev = dev;
+	list_add_tail(&owner->list, &dev_domain_list);
+	spin_unlock(&dev_domain_list_spinlock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+
+	spin_lock(&dev_domain_list_spinlock);
+	owner = find_device(dev);
+	if (!owner) {
+		spin_unlock(&dev_domain_list_spinlock);
+		return -ENODEV;
+	}
+	list_del(&owner->list);
+	spin_unlock(&dev_domain_list_spinlock);
+	kfree(owner);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892..e4343fe 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -48,3 +48,11 @@ config XEN_DEBUG_FS
 	help
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
+
+config XEN_DEBUG
+	bool "Enable Xen debug checks"
+	depends on XEN
+	default n
+	help
+	  Enable various WARN_ON checks in the Xen MMU code.
+	  Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542ef..49dbd78 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1284,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor)

 	xen_setup_features();

-	pv_info = xen_info;
-	pv_info.kernel_rpl = 0;
+	pv_info.name = "Xen HVM";

 	xen_domain_type = XEN_HVM_DOMAIN;

 	return 0;
 }

-void xen_hvm_init_shared_info(void)
+void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
 	struct xen_add_to_physmap xatp;
@@ -1331,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+		if (xen_have_vector_callback)
+			xen_init_lock_cpu(cpu);
 		break;
 	default:
 		break;
@@ -1355,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)

 	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
+	xen_hvm_smp_init();
 	register_cpu_notifier(&xen_hvm_cpu_notifier);
 	xen_unplug_emulated_devices();
 	have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61..0c376a2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/seq_file.h>

 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -416,8 +417,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 	if (val & _PAGE_PRESENT) {
 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & PTE_FLAGS_MASK;
-		unsigned long mfn = pfn_to_mfn(pfn);
+		unsigned long mfn;

+		if (!xen_feature(XENFEAT_auto_translated_physmap))
+			mfn = get_phys_to_machine(pfn);
+		else
+			mfn = pfn;
 		/*
 		 * If there's no mfn for the pfn, then just create an
 		 * empty non-present pte.  Unfortunately this loses
@@ -427,8 +432,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 			mfn = 0;
 			flags = 0;
+		} else {
+			/*
+			 * Paramount to do this test _after_ the
+			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
+			 * IDENTITY_FRAME_BIT resolves to true.
+			 */
+			mfn &= ~FOREIGN_FRAME_BIT;
+			if (mfn & IDENTITY_FRAME_BIT) {
+				mfn &= ~IDENTITY_FRAME_BIT;
+				flags |= _PAGE_IOMAP;
+			}
 		}
-
 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 	}

@@ -532,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);

+#ifdef CONFIG_XEN_DEBUG
+pte_t xen_make_pte_debug(pteval_t pte)
+{
+	phys_addr_t addr = (pte & PTE_PFN_MASK);
+	phys_addr_t other_addr;
+	bool io_page = false;
+	pte_t _pte;
+
+	if (pte & _PAGE_IOMAP)
+		io_page = true;
+
+	_pte = xen_make_pte(pte);
+
+	if (!addr)
+		return _pte;
+
+	if (io_page &&
+	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+		other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
+		WARN(addr != other_addr,
+			"0x%lx is using VM_IO, but it is 0x%lx!\n",
+			(unsigned long)addr, (unsigned long)other_addr);
+	} else {
+		pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
+		other_addr = (_pte.pte & PTE_PFN_MASK);
+		WARN((addr == other_addr) && (!io_page) && (!iomap_set),
+			"0x%lx is missing VM_IO (and wasn't fixed)!\n",
+			(unsigned long)addr);
+	}
+
+	return _pte;
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
+#endif
+
 pgd_t xen_make_pgd(pgdval_t pgd)
 {
 	pgd = pte_pfn_to_mfn(pgd);
@@ -1942,6 +1992,9 @@ __init void xen_ident_map_ISA(void)

 static __init void xen_post_allocator_init(void)
 {
+#ifdef CONFIG_XEN_DEBUG
+	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
+#endif
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;
@@ -2074,7 +2127,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
 			in_frames[i] = virt_to_mfn(vaddr);

 		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
-		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);

 		if (out_frames)
 			out_frames[i] = virt_to_pfn(vaddr);
@@ -2353,6 +2406,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);

 #ifdef CONFIG_XEN_DEBUG_FS

+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+	.open		= p2m_dump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static struct dentry *d_mmu_debug;

 static int __init xen_mmu_debugfs(void)
@@ -2408,6 +2473,7 @@ static int __init xen_mmu_debugfs(void)
 	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
 			   &mmu_stats.prot_commit_batched);

+	debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
 	return 0;
 }
 fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7c..dd5e735 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -30,6 +30,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>

 #include <asm/cache.h>
 #include <asm/setup.h>
@@ -59,9 +60,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);

+static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));

+/* We might hit two boundary violations at the start and end, at max each
+ * boundary violation will require three middle nodes. */
+RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
+
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
 	BUG_ON(pfn >= MAX_P2M_PFN);
@@ -136,7 +143,7 @@ static void p2m_init(unsigned long *p2m)
  * - After resume we're called from within stop_machine, but the mfn
  *   tree should alreay be completely allocated.
  */
-void xen_build_mfn_list_list(void)
+void __ref xen_build_mfn_list_list(void)
 {
 	unsigned long pfn;

@@ -221,6 +228,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
 	p2m_top_init(p2m_top);

+	p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_init(p2m_identity);
+
 	/*
 	 * The domain builder gives us a pre-constructed p2m array in
 	 * mfn_list for all the pages initially given to us, so we just
@@ -266,6 +276,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);

+	/*
+	 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
+	 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
+	 * would be wrong.
+	 */
+	if (p2m_top[topidx][mididx] == p2m_identity)
+		return IDENTITY_FRAME(pfn);
+
 	return p2m_top[topidx][mididx][idx];
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -335,9 +353,11 @@ static bool alloc_p2m(unsigned long pfn)
 			p2m_top_mfn_p[topidx] = mid_mfn;
 	}

-	if (p2m_top[topidx][mididx] == p2m_missing) {
+	if (p2m_top[topidx][mididx] == p2m_identity ||
+	    p2m_top[topidx][mididx] == p2m_missing) {
 		/* p2m leaf page is missing */
 		unsigned long *p2m;
+		unsigned long *p2m_orig = p2m_top[topidx][mididx];

 		p2m = alloc_p2m_page();
 		if (!p2m)
@@ -345,7 +365,7 @@ static bool alloc_p2m(unsigned long pfn)

 		p2m_init(p2m);

-		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+		if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
 			free_p2m_page(p2m);
 		else
 			mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -354,11 +374,91 @@ static bool alloc_p2m(unsigned long pfn)
 	return true;
 }

+bool __early_alloc_p2m(unsigned long pfn)
+{
+	unsigned topidx, mididx, idx;
+
+	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
+	idx = p2m_index(pfn);
+
+	/* Pfff.. No boundary cross-over, lets get out. */
+	if (!idx)
+		return false;
+
+	WARN(p2m_top[topidx][mididx] == p2m_identity,
+		"P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
+		topidx, mididx);
+
+	/*
+	 * Could be done by xen_build_dynamic_phys_to_machine..
+	 */
+	if (p2m_top[topidx][mididx] != p2m_missing)
+		return false;
+
+	/* Boundary cross-over for the edges: */
+	if (idx) {
+		unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+		p2m_init(p2m);
+
+		p2m_top[topidx][mididx] = p2m;
+
+	}
+	return idx != 0;
+}
+unsigned long set_phys_range_identity(unsigned long pfn_s,
+				      unsigned long pfn_e)
+{
+	unsigned long pfn;
+
+	if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
+		return 0;
+
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+		return pfn_e - pfn_s;
+
+	if (pfn_s > pfn_e)
+		return 0;
+
+	for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
+		pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
+		pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
+	{
+		unsigned topidx = p2m_top_index(pfn);
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+			p2m_mid_init(mid);
+
+			p2m_top[topidx] = mid;
+		}
+	}
+
+	__early_alloc_p2m(pfn_s);
+	__early_alloc_p2m(pfn_e);
+
+	for (pfn = pfn_s; pfn < pfn_e; pfn++)
+		if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
+			break;
+
+	if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+		"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
+		(pfn_e - pfn_s) - (pfn - pfn_s)))
+		printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+
+	return pfn - pfn_s;
+}
+
 /* Try to install p2m mapping; fail if intermediate bits missing */
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	unsigned topidx, mididx, idx;

+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return true;
+	}
 	if (unlikely(pfn >= MAX_P2M_PFN)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
@@ -368,6 +468,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);

+	/* For sparse holes were the p2m leaf has real PFN along with
+	 * PCI holes, stick in the PFN as the MFN value.
+	 */
+	if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+		if (p2m_top[topidx][mididx] == p2m_identity)
+			return true;
+
+		/* Swap over from MISSING to IDENTITY if needed. */
+		if (p2m_top[topidx][mididx] == p2m_missing) {
+			WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
+				p2m_identity) != p2m_missing);
+			return true;
+		}
+	}
+
 	if (p2m_top[topidx][mididx] == p2m_missing)
 		return mfn == INVALID_P2M_ENTRY;

@@ -378,11 +493,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)

 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
-	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return true;
-	}
-
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
 		if (!alloc_p2m(pfn))
 			return false;
@@ -417,11 +527,11 @@ static unsigned long mfn_hash(unsigned long mfn)
 }

 /* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page)
+int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
 {
 	unsigned long flags;
 	unsigned long pfn;
-	unsigned long address;
+	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;

@@ -429,7 +539,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	if (!PageHighMem(page)) {
 		address = (unsigned long)__va(pfn << PAGE_SHIFT);
 		ptep = lookup_address(address, &level);
-
 		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
 					"m2p_add_override: pfn %lx not mapped", pfn))
 			return -EINVAL;
@@ -439,10 +548,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	page->index = pfn_to_mfn(pfn);

 	__set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
-	if (!PageHighMem(page))
+	if (clear_pte && !PageHighMem(page))
 		/* Just zap old mapping for now */
 		pte_clear(&init_mm, address, ptep);
-
 	spin_lock_irqsave(&m2p_override_lock, flags);
 	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -450,12 +558,12 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	return 0;
 }

-int m2p_remove_override(struct page *page)
+int m2p_remove_override(struct page *page, bool clear_pte)
 {
 	unsigned long flags;
 	unsigned long mfn;
 	unsigned long pfn;
-	unsigned long address;
+	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;

@@ -478,7 +586,7 @@ int m2p_remove_override(struct page *page)
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
 	__set_phys_to_machine(pfn, page->index);

-	if (!PageHighMem(page))
+	if (clear_pte && !PageHighMem(page))
 		set_pte_at(&init_mm, address, ptep,
 				pfn_pte(pfn, PAGE_KERNEL));
 		/* No tlb flush necessary because the caller already
@@ -520,3 +628,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+int p2m_dump_show(struct seq_file *m, void *v)
+{
+	static const char * const level_name[] = { "top", "middle",
+						"entry", "abnormal" };
+	static const char * const type_name[] = { "identity", "missing",
+						"pfn", "abnormal"};
+#define TYPE_IDENTITY 0
+#define TYPE_MISSING 1
+#define TYPE_PFN 2
+#define TYPE_UNKNOWN 3
+	unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
+	unsigned int uninitialized_var(prev_level);
+	unsigned int uninitialized_var(prev_type);
+
+	if (!p2m_top)
+		return 0;
+
+	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
+		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
+		unsigned idx = p2m_index(pfn);
+		unsigned lvl, type;
+
+		lvl = 4;
+		type = TYPE_UNKNOWN;
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			lvl = 0; type = TYPE_MISSING;
+		} else if (p2m_top[topidx] == NULL) {
+			lvl = 0; type = TYPE_UNKNOWN;
+		} else if (p2m_top[topidx][mididx] == NULL) {
+			lvl = 1; type = TYPE_UNKNOWN;
+		} else if (p2m_top[topidx][mididx] == p2m_identity) {
+			lvl = 1; type = TYPE_IDENTITY;
+		} else if (p2m_top[topidx][mididx] == p2m_missing) {
+			lvl = 1; type = TYPE_MISSING;
+		} else if (p2m_top[topidx][mididx][idx] == 0) {
+			lvl = 2; type = TYPE_UNKNOWN;
+		} else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
+			lvl = 2; type = TYPE_IDENTITY;
+		} else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
+			lvl = 2; type = TYPE_MISSING;
+		} else if (p2m_top[topidx][mididx][idx] == pfn) {
+			lvl = 2; type = TYPE_PFN;
+		} else if (p2m_top[topidx][mididx][idx] != pfn) {
+			lvl = 2; type = TYPE_PFN;
+		}
+		if (pfn == 0) {
+			prev_level = lvl;
+			prev_type = type;
+		}
+		if (pfn == MAX_DOMAIN_PAGES-1) {
+			lvl = 3;
+			type = TYPE_UNKNOWN;
+		}
+		if (prev_type != type) {
+			seq_printf(m, " [0x%lx->0x%lx] %s\n",
+				prev_pfn_type, pfn, type_name[prev_type]);
+			prev_pfn_type = pfn;
+			prev_type = type;
+		}
+		if (prev_level != lvl) {
+			seq_printf(m, " [0x%lx->0x%lx] level %s\n",
+				prev_pfn_level, pfn, level_name[prev_level]);
+			prev_pfn_level = pfn;
+			prev_level = lvl;
+		}
+	}
+	return 0;
+#undef TYPE_IDENTITY
+#undef TYPE_MISSING
+#undef TYPE_PFN
+#undef TYPE_UNKNOWN
+}
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a5..edeaff2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;

 static __init void xen_add_extra_mem(unsigned long pages)
 {
+	unsigned long pfn;
+
 	u64 size = (u64)pages * PAGE_SIZE;
 	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;

@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
 	xen_extra_mem_size += size;

 	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+
+	for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
+		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 }

 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
 		WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
 		     start, end, ret);
 		if (ret == 1) {
-			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 			len++;
 		}
 	}
@@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 	return released;
 }

+static unsigned long __init xen_set_identity(const struct e820entry *list,
+					     ssize_t map_size)
+{
+	phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
+	phys_addr_t start_pci = last;
+	const struct e820entry *entry;
+	unsigned long identity = 0;
+	int i;
+
+	for (i = 0, entry = list; i < map_size; i++, entry++) {
+		phys_addr_t start = entry->addr;
+		phys_addr_t end = start + entry->size;
+
+		if (start < last)
+			start = last;
+
+		if (end <= start)
+			continue;
+
+		/* Skip over the 1MB region. */
+		if (last > end)
+			continue;
+
+		if (entry->type == E820_RAM) {
+			if (start > start_pci)
+				identity += set_phys_range_identity(
+						PFN_UP(start_pci), PFN_DOWN(start));
+
+			/* Without saving 'last' we would gooble RAM too
+			 * at the end of the loop. */
+			last = end;
+			start_pci = end;
+			continue;
+		}
+		start_pci = min(start, start_pci);
+		last = end;
+	}
+	if (last > start_pci)
+		identity += set_phys_range_identity(
+					PFN_UP(start_pci), PFN_DOWN(last));
+	return identity;
+}
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
 	static struct e820entry map[E820MAX] __initdata;
+	static struct e820entry map_raw[E820MAX] __initdata;

 	unsigned long max_pfn = xen_start_info->nr_pages;
 	unsigned long long mem_end;
@@ -151,6 +199,7 @@ char * __init xen_memory_setup(void)
 	struct xen_memory_map memmap;
 	unsigned long extra_pages = 0;
 	unsigned long extra_limit;
+	unsigned long identity_pages = 0;
 	int i;
 	int op;

@@ -176,6 +225,7 @@ char * __init xen_memory_setup(void)
 	}
 	BUG_ON(rc);

+	memcpy(map_raw, map, sizeof(map));
 	e820.nr_map = 0;
 	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
@@ -194,6 +244,14 @@ char * __init xen_memory_setup(void)
 			end -= delta;

 			extra_pages += PFN_DOWN(delta);
+			/*
+			 * Set RAM below 4GB that is not for us to be unusable.
+			 * This prevents "System RAM" address space from being
+			 * used as potential resource for I/O address (happens
+			 * when 'allocate_resource' is called).
+			 */
+			if (delta && end < 0x100000000UL)
+				e820_add_region(end, delta, E820_UNUSABLE);
 		}

 		if (map[i].size > 0 && end > xen_extra_mem_start)
@@ -251,6 +309,13 @@ char * __init xen_memory_setup(void)

 	xen_add_extra_mem(extra_pages);

+	/*
+	 * Set P2M for all non-RAM pages and E820 gaps to be identity
+	 * type PFNs. We supply it with the non-sanitized version
+	 * of the E820.
+	 */
+	identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
+	printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
 	return "Xen";
 }

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c79..3061244 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
 	xen_fill_possible_map();
 	xen_init_spinlocks();
 }
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+	native_smp_prepare_cpus(max_cpus);
+	WARN_ON(xen_smp_intr_init(0));
+
+	if (!xen_have_vector_callback)
+		return;
+	xen_init_lock_cpu(0);
+	xen_init_spinlocks();
+}
+
+static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
+{
+	int rc;
+	rc = native_cpu_up(cpu);
+	WARN_ON (xen_smp_intr_init(cpu));
+	return rc;
+}
+
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+	unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
+	native_cpu_die(cpu);
+}
+
+void __init xen_hvm_smp_init(void)
+{
+	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+	smp_ops.cpu_up = xen_hvm_cpu_up;
+	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a..45329c8 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
 #include "xen-ops.h"
 #include "mmu.h"

-void xen_pre_suspend(void)
+void xen_arch_pre_suspend(void)
 {
 	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
 	xen_start_info->console.domU.mfn =
@@ -26,8 +26,9 @@ void xen_pre_suspend(void)
 		BUG();
 }

-void xen_hvm_post_suspend(int suspend_cancelled)
+void xen_arch_hvm_post_suspend(int suspend_cancelled)
 {
+#ifdef CONFIG_XEN_PVHVM
 	int cpu;
 	xen_hvm_init_shared_info();
 	xen_callback_vector();
@@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled)
 			xen_setup_runstate_info(cpu);
 		}
 	}
+#endif
 }

-void xen_post_suspend(int suspend_cancelled)
+void xen_arch_post_suspend(int suspend_cancelled)
 {
 	xen_build_mfn_list_list();

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e..2e2d370 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -397,7 +397,9 @@ void xen_setup_timer(int cpu)
 		name = "<timer kasprintf failed>";

 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
-				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
+				      IRQF_DISABLED|IRQF_PERCPU|
+				      IRQF_NOBALANCING|IRQF_TIMER|
+				      IRQF_FORCE_RESUME,
 				      name, NULL);

 	evt = &per_cpu(xen_clock_events, cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf9..3112f55 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);

 #ifdef CONFIG_SMP
 void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);

 extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
 #endif

 #ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/block/blk-core.c b/block/blk-core.c
index 2f4002f..77836fc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -455,6 +455,7 @@ void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
+EXPORT_SYMBOL_GPL(blk_put_queue);

 void blk_cleanup_queue(struct request_queue *q)
 {
@@ -662,6 +663,7 @@ int blk_get_queue(struct request_queue *q)

 	return 1;
 }
+EXPORT_SYMBOL_GPL(blk_get_queue);

 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d7aa39e..9cb8668 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -120,6 +120,10 @@ static DEFINE_SPINLOCK(minor_lock);
 #define EXTENDED (1<<EXT_SHIFT)
 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)

 #define DEV_NAME	"xvd"	/* name in /dev */

@@ -281,7 +285,7 @@ static int blkif_queue_request(struct request *req)
 	info->shadow[id].request = req;

 	ring_req->id = id;
-	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
 	ring_req->handle = info->handle;

 	ring_req->operation = rq_data_dir(req) ?
@@ -317,7 +321,7 @@ static int blkif_queue_request(struct request *req)
 				rq_data_dir(req) );

 		info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
-		ring_req->seg[i] =
+		ring_req->u.rw.seg[i] =
 				(struct blkif_request_segment) {
 					.gref       = ref,
 					.first_sect = fsect,
@@ -434,6 +438,65 @@ static void xlvbd_flush(struct blkfront_info *info)
 	       info->feature_flush ? "enabled" : "disabled");
 }

+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+	int major;
+	major = BLKIF_MAJOR(vdevice);
+	*minor = BLKIF_MINOR(vdevice);
+	switch (major) {
+		case XEN_IDE0_MAJOR:
+			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = ((*minor / 64) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_IDE1_MAJOR:
+			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK0_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK1_MAJOR:
+		case XEN_SCSI_DISK2_MAJOR:
+		case XEN_SCSI_DISK3_MAJOR:
+		case XEN_SCSI_DISK4_MAJOR:
+		case XEN_SCSI_DISK5_MAJOR:
+		case XEN_SCSI_DISK6_MAJOR:
+		case XEN_SCSI_DISK7_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) +
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK8_MAJOR:
+		case XEN_SCSI_DISK9_MAJOR:
+		case XEN_SCSI_DISK10_MAJOR:
+		case XEN_SCSI_DISK11_MAJOR:
+		case XEN_SCSI_DISK12_MAJOR:
+		case XEN_SCSI_DISK13_MAJOR:
+		case XEN_SCSI_DISK14_MAJOR:
+		case XEN_SCSI_DISK15_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) +
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XENVBD_MAJOR:
+			*offset = *minor / PARTS_PER_DISK;
+			break;
+		default:
+			printk(KERN_WARNING "blkfront: your disk configuration is "
+					"incorrect, please use an xvd device instead\n");
+			return -ENODEV;
+	}
+	return 0;
+}

 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
@@ -441,7 +504,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
-	int err = -ENODEV;
+	int err;
 	unsigned int offset;
 	int minor;
 	int nr_parts;
@@ -456,12 +519,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	}

 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
-		minor = BLKIF_MINOR(info->vdevice);
-		nr_parts = PARTS_PER_DISK;
+		err = xen_translate_vdev(info->vdevice, &minor, &offset);
+		if (err)
+			return err;
+ 		nr_parts = PARTS_PER_DISK;
 	} else {
 		minor = BLKIF_MINOR_EXT(info->vdevice);
 		nr_parts = PARTS_PER_EXT_DISK;
+		offset = minor / nr_parts;
+		if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+					"emulated IDE disks,\n\t choose an xvd device name"
+					"from xvde on\n", info->vdevice);
 	}
+	err = -ENODEV;

 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
@@ -475,8 +546,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	if (gd == NULL)
 		goto release;

-	offset = minor / nr_parts;
-
 	if (nr_minors > 1) {
 		if (offset < 26)
 			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
@@ -615,7 +684,7 @@ static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
 	for (i = 0; i < s->req.nr_segments; i++)
-		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }

 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -932,7 +1001,7 @@ static int blkif_recover(struct blkfront_info *info)
 		/* Rewrite any grant references invalidated by susp/resume. */
 		for (j = 0; j < req->nr_segments; j++)
 			gnttab_grant_foreign_access_ref(
-				req->seg[j].gref,
+				req->u.rw.seg[j].gref,
 				info->xbdev->otherend_id,
 				pfn_to_mfn(info->shadow[req->id].frame[j]),
 				rq_data_dir(info->shadow[req->id].request));
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
index 26347b7..3706156 100644
--- a/drivers/gpu/drm/nouveau/nouveau_mem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -412,7 +412,8 @@ nouveau_mem_vram_init(struct drm_device *dev)
 	ret = ttm_bo_device_init(&dev_priv->ttm.bdev,
 				 dev_priv->ttm.bo_global_ref.ref.object,
 				 &nouveau_bo_driver, DRM_FILE_PAGE_OFFSET,
-				 dma_bits <= 32 ? true : false);
+				 dma_bits <= 32 ? true : false,
+				 dev->dev);
 	if (ret) {
 		NV_ERROR(dev, "Error initialising bo driver: %d\n", ret);
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index 9a250eb..07b1151 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -12,6 +12,7 @@ struct nouveau_sgdma_be {
 	struct drm_device *dev;

 	dma_addr_t *pages;
+	bool *ttm_alloced;
 	unsigned nr_pages;

 	u64 offset;
@@ -20,7 +21,8 @@ struct nouveau_sgdma_be {

 static int
 nouveau_sgdma_populate(struct ttm_backend *be, unsigned long num_pages,
-		       struct page **pages, struct page *dummy_read_page)
+		       struct page **pages, struct page *dummy_read_page,
+		       dma_addr_t *dma_addrs)
 {
 	struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)be;
 	struct drm_device *dev = nvbe->dev;
@@ -34,15 +36,25 @@ nouveau_sgdma_populate(struct ttm_backend *be, unsigned long num_pages,
 	if (!nvbe->pages)
 		return -ENOMEM;

+	nvbe->ttm_alloced = kmalloc(sizeof(bool) * num_pages, GFP_KERNEL);
+	if (!nvbe->ttm_alloced)
+		return -ENOMEM;
+
 	nvbe->nr_pages = 0;
 	while (num_pages--) {
-		nvbe->pages[nvbe->nr_pages] =
-			pci_map_page(dev->pdev, pages[nvbe->nr_pages], 0,
+		if (dma_addrs[nvbe->nr_pages] != DMA_ERROR_CODE) {
+			nvbe->pages[nvbe->nr_pages] =
+					dma_addrs[nvbe->nr_pages];
+		 	nvbe->ttm_alloced[nvbe->nr_pages] = true;
+		} else {
+			nvbe->pages[nvbe->nr_pages] =
+				pci_map_page(dev->pdev, pages[nvbe->nr_pages], 0,
 				     PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
-		if (pci_dma_mapping_error(dev->pdev,
-					  nvbe->pages[nvbe->nr_pages])) {
-			be->func->clear(be);
-			return -EFAULT;
+			if (pci_dma_mapping_error(dev->pdev,
+						  nvbe->pages[nvbe->nr_pages])) {
+				be->func->clear(be);
+				return -EFAULT;
+			}
 		}

 		nvbe->nr_pages++;
@@ -65,11 +77,14 @@ nouveau_sgdma_clear(struct ttm_backend *be)
 			be->func->unbind(be);

 		while (nvbe->nr_pages--) {
-			pci_unmap_page(dev->pdev, nvbe->pages[nvbe->nr_pages],
+			if (!nvbe->ttm_alloced[nvbe->nr_pages])
+				pci_unmap_page(dev->pdev, nvbe->pages[nvbe->nr_pages],
 				       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
 		}
 		kfree(nvbe->pages);
+		kfree(nvbe->ttm_alloced);
 		nvbe->pages = NULL;
+		nvbe->ttm_alloced = NULL;
 		nvbe->nr_pages = 0;
 	}
 }
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index d270b3f..f643133 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3048,9 +3048,6 @@ int evergreen_init(struct radeon_device *rdev)
 {
 	int r;

-	r = radeon_dummy_page_init(rdev);
-	if (r)
-		return r;
 	/* This don't do much */
 	r = radeon_gem_init(rdev);
 	if (r)
@@ -3162,7 +3159,6 @@ void evergreen_fini(struct radeon_device *rdev)
 	radeon_atombios_fini(rdev);
 	kfree(rdev->bios);
 	rdev->bios = NULL;
-	radeon_dummy_page_fini(rdev);
 }

 static void evergreen_pcie_gen2_enable(struct radeon_device *rdev)
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index de88624..36efc45 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -2509,9 +2509,6 @@ int r600_init(struct radeon_device *rdev)
 {
 	int r;

-	r = radeon_dummy_page_init(rdev);
-	if (r)
-		return r;
 	if (r600_debugfs_mc_info_init(rdev)) {
 		DRM_ERROR("Failed to register debugfs file for mc !\n");
 	}
@@ -2625,7 +2622,6 @@ void r600_fini(struct radeon_device *rdev)
 	radeon_atombios_fini(rdev);
 	kfree(rdev->bios);
 	rdev->bios = NULL;
-	radeon_dummy_page_fini(rdev);
 }


diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 56c48b6..c5955d3 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -319,6 +319,7 @@ struct radeon_gart {
 	union radeon_gart_table		table;
 	struct page			**pages;
 	dma_addr_t			*pages_addr;
+	bool				*ttm_alloced;
 	bool				ready;
 };

@@ -331,7 +332,8 @@ void radeon_gart_fini(struct radeon_device *rdev);
 void radeon_gart_unbind(struct radeon_device *rdev, unsigned offset,
 			int pages);
 int radeon_gart_bind(struct radeon_device *rdev, unsigned offset,
-		     int pages, struct page **pagelist);
+		     int pages, struct page **pagelist,
+		     dma_addr_t *dma_addr);


 /*
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c
index 6501611..de4a86f 100644
--- a/drivers/gpu/drm/radeon/radeon_gart.c
+++ b/drivers/gpu/drm/radeon/radeon_gart.c
@@ -149,8 +149,9 @@ void radeon_gart_unbind(struct radeon_device *rdev, unsigned offset,
 	p = t / (PAGE_SIZE / RADEON_GPU_PAGE_SIZE);
 	for (i = 0; i < pages; i++, p++) {
 		if (rdev->gart.pages[p]) {
-			pci_unmap_page(rdev->pdev, rdev->gart.pages_addr[p],
-				       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+			if (!rdev->gart.ttm_alloced[p])
+				pci_unmap_page(rdev->pdev, rdev->gart.pages_addr[p],
+				       		PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
 			rdev->gart.pages[p] = NULL;
 			rdev->gart.pages_addr[p] = rdev->dummy_page.addr;
 			page_base = rdev->gart.pages_addr[p];
@@ -165,7 +166,7 @@ void radeon_gart_unbind(struct radeon_device *rdev, unsigned offset,
 }

 int radeon_gart_bind(struct radeon_device *rdev, unsigned offset,
-		     int pages, struct page **pagelist)
+		     int pages, struct page **pagelist, dma_addr_t *dma_addr)
 {
 	unsigned t;
 	unsigned p;
@@ -180,15 +181,22 @@ int radeon_gart_bind(struct radeon_device *rdev, unsigned offset,
 	p = t / (PAGE_SIZE / RADEON_GPU_PAGE_SIZE);

 	for (i = 0; i < pages; i++, p++) {
-		/* we need to support large memory configurations */
-		/* assume that unbind have already been call on the range */
-		rdev->gart.pages_addr[p] = pci_map_page(rdev->pdev, pagelist[i],
+		/* On TTM path, we only use the DMA API if TTM_PAGE_FLAG_DMA32
+		 * is requested. */
+		if (dma_addr[i] != DMA_ERROR_CODE) {
+			rdev->gart.ttm_alloced[p] = true;
+			rdev->gart.pages_addr[p] = dma_addr[i];
+		} else {
+			/* we need to support large memory configurations */
+			/* assume that unbind have already been call on the range */
+			rdev->gart.pages_addr[p] = pci_map_page(rdev->pdev, pagelist[i],
 							0, PAGE_SIZE,
 							PCI_DMA_BIDIRECTIONAL);
-		if (pci_dma_mapping_error(rdev->pdev, rdev->gart.pages_addr[p])) {
-			/* FIXME: failed to map page (return -ENOMEM?) */
-			radeon_gart_unbind(rdev, offset, pages);
-			return -ENOMEM;
+			if (pci_dma_mapping_error(rdev->pdev, rdev->gart.pages_addr[p])) {
+				/* FIXME: failed to map page (return -ENOMEM?) */
+				radeon_gart_unbind(rdev, offset, pages);
+				return -ENOMEM;
+			}
 		}
 		rdev->gart.pages[p] = pagelist[i];
 		page_base = rdev->gart.pages_addr[p];
@@ -251,6 +259,12 @@ int radeon_gart_init(struct radeon_device *rdev)
 		radeon_gart_fini(rdev);
 		return -ENOMEM;
 	}
+	rdev->gart.ttm_alloced = kzalloc(sizeof(bool) *
+					 rdev->gart.num_cpu_pages, GFP_KERNEL);
+	if (rdev->gart.ttm_alloced == NULL) {
+		radeon_gart_fini(rdev);
+		return -ENOMEM;
+	}
 	/* set GART entry to point to the dummy page by default */
 	for (i = 0; i < rdev->gart.num_cpu_pages; i++) {
 		rdev->gart.pages_addr[i] = rdev->dummy_page.addr;
@@ -267,6 +281,9 @@ void radeon_gart_fini(struct radeon_device *rdev)
 	rdev->gart.ready = false;
 	kfree(rdev->gart.pages);
 	kfree(rdev->gart.pages_addr);
+	kfree(rdev->gart.ttm_alloced);
 	rdev->gart.pages = NULL;
 	rdev->gart.pages_addr = NULL;
+	rdev->gart.ttm_alloced = NULL;
+	radeon_dummy_page_fini(rdev);
 }
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index e5b2cf1..371890c 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -517,7 +517,8 @@ int radeon_ttm_init(struct radeon_device *rdev)
 	r = ttm_bo_device_init(&rdev->mman.bdev,
 			       rdev->mman.bo_global_ref.ref.object,
 			       &radeon_bo_driver, DRM_FILE_PAGE_OFFSET,
-			       rdev->need_dma32);
+			       rdev->need_dma32,
+			       rdev->dev);
 	if (r) {
 		DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
 		return r;
@@ -647,6 +648,7 @@ struct radeon_ttm_backend {
 	unsigned long			num_pages;
 	struct page			**pages;
 	struct page			*dummy_read_page;
+	dma_addr_t			*dma_addrs;
 	bool				populated;
 	bool				bound;
 	unsigned			offset;
@@ -655,12 +657,14 @@ struct radeon_ttm_backend {
 static int radeon_ttm_backend_populate(struct ttm_backend *backend,
 				       unsigned long num_pages,
 				       struct page **pages,
-				       struct page *dummy_read_page)
+				       struct page *dummy_read_page,
+				       dma_addr_t *dma_addrs)
 {
 	struct radeon_ttm_backend *gtt;

 	gtt = container_of(backend, struct radeon_ttm_backend, backend);
 	gtt->pages = pages;
+	gtt->dma_addrs = dma_addrs;
 	gtt->num_pages = num_pages;
 	gtt->dummy_read_page = dummy_read_page;
 	gtt->populated = true;
@@ -673,6 +677,7 @@ static void radeon_ttm_backend_clear(struct ttm_backend *backend)

 	gtt = container_of(backend, struct radeon_ttm_backend, backend);
 	gtt->pages = NULL;
+	gtt->dma_addrs = NULL;
 	gtt->num_pages = 0;
 	gtt->dummy_read_page = NULL;
 	gtt->populated = false;
@@ -693,7 +698,7 @@ static int radeon_ttm_backend_bind(struct ttm_backend *backend,
 		     gtt->num_pages, bo_mem, backend);
 	}
 	r = radeon_gart_bind(gtt->rdev, gtt->offset,
-			     gtt->num_pages, gtt->pages);
+			     gtt->num_pages, gtt->pages, gtt->dma_addrs);
 	if (r) {
 		DRM_ERROR("failed to bind %lu pages at 0x%08X\n",
 			  gtt->num_pages, gtt->offset);
diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c
index d8ba676..6a312e6 100644
--- a/drivers/gpu/drm/radeon/rv770.c
+++ b/drivers/gpu/drm/radeon/rv770.c
@@ -1256,9 +1256,6 @@ int rv770_init(struct radeon_device *rdev)
 {
 	int r;

-	r = radeon_dummy_page_init(rdev);
-	if (r)
-		return r;
 	/* This don't do much */
 	r = radeon_gem_init(rdev);
 	if (r)
@@ -1373,7 +1370,6 @@ void rv770_fini(struct radeon_device *rdev)
 	radeon_atombios_fini(rdev);
 	kfree(rdev->bios);
 	rdev->bios = NULL;
-	radeon_dummy_page_fini(rdev);
 }

 static void rv770_pcie_gen2_enable(struct radeon_device *rdev)
diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c b/drivers/gpu/drm/ttm/ttm_agp_backend.c
index f999e36..1c4a72f 100644
--- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
+++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
@@ -47,7 +47,8 @@ struct ttm_agp_backend {

 static int ttm_agp_populate(struct ttm_backend *backend,
 			    unsigned long num_pages, struct page **pages,
-			    struct page *dummy_read_page)
+			    struct page *dummy_read_page,
+			    dma_addr_t *dma_addrs)
 {
 	struct ttm_agp_backend *agp_be =
 	    container_of(backend, struct ttm_agp_backend, backend);
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index af61fc2..278a2d3 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1526,12 +1526,14 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
 		       struct ttm_bo_global *glob,
 		       struct ttm_bo_driver *driver,
 		       uint64_t file_page_offset,
-		       bool need_dma32)
+		       bool need_dma32,
+		       struct device *dev)
 {
 	int ret = -EINVAL;

 	rwlock_init(&bdev->vm_lock);
 	bdev->driver = driver;
+	bdev->dev = dev;

 	memset(bdev->man, 0, sizeof(bdev->man));

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index b1e02ff..35849db 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -38,6 +38,7 @@
 #include <linux/mm.h>
 #include <linux/seq_file.h> /* for seq_printf */
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>

 #include <asm/atomic.h>

@@ -662,7 +663,8 @@ out:
  * cached pages.
  */
 int ttm_get_pages(struct list_head *pages, int flags,
-		enum ttm_caching_state cstate, unsigned count)
+		  enum ttm_caching_state cstate, unsigned count,
+		  dma_addr_t *dma_address, struct device *dev)
 {
 	struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
 	struct page *p = NULL;
@@ -681,14 +683,22 @@ int ttm_get_pages(struct list_head *pages, int flags,
 			gfp_flags |= GFP_HIGHUSER;

 		for (r = 0; r < count; ++r) {
-			p = alloc_page(gfp_flags);
+			if ((flags & TTM_PAGE_FLAG_DMA32) && dma_address) {
+				void *addr;
+				addr = dma_alloc_coherent(dev, PAGE_SIZE,
+							  &dma_address[r],
+							  gfp_flags);
+				if (addr == NULL)
+					return -ENOMEM;
+				p = virt_to_page(addr);
+			} else
+				p = alloc_page(gfp_flags);
 			if (!p) {

 				printk(KERN_ERR TTM_PFX
 				       "Unable to allocate page.");
 				return -ENOMEM;
 			}
-
 			list_add(&p->lru, pages);
 		}
 		return 0;
@@ -720,7 +730,7 @@ int ttm_get_pages(struct list_head *pages, int flags,
 			printk(KERN_ERR TTM_PFX
 			       "Failed to allocate extra pages "
 			       "for large request.");
-			ttm_put_pages(pages, 0, flags, cstate);
+			ttm_put_pages(pages, 0, flags, cstate, NULL, NULL);
 			return r;
 		}
 	}
@@ -731,17 +741,30 @@ int ttm_get_pages(struct list_head *pages, int flags,

 /* Put all pages in pages list to correct pool to wait for reuse */
 void ttm_put_pages(struct list_head *pages, unsigned page_count, int flags,
-		enum ttm_caching_state cstate)
+		   enum ttm_caching_state cstate, dma_addr_t *dma_address,
+		   struct device *dev)
 {
 	unsigned long irq_flags;
 	struct ttm_page_pool *pool = ttm_get_pool(flags, cstate);
 	struct page *p, *tmp;
+	unsigned r;

 	if (pool == NULL) {
 		/* No pool for this memory type so free the pages */

+		r = page_count-1;
 		list_for_each_entry_safe(p, tmp, pages, lru) {
-			__free_page(p);
+			if ((flags & TTM_PAGE_FLAG_DMA32) && dma_address) {
+				void *addr = page_address(p);
+				WARN_ON(!addr || !dma_address[r]);
+				if (addr)
+					dma_free_coherent(dev, PAGE_SIZE,
+							  addr,
+							  dma_address[r]);
+				dma_address[r] = 0;
+			} else
+				__free_page(p);
+			r--;
 		}
 		/* Make the pages list empty */
 		INIT_LIST_HEAD(pages);
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index af789dc..354f9d9 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -49,12 +49,16 @@ static int ttm_tt_swapin(struct ttm_tt *ttm);
 static void ttm_tt_alloc_page_directory(struct ttm_tt *ttm)
 {
 	ttm->pages = drm_calloc_large(ttm->num_pages, sizeof(*ttm->pages));
+	ttm->dma_address = drm_calloc_large(ttm->num_pages,
+					    sizeof(*ttm->dma_address));
 }

 static void ttm_tt_free_page_directory(struct ttm_tt *ttm)
 {
 	drm_free_large(ttm->pages);
 	ttm->pages = NULL;
+	drm_free_large(ttm->dma_address);
+	ttm->dma_address = NULL;
 }

 static void ttm_tt_free_user_pages(struct ttm_tt *ttm)
@@ -105,7 +109,8 @@ static struct page *__ttm_tt_get_page(struct ttm_tt *ttm, int index)

 		INIT_LIST_HEAD(&h);

-		ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1);
+		ret = ttm_get_pages(&h, ttm->page_flags, ttm->caching_state, 1,
+				    &ttm->dma_address[index], ttm->dev);

 		if (ret != 0)
 			return NULL;
@@ -164,7 +169,7 @@ int ttm_tt_populate(struct ttm_tt *ttm)
 	}

 	be->func->populate(be, ttm->num_pages, ttm->pages,
-			   ttm->dummy_read_page);
+			   ttm->dummy_read_page, ttm->dma_address);
 	ttm->state = tt_unbound;
 	return 0;
 }
@@ -298,7 +303,8 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
 			count++;
 		}
 	}
-	ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state);
+	ttm_put_pages(&h, count, ttm->page_flags, ttm->caching_state,
+		      ttm->dma_address, ttm->dev);
 	ttm->state = tt_unpopulated;
 	ttm->first_himem_page = ttm->num_pages;
 	ttm->last_lomem_page = -1;
@@ -391,6 +397,7 @@ struct ttm_tt *ttm_tt_create(struct ttm_bo_device *bdev, unsigned long size,
 	ttm->last_lomem_page = -1;
 	ttm->caching_state = tt_cached;
 	ttm->page_flags = page_flags;
+	ttm->dev = bdev->dev;

 	ttm->dummy_read_page = dummy_read_page;

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
index 80bc37b..87e43e0 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c
@@ -102,7 +102,8 @@ struct vmw_ttm_backend {

 static int vmw_ttm_populate(struct ttm_backend *backend,
 			    unsigned long num_pages, struct page **pages,
-			    struct page *dummy_read_page)
+			    struct page *dummy_read_page,
+			    dma_addr_t *dma_addrs)
 {
 	struct vmw_ttm_backend *vmw_be =
 	    container_of(backend, struct vmw_ttm_backend, backend);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 10ca97e..803d979 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -322,11 +322,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
 	dev_priv->active_master = &dev_priv->fbdev_master;

-
 	ret = ttm_bo_device_init(&dev_priv->bdev,
 				 dev_priv->bo_global_ref.ref.object,
 				 &vmw_bo_driver, VMWGFX_FILE_PAGE_OFFSET,
-				 false);
+				 false,
+				 dev->dev);
 	if (unlikely(ret != 0)) {
 		DRM_ERROR("Failed initializing TTM buffer object driver.\n");
 		goto out_err1;
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0382332..1826d5d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2966,12 +2966,38 @@ config XEN_NETDEV_FRONTEND
 	select XEN_XENBUS_FRONTEND
 	default y
 	help
-	  The network device frontend driver allows the kernel to
-	  access network devices exported exported by a virtual
-	  machine containing a physical network device driver. The
-	  frontend driver is intended for unprivileged guest domains;
-	  if you are compiling a kernel for a Xen guest, you almost
-	  certainly want to enable this.
+	  This driver provides support for Xen paravirtual network
+	  devices exported by a Xen network driver domain (often
+	  domain 0).
+
+	  The corresponding Linux backend driver is enabled by the
+	  CONFIG_XEN_NETDEV_BACKEND option.
+
+	  If you are compiling a kernel for use as Xen guest, you
+	  should say Y here. To compile this driver as a module, chose
+	  M here: the module will be called xen-netfront.
+
+config XEN_NETDEV_BACKEND
+	tristate "Xen backend network device"
+	depends on XEN_BACKEND
+	help
+	  This driver allows the kernel to act as a Xen network driver
+	  domain which exports paravirtual network devices to other
+	  Xen domains. These devices can be accessed by any operating
+	  system that implements a compatible front end.
+
+	  The corresponding Linux frontend driver is enabled by the
+	  CONFIG_XEN_NETDEV_FRONTEND configuration option.
+
+	  The backend driver presents a standard network device
+	  endpoint for each paravirtual network device to the driver
+	  domain network stack. These can then be bridged or routed
+	  etc in order to provide full network connectivity.
+
+	  If you are compiling a kernel to run in a Xen network driver
+	  domain (often this is domain 0) you should say Y here. To
+	  compile this driver as a module, chose M here: the module
+	  will be called xen-netback.

 config ISERIES_VETH
 	tristate "iSeries Virtual Ethernet driver support"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b90738d..145dfd7 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -171,6 +171,7 @@ obj-$(CONFIG_SLIP) += slip.o
 obj-$(CONFIG_SLHC) += slhc.o

 obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
+obj-$(CONFIG_XEN_NETDEV_BACKEND) += xen-netback/

 obj-$(CONFIG_DUMMY) += dummy.o
 obj-$(CONFIG_IFB) += ifb.o
diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile
new file mode 100644
index 0000000..e346e81
--- /dev/null
+++ b/drivers/net/xen-netback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
+
+xen-netback-y := netback.o xenbus.o interface.o
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
new file mode 100644
index 0000000..21f4c0c
--- /dev/null
+++ b/drivers/net/xen-netback/common.h
@@ -0,0 +1,162 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_NETBACK__COMMON_H__
+#define __XEN_NETBACK__COMMON_H__
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/io.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include <xen/interface/io/netif.h>
+#include <xen/interface/grant_table.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+struct xen_netbk;
+
+struct xenvif {
+	/* Unique identifier for this interface. */
+	domid_t          domid;
+	unsigned int     handle;
+
+	/* Reference to netback processing backend. */
+	struct xen_netbk *netbk;
+
+	u8               fe_dev_addr[6];
+
+	/* Physical parameters of the comms window. */
+	grant_handle_t   tx_shmem_handle;
+	grant_ref_t      tx_shmem_ref;
+	grant_handle_t   rx_shmem_handle;
+	grant_ref_t      rx_shmem_ref;
+	unsigned int     irq;
+
+	/* List of frontends to notify after a batch of frames sent. */
+	struct list_head notify_list;
+
+	/* The shared rings and indexes. */
+	struct xen_netif_tx_back_ring tx;
+	struct xen_netif_rx_back_ring rx;
+	struct vm_struct *tx_comms_area;
+	struct vm_struct *rx_comms_area;
+
+	/* Flags that must not be set in dev->features */
+	int features_disabled;
+
+	/* Frontend feature information. */
+	u8 can_sg:1;
+	u8 gso:1;
+	u8 gso_prefix:1;
+	u8 csum:1;
+
+	/* Internal feature information. */
+	u8 can_queue:1;	    /* can queue packets for receiver? */
+
+	/*
+	 * Allow xenvif_start_xmit() to peek ahead in the rx request
+	 * ring.  This is a prediction of what rx_req_cons will be
+	 * once all queued skbs are put on the ring.
+	 */
+	RING_IDX rx_req_cons_peek;
+
+	/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+	unsigned long   credit_bytes;
+	unsigned long   credit_usec;
+	unsigned long   remaining_credit;
+	struct timer_list credit_timeout;
+
+	/* Statistics */
+	int rx_gso_checksum_fixup;
+
+	/* Miscellaneous private stuff. */
+	struct list_head schedule_list;
+	atomic_t         refcnt;
+	struct net_device *dev;
+	struct net_device_stats stats;
+
+	wait_queue_head_t waiting_to_free;
+};
+
+#define XEN_NETIF_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
+#define XEN_NETIF_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+
+struct xenvif *xenvif_alloc(struct device *parent,
+			    domid_t domid,
+			    unsigned int handle);
+
+int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
+		   unsigned long rx_ring_ref, unsigned int evtchn);
+void xenvif_disconnect(struct xenvif *vif);
+
+void xenvif_get(struct xenvif *vif);
+void xenvif_put(struct xenvif *vif);
+
+int xenvif_xenbus_init(void);
+
+int xenvif_schedulable(struct xenvif *vif);
+
+int xen_netbk_rx_ring_full(struct xenvif *vif);
+
+int xen_netbk_must_stop_queue(struct xenvif *vif);
+
+/* (Un)Map communication rings. */
+void xen_netbk_unmap_frontend_rings(struct xenvif *vif);
+int xen_netbk_map_frontend_rings(struct xenvif *vif,
+				 grant_ref_t tx_ring_ref,
+				 grant_ref_t rx_ring_ref);
+
+/* (De)Register a xenvif with the netback backend. */
+void xen_netbk_add_xenvif(struct xenvif *vif);
+void xen_netbk_remove_xenvif(struct xenvif *vif);
+
+/* (De)Schedule backend processing for a xenvif */
+void xen_netbk_schedule_xenvif(struct xenvif *vif);
+void xen_netbk_deschedule_xenvif(struct xenvif *vif);
+
+/* Check for SKBs from frontend and schedule backend processing */
+void xen_netbk_check_rx_xenvif(struct xenvif *vif);
+/* Receive an SKB from the frontend */
+void xenvif_receive_skb(struct xenvif *vif, struct sk_buff *skb);
+
+/* Queue an SKB for transmission to the frontend */
+void xen_netbk_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb);
+/* Notify xenvif that ring now has space to send an skb to the frontend */
+void xenvif_notify_tx_completion(struct xenvif *vif);
+
+/* Returns number of ring slots required to send an skb to the frontend */
+unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb);
+
+#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
new file mode 100644
index 0000000..1614ba5
--- /dev/null
+++ b/drivers/net/xen-netback/interface.c
@@ -0,0 +1,424 @@
+/*
+ * Network-device interface management.
+ *
+ * Copyright (c) 2004-2005, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+
+#include <xen/events.h>
+#include <asm/xen/hypercall.h>
+
+#define XENVIF_QUEUE_LENGTH 32
+
+void xenvif_get(struct xenvif *vif)
+{
+	atomic_inc(&vif->refcnt);
+}
+
+void xenvif_put(struct xenvif *vif)
+{
+	if (atomic_dec_and_test(&vif->refcnt))
+		wake_up(&vif->waiting_to_free);
+}
+
+int xenvif_schedulable(struct xenvif *vif)
+{
+	return netif_running(vif->dev) && netif_carrier_ok(vif->dev);
+}
+
+static int xenvif_rx_schedulable(struct xenvif *vif)
+{
+	return xenvif_schedulable(vif) && !xen_netbk_rx_ring_full(vif);
+}
+
+static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
+{
+	struct xenvif *vif = dev_id;
+
+	if (vif->netbk == NULL)
+		return IRQ_NONE;
+
+	xen_netbk_schedule_xenvif(vif);
+
+	if (xenvif_rx_schedulable(vif))
+		netif_wake_queue(vif->dev);
+
+	return IRQ_HANDLED;
+}
+
+static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct xenvif *vif = netdev_priv(dev);
+
+	BUG_ON(skb->dev != dev);
+
+	if (vif->netbk == NULL)
+		goto drop;
+
+	/* Drop the packet if the target domain has no receive buffers. */
+	if (!xenvif_rx_schedulable(vif))
+		goto drop;
+
+	/* Reserve ring slots for the worst-case number of fragments. */
+	vif->rx_req_cons_peek += xen_netbk_count_skb_slots(vif, skb);
+	xenvif_get(vif);
+
+	if (vif->can_queue && xen_netbk_must_stop_queue(vif))
+		netif_stop_queue(dev);
+
+	xen_netbk_queue_tx_skb(vif, skb);
+
+	return NETDEV_TX_OK;
+
+ drop:
+	vif->stats.tx_dropped++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+void xenvif_receive_skb(struct xenvif *vif, struct sk_buff *skb)
+{
+	netif_rx_ni(skb);
+	vif->dev->last_rx = jiffies;
+}
+
+void xenvif_notify_tx_completion(struct xenvif *vif)
+{
+	if (netif_queue_stopped(vif->dev) && xenvif_rx_schedulable(vif))
+		netif_wake_queue(vif->dev);
+}
+
+static struct net_device_stats *xenvif_get_stats(struct net_device *dev)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	return &vif->stats;
+}
+
+static void xenvif_up(struct xenvif *vif)
+{
+	xen_netbk_add_xenvif(vif);
+	enable_irq(vif->irq);
+	xen_netbk_check_rx_xenvif(vif);
+}
+
+static void xenvif_down(struct xenvif *vif)
+{
+	disable_irq(vif->irq);
+	xen_netbk_deschedule_xenvif(vif);
+	xen_netbk_remove_xenvif(vif);
+}
+
+static int xenvif_open(struct net_device *dev)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	if (netif_carrier_ok(dev))
+		xenvif_up(vif);
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int xenvif_close(struct net_device *dev)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	if (netif_carrier_ok(dev))
+		xenvif_down(vif);
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static int xenvif_change_mtu(struct net_device *dev, int mtu)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	int max = vif->can_sg ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+	if (mtu > max)
+		return -EINVAL;
+	dev->mtu = mtu;
+	return 0;
+}
+
+static void xenvif_set_features(struct xenvif *vif)
+{
+	struct net_device *dev = vif->dev;
+	int features = dev->features;
+
+	if (vif->can_sg)
+		features |= NETIF_F_SG;
+	if (vif->gso || vif->gso_prefix)
+		features |= NETIF_F_TSO;
+	if (vif->csum)
+		features |= NETIF_F_IP_CSUM;
+
+	features &= ~(vif->features_disabled);
+
+	if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN)
+		dev->mtu = ETH_DATA_LEN;
+
+	dev->features = features;
+}
+
+static int xenvif_set_tx_csum(struct net_device *dev, u32 data)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	if (data) {
+		if (!vif->csum)
+			return -EOPNOTSUPP;
+		vif->features_disabled &= ~NETIF_F_IP_CSUM;
+	} else {
+		vif->features_disabled |= NETIF_F_IP_CSUM;
+	}
+
+	xenvif_set_features(vif);
+	return 0;
+}
+
+static int xenvif_set_sg(struct net_device *dev, u32 data)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	if (data) {
+		if (!vif->can_sg)
+			return -EOPNOTSUPP;
+		vif->features_disabled &= ~NETIF_F_SG;
+	} else {
+		vif->features_disabled |= NETIF_F_SG;
+	}
+
+	xenvif_set_features(vif);
+	return 0;
+}
+
+static int xenvif_set_tso(struct net_device *dev, u32 data)
+{
+	struct xenvif *vif = netdev_priv(dev);
+	if (data) {
+		if (!vif->gso && !vif->gso_prefix)
+			return -EOPNOTSUPP;
+		vif->features_disabled &= ~NETIF_F_TSO;
+	} else {
+		vif->features_disabled |= NETIF_F_TSO;
+	}
+
+	xenvif_set_features(vif);
+	return 0;
+}
+
+static const struct xenvif_stat {
+	char name[ETH_GSTRING_LEN];
+	u16 offset;
+} xenvif_stats[] = {
+	{
+		"rx_gso_checksum_fixup",
+		offsetof(struct xenvif, rx_gso_checksum_fixup)
+	},
+};
+
+static int xenvif_get_sset_count(struct net_device *dev, int string_set)
+{
+	switch (string_set) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(xenvif_stats);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void xenvif_get_ethtool_stats(struct net_device *dev,
+				     struct ethtool_stats *stats, u64 * data)
+{
+	void *vif = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xenvif_stats); i++)
+		data[i] = *(int *)(vif + xenvif_stats[i].offset);
+}
+
+static void xenvif_get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ARRAY_SIZE(xenvif_stats); i++)
+			memcpy(data + i * ETH_GSTRING_LEN,
+			       xenvif_stats[i].name, ETH_GSTRING_LEN);
+		break;
+	}
+}
+
+static struct ethtool_ops xenvif_ethtool_ops = {
+	.get_tx_csum	= ethtool_op_get_tx_csum,
+	.set_tx_csum	= xenvif_set_tx_csum,
+	.get_sg		= ethtool_op_get_sg,
+	.set_sg		= xenvif_set_sg,
+	.get_tso	= ethtool_op_get_tso,
+	.set_tso	= xenvif_set_tso,
+	.get_link	= ethtool_op_get_link,
+
+	.get_sset_count = xenvif_get_sset_count,
+	.get_ethtool_stats = xenvif_get_ethtool_stats,
+	.get_strings = xenvif_get_strings,
+};
+
+static struct net_device_ops xenvif_netdev_ops = {
+	.ndo_start_xmit	= xenvif_start_xmit,
+	.ndo_get_stats	= xenvif_get_stats,
+	.ndo_open	= xenvif_open,
+	.ndo_stop	= xenvif_close,
+	.ndo_change_mtu	= xenvif_change_mtu,
+};
+
+struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
+			    unsigned int handle)
+{
+	int err;
+	struct net_device *dev;
+	struct xenvif *vif;
+	char name[IFNAMSIZ] = {};
+
+	snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+	dev = alloc_netdev(sizeof(struct xenvif), name, ether_setup);
+	if (dev == NULL) {
+		pr_warn("Could not allocate netdev\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	SET_NETDEV_DEV(dev, parent);
+
+	vif = netdev_priv(dev);
+	vif->domid  = domid;
+	vif->handle = handle;
+	vif->netbk  = NULL;
+	vif->can_sg = 1;
+	vif->csum = 1;
+	atomic_set(&vif->refcnt, 1);
+	init_waitqueue_head(&vif->waiting_to_free);
+	vif->dev = dev;
+	INIT_LIST_HEAD(&vif->schedule_list);
+	INIT_LIST_HEAD(&vif->notify_list);
+
+	vif->credit_bytes = vif->remaining_credit = ~0UL;
+	vif->credit_usec  = 0UL;
+	init_timer(&vif->credit_timeout);
+	/* Initialize 'expires' now: it's used to track the credit window. */
+	vif->credit_timeout.expires = jiffies;
+
+	dev->netdev_ops	= &xenvif_netdev_ops;
+	xenvif_set_features(vif);
+	SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops);
+
+	dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
+
+	/*
+	 * Initialise a dummy MAC address. We choose the numerically
+	 * largest non-broadcast address to prevent the address getting
+	 * stolen by an Ethernet bridge for STP purposes.
+	 * (FE:FF:FF:FF:FF:FF)
+	 */
+	memset(dev->dev_addr, 0xFF, ETH_ALEN);
+	dev->dev_addr[0] &= ~0x01;
+
+	netif_carrier_off(dev);
+
+	err = register_netdev(dev);
+	if (err) {
+		netdev_warn(dev, "Could not register device: err=%d\n", err);
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+
+	netdev_dbg(dev, "Successfully created xenvif\n");
+	return vif;
+}
+
+int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
+		   unsigned long rx_ring_ref, unsigned int evtchn)
+{
+	int err = -ENOMEM;
+
+	/* Already connected through? */
+	if (vif->irq)
+		return 0;
+
+	xenvif_set_features(vif);
+
+	err = xen_netbk_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
+	if (err < 0)
+		goto err;
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		vif->domid, evtchn, xenvif_interrupt, 0,
+		vif->dev->name, vif);
+	if (err < 0)
+		goto err_unmap;
+	vif->irq = err;
+	disable_irq(vif->irq);
+
+	xenvif_get(vif);
+
+	rtnl_lock();
+	netif_carrier_on(vif->dev);
+	if (netif_running(vif->dev))
+		xenvif_up(vif);
+	rtnl_unlock();
+
+	return 0;
+err_unmap:
+	xen_netbk_unmap_frontend_rings(vif);
+err:
+	return err;
+}
+
+void xenvif_disconnect(struct xenvif *vif)
+{
+	struct net_device *dev = vif->dev;
+	if (netif_carrier_ok(dev)) {
+		rtnl_lock();
+		netif_carrier_off(dev); /* discard queued packets */
+		if (netif_running(dev))
+			xenvif_down(vif);
+		rtnl_unlock();
+		xenvif_put(vif);
+	}
+
+	atomic_dec(&vif->refcnt);
+	wait_event(vif->waiting_to_free, atomic_read(&vif->refcnt) == 0);
+
+	del_timer_sync(&vif->credit_timeout);
+
+	if (vif->irq)
+		unbind_from_irqhandler(vif->irq, vif);
+
+	unregister_netdev(vif->dev);
+
+	xen_netbk_unmap_frontend_rings(vif);
+
+	free_netdev(vif->dev);
+}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
new file mode 100644
index 0000000..c2669b8
--- /dev/null
+++ b/drivers/net/xen-netback/netback.c
@@ -0,0 +1,1745 @@
+/*
+ * Back-end of the driver for virtual network devices. This portion of the
+ * driver exports a 'unified' network-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ *  drivers/net/xen-netfront.c
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <linux/kthread.h>
+#include <linux/if_vlan.h>
+#include <linux/udp.h>
+
+#include <net/tcp.h>
+
+#include <xen/events.h>
+#include <xen/interface/memory.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+struct pending_tx_info {
+	struct xen_netif_tx_request req;
+	struct xenvif *vif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct netbk_rx_meta {
+	int id;
+	int size;
+	int gso_size;
+};
+
+#define MAX_PENDING_REQS 256
+
+#define MAX_BUFFER_OFFSET PAGE_SIZE
+
+/* extra field used in struct page */
+union page_ext {
+	struct {
+#if BITS_PER_LONG < 64
+#define IDX_WIDTH   8
+#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH)
+		unsigned int group:GROUP_WIDTH;
+		unsigned int idx:IDX_WIDTH;
+#else
+		unsigned int group, idx;
+#endif
+	} e;
+	void *mapping;
+};
+
+struct xen_netbk {
+	wait_queue_head_t wq;
+	struct task_struct *task;
+
+	struct sk_buff_head rx_queue;
+	struct sk_buff_head tx_queue;
+
+	struct timer_list net_timer;
+
+	struct page *mmap_pages[MAX_PENDING_REQS];
+
+	pending_ring_idx_t pending_prod;
+	pending_ring_idx_t pending_cons;
+	struct list_head net_schedule_list;
+
+	/* Protect the net_schedule_list in netif. */
+	spinlock_t net_schedule_list_lock;
+
+	atomic_t netfront_count;
+
+	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
+
+	u16 pending_ring[MAX_PENDING_REQS];
+
+	/*
+	 * Given MAX_BUFFER_OFFSET of 4096 the worst case is that each
+	 * head/fragment page uses 2 copy operations because it
+	 * straddles two buffers in the frontend.
+	 */
+	struct gnttab_copy grant_copy_op[2*XEN_NETIF_RX_RING_SIZE];
+	struct netbk_rx_meta meta[2*XEN_NETIF_RX_RING_SIZE];
+};
+
+static struct xen_netbk *xen_netbk;
+static int xen_netbk_group_nr;
+
+void xen_netbk_add_xenvif(struct xenvif *vif)
+{
+	int i;
+	int min_netfront_count;
+	int min_group = 0;
+	struct xen_netbk *netbk;
+
+	min_netfront_count = atomic_read(&xen_netbk[0].netfront_count);
+	for (i = 0; i < xen_netbk_group_nr; i++) {
+		int netfront_count = atomic_read(&xen_netbk[i].netfront_count);
+		if (netfront_count < min_netfront_count) {
+			min_group = i;
+			min_netfront_count = netfront_count;
+		}
+	}
+
+	netbk = &xen_netbk[min_group];
+
+	vif->netbk = netbk;
+	atomic_inc(&netbk->netfront_count);
+}
+
+void xen_netbk_remove_xenvif(struct xenvif *vif)
+{
+	struct xen_netbk *netbk = vif->netbk;
+	vif->netbk = NULL;
+	atomic_dec(&netbk->netfront_count);
+}
+
+static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx);
+static void make_tx_response(struct xenvif *vif,
+			     struct xen_netif_tx_request *txp,
+			     s8       st);
+static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
+					     u16      id,
+					     s8       st,
+					     u16      offset,
+					     u16      size,
+					     u16      flags);
+
+static inline unsigned long idx_to_pfn(struct xen_netbk *netbk,
+				       unsigned int idx)
+{
+	return page_to_pfn(netbk->mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk,
+					 unsigned int idx)
+{
+	return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
+}
+
+/* extra field used in struct page */
+static inline void set_page_ext(struct page *pg, struct xen_netbk *netbk,
+				unsigned int idx)
+{
+	unsigned int group = netbk - xen_netbk;
+	union page_ext ext = { .e = { .group = group + 1, .idx = idx } };
+
+	BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
+	pg->mapping = ext.mapping;
+}
+
+static int get_page_ext(struct page *pg,
+			unsigned int *pgroup, unsigned int *pidx)
+{
+	union page_ext ext = { .mapping = pg->mapping };
+	struct xen_netbk *netbk;
+	unsigned int group, idx;
+
+	group = ext.e.group - 1;
+
+	if (group < 0 || group >= xen_netbk_group_nr)
+		return 0;
+
+	netbk = &xen_netbk[group];
+
+	idx = ext.e.idx;
+
+	if ((idx < 0) || (idx >= MAX_PENDING_REQS))
+		return 0;
+
+	if (netbk->mmap_pages[idx] != pg)
+		return 0;
+
+	*pgroup = group;
+	*pidx = idx;
+
+	return 1;
+}
+
+/*
+ * This is the amount of packet we copy rather than map, so that the
+ * guest can't fiddle with the contents of the headers while we do
+ * packet processing on them (netfilter, routing, etc).
+ */
+#define PKT_PROT_LEN    (ETH_HLEN + \
+			 VLAN_HLEN + \
+			 sizeof(struct iphdr) + MAX_IPOPTLEN + \
+			 sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
+
+static inline pending_ring_idx_t pending_index(unsigned i)
+{
+	return i & (MAX_PENDING_REQS-1);
+}
+
+static inline pending_ring_idx_t nr_pending_reqs(struct xen_netbk *netbk)
+{
+	return MAX_PENDING_REQS -
+		netbk->pending_prod + netbk->pending_cons;
+}
+
+static void xen_netbk_kick_thread(struct xen_netbk *netbk)
+{
+	wake_up(&netbk->wq);
+}
+
+static int max_required_rx_slots(struct xenvif *vif)
+{
+	int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
+
+	if (vif->can_sg || vif->gso || vif->gso_prefix)
+		max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
+
+	return max;
+}
+
+int xen_netbk_rx_ring_full(struct xenvif *vif)
+{
+	RING_IDX peek   = vif->rx_req_cons_peek;
+	RING_IDX needed = max_required_rx_slots(vif);
+
+	return ((vif->rx.sring->req_prod - peek) < needed) ||
+	       ((vif->rx.rsp_prod_pvt + XEN_NETIF_RX_RING_SIZE - peek) < needed);
+}
+
+int xen_netbk_must_stop_queue(struct xenvif *vif)
+{
+	if (!xen_netbk_rx_ring_full(vif))
+		return 0;
+
+	vif->rx.sring->req_event = vif->rx_req_cons_peek +
+		max_required_rx_slots(vif);
+	mb(); /* request notification /then/ check the queue */
+
+	return xen_netbk_rx_ring_full(vif);
+}
+
+/*
+ * Returns true if we should start a new receive buffer instead of
+ * adding 'size' bytes to a buffer which currently contains 'offset'
+ * bytes.
+ */
+static bool start_new_rx_buffer(int offset, unsigned long size, int head)
+{
+	/* simple case: we have completely filled the current buffer. */
+	if (offset == MAX_BUFFER_OFFSET)
+		return true;
+
+	/*
+	 * complex case: start a fresh buffer if the current frag
+	 * would overflow the current buffer but only if:
+	 *     (i)   this frag would fit completely in the next buffer
+	 * and (ii)  there is already some data in the current buffer
+	 * and (iii) this is not the head buffer.
+	 *
+	 * Where:
+	 * - (i) stops us splitting a frag into two copies
+	 *   unless the frag is too large for a single buffer.
+	 * - (ii) stops us from leaving a buffer pointlessly empty.
+	 * - (iii) stops us leaving the first buffer
+	 *   empty. Strictly speaking this is already covered
+	 *   by (ii) but is explicitly checked because
+	 *   netfront relies on the first buffer being
+	 *   non-empty and can crash otherwise.
+	 *
+	 * This means we will effectively linearise small
+	 * frags but do not needlessly split large buffers
+	 * into multiple copies tend to give large frags their
+	 * own buffers as before.
+	 */
+	if ((offset + size > MAX_BUFFER_OFFSET) &&
+	    (size <= MAX_BUFFER_OFFSET) && offset && !head)
+		return true;
+
+	return false;
+}
+
+/*
+ * Figure out how many ring slots we're going to need to send @skb to
+ * the guest. This function is essentially a dry run of
+ * netbk_gop_frag_copy.
+ */
+unsigned int xen_netbk_count_skb_slots(struct xenvif *vif, struct sk_buff *skb)
+{
+	unsigned int count;
+	int i, copy_off;
+
+	count = DIV_ROUND_UP(
+			offset_in_page(skb->data)+skb_headlen(skb), PAGE_SIZE);
+
+	copy_off = skb_headlen(skb) % PAGE_SIZE;
+
+	if (skb_shinfo(skb)->gso_size)
+		count++;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		unsigned long size = skb_shinfo(skb)->frags[i].size;
+		unsigned long bytes;
+		while (size > 0) {
+			BUG_ON(copy_off > MAX_BUFFER_OFFSET);
+
+			if (start_new_rx_buffer(copy_off, size, 0)) {
+				count++;
+				copy_off = 0;
+			}
+
+			bytes = size;
+			if (copy_off + bytes > MAX_BUFFER_OFFSET)
+				bytes = MAX_BUFFER_OFFSET - copy_off;
+
+			copy_off += bytes;
+			size -= bytes;
+		}
+	}
+	return count;
+}
+
+struct netrx_pending_operations {
+	unsigned copy_prod, copy_cons;
+	unsigned meta_prod, meta_cons;
+	struct gnttab_copy *copy;
+	struct netbk_rx_meta *meta;
+	int copy_off;
+	grant_ref_t copy_gref;
+};
+
+static struct netbk_rx_meta *get_next_rx_buffer(struct xenvif *vif,
+						struct netrx_pending_operations *npo)
+{
+	struct netbk_rx_meta *meta;
+	struct xen_netif_rx_request *req;
+
+	req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
+
+	meta = npo->meta + npo->meta_prod++;
+	meta->gso_size = 0;
+	meta->size = 0;
+	meta->id = req->id;
+
+	npo->copy_off = 0;
+	npo->copy_gref = req->gref;
+
+	return meta;
+}
+
+/*
+ * Set up the grant operations for this fragment. If it's a flipping
+ * interface, we also set up the unmap request from here.
+ */
+static void netbk_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
+				struct netrx_pending_operations *npo,
+				struct page *page, unsigned long size,
+				unsigned long offset, int *head)
+{
+	struct gnttab_copy *copy_gop;
+	struct netbk_rx_meta *meta;
+	/*
+	 * These variables a used iff get_page_ext returns true,
+	 * in which case they are guaranteed to be initialized.
+	 */
+	unsigned int uninitialized_var(group), uninitialized_var(idx);
+	int foreign = get_page_ext(page, &group, &idx);
+	unsigned long bytes;
+
+	/* Data must not cross a page boundary. */
+	BUG_ON(size + offset > PAGE_SIZE);
+
+	meta = npo->meta + npo->meta_prod - 1;
+
+	while (size > 0) {
+		BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
+
+		if (start_new_rx_buffer(npo->copy_off, size, *head)) {
+			/*
+			 * Netfront requires there to be some data in the head
+			 * buffer.
+			 */
+			BUG_ON(*head);
+
+			meta = get_next_rx_buffer(vif, npo);
+		}
+
+		bytes = size;
+		if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
+			bytes = MAX_BUFFER_OFFSET - npo->copy_off;
+
+		copy_gop = npo->copy + npo->copy_prod++;
+		copy_gop->flags = GNTCOPY_dest_gref;
+		if (foreign) {
+			struct xen_netbk *netbk = &xen_netbk[group];
+			struct pending_tx_info *src_pend;
+
+			src_pend = &netbk->pending_tx_info[idx];
+
+			copy_gop->source.domid = src_pend->vif->domid;
+			copy_gop->source.u.ref = src_pend->req.gref;
+			copy_gop->flags |= GNTCOPY_source_gref;
+		} else {
+			void *vaddr = page_address(page);
+			copy_gop->source.domid = DOMID_SELF;
+			copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
+		}
+		copy_gop->source.offset = offset;
+		copy_gop->dest.domid = vif->domid;
+
+		copy_gop->dest.offset = npo->copy_off;
+		copy_gop->dest.u.ref = npo->copy_gref;
+		copy_gop->len = bytes;
+
+		npo->copy_off += bytes;
+		meta->size += bytes;
+
+		offset += bytes;
+		size -= bytes;
+
+		/* Leave a gap for the GSO descriptor. */
+		if (*head && skb_shinfo(skb)->gso_size && !vif->gso_prefix)
+			vif->rx.req_cons++;
+
+		*head = 0; /* There must be something in this buffer now. */
+
+	}
+}
+
+/*
+ * Prepare an SKB to be transmitted to the frontend.
+ *
+ * This function is responsible for allocating grant operations, meta
+ * structures, etc.
+ *
+ * It returns the number of meta structures consumed. The number of
+ * ring slots used is always equal to the number of meta slots used
+ * plus the number of GSO descriptors used. Currently, we use either
+ * zero GSO descriptors (for non-GSO packets) or one descriptor (for
+ * frontend-side LRO).
+ */
+static int netbk_gop_skb(struct sk_buff *skb,
+			 struct netrx_pending_operations *npo)
+{
+	struct xenvif *vif = netdev_priv(skb->dev);
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	int i;
+	struct xen_netif_rx_request *req;
+	struct netbk_rx_meta *meta;
+	unsigned char *data;
+	int head = 1;
+	int old_meta_prod;
+
+	old_meta_prod = npo->meta_prod;
+
+	/* Set up a GSO prefix descriptor, if necessary */
+	if (skb_shinfo(skb)->gso_size && vif->gso_prefix) {
+		req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
+		meta = npo->meta + npo->meta_prod++;
+		meta->gso_size = skb_shinfo(skb)->gso_size;
+		meta->size = 0;
+		meta->id = req->id;
+	}
+
+	req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
+	meta = npo->meta + npo->meta_prod++;
+
+	if (!vif->gso_prefix)
+		meta->gso_size = skb_shinfo(skb)->gso_size;
+	else
+		meta->gso_size = 0;
+
+	meta->size = 0;
+	meta->id = req->id;
+	npo->copy_off = 0;
+	npo->copy_gref = req->gref;
+
+	data = skb->data;
+	while (data < skb_tail_pointer(skb)) {
+		unsigned int offset = offset_in_page(data);
+		unsigned int len = PAGE_SIZE - offset;
+
+		if (data + len > skb_tail_pointer(skb))
+			len = skb_tail_pointer(skb) - data;
+
+		netbk_gop_frag_copy(vif, skb, npo,
+				    virt_to_page(data), len, offset, &head);
+		data += len;
+	}
+
+	for (i = 0; i < nr_frags; i++) {
+		netbk_gop_frag_copy(vif, skb, npo,
+				    skb_shinfo(skb)->frags[i].page,
+				    skb_shinfo(skb)->frags[i].size,
+				    skb_shinfo(skb)->frags[i].page_offset,
+				    &head);
+	}
+
+	return npo->meta_prod - old_meta_prod;
+}
+
+/*
+ * This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
+ * used to set up the operations on the top of
+ * netrx_pending_operations, which have since been done.  Check that
+ * they didn't give any errors and advance over them.
+ */
+static int netbk_check_gop(struct xenvif *vif, int nr_meta_slots,
+			   struct netrx_pending_operations *npo)
+{
+	struct gnttab_copy     *copy_op;
+	int status = XEN_NETIF_RSP_OKAY;
+	int i;
+
+	for (i = 0; i < nr_meta_slots; i++) {
+		copy_op = npo->copy + npo->copy_cons++;
+		if (copy_op->status != GNTST_okay) {
+			netdev_dbg(vif->dev,
+				   "Bad status %d from copy to DOM%d.\n",
+				   copy_op->status, vif->domid);
+			status = XEN_NETIF_RSP_ERROR;
+		}
+	}
+
+	return status;
+}
+
+static void netbk_add_frag_responses(struct xenvif *vif, int status,
+				     struct netbk_rx_meta *meta,
+				     int nr_meta_slots)
+{
+	int i;
+	unsigned long offset;
+
+	/* No fragments used */
+	if (nr_meta_slots <= 1)
+		return;
+
+	nr_meta_slots--;
+
+	for (i = 0; i < nr_meta_slots; i++) {
+		int flags;
+		if (i == nr_meta_slots - 1)
+			flags = 0;
+		else
+			flags = XEN_NETRXF_more_data;
+
+		offset = 0;
+		make_rx_response(vif, meta[i].id, status, offset,
+				 meta[i].size, flags);
+	}
+}
+
+struct skb_cb_overlay {
+	int meta_slots_used;
+};
+
+static void xen_netbk_rx_action(struct xen_netbk *netbk)
+{
+	struct xenvif *vif = NULL, *tmp;
+	s8 status;
+	u16 irq, flags;
+	struct xen_netif_rx_response *resp;
+	struct sk_buff_head rxq;
+	struct sk_buff *skb;
+	LIST_HEAD(notify);
+	int ret;
+	int nr_frags;
+	int count;
+	unsigned long offset;
+	struct skb_cb_overlay *sco;
+
+	struct netrx_pending_operations npo = {
+		.copy  = netbk->grant_copy_op,
+		.meta  = netbk->meta,
+	};
+
+	skb_queue_head_init(&rxq);
+
+	count = 0;
+
+	while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
+		vif = netdev_priv(skb->dev);
+		nr_frags = skb_shinfo(skb)->nr_frags;
+
+		sco = (struct skb_cb_overlay *)skb->cb;
+		sco->meta_slots_used = netbk_gop_skb(skb, &npo);
+
+		count += nr_frags + 1;
+
+		__skb_queue_tail(&rxq, skb);
+
+		/* Filled the batch queue? */
+		if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
+			break;
+	}
+
+	BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
+
+	if (!npo.copy_prod)
+		return;
+
+	BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
+					npo.copy_prod);
+	BUG_ON(ret != 0);
+
+	while ((skb = __skb_dequeue(&rxq)) != NULL) {
+		sco = (struct skb_cb_overlay *)skb->cb;
+
+		vif = netdev_priv(skb->dev);
+
+		if (netbk->meta[npo.meta_cons].gso_size && vif->gso_prefix) {
+			resp = RING_GET_RESPONSE(&vif->rx,
+						vif->rx.rsp_prod_pvt++);
+
+			resp->flags = XEN_NETRXF_gso_prefix | XEN_NETRXF_more_data;
+
+			resp->offset = netbk->meta[npo.meta_cons].gso_size;
+			resp->id = netbk->meta[npo.meta_cons].id;
+			resp->status = sco->meta_slots_used;
+
+			npo.meta_cons++;
+			sco->meta_slots_used--;
+		}
+
+
+		vif->stats.tx_bytes += skb->len;
+		vif->stats.tx_packets++;
+
+		status = netbk_check_gop(vif, sco->meta_slots_used, &npo);
+
+		if (sco->meta_slots_used == 1)
+			flags = 0;
+		else
+			flags = XEN_NETRXF_more_data;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+			flags |= XEN_NETRXF_csum_blank | XEN_NETRXF_data_validated;
+		else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+			/* remote but checksummed. */
+			flags |= XEN_NETRXF_data_validated;
+
+		offset = 0;
+		resp = make_rx_response(vif, netbk->meta[npo.meta_cons].id,
+					status, offset,
+					netbk->meta[npo.meta_cons].size,
+					flags);
+
+		if (netbk->meta[npo.meta_cons].gso_size && !vif->gso_prefix) {
+			struct xen_netif_extra_info *gso =
+				(struct xen_netif_extra_info *)
+				RING_GET_RESPONSE(&vif->rx,
+						  vif->rx.rsp_prod_pvt++);
+
+			resp->flags |= XEN_NETRXF_extra_info;
+
+			gso->u.gso.size = netbk->meta[npo.meta_cons].gso_size;
+			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+			gso->u.gso.pad = 0;
+			gso->u.gso.features = 0;
+
+			gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+			gso->flags = 0;
+		}
+
+		netbk_add_frag_responses(vif, status,
+					 netbk->meta + npo.meta_cons + 1,
+					 sco->meta_slots_used);
+
+		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret);
+		irq = vif->irq;
+		if (ret && list_empty(&vif->notify_list))
+			list_add_tail(&vif->notify_list, &notify);
+
+		xenvif_notify_tx_completion(vif);
+
+		xenvif_put(vif);
+		npo.meta_cons += sco->meta_slots_used;
+		dev_kfree_skb(skb);
+	}
+
+	list_for_each_entry_safe(vif, tmp, &notify, notify_list) {
+		notify_remote_via_irq(vif->irq);
+		list_del_init(&vif->notify_list);
+	}
+
+	/* More work to do? */
+	if (!skb_queue_empty(&netbk->rx_queue) &&
+			!timer_pending(&netbk->net_timer))
+		xen_netbk_kick_thread(netbk);
+}
+
+void xen_netbk_queue_tx_skb(struct xenvif *vif, struct sk_buff *skb)
+{
+	struct xen_netbk *netbk = vif->netbk;
+
+	skb_queue_tail(&netbk->rx_queue, skb);
+
+	xen_netbk_kick_thread(netbk);
+}
+
+static void xen_netbk_alarm(unsigned long data)
+{
+	struct xen_netbk *netbk = (struct xen_netbk *)data;
+	xen_netbk_kick_thread(netbk);
+}
+
+static int __on_net_schedule_list(struct xenvif *vif)
+{
+	return !list_empty(&vif->schedule_list);
+}
+
+/* Must be called with net_schedule_list_lock held */
+static void remove_from_net_schedule_list(struct xenvif *vif)
+{
+	if (likely(__on_net_schedule_list(vif))) {
+		list_del_init(&vif->schedule_list);
+		xenvif_put(vif);
+	}
+}
+
+static struct xenvif *poll_net_schedule_list(struct xen_netbk *netbk)
+{
+	struct xenvif *vif = NULL;
+
+	spin_lock_irq(&netbk->net_schedule_list_lock);
+	if (list_empty(&netbk->net_schedule_list))
+		goto out;
+
+	vif = list_first_entry(&netbk->net_schedule_list,
+			       struct xenvif, schedule_list);
+	if (!vif)
+		goto out;
+
+	xenvif_get(vif);
+
+	remove_from_net_schedule_list(vif);
+out:
+	spin_unlock_irq(&netbk->net_schedule_list_lock);
+	return vif;
+}
+
+void xen_netbk_schedule_xenvif(struct xenvif *vif)
+{
+	unsigned long flags;
+	struct xen_netbk *netbk = vif->netbk;
+
+	if (__on_net_schedule_list(vif))
+		goto kick;
+
+	spin_lock_irqsave(&netbk->net_schedule_list_lock, flags);
+	if (!__on_net_schedule_list(vif) &&
+	    likely(xenvif_schedulable(vif))) {
+		list_add_tail(&vif->schedule_list, &netbk->net_schedule_list);
+		xenvif_get(vif);
+	}
+	spin_unlock_irqrestore(&netbk->net_schedule_list_lock, flags);
+
+kick:
+	smp_mb();
+	if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
+	    !list_empty(&netbk->net_schedule_list))
+		xen_netbk_kick_thread(netbk);
+}
+
+void xen_netbk_deschedule_xenvif(struct xenvif *vif)
+{
+	struct xen_netbk *netbk = vif->netbk;
+	spin_lock_irq(&netbk->net_schedule_list_lock);
+	remove_from_net_schedule_list(vif);
+	spin_unlock_irq(&netbk->net_schedule_list_lock);
+}
+
+void xen_netbk_check_rx_xenvif(struct xenvif *vif)
+{
+	int more_to_do;
+
+	RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, more_to_do);
+
+	if (more_to_do)
+		xen_netbk_schedule_xenvif(vif);
+}
+
+static void tx_add_credit(struct xenvif *vif)
+{
+	unsigned long max_burst, max_credit;
+
+	/*
+	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+	 * Otherwise the interface can seize up due to insufficient credit.
+	 */
+	max_burst = RING_GET_REQUEST(&vif->tx, vif->tx.req_cons)->size;
+	max_burst = min(max_burst, 131072UL);
+	max_burst = max(max_burst, vif->credit_bytes);
+
+	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
+	max_credit = vif->remaining_credit + vif->credit_bytes;
+	if (max_credit < vif->remaining_credit)
+		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+	vif->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+	struct xenvif *vif = (struct xenvif *)data;
+	tx_add_credit(vif);
+	xen_netbk_check_rx_xenvif(vif);
+}
+
+static void netbk_tx_err(struct xenvif *vif,
+			 struct xen_netif_tx_request *txp, RING_IDX end)
+{
+	RING_IDX cons = vif->tx.req_cons;
+
+	do {
+		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+		if (cons >= end)
+			break;
+		txp = RING_GET_REQUEST(&vif->tx, cons++);
+	} while (1);
+	vif->tx.req_cons = cons;
+	xen_netbk_check_rx_xenvif(vif);
+	xenvif_put(vif);
+}
+
+static int netbk_count_requests(struct xenvif *vif,
+				struct xen_netif_tx_request *first,
+				struct xen_netif_tx_request *txp,
+				int work_to_do)
+{
+	RING_IDX cons = vif->tx.req_cons;
+	int frags = 0;
+
+	if (!(first->flags & XEN_NETTXF_more_data))
+		return 0;
+
+	do {
+		if (frags >= work_to_do) {
+			netdev_dbg(vif->dev, "Need more frags\n");
+			return -frags;
+		}
+
+		if (unlikely(frags >= MAX_SKB_FRAGS)) {
+			netdev_dbg(vif->dev, "Too many frags\n");
+			return -frags;
+		}
+
+		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
+		       sizeof(*txp));
+		if (txp->size > first->size) {
+			netdev_dbg(vif->dev, "Frags galore\n");
+			return -frags;
+		}
+
+		first->size -= txp->size;
+		frags++;
+
+		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+			netdev_dbg(vif->dev, "txp->offset: %x, size: %u\n",
+				 txp->offset, txp->size);
+			return -frags;
+		}
+	} while ((txp++)->flags & XEN_NETTXF_more_data);
+	return frags;
+}
+
+static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
+					 struct sk_buff *skb,
+					 unsigned long pending_idx)
+{
+	struct page *page;
+	page = alloc_page(GFP_KERNEL|__GFP_COLD);
+	if (!page)
+		return NULL;
+	set_page_ext(page, netbk, pending_idx);
+	netbk->mmap_pages[pending_idx] = page;
+	return page;
+}
+
+static struct gnttab_copy *xen_netbk_get_requests(struct xen_netbk *netbk,
+						  struct xenvif *vif,
+						  struct sk_buff *skb,
+						  struct xen_netif_tx_request *txp,
+						  struct gnttab_copy *gop)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	skb_frag_t *frags = shinfo->frags;
+	unsigned long pending_idx = *((u16 *)skb->data);
+	int i, start;
+
+	/* Skip first skb fragment if it is on same page as header fragment. */
+	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+	for (i = start; i < shinfo->nr_frags; i++, txp++) {
+		struct page *page;
+		pending_ring_idx_t index;
+		struct pending_tx_info *pending_tx_info =
+			netbk->pending_tx_info;
+
+		index = pending_index(netbk->pending_cons++);
+		pending_idx = netbk->pending_ring[index];
+		page = xen_netbk_alloc_page(netbk, skb, pending_idx);
+		if (!page)
+			return NULL;
+
+		netbk->mmap_pages[pending_idx] = page;
+
+		gop->source.u.ref = txp->gref;
+		gop->source.domid = vif->domid;
+		gop->source.offset = txp->offset;
+
+		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+		gop->dest.domid = DOMID_SELF;
+		gop->dest.offset = txp->offset;
+
+		gop->len = txp->size;
+		gop->flags = GNTCOPY_source_gref;
+
+		gop++;
+
+		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+		xenvif_get(vif);
+		pending_tx_info[pending_idx].vif = vif;
+		frags[i].page = (void *)pending_idx;
+	}
+
+	return gop;
+}
+
+static int xen_netbk_tx_check_gop(struct xen_netbk *netbk,
+				  struct sk_buff *skb,
+				  struct gnttab_copy **gopp)
+{
+	struct gnttab_copy *gop = *gopp;
+	int pending_idx = *((u16 *)skb->data);
+	struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
+	struct xenvif *vif = pending_tx_info[pending_idx].vif;
+	struct xen_netif_tx_request *txp;
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int nr_frags = shinfo->nr_frags;
+	int i, err, start;
+
+	/* Check status of header. */
+	err = gop->status;
+	if (unlikely(err)) {
+		pending_ring_idx_t index;
+		index = pending_index(netbk->pending_prod++);
+		txp = &pending_tx_info[pending_idx].req;
+		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+		netbk->pending_ring[index] = pending_idx;
+		xenvif_put(vif);
+	}
+
+	/* Skip first skb fragment if it is on same page as header fragment. */
+	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+	for (i = start; i < nr_frags; i++) {
+		int j, newerr;
+		pending_ring_idx_t index;
+
+		pending_idx = (unsigned long)shinfo->frags[i].page;
+
+		/* Check error status: if okay then remember grant handle. */
+		newerr = (++gop)->status;
+		if (likely(!newerr)) {
+			/* Had a previous error? Invalidate this fragment. */
+			if (unlikely(err))
+				xen_netbk_idx_release(netbk, pending_idx);
+			continue;
+		}
+
+		/* Error on this fragment: respond to client with an error. */
+		txp = &netbk->pending_tx_info[pending_idx].req;
+		make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
+		index = pending_index(netbk->pending_prod++);
+		netbk->pending_ring[index] = pending_idx;
+		xenvif_put(vif);
+
+		/* Not the first error? Preceding frags already invalidated. */
+		if (err)
+			continue;
+
+		/* First error: invalidate header and preceding fragments. */
+		pending_idx = *((u16 *)skb->data);
+		xen_netbk_idx_release(netbk, pending_idx);
+		for (j = start; j < i; j++) {
+			pending_idx = (unsigned long)shinfo->frags[i].page;
+			xen_netbk_idx_release(netbk, pending_idx);
+		}
+
+		/* Remember the error: invalidate all subsequent fragments. */
+		err = newerr;
+	}
+
+	*gopp = gop + 1;
+	return err;
+}
+
+static void xen_netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int nr_frags = shinfo->nr_frags;
+	int i;
+
+	for (i = 0; i < nr_frags; i++) {
+		skb_frag_t *frag = shinfo->frags + i;
+		struct xen_netif_tx_request *txp;
+		unsigned long pending_idx;
+
+		pending_idx = (unsigned long)frag->page;
+
+		txp = &netbk->pending_tx_info[pending_idx].req;
+		frag->page = virt_to_page(idx_to_kaddr(netbk, pending_idx));
+		frag->size = txp->size;
+		frag->page_offset = txp->offset;
+
+		skb->len += txp->size;
+		skb->data_len += txp->size;
+		skb->truesize += txp->size;
+
+		/* Take an extra reference to offset xen_netbk_idx_release */
+		get_page(netbk->mmap_pages[pending_idx]);
+		xen_netbk_idx_release(netbk, pending_idx);
+	}
+}
+
+static int xen_netbk_get_extras(struct xenvif *vif,
+				struct xen_netif_extra_info *extras,
+				int work_to_do)
+{
+	struct xen_netif_extra_info extra;
+	RING_IDX cons = vif->tx.req_cons;
+
+	do {
+		if (unlikely(work_to_do-- <= 0)) {
+			netdev_dbg(vif->dev, "Missing extra info\n");
+			return -EBADR;
+		}
+
+		memcpy(&extra, RING_GET_REQUEST(&vif->tx, cons),
+		       sizeof(extra));
+		if (unlikely(!extra.type ||
+			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+			vif->tx.req_cons = ++cons;
+			netdev_dbg(vif->dev,
+				   "Invalid extra type: %d\n", extra.type);
+			return -EINVAL;
+		}
+
+		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
+		vif->tx.req_cons = ++cons;
+	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+	return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct xenvif *vif,
+			     struct sk_buff *skb,
+			     struct xen_netif_extra_info *gso)
+{
+	if (!gso->u.gso.size) {
+		netdev_dbg(vif->dev, "GSO size must not be zero.\n");
+		return -EINVAL;
+	}
+
+	/* Currently only TCPv4 S.O. is supported. */
+	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+		netdev_dbg(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
+		return -EINVAL;
+	}
+
+	skb_shinfo(skb)->gso_size = gso->u.gso.size;
+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+	/* Header must be checked, and gso_segs computed. */
+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+	skb_shinfo(skb)->gso_segs = 0;
+
+	return 0;
+}
+
+static int checksum_setup(struct xenvif *vif, struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	unsigned char *th;
+	int err = -EPROTO;
+	int recalculate_partial_csum = 0;
+
+	/*
+	 * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
+	 * peers can fail to set NETRXF_csum_blank when sending a GSO
+	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
+	 * recalculate the partial checksum.
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
+		vif->rx_gso_checksum_fixup++;
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		recalculate_partial_csum = 1;
+	}
+
+	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		goto out;
+
+	iph = (void *)skb->data;
+	th = skb->data + 4 * iph->ihl;
+	if (th >= skb_tail_pointer(skb))
+		goto out;
+
+	skb->csum_start = th - skb->head;
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		skb->csum_offset = offsetof(struct tcphdr, check);
+
+		if (recalculate_partial_csum) {
+			struct tcphdr *tcph = (struct tcphdr *)th;
+			tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+							 skb->len - iph->ihl*4,
+							 IPPROTO_TCP, 0);
+		}
+		break;
+	case IPPROTO_UDP:
+		skb->csum_offset = offsetof(struct udphdr, check);
+
+		if (recalculate_partial_csum) {
+			struct udphdr *udph = (struct udphdr *)th;
+			udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+							 skb->len - iph->ihl*4,
+							 IPPROTO_UDP, 0);
+		}
+		break;
+	default:
+		if (net_ratelimit())
+			netdev_err(vif->dev,
+				   "Attempting to checksum a non-TCP/UDP packet, dropping a protocol %d packet\n",
+				   iph->protocol);
+		goto out;
+	}
+
+	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
+		goto out;
+
+	err = 0;
+
+out:
+	return err;
+}
+
+static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
+{
+	unsigned long now = jiffies;
+	unsigned long next_credit =
+		vif->credit_timeout.expires +
+		msecs_to_jiffies(vif->credit_usec / 1000);
+
+	/* Timer could already be pending in rare cases. */
+	if (timer_pending(&vif->credit_timeout))
+		return true;
+
+	/* Passed the point where we can replenish credit? */
+	if (time_after_eq(now, next_credit)) {
+		vif->credit_timeout.expires = now;
+		tx_add_credit(vif);
+	}
+
+	/* Still too big to send right now? Set a callback. */
+	if (size > vif->remaining_credit) {
+		vif->credit_timeout.data     =
+			(unsigned long)vif;
+		vif->credit_timeout.function =
+			tx_credit_callback;
+		mod_timer(&vif->credit_timeout,
+			  next_credit);
+
+		return true;
+	}
+
+	return false;
+}
+
+static unsigned xen_netbk_tx_build_gops(struct xen_netbk *netbk)
+{
+	struct gnttab_copy *gop = netbk->tx_copy_ops, *request_gop;
+	struct sk_buff *skb;
+	int ret;
+
+	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+		!list_empty(&netbk->net_schedule_list)) {
+		struct xenvif *vif;
+		struct xen_netif_tx_request txreq;
+		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
+		struct page *page;
+		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
+		u16 pending_idx;
+		RING_IDX idx;
+		int work_to_do;
+		unsigned int data_len;
+		pending_ring_idx_t index;
+
+		/* Get a netif from the list with work to do. */
+		vif = poll_net_schedule_list(netbk);
+		if (!vif)
+			continue;
+
+		RING_FINAL_CHECK_FOR_REQUESTS(&vif->tx, work_to_do);
+		if (!work_to_do) {
+			xenvif_put(vif);
+			continue;
+		}
+
+		idx = vif->tx.req_cons;
+		rmb(); /* Ensure that we see the request before we copy it. */
+		memcpy(&txreq, RING_GET_REQUEST(&vif->tx, idx), sizeof(txreq));
+
+		/* Credit-based scheduling. */
+		if (txreq.size > vif->remaining_credit &&
+		    tx_credit_exceeded(vif, txreq.size)) {
+			xenvif_put(vif);
+			continue;
+		}
+
+		vif->remaining_credit -= txreq.size;
+
+		work_to_do--;
+		vif->tx.req_cons = ++idx;
+
+		memset(extras, 0, sizeof(extras));
+		if (txreq.flags & XEN_NETTXF_extra_info) {
+			work_to_do = xen_netbk_get_extras(vif, extras,
+							  work_to_do);
+			idx = vif->tx.req_cons;
+			if (unlikely(work_to_do < 0)) {
+				netbk_tx_err(vif, &txreq, idx);
+				continue;
+			}
+		}
+
+		ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do);
+		if (unlikely(ret < 0)) {
+			netbk_tx_err(vif, &txreq, idx - ret);
+			continue;
+		}
+		idx += ret;
+
+		if (unlikely(txreq.size < ETH_HLEN)) {
+			netdev_dbg(vif->dev,
+				   "Bad packet size: %d\n", txreq.size);
+			netbk_tx_err(vif, &txreq, idx);
+			continue;
+		}
+
+		/* No crossing a page as the payload mustn't fragment. */
+		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+			netdev_dbg(vif->dev,
+				   "txreq.offset: %x, size: %u, end: %lu\n",
+				   txreq.offset, txreq.size,
+				   (txreq.offset&~PAGE_MASK) + txreq.size);
+			netbk_tx_err(vif, &txreq, idx);
+			continue;
+		}
+
+		index = pending_index(netbk->pending_cons);
+		pending_idx = netbk->pending_ring[index];
+
+		data_len = (txreq.size > PKT_PROT_LEN &&
+			    ret < MAX_SKB_FRAGS) ?
+			PKT_PROT_LEN : txreq.size;
+
+		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
+				GFP_ATOMIC | __GFP_NOWARN);
+		if (unlikely(skb == NULL)) {
+			netdev_dbg(vif->dev,
+				   "Can't allocate a skb in start_xmit.\n");
+			netbk_tx_err(vif, &txreq, idx);
+			break;
+		}
+
+		/* Packets passed to netif_rx() must have some headroom. */
+		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+			struct xen_netif_extra_info *gso;
+			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+			if (netbk_set_skb_gso(vif, skb, gso)) {
+				kfree_skb(skb);
+				netbk_tx_err(vif, &txreq, idx);
+				continue;
+			}
+		}
+
+		/* XXX could copy straight to head */
+		page = xen_netbk_alloc_page(netbk, skb, pending_idx);
+		if (!page) {
+			kfree_skb(skb);
+			netbk_tx_err(vif, &txreq, idx);
+			continue;
+		}
+
+		netbk->mmap_pages[pending_idx] = page;
+
+		gop->source.u.ref = txreq.gref;
+		gop->source.domid = vif->domid;
+		gop->source.offset = txreq.offset;
+
+		gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+		gop->dest.domid = DOMID_SELF;
+		gop->dest.offset = txreq.offset;
+
+		gop->len = txreq.size;
+		gop->flags = GNTCOPY_source_gref;
+
+		gop++;
+
+		memcpy(&netbk->pending_tx_info[pending_idx].req,
+		       &txreq, sizeof(txreq));
+		netbk->pending_tx_info[pending_idx].vif = vif;
+		*((u16 *)skb->data) = pending_idx;
+
+		__skb_put(skb, data_len);
+
+		skb_shinfo(skb)->nr_frags = ret;
+		if (data_len < txreq.size) {
+			skb_shinfo(skb)->nr_frags++;
+			skb_shinfo(skb)->frags[0].page =
+				(void *)(unsigned long)pending_idx;
+		} else {
+			/* Discriminate from any valid pending_idx value. */
+			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
+		}
+
+		__skb_queue_tail(&netbk->tx_queue, skb);
+
+		netbk->pending_cons++;
+
+		request_gop = xen_netbk_get_requests(netbk, vif,
+						     skb, txfrags, gop);
+		if (request_gop == NULL) {
+			kfree_skb(skb);
+			netbk_tx_err(vif, &txreq, idx);
+			continue;
+		}
+		gop = request_gop;
+
+		vif->tx.req_cons = idx;
+		xen_netbk_check_rx_xenvif(vif);
+
+		if ((gop-netbk->tx_copy_ops) >= ARRAY_SIZE(netbk->tx_copy_ops))
+			break;
+	}
+
+	return gop - netbk->tx_copy_ops;
+}
+
+static void xen_netbk_tx_submit(struct xen_netbk *netbk)
+{
+	struct gnttab_copy *gop = netbk->tx_copy_ops;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
+		struct xen_netif_tx_request *txp;
+		struct xenvif *vif;
+		u16 pending_idx;
+		unsigned data_len;
+
+		pending_idx = *((u16 *)skb->data);
+		vif = netbk->pending_tx_info[pending_idx].vif;
+		txp = &netbk->pending_tx_info[pending_idx].req;
+
+		/* Check the remap error code. */
+		if (unlikely(xen_netbk_tx_check_gop(netbk, skb, &gop))) {
+			netdev_dbg(vif->dev, "netback grant failed.\n");
+			skb_shinfo(skb)->nr_frags = 0;
+			kfree_skb(skb);
+			continue;
+		}
+
+		data_len = skb->len;
+		memcpy(skb->data,
+		       (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
+		       data_len);
+		if (data_len < txp->size) {
+			/* Append the packet payload as a fragment. */
+			txp->offset += data_len;
+			txp->size -= data_len;
+		} else {
+			/* Schedule a response immediately. */
+			xen_netbk_idx_release(netbk, pending_idx);
+		}
+
+		if (txp->flags & XEN_NETTXF_csum_blank)
+			skb->ip_summed = CHECKSUM_PARTIAL;
+		else if (txp->flags & XEN_NETTXF_data_validated)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		xen_netbk_fill_frags(netbk, skb);
+
+		/*
+		 * If the initial fragment was < PKT_PROT_LEN then
+		 * pull through some bytes from the other fragments to
+		 * increase the linear region to PKT_PROT_LEN bytes.
+		 */
+		if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
+			int target = min_t(int, skb->len, PKT_PROT_LEN);
+			__pskb_pull_tail(skb, target - skb_headlen(skb));
+		}
+
+		skb->dev      = vif->dev;
+		skb->protocol = eth_type_trans(skb, skb->dev);
+
+		if (checksum_setup(vif, skb)) {
+			netdev_dbg(vif->dev,
+				   "Can't setup checksum in net_tx_action\n");
+			kfree_skb(skb);
+			continue;
+		}
+
+		vif->stats.rx_bytes += skb->len;
+		vif->stats.rx_packets++;
+
+		xenvif_receive_skb(vif, skb);
+	}
+}
+
+/* Called after netfront has transmitted */
+static void xen_netbk_tx_action(struct xen_netbk *netbk)
+{
+	unsigned nr_gops;
+	int ret;
+
+	nr_gops = xen_netbk_tx_build_gops(netbk);
+
+	if (nr_gops == 0)
+		return;
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
+					netbk->tx_copy_ops, nr_gops);
+	BUG_ON(ret);
+
+	xen_netbk_tx_submit(netbk);
+
+}
+
+static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx)
+{
+	struct xenvif *vif;
+	struct pending_tx_info *pending_tx_info;
+	pending_ring_idx_t index;
+
+	/* Already complete? */
+	if (netbk->mmap_pages[pending_idx] == NULL)
+		return;
+
+	pending_tx_info = &netbk->pending_tx_info[pending_idx];
+
+	vif = pending_tx_info->vif;
+
+	make_tx_response(vif, &pending_tx_info->req, XEN_NETIF_RSP_OKAY);
+
+	index = pending_index(netbk->pending_prod++);
+	netbk->pending_ring[index] = pending_idx;
+
+	xenvif_put(vif);
+
+	netbk->mmap_pages[pending_idx]->mapping = 0;
+	put_page(netbk->mmap_pages[pending_idx]);
+	netbk->mmap_pages[pending_idx] = NULL;
+}
+
+static void make_tx_response(struct xenvif *vif,
+			     struct xen_netif_tx_request *txp,
+			     s8       st)
+{
+	RING_IDX i = vif->tx.rsp_prod_pvt;
+	struct xen_netif_tx_response *resp;
+	int notify;
+
+	resp = RING_GET_RESPONSE(&vif->tx, i);
+	resp->id     = txp->id;
+	resp->status = st;
+
+	if (txp->flags & XEN_NETTXF_extra_info)
+		RING_GET_RESPONSE(&vif->tx, ++i)->status = XEN_NETIF_RSP_NULL;
+
+	vif->tx.rsp_prod_pvt = ++i;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->tx, notify);
+	if (notify)
+		notify_remote_via_irq(vif->irq);
+}
+
+static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
+					     u16      id,
+					     s8       st,
+					     u16      offset,
+					     u16      size,
+					     u16      flags)
+{
+	RING_IDX i = vif->rx.rsp_prod_pvt;
+	struct xen_netif_rx_response *resp;
+
+	resp = RING_GET_RESPONSE(&vif->rx, i);
+	resp->offset     = offset;
+	resp->flags      = flags;
+	resp->id         = id;
+	resp->status     = (s16)size;
+	if (st < 0)
+		resp->status = (s16)st;
+
+	vif->rx.rsp_prod_pvt = ++i;
+
+	return resp;
+}
+
+static inline int rx_work_todo(struct xen_netbk *netbk)
+{
+	return !skb_queue_empty(&netbk->rx_queue);
+}
+
+static inline int tx_work_todo(struct xen_netbk *netbk)
+{
+
+	if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+			!list_empty(&netbk->net_schedule_list))
+		return 1;
+
+	return 0;
+}
+
+static int xen_netbk_kthread(void *data)
+{
+	struct xen_netbk *netbk = data;
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(netbk->wq,
+				rx_work_todo(netbk) ||
+				tx_work_todo(netbk) ||
+				kthread_should_stop());
+		cond_resched();
+
+		if (kthread_should_stop())
+			break;
+
+		if (rx_work_todo(netbk))
+			xen_netbk_rx_action(netbk);
+
+		if (tx_work_todo(netbk))
+			xen_netbk_tx_action(netbk);
+	}
+
+	return 0;
+}
+
+void xen_netbk_unmap_frontend_rings(struct xenvif *vif)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	if (vif->tx.sring) {
+		gnttab_set_unmap_op(&op, (unsigned long)vif->tx_comms_area->addr,
+				    GNTMAP_host_map, vif->tx_shmem_handle);
+
+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+			BUG();
+	}
+
+	if (vif->rx.sring) {
+		gnttab_set_unmap_op(&op, (unsigned long)vif->rx_comms_area->addr,
+				    GNTMAP_host_map, vif->rx_shmem_handle);
+
+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+			BUG();
+	}
+	if (vif->rx_comms_area)
+		free_vm_area(vif->rx_comms_area);
+	if (vif->tx_comms_area)
+		free_vm_area(vif->tx_comms_area);
+}
+
+int xen_netbk_map_frontend_rings(struct xenvif *vif,
+				 grant_ref_t tx_ring_ref,
+				 grant_ref_t rx_ring_ref)
+{
+	struct gnttab_map_grant_ref op;
+	struct xen_netif_tx_sring *txs;
+	struct xen_netif_rx_sring *rxs;
+
+	int err = -ENOMEM;
+
+	vif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
+	if (vif->tx_comms_area == NULL)
+		goto err;
+
+	vif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
+	if (vif->rx_comms_area == NULL)
+		goto err;
+
+	gnttab_set_map_op(&op, (unsigned long)vif->tx_comms_area->addr,
+			  GNTMAP_host_map, tx_ring_ref, vif->domid);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status) {
+		netdev_warn(vif->dev,
+			    "failed to map tx ring. err=%d status=%d\n",
+			    err, op.status);
+		err = op.status;
+		goto err;
+	}
+
+	vif->tx_shmem_ref    = tx_ring_ref;
+	vif->tx_shmem_handle = op.handle;
+
+	txs = (struct xen_netif_tx_sring *)vif->tx_comms_area->addr;
+	BACK_RING_INIT(&vif->tx, txs, PAGE_SIZE);
+
+	gnttab_set_map_op(&op, (unsigned long)vif->rx_comms_area->addr,
+			  GNTMAP_host_map, rx_ring_ref, vif->domid);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status) {
+		netdev_warn(vif->dev,
+			    "failed to map rx ring. err=%d status=%d\n",
+			    err, op.status);
+		err = op.status;
+		goto err;
+	}
+
+	vif->rx_shmem_ref     = rx_ring_ref;
+	vif->rx_shmem_handle  = op.handle;
+	vif->rx_req_cons_peek = 0;
+
+	rxs = (struct xen_netif_rx_sring *)vif->rx_comms_area->addr;
+	BACK_RING_INIT(&vif->rx, rxs, PAGE_SIZE);
+
+	return 0;
+
+err:
+	xen_netbk_unmap_frontend_rings(vif);
+	return err;
+}
+
+static int __init netback_init(void)
+{
+	int i;
+	int rc = 0;
+	int group;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	xen_netbk_group_nr = num_online_cpus();
+	xen_netbk = vzalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr);
+	if (!xen_netbk) {
+		printk(KERN_ALERT "%s: out of memory\n", __func__);
+		return -ENOMEM;
+	}
+
+	for (group = 0; group < xen_netbk_group_nr; group++) {
+		struct xen_netbk *netbk = &xen_netbk[group];
+		skb_queue_head_init(&netbk->rx_queue);
+		skb_queue_head_init(&netbk->tx_queue);
+
+		init_timer(&netbk->net_timer);
+		netbk->net_timer.data = (unsigned long)netbk;
+		netbk->net_timer.function = xen_netbk_alarm;
+
+		netbk->pending_cons = 0;
+		netbk->pending_prod = MAX_PENDING_REQS;
+		for (i = 0; i < MAX_PENDING_REQS; i++)
+			netbk->pending_ring[i] = i;
+
+		init_waitqueue_head(&netbk->wq);
+		netbk->task = kthread_create(xen_netbk_kthread,
+					     (void *)netbk,
+					     "netback/%u", group);
+
+		if (IS_ERR(netbk->task)) {
+			printk(KERN_ALERT "kthread_run() fails at netback\n");
+			del_timer(&netbk->net_timer);
+			rc = PTR_ERR(netbk->task);
+			goto failed_init;
+		}
+
+		kthread_bind(netbk->task, group);
+
+		INIT_LIST_HEAD(&netbk->net_schedule_list);
+
+		spin_lock_init(&netbk->net_schedule_list_lock);
+
+		atomic_set(&netbk->netfront_count, 0);
+
+		wake_up_process(netbk->task);
+	}
+
+	rc = xenvif_xenbus_init();
+	if (rc)
+		goto failed_init;
+
+	return 0;
+
+failed_init:
+	while (--group >= 0) {
+		struct xen_netbk *netbk = &xen_netbk[group];
+		for (i = 0; i < MAX_PENDING_REQS; i++) {
+			if (netbk->mmap_pages[i])
+				__free_page(netbk->mmap_pages[i]);
+		}
+		del_timer(&netbk->net_timer);
+		kthread_stop(netbk->task);
+	}
+	vfree(xen_netbk);
+	return rc;
+
+}
+
+module_init(netback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
new file mode 100644
index 0000000..22b8c35
--- /dev/null
+++ b/drivers/net/xen-netback/xenbus.c
@@ -0,0 +1,490 @@
+/*
+ * Xenbus code for netif backend
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include "common.h"
+
+struct backend_info {
+	struct xenbus_device *dev;
+	struct xenvif *vif;
+	enum xenbus_state frontend_state;
+	struct xenbus_watch hotplug_status_watch;
+	int have_hotplug_status_watch:1;
+};
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static void backend_create_xenvif(struct backend_info *be);
+static void unregister_hotplug_status_watch(struct backend_info *be);
+
+static int netback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	unregister_hotplug_status_watch(be);
+	if (be->vif) {
+		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
+		xenvif_disconnect(be->vif);
+		be->vif = NULL;
+	}
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+	return 0;
+}
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int netback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	const char *message;
+	struct xenbus_transaction xbt;
+	int err;
+	int sg;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	sg = 1;
+
+	do {
+		err = xenbus_transaction_start(&xbt);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "starting transaction");
+			goto fail;
+		}
+
+		err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
+		if (err) {
+			message = "writing feature-sg";
+			goto abort_transaction;
+		}
+
+		err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+				    "%d", sg);
+		if (err) {
+			message = "writing feature-gso-tcpv4";
+			goto abort_transaction;
+		}
+
+		/* We support rx-copy path. */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "feature-rx-copy", "%d", 1);
+		if (err) {
+			message = "writing feature-rx-copy";
+			goto abort_transaction;
+		}
+
+		/*
+		 * We don't support rx-flip path (except old guests who don't
+		 * grok this feature flag).
+		 */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "feature-rx-flip", "%d", 0);
+		if (err) {
+			message = "writing feature-rx-flip";
+			goto abort_transaction;
+		}
+
+		err = xenbus_transaction_end(xbt, 0);
+	} while (err == -EAGAIN);
+
+	if (err) {
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto fail;
+	}
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	/* This kicks hotplug scripts, so do it immediately. */
+	backend_create_xenvif(be);
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+	pr_debug("failed");
+	netback_remove(dev);
+	return err;
+}
+
+
+/*
+ * Handle the creation of the hotplug script environment.  We add the script
+ * and vif variables to the environment, for the benefit of the vif-* hotplug
+ * scripts.
+ */
+static int netback_uevent(struct xenbus_device *xdev,
+			  struct kobj_uevent_env *env)
+{
+	struct backend_info *be = dev_get_drvdata(&xdev->dev);
+	char *val;
+
+	val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
+	if (IS_ERR(val)) {
+		int err = PTR_ERR(val);
+		xenbus_dev_fatal(xdev, err, "reading script");
+		return err;
+	} else {
+		if (add_uevent_var(env, "script=%s", val)) {
+			kfree(val);
+			return -ENOMEM;
+		}
+		kfree(val);
+	}
+
+	if (!be || !be->vif)
+		return 0;
+
+	return add_uevent_var(env, "vif=%s", be->vif->dev->name);
+}
+
+
+static void backend_create_xenvif(struct backend_info *be)
+{
+	int err;
+	long handle;
+	struct xenbus_device *dev = be->dev;
+
+	if (be->vif != NULL)
+		return;
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading handle");
+		return;
+	}
+
+	be->vif = xenvif_alloc(&dev->dev, dev->otherend_id, handle);
+	if (IS_ERR(be->vif)) {
+		err = PTR_ERR(be->vif);
+		be->vif = NULL;
+		xenbus_dev_fatal(dev, err, "creating interface");
+		return;
+	}
+
+	kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
+
+
+static void disconnect_backend(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	if (be->vif) {
+		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
+		xenvif_disconnect(be->vif);
+		be->vif = NULL;
+	}
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	pr_debug("frontend state %s", xenbus_strstate(frontend_state));
+
+	be->frontend_state = frontend_state;
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+			       __func__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+		break;
+
+	case XenbusStateConnected:
+		if (dev->state == XenbusStateConnected)
+			break;
+		backend_create_xenvif(be);
+		if (be->vif)
+			connect(be);
+		break;
+
+	case XenbusStateClosing:
+		if (be->vif)
+			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
+		disconnect_backend(dev);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+static void xen_net_read_rate(struct xenbus_device *dev,
+			      unsigned long *bytes, unsigned long *usec)
+{
+	char *s, *e;
+	unsigned long b, u;
+	char *ratestr;
+
+	/* Default to unlimited bandwidth. */
+	*bytes = ~0UL;
+	*usec = 0;
+
+	ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+	if (IS_ERR(ratestr))
+		return;
+
+	s = ratestr;
+	b = simple_strtoul(s, &e, 10);
+	if ((s == e) || (*e != ','))
+		goto fail;
+
+	s = e + 1;
+	u = simple_strtoul(s, &e, 10);
+	if ((s == e) || (*e != '\0'))
+		goto fail;
+
+	*bytes = b;
+	*usec = u;
+
+	kfree(ratestr);
+	return;
+
+ fail:
+	pr_warn("Failed to parse network rate limit. Traffic unlimited.\n");
+	kfree(ratestr);
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+	char *s, *e, *macstr;
+	int i;
+
+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+	if (IS_ERR(macstr))
+		return PTR_ERR(macstr);
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = simple_strtoul(s, &e, 16);
+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+			kfree(macstr);
+			return -ENOENT;
+		}
+		s = e+1;
+	}
+
+	kfree(macstr);
+	return 0;
+}
+
+static void unregister_hotplug_status_watch(struct backend_info *be)
+{
+	if (be->have_hotplug_status_watch) {
+		unregister_xenbus_watch(&be->hotplug_status_watch);
+		kfree(be->hotplug_status_watch.node);
+	}
+	be->have_hotplug_status_watch = 0;
+}
+
+static void hotplug_status_changed(struct xenbus_watch *watch,
+				   const char **vec,
+				   unsigned int vec_size)
+{
+	struct backend_info *be = container_of(watch,
+					       struct backend_info,
+					       hotplug_status_watch);
+	char *str;
+	unsigned int len;
+
+	str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
+	if (IS_ERR(str))
+		return;
+	if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
+		xenbus_switch_state(be->dev, XenbusStateConnected);
+		/* Not interested in this watch anymore. */
+		unregister_hotplug_status_watch(be);
+	}
+	kfree(str);
+}
+
+static void connect(struct backend_info *be)
+{
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	err = connect_rings(be);
+	if (err)
+		return;
+
+	err = xen_net_read_mac(dev, be->vif->fe_dev_addr);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+		return;
+	}
+
+	xen_net_read_rate(dev, &be->vif->credit_bytes,
+			  &be->vif->credit_usec);
+	be->vif->remaining_credit = be->vif->credit_bytes;
+
+	unregister_hotplug_status_watch(be);
+	err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
+				   hotplug_status_changed,
+				   "%s/%s", dev->nodename, "hotplug-status");
+	if (err) {
+		/* Switch now, since we can't do a watch. */
+		xenbus_switch_state(dev, XenbusStateConnected);
+	} else {
+		be->have_hotplug_status_watch = 1;
+	}
+
+	netif_wake_queue(be->vif->dev);
+}
+
+
+static int connect_rings(struct backend_info *be)
+{
+	struct xenvif *vif = be->vif;
+	struct xenbus_device *dev = be->dev;
+	unsigned long tx_ring_ref, rx_ring_ref;
+	unsigned int evtchn, rx_copy;
+	int err;
+	int val;
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "tx-ring-ref", "%lu", &tx_ring_ref,
+			    "rx-ring-ref", "%lu", &rx_ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+			   &rx_copy);
+	if (err == -ENOENT) {
+		err = 0;
+		rx_copy = 0;
+	}
+	if (err < 0) {
+		xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+				 dev->otherend);
+		return err;
+	}
+	if (!rx_copy)
+		return -EOPNOTSUPP;
+
+	if (vif->dev->tx_queue_len != 0) {
+		if (xenbus_scanf(XBT_NIL, dev->otherend,
+				 "feature-rx-notify", "%d", &val) < 0)
+			val = 0;
+		if (val)
+			vif->can_queue = 1;
+		else
+			/* Must be non-zero for pfifo_fast to work. */
+			vif->dev->tx_queue_len = 1;
+	}
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg",
+			 "%d", &val) < 0)
+		val = 0;
+	vif->can_sg = !!val;
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
+			 "%d", &val) < 0)
+		val = 0;
+	vif->gso = !!val;
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
+			 "%d", &val) < 0)
+		val = 0;
+	vif->gso_prefix = !!val;
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+			 "%d", &val) < 0)
+		val = 0;
+	vif->csum = !val;
+
+	/* Map the shared frame, irq etc. */
+	err = xenvif_connect(vif, tx_ring_ref, rx_ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "mapping shared-frames %lu/%lu port %u",
+				 tx_ring_ref, rx_ring_ref, evtchn);
+		return err;
+	}
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id netback_ids[] = {
+	{ "vif" },
+	{ "" }
+};
+
+
+static struct xenbus_driver netback = {
+	.name = "vif",
+	.owner = THIS_MODULE,
+	.ids = netback_ids,
+	.probe = netback_probe,
+	.remove = netback_remove,
+	.uevent = netback_uevent,
+	.otherend_changed = frontend_changed,
+};
+
+int xenvif_xenbus_init(void)
+{
+	return xenbus_register_backend(&netback);
+}
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index da1f121..a6ab973 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -359,7 +359,7 @@ static void xennet_tx_buf_gc(struct net_device *dev)
 			struct xen_netif_tx_response *txrsp;

 			txrsp = RING_GET_RESPONSE(&np->tx, cons);
-			if (txrsp->status == NETIF_RSP_NULL)
+			if (txrsp->status == XEN_NETIF_RSP_NULL)
 				continue;

 			id  = txrsp->id;
@@ -416,7 +416,7 @@ static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
 	   larger than a page), split it it into page-sized chunks. */
 	while (len > PAGE_SIZE - offset) {
 		tx->size = PAGE_SIZE - offset;
-		tx->flags |= NETTXF_more_data;
+		tx->flags |= XEN_NETTXF_more_data;
 		len -= tx->size;
 		data += tx->size;
 		offset = 0;
@@ -442,7 +442,7 @@ static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
 	for (i = 0; i < frags; i++) {
 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;

-		tx->flags |= NETTXF_more_data;
+		tx->flags |= XEN_NETTXF_more_data;

 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
 		np->tx_skbs[id].skb = skb_get(skb);
@@ -517,10 +517,10 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx->flags = 0;
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		/* local packet? */
-		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
+		tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
 	else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
 		/* remote but checksummed. */
-		tx->flags |= NETTXF_data_validated;
+		tx->flags |= XEN_NETTXF_data_validated;

 	if (skb_shinfo(skb)->gso_size) {
 		struct xen_netif_extra_info *gso;
@@ -531,7 +531,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (extra)
 			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
 		else
-			tx->flags |= NETTXF_extra_info;
+			tx->flags |= XEN_NETTXF_extra_info;

 		gso->u.gso.size = skb_shinfo(skb)->gso_size;
 		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
@@ -651,7 +651,7 @@ static int xennet_get_responses(struct netfront_info *np,
 	int err = 0;
 	unsigned long ret;

-	if (rx->flags & NETRXF_extra_info) {
+	if (rx->flags & XEN_NETRXF_extra_info) {
 		err = xennet_get_extras(np, extras, rp);
 		cons = np->rx.rsp_cons;
 	}
@@ -688,7 +688,7 @@ static int xennet_get_responses(struct netfront_info *np,
 		__skb_queue_tail(list, skb);

 next:
-		if (!(rx->flags & NETRXF_more_data))
+		if (!(rx->flags & XEN_NETRXF_more_data))
 			break;

 		if (cons + frags == rp) {
@@ -983,9 +983,9 @@ err:
 		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
 		skb->len += skb->data_len;

-		if (rx->flags & NETRXF_csum_blank)
+		if (rx->flags & XEN_NETRXF_csum_blank)
 			skb->ip_summed = CHECKSUM_PARTIAL;
-		else if (rx->flags & NETRXF_data_validated)
+		else if (rx->flags & XEN_NETRXF_data_validated)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;

 		__skb_queue_tail(&rxq, skb);
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 3a5a6fc..492b7d8 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = {

 #ifdef CONFIG_PCI_MSI
 static int pci_frontend_enable_msix(struct pci_dev *dev,
-				    int **vector, int nvec)
+				    int vector[], int nvec)
 {
 	int err;
 	int i;
@@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev,
 	if (likely(!err)) {
 		if (likely(!op.value)) {
 			/* we get the result */
-			for (i = 0; i < nvec; i++)
-				*(*vector+i) = op.msix_entries[i].vector;
-			return 0;
+			for (i = 0; i < nvec; i++) {
+				if (op.msix_entries[i].vector <= 0) {
+					dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n",
+						i, op.msix_entries[i].vector);
+					err = -EINVAL;
+					vector[i] = -1;
+					continue;
+				}
+				vector[i] = op.msix_entries[i].vector;
+			}
 		} else {
 			printk(KERN_DEBUG "enable msix get value %x\n",
 				op.value);
-			return op.value;
 		}
 	} else {
 		dev_err(&dev->dev, "enable msix get err %x\n", err);
-		return err;
 	}
+	return err;
 }

 static void pci_frontend_disable_msix(struct pci_dev *dev)
@@ -310,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev)
 		dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
 }

-static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[])
 {
 	int err;
 	struct xen_pci_op op = {
@@ -324,7 +330,13 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)

 	err = do_pci_op(pdev, &op);
 	if (likely(!err)) {
-		*(*vector) = op.value;
+		vector[0] = op.value;
+		if (op.value <= 0) {
+			dev_warn(&dev->dev, "MSI entry is invalid: %d!\n",
+				op.value);
+			err = -EINVAL;
+			vector[0] = -1;
+		}
 	} else {
 		dev_err(&dev->dev, "pci frontend enable msi failed for dev "
 				    "%x:%x\n", op.bus, op.devfn);
@@ -733,8 +745,7 @@ static void free_pdev(struct pcifront_device *pdev)

 	pcifront_free_roots(pdev);

-	/*For PCIE_AER error handling job*/
-	flush_scheduled_work();
+	cancel_work_sync(&pdev->op_work);

 	if (pdev->irq >= 0)
 		unbind_from_irqhandler(pdev->irq, pdev);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 07bec09..e5ecae6 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,6 +37,79 @@ config XEN_BACKEND
 	  Support for backend device drivers that provide I/O services
 	  to other virtual machines.

+config XEN_BLKDEV_BACKEND
+	tristate "Block-device backend driver"
+	depends on XEN_BACKEND && BLOCK
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
+config XEN_PCIDEV_BACKEND
+	tristate "PCI-device backend driver"
+	depends on PCI
+	depends on XEN_BACKEND
+	help
+	  The PCI device backend driver allows the kernel to export arbitrary
+	  PCI devices to other guests. If you select this to be a module, you
+	  will need to make sure no other driver has bound to the device(s)
+	  you want to make visible to other guests.
+
+choice
+	prompt "PCI Backend Mode"
+	depends on XEN_PCIDEV_BACKEND
+	default XEN_PCIDEV_BACKEND_VPCI if !IA64
+	default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
+
+config XEN_PCIDEV_BACKEND_VPCI
+	bool "Virtual PCI"
+	---help---
+	  This PCI Backend hides the true PCI topology and makes the frontend
+	  think there is a single PCI bus with only the exported devices on it.
+	  For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+	  second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+config XEN_PCIDEV_BACKEND_PASS
+	bool "Passthrough"
+	---help---
+	  This PCI Backend provides a real view of the PCI topology to the
+	  frontend (for example, a device at 06:01.b will still appear at
+	  06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+	  PCI devices to its driver domains. This may be required for drivers
+	  which depend on finding their hardward in certain bus/slot
+	  locations.
+
+config XEN_PCIDEV_BACKEND_SLOT
+	bool "Slot"
+	---help---
+	  This PCI Backend hides the true PCI topology and makes the frontend
+	  think there is a single PCI bus with only the exported devices on it.
+	  Contrary to the virtual PCI backend, a function becomes a new slot.
+	  For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+	  second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+	bool "Controller"
+	depends on IA64
+	---help---
+	  This PCI backend virtualizes the PCI bus topology by providing a
+	  virtual bus per PCI root device.  Devices which are physically under
+	  the same root bus will appear on the same virtual bus.  For systems
+	  with complex I/O addressing, this is the only backend which supports
+	  extended I/O port spaces and MMIO translation offsets.  This backend
+	  also supports slot virtualization.  For example, a device at
+	  0000:01:02.1 will be re-assigned to 0000:00:00.0.  A second device
+	  at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+	  re-assigned to 0000:00:01.0.  A third device at 0000:16:05.0 (under
+	  a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+endchoice
+
+config XEN_PCIDEV_BE_DEBUG
+	bool "PCI Backend Debugging"
+	depends on XEN_PCIDEV_BACKEND
+
+
 config XENFS
 	tristate "Xen filesystem"
 	default y
@@ -76,10 +149,20 @@ config XEN_XENBUS_FRONTEND
 config XEN_GNTDEV
 	tristate "userspace grant access device driver"
 	depends on XEN
+	default m
 	select MMU_NOTIFIER
 	help
 	  Allows userspace processes to use grants.

+config XEN_GRANT_DEV_ALLOC
+	tristate "User-space grant reference allocator driver"
+	depends on XEN
+	default m
+	help
+	  Allows userspace processes to create pages with access granted
+	  to other domains. This can be used to implement frontend drivers
+	  or as part of an inter-domain shared memory channel.
+
 config XEN_PLATFORM_PCI
 	tristate "xen platform pci device driver"
 	depends on XEN_PVHVM && PCI
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 5088cc2..c1cb873 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,7 +9,10 @@ obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
 obj-$(CONFIG_XEN_DEV_EVTCHN)	+= xen-evtchn.o
+obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= pciback/
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
 obj-$(CONFIG_XEN_GNTDEV)	+= xen-gntdev.o
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
 obj-$(CONFIG_XENFS)		+= xenfs/
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PLATFORM_PCI)	+= xen-platform-pci.o
@@ -18,5 +21,6 @@ obj-$(CONFIG_XEN_DOM0)		+= pci.o

 xen-evtchn-y			:= evtchn.o
 xen-gntdev-y				:= gntdev.o
+xen-gntalloc-y				:= gntalloc.o

 xen-platform-pci-y		:= platform-pci.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 43f9f02..718050a 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -232,7 +232,7 @@ static int increase_reservation(unsigned long nr_pages)
 		set_phys_to_machine(pfn, frame_list[i]);

 		/* Link back into the page tables if not highmem. */
-		if (pfn < max_low_pfn) {
+		if (!xen_hvm_domain() && pfn < max_low_pfn) {
 			int ret;
 			ret = HYPERVISOR_update_va_mapping(
 				(unsigned long)__va(pfn << PAGE_SHIFT),
@@ -280,7 +280,7 @@ static int decrease_reservation(unsigned long nr_pages)

 		scrub_page(page);

-		if (!PageHighMem(page)) {
+		if (!xen_hvm_domain() && !PageHighMem(page)) {
 			ret = HYPERVISOR_update_va_mapping(
 				(unsigned long)__va(pfn << PAGE_SHIFT),
 				__pte_ma(0), 0);
@@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages)
 	/* No more mappings: invalidate P2M and add to balloon. */
 	for (i = 0; i < nr_pages; i++) {
 		pfn = mfn_to_pfn(frame_list[i]);
-		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 		balloon_append(pfn_to_page(pfn));
 	}

@@ -392,15 +392,19 @@ static struct notifier_block xenstore_notifier;

 static int __init balloon_init(void)
 {
-	unsigned long pfn, extra_pfn_end;
+ 	unsigned long pfn, nr_pages, extra_pfn_end;
 	struct page *page;

-	if (!xen_pv_domain())
+	if (!xen_domain())
 		return -ENODEV;

 	pr_info("xen_balloon: Initialising balloon driver.\n");

-	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ 	if (xen_pv_domain())
+ 		nr_pages = xen_start_info->nr_pages;
+ 	else
+ 		nr_pages = max_pfn;
+ 	balloon_stats.current_pages = min(nr_pages, max_pfn);
 	balloon_stats.target_pages  = balloon_stats.current_pages;
 	balloon_stats.balloon_low   = 0;
 	balloon_stats.balloon_high  = 0;
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
new file mode 100644
index 0000000..f1ae1ff
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+
+xen-blkback-y	:= blkback.o xenbus.o interface.o vbd.o
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
new file mode 100644
index 0000000..15790ae
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,708 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+
+#include <xen/events.h>
+#include <xen/page.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+	blkif_t       *blkif;
+	u64            id;
+	int            nr_pages;
+	atomic_t       pendcnt;
+	unsigned short operation;
+	int            status;
+	struct list_head free_list;
+} pending_req_t;
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+struct xen_blkbk {
+	pending_req_t	*pending_reqs;
+	struct list_head	pending_free;
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
+	struct page		**pending_pages;
+	grant_handle_t		*pending_grant_handles;
+};
+
+static struct xen_blkbk *blkbk;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - blkbk->pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+	(blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 struct blkif_request *req,
+				 pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
+	if (!list_empty(&blkbk->pending_free)) {
+		req = list_entry(blkbk->pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
+	was_empty = list_empty(&blkbk->pending_free);
+	list_add(&req->free_list, &blkbk->pending_free);
+	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&blkbk->pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (blkif->plug == NULL)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int ret;
+
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
+		if (handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		invcount++;
+	}
+
+	ret = HYPERVISOR_grant_table_op(
+		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	BUG_ON(ret);
+	/* Note, we use invcount, so nr->pages, so we can't index
+	 * using vaddr(req, i). */
+	for (i = 0; i < invcount; i++) {
+		ret = m2p_remove_override(
+			virt_to_page(unmap[i].host_addr), false);
+		if (ret) {
+			printk(KERN_ALERT "Failed to remove M2P override for " \
+				"%lx\n", (unsigned long)unmap[i].host_addr);
+			continue;
+		}
+	}
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+	blkif_t *blkif = arg;
+	struct vbd *vbd = &blkif->vbd;
+
+	blkif_get(blkif);
+
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+		if (unlikely(vbd->size != vbd_size(vbd)))
+			vbd_resize(blkif);
+
+		wait_event_interruptible(
+			blkif->wq,
+			blkif->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			blkbk->pending_free_wq,
+			!list_empty(&blkbk->pending_free) || kthread_should_stop());
+
+		blkif->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		if (do_block_io_op(blkif))
+			blkif->waiting_reqs = 1;
+		unplug_queue(blkif);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+
+	return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+	/* An error fails the entire request. */
+	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+	    (error == -EOPNOTSUPP)) {
+		DPRINTK("blkback: write barrier op failed, not supported\n");
+		blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (error) {
+		DPRINTK("Buffer not up-to-date at end of operation, "
+			"error=%d\n", error);
+		pending_req->status = BLKIF_RSP_ERROR;
+	}
+
+	if (atomic_dec_and_test(&pending_req->pendcnt)) {
+		fast_flush_area(pending_req);
+		make_response(pending_req->blkif, pending_req->id,
+			      pending_req->operation, pending_req->status);
+		blkif_put(pending_req->blkif);
+		free_req(pending_req);
+	}
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+	__end_block_io_op(bio->bi_private, error);
+	bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif)
+{
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	struct blkif_request req;
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	while (rc != rp) {
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+			break;
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		switch (blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		case BLKIF_OP_WRITE_BARRIER:
+			blkif->st_br_req++;
+			/* fall through */
+		case BLKIF_OP_WRITE:
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		default:
+			/* A good sign something is wrong: sleep for a while to
+			 * avoid excessive CPU consumption by a bad guest. */
+			msleep(1);
+			DPRINTK("error: unknown block io operation [%d]\n",
+				req.operation);
+			make_response(blkif, req.id, req.operation,
+				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+
+	return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 struct blkif_request *req,
+				 pending_req_t *pending_req)
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct phys_req preq;
+	struct {
+		unsigned long buf; unsigned int nsec;
+	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int nseg;
+	struct bio *bio = NULL;
+	int ret, i;
+	int operation;
+
+	switch (req->operation) {
+	case BLKIF_OP_READ:
+		operation = READ;
+		break;
+	case BLKIF_OP_WRITE:
+		operation = WRITE;
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		operation = REQ_FLUSH | REQ_FUA;
+		break;
+	default:
+		operation = 0; /* make gcc happy */
+		BUG();
+	}
+
+	/* Check that number of segments is sane. */
+	nseg = req->nr_segments;
+	if (unlikely(nseg == 0 && operation != (REQ_FLUSH | REQ_FUA)) ||
+	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		DPRINTK("Bad number of segments in request (%d)\n", nseg);
+		goto fail_response;
+	}
+
+	preq.dev           = req->handle;
+	preq.sector_number = req->u.rw.sector_number;
+	preq.nr_sects      = 0;
+
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = req->operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
+	for (i = 0; i < nseg; i++) {
+		uint32_t flags;
+
+		seg[i].nsec = req->u.rw.seg[i].last_sect -
+			req->u.rw.seg[i].first_sect + 1;
+
+		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
+			goto fail_response;
+		preq.nr_sects += seg[i].nsec;
+
+		flags = GNTMAP_host_map;
+		if (operation != READ)
+			flags |= GNTMAP_readonly;
+		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+				  req->u.rw.seg[i].gref, blkif->domid);
+	}
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	BUG_ON(ret);
+
+	for (i = 0; i < nseg; i++) {
+		if (unlikely(map[i].status != 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			map[i].handle = BLKBACK_INVALID_HANDLE;
+			ret |= 1;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
+
+		if (ret)
+			continue;
+
+		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
+			blkbk->pending_page(pending_req, i), false);
+		if (ret) {
+			printk(KERN_ALERT "Failed to install M2P override for"\
+				" %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret);
+			continue;
+		}
+
+		seg[i].buf  = map[i].dev_bus_addr |
+			(req->u.rw.seg[i].first_sect << 9);
+	}
+
+	if (ret)
+		goto fail_flush;
+
+	if (vbd_translate(&preq, blkif, operation) != 0) {
+		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+			operation == READ ? "read" : "write",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, preq.dev);
+		goto fail_flush;
+	}
+
+	plug_queue(blkif, preq.bdev);
+	atomic_set(&pending_req->pendcnt, 1);
+	blkif_get(blkif);
+
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			DPRINTK("Misaligned I/O request from domain %d",
+				blkif->domid);
+			goto fail_put_bio;
+		}
+
+		while ((bio == NULL) ||
+		       (bio_add_page(bio,
+				     blkbk->pending_page(pending_req, i),
+				     seg[i].nsec << 9,
+				     seg[i].buf & ~PAGE_MASK) == 0)) {
+			if (bio) {
+				atomic_inc(&pending_req->pendcnt);
+				submit_bio(operation, bio);
+			}
+
+			bio = bio_alloc(GFP_KERNEL, nseg-i);
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
+
+			bio->bi_bdev    = preq.bdev;
+			bio->bi_private = pending_req;
+			bio->bi_end_io  = end_block_io_op;
+			bio->bi_sector  = preq.sector_number;
+		}
+
+		preq.sector_number += seg[i].nsec;
+	}
+
+	if (!bio) {
+		BUG_ON(operation != (REQ_FLUSH | REQ_FUA));
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
+
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
+		bio->bi_sector  = -1;
+	}
+
+	submit_bio(operation, bio);
+
+	if (operation == READ)
+		blkif->st_rd_sect += preq.nr_sects;
+	else if (operation == WRITE || operation == (REQ_FLUSH | REQ_FUA))
+		blkif->st_wr_sect += preq.nr_sects;
+
+	return;
+
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
+	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
+	msleep(1); /* back off a bit */
+	return;
+
+ fail_put_bio:
+	__end_block_io_op(pending_req, -EINVAL);
+	if (bio)
+		bio_put(bio);
+	unplug_queue(blkif);
+	msleep(1); /* back off a bit */
+	return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st)
+{
+	struct blkif_response  resp;
+	unsigned long     flags;
+	union blkif_back_rings *blk_rings = &blkif->blk_rings;
+	int more_to_do = 0;
+	int notify;
+
+	resp.id        = id;
+	resp.operation = op;
+	resp.status    = st;
+
+	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	/* Place on the response ring for the relevant domain. */
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	default:
+		BUG();
+	}
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+	if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+		/*
+		 * Tail check for pending requests. Allows frontend to avoid
+		 * notifications if requests are already in flight (lower
+		 * overheads and promotes batching).
+		 */
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+		more_to_do = 1;
+	}
+
+	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+	if (more_to_do)
+		blkif_notify_work(blkif);
+	if (notify)
+		notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+	int i, mmap_pages;
+	int rc = 0;
+
+	if (!xen_pv_domain())
+		return -ENODEV;
+
+	blkbk = (struct xen_blkbk *)vmalloc(sizeof(struct xen_blkbk));
+	if (!blkbk) {
+		printk(KERN_ALERT "%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
+	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	blkbk->pending_grant_handles = vzalloc(sizeof(blkbk->pending_grant_handles[0]) *
+					mmap_pages);
+	blkbk->pending_pages         = vzalloc(sizeof(blkbk->pending_pages[0]) * mmap_pages);
+
+	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || !blkbk->pending_pages) {
+		rc = -ENOMEM;
+		goto out_of_memory;
+	}
+
+	for (i = 0; i < mmap_pages; i++) {
+		blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+		blkbk->pending_pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (blkbk->pending_pages[i] == NULL) {
+			rc = -ENOMEM;
+			goto out_of_memory;
+		}
+	}
+	rc = blkif_interface_init();
+	if (rc)
+		goto failed_init;
+
+	memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
+
+	INIT_LIST_HEAD(&blkbk->pending_free);
+	spin_lock_init(&blkbk->pending_free_lock);
+	init_waitqueue_head(&blkbk->pending_free_wq);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&blkbk->pending_reqs[i].free_list, &blkbk->pending_free);
+
+	rc = blkif_xenbus_init();
+	if (rc)
+		goto failed_init;
+
+	return 0;
+
+ out_of_memory:
+	printk(KERN_ERR "%s: out of memory\n", __func__);
+ failed_init:
+	kfree(blkbk->pending_reqs);
+	vfree(blkbk->pending_grant_handles);
+	for (i = 0; i < mmap_pages; i++) {
+		if (blkbk->pending_pages[i])
+			__free_page(blkbk->pending_pages[i]);
+	}
+	vfree(blkbk->pending_pages);
+	vfree(blkbk);
+	blkbk = NULL;
+	return rc;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
new file mode 100644
index 0000000..0f91830
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,141 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
+	unsigned char  readonly;    /* Non-zero -> read-only */
+	unsigned char  type;        /* VDISK_xxx */
+	u32            pdevice;     /* phys device that this vbd maps to */
+	struct block_device *bdev;
+	sector_t       size;        /* Cached size parameter */
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+	/* Unique identifier for this interface. */
+	domid_t           domid;
+	unsigned int      handle;
+	/* Physical parameters of the comms window. */
+	unsigned int      irq;
+	/* Comms information. */
+	enum blkif_protocol blk_protocol;
+	union blkif_back_rings blk_rings;
+	struct vm_struct *blk_ring_area;
+	/* The VBD attached to this interface. */
+	struct vbd        vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info *be;
+	/* Private fields. */
+	spinlock_t       blk_ring_lock;
+	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	unsigned int        waiting_reqs;
+	struct request_queue     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
+	int                 st_br_req;
+	int                 st_rd_sect;
+	int                 st_wr_sect;
+
+	wait_queue_head_t waiting_to_free;
+
+	grant_handle_t shmem_handle;
+	grant_ref_t    shmem_ref;
+} blkif_t;
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+void vbd_resize(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			wake_up(&(_b)->waiting_to_free);\
+	} while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+	       unsigned minor, int readonly, int cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+	unsigned short       dev;
+	unsigned short       nr_sects;
+	struct block_device *bdev;
+	blkif_sector_t       sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+int blkif_interface_init(void);
+
+int blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
new file mode 100644
index 0000000..e397a41
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <linux/kthread.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+	blkif_t *blkif;
+
+	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	memset(blkif, 0, sizeof(*blkif));
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
+	init_waitqueue_head(&blkif->waiting_to_free);
+
+	return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
+{
+	struct gnttab_map_grant_ref op;
+
+	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			  GNTMAP_host_map, shared_page, blkif->domid);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status) {
+		DPRINTK(" Grant table operation failure !\n");
+		return op.status;
+	}
+
+	blkif->shmem_ref = shared_page;
+	blkif->shmem_handle = op.handle;
+
+	return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
+			    GNTMAP_host_map, blkif->shmem_handle);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+		BUG();
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
+		return -ENOMEM;
+
+	err = map_frontend_page(blkif, shared_page);
+	if (err) {
+		free_vm_area(blkif->blk_ring_area);
+		return err;
+	}
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		struct blkif_sring *sring;
+		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		struct blkif_x86_32_sring *sring_x86_32;
+		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		struct blkif_x86_64_sring *sring_x86_64;
+		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	if (err < 0)
+	{
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+	}
+
+	atomic_dec(&blkif->refcnt);
+	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+	atomic_inc(&blkif->refcnt);
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		unmap_frontend_page(blkif);
+		free_vm_area(blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+	}
+}
+
+void blkif_free(blkif_t *blkif)
+{
+	if (!atomic_dec_and_test(&blkif->refcnt))
+		BUG();
+	kmem_cache_free(blkif_cachep, blkif);
+}
+
+int __init blkif_interface_init(void)
+{
+	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+					 0, 0, NULL);
+	if (!blkif_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
new file mode 100644
index 0000000..8c91a2f
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,163 @@
+/******************************************************************************
+ * blkback/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
+		      (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+	return vbd_sz(vbd);
+}
+
+unsigned int vbd_info(struct vbd *vbd)
+{
+	return vbd->type | (vbd->readonly?VDISK_READONLY:0);
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+	return bdev_logical_block_size(vbd->bdev);
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+	       unsigned minor, int readonly, int cdrom)
+{
+	struct vbd *vbd;
+	struct block_device *bdev;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+				 FMODE_READ : FMODE_WRITE, NULL);
+
+	if (IS_ERR(bdev)) {
+		DPRINTK("vbd_creat: device %08x could not be opened.\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+	vbd->size = vbd_size(vbd);
+
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		vbd_free(vbd);
+		return -ENOENT;
+	}
+
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+	struct vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != READ) && vbd->readonly)
+		goto out;
+
+	if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
+		goto out;
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
+
+void vbd_resize(blkif_t *blkif)
+{
+	struct vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = blkback_xenbus(blkif->be);
+	unsigned long long new_size = vbd_size(vbd);
+
+	printk(KERN_INFO "VBD Resize: Domid: %d, Device: (%d, %d)\n",
+		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
+	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		printk(KERN_WARNING "Error starting transaction");
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+			    vbd_size(vbd));
+	if (err) {
+		printk(KERN_WARNING "Error writing new size");
+		goto abort;
+	}
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		printk(KERN_WARNING "Error writing the state");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		printk(KERN_WARNING "Error ending transaction");
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
new file mode 100644
index 0000000..031bc3d
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,559 @@
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include "common.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+	blkif_t *blkif;
+	struct xenbus_watch backend_watch;
+	unsigned major;
+	unsigned minor;
+	char *mode;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+			    unsigned int);
+
+struct xenbus_device *blkback_xenbus(struct backend_info *be)
+{
+	return be->dev;
+}
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath))
+		return PTR_ERR(devpath);
+
+	if ((devname = strstr(devpath, "/dev/")) != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+	kfree(devpath);
+
+	return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{
+	int err;
+	char name[TASK_COMM_LEN];
+
+	/* Not ready to connect? */
+	if (!blkif->irq || !blkif->vbd.bdev)
+		return;
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blkback_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "block flush");
+		return;
+	}
+	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+	if (IS_ERR(blkif->xenblkd)) {
+		err = PTR_ERR(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+	}
+}
+
+
+/****************************************************************
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_br_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static struct attribute_group vbdstat_group = {
+	.name = "statistics",
+	.attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+	int error;
+
+	error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ 	if (error)
+		goto fail1;
+
+	error = device_create_file(&dev->dev, &dev_attr_mode);
+	if (error)
+		goto fail2;
+
+	error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+	if (error)
+		goto fail3;
+
+	return 0;
+
+fail3:	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
+fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
+	return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+	device_remove_file(&dev->dev, &dev_attr_mode);
+	device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("");
+
+	if (be->major || be->minor)
+		xenvbd_sysfs_delif(dev);
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+
+	if (be->blkif) {
+		blkif_disconnect(be->blkif);
+		vbd_free(&be->blkif->vbd);
+		blkif_free(be->blkif);
+		be->blkif = NULL;
+	}
+
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+	return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+	return err;
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	be->blkif = blkif_alloc(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	/* setup back pointer */
+	be->blkif->be = be;
+
+	err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+				   "%s/%s", dev->nodename, "physical-device");
+	if (err)
+		goto fail;
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	DPRINTK("failed");
+	blkback_remove(dev);
+	return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	unsigned major;
+	unsigned minor;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	int cdrom = 0;
+	char *device_type;
+
+	DPRINTK("");
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+			   &major, &minor);
+	if (XENBUS_EXIST_ERR(err)) {
+		/* Since this watch will fire once immediately after it is
+		   registered, we expect this.  Ignore it, and wait for the
+		   hotplug scripts. */
+		return;
+	}
+	if (err != 2) {
+		xenbus_dev_fatal(dev, err, "reading physical-device");
+		return;
+	}
+
+	if ((be->major || be->minor) &&
+	    ((be->major != major) || (be->minor != minor))) {
+		printk(KERN_WARNING
+		       "blkback: changing physical device (from %x:%x to "
+		       "%x:%x) not supported.\n", be->major, be->minor,
+		       major, minor);
+		return;
+	}
+
+	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+	if (IS_ERR(be->mode)) {
+		err = PTR_ERR(be->mode);
+		be->mode = NULL;
+		xenbus_dev_fatal(dev, err, "reading mode");
+		return;
+	}
+
+	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+	if (!IS_ERR(device_type)) {
+		cdrom = strcmp(device_type, "cdrom") == 0;
+		kfree(device_type);
+	}
+
+	if (be->major == 0 && be->minor == 0) {
+		/* Front end dir is a number, which is used as the handle. */
+
+		char *p = strrchr(dev->otherend, '/') + 1;
+		long handle = simple_strtoul(p, NULL, 0);
+
+		be->major = major;
+		be->minor = minor;
+
+		err = vbd_create(be->blkif, handle, major, minor,
+				 (NULL == strchr(be->mode, 'w')), cdrom);
+		if (err) {
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating vbd structure");
+			return;
+		}
+
+		err = xenvbd_sysfs_addif(dev);
+		if (err) {
+			vbd_free(&be->blkif->vbd);
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating sysfs entries");
+			return;
+		}
+
+		/* We're potentially connected now */
+		update_blkif_status(be->blkif);
+	}
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	DPRINTK("%s", xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
+			       __FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/* Ensure we connect even when two watches fire in
+		   close successsion and we miss the intermediate value
+		   of frontend_state. */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		/* Enforce precondition before potential leak point.
+		 * blkif_disconnect() is idempotent.
+		 */
+		blkif_disconnect(be->blkif);
+
+		err = connect_ring(be);
+		if (err)
+			break;
+		update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		/* implies blkif_disconnect() via blkback_remove() */
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	DPRINTK("%s", dev->otherend);
+
+	/* Supply the information about the device the frontend needs */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	err = blkback_barrier(xbt, be, 1);
+	if (err)
+		goto abort;
+
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    vbd_size(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sectors",
+				 dev->nodename);
+		goto abort;
+	}
+
+	/* FIXME: use a typename instead */
+	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+			    vbd_info(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/info",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    vbd_secsize(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+				 dev->nodename);
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "switching to Connected state",
+				 dev->nodename);
+
+	return;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned long ring_ref;
+	unsigned int evtchn;
+	char protocol[64] = "";
+	int err;
+
+	DPRINTK("%s", dev->otherend);
+
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    "%63s", protocol, NULL);
+	if (err)
+		strcpy(protocol, "unspecified, assuming native");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -1;
+	}
+	printk(KERN_INFO
+	       "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
+	       ring_ref, evtchn, be->blkif->blk_protocol, protocol);
+
+	/* Map the shared frame, irq etc. */
+	err = blkif_map(be->blkif, ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+				 ring_ref, evtchn);
+		return err;
+	}
+
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+
+static struct xenbus_driver blkback = {
+	.name = "vbd",
+	.owner = THIS_MODULE,
+	.ids = blkback_ids,
+	.probe = blkback_probe,
+	.remove = blkback_remove,
+	.otherend_changed = frontend_changed
+};
+
+
+int blkif_xenbus_init(void)
+{
+	return xenbus_register_backend(&blkback);
+}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 7468147..8b7fc9a6 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -97,6 +97,7 @@ struct irq_info
 			unsigned short gsi;
 			unsigned char vector;
 			unsigned char flags;
+			uint16_t domid;
 		} pirq;
 	} u;
 };
@@ -114,7 +115,7 @@ struct cpu_evtchn_s {
 static __initdata struct cpu_evtchn_s init_evtchn_mask = {
 	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
 };
-static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask;

 static inline unsigned long *cpu_evtchn_mask(int cpu)
 {
@@ -153,11 +154,13 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
 }

 static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq,
-				    unsigned short gsi, unsigned short vector)
+				    unsigned short gsi, unsigned short vector,
+				    domid_t domid)
 {
 	return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
 			.cpu = 0,
-			.u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } };
+			.u.pirq = { .pirq = pirq, .gsi = gsi,
+				    .vector = vector, .domid = domid } };
 }

 /*
@@ -277,7 +280,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)

 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
+	cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
 #endif

 	clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
@@ -294,7 +297,7 @@ static void init_evtchn_cpu_bindings(void)

 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
-		cpumask_copy(desc->affinity, cpumask_of(0));
+		cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
 	}
 #endif

@@ -376,81 +379,69 @@ static void unmask_evtchn(int port)
 	put_cpu();
 }

-static int get_nr_hw_irqs(void)
+static int xen_allocate_irq_dynamic(void)
 {
-	int ret = 1;
+	int first = 0;
+	int irq;

 #ifdef CONFIG_X86_IO_APIC
-	ret = get_nr_irqs_gsi();
+	/*
+	 * For an HVM guest or domain 0 which see "real" (emulated or
+	 * actual repectively) GSIs we allocate dynamic IRQs
+	 * e.g. those corresponding to event channels or MSIs
+	 * etc. from the range above those "real" GSIs to avoid
+	 * collisions.
+	 */
+	if (xen_initial_domain() || xen_hvm_domain())
+		first = get_nr_irqs_gsi();
 #endif

-	return ret;
-}
+retry:
+	irq = irq_alloc_desc_from(first, -1);

-static int find_unbound_pirq(int type)
-{
-	int rc, i;
-	struct physdev_get_free_pirq op_get_free_pirq;
-	op_get_free_pirq.type = type;
+	if (irq == -ENOMEM && first > NR_IRQS_LEGACY) {
+		printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n");
+		first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY);
+		goto retry;
+	}

-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
-	if (!rc)
-		return op_get_free_pirq.pirq;
+	if (irq < 0)
+		panic("No available IRQ to bind to: increase nr_irqs!\n");

-	for (i = 0; i < nr_irqs; i++) {
-		if (pirq_to_irq[i] < 0)
-			return i;
-	}
-	return -1;
+	return irq;
 }

-static int find_unbound_irq(void)
+static int xen_allocate_irq_gsi(unsigned gsi)
 {
-	struct irq_data *data;
-	int irq, res;
-	int bottom = get_nr_hw_irqs();
-	int top = nr_irqs-1;
-
-	if (bottom == nr_irqs)
-		goto no_irqs;
+	int irq;

-	/* This loop starts from the top of IRQ space and goes down.
-	 * We need this b/c if we have a PCI device in a Xen PV guest
-	 * we do not have an IO-APIC (though the backend might have them)
-	 * mapped in. To not have a collision of physical IRQs with the Xen
-	 * event channels start at the top of the IRQ space for virtual IRQs.
+	/*
+	 * A PV guest has no concept of a GSI (since it has no ACPI
+	 * nor access to/knowledge of the physical APICs). Therefore
+	 * all IRQs are dynamically allocated from the entire IRQ
+	 * space.
 	 */
-	for (irq = top; irq > bottom; irq--) {
-		data = irq_get_irq_data(irq);
-		/* only 15->0 have init'd desc; handle irq > 16 */
-		if (!data)
-			break;
-		if (data->chip == &no_irq_chip)
-			break;
-		if (data->chip != &xen_dynamic_chip)
-			continue;
-		if (irq_info[irq].type == IRQT_UNBOUND)
-			return irq;
-	}
+	if (xen_pv_domain() && !xen_initial_domain())
+		return xen_allocate_irq_dynamic();

-	if (irq == bottom)
-		goto no_irqs;
+	/* Legacy IRQ descriptors are already allocated by the arch. */
+	if (gsi < NR_IRQS_LEGACY)
+		return gsi;

-	res = irq_alloc_desc_at(irq, -1);
-
-	if (WARN_ON(res != irq))
-		return -1;
+	irq = irq_alloc_desc_at(gsi, -1);
+	if (irq < 0)
+		panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq);

 	return irq;
-
-no_irqs:
-	panic("No available IRQ to bind to: increase nr_irqs!\n");
 }

-static bool identity_mapped_irq(unsigned irq)
+static void xen_free_irq(unsigned irq)
 {
-	/* identity map all the hardware irqs */
-	return irq < get_nr_hw_irqs();
+	/* Legacy IRQ descriptors are managed by the arch. */
+	if (irq < NR_IRQS_LEGACY)
+		return;
+
+	irq_free_desc(irq);
 }

 static void pirq_unmask_notify(int irq)
@@ -486,7 +477,7 @@ static bool probing_irq(int irq)
 	return desc && desc->action == NULL;
 }

-static unsigned int startup_pirq(unsigned int irq)
+static unsigned int __startup_pirq(unsigned int irq)
 {
 	struct evtchn_bind_pirq bind_pirq;
 	struct irq_info *info = info_for_irq(irq);
@@ -524,9 +515,15 @@ out:
 	return 0;
 }

-static void shutdown_pirq(unsigned int irq)
+static unsigned int startup_pirq(struct irq_data *data)
+{
+	return __startup_pirq(data->irq);
+}
+
+static void shutdown_pirq(struct irq_data *data)
 {
 	struct evtchn_close close;
+	unsigned int irq = data->irq;
 	struct irq_info *info = info_for_irq(irq);
 	int evtchn = evtchn_from_irq(irq);

@@ -546,20 +543,20 @@ static void shutdown_pirq(unsigned int irq)
 	info->evtchn = 0;
 }

-static void enable_pirq(unsigned int irq)
+static void enable_pirq(struct irq_data *data)
 {
-	startup_pirq(irq);
+	startup_pirq(data);
 }

-static void disable_pirq(unsigned int irq)
+static void disable_pirq(struct irq_data *data)
 {
 }

-static void ack_pirq(unsigned int irq)
+static void ack_pirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);

-	move_native_irq(irq);
+	move_native_irq(data->irq);

 	if (VALID_EVTCHN(evtchn)) {
 		mask_evtchn(evtchn);
@@ -567,23 +564,6 @@ static void ack_pirq(unsigned int irq)
 	}
 }

-static void end_pirq(unsigned int irq)
-{
-	int evtchn = evtchn_from_irq(irq);
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (WARN_ON(!desc))
-		return;
-
-	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
-	    (IRQ_DISABLED|IRQ_PENDING)) {
-		shutdown_pirq(irq);
-	} else if (VALID_EVTCHN(evtchn)) {
-		unmask_evtchn(evtchn);
-		pirq_unmask_notify(irq);
-	}
-}
-
 static int find_irq_by_gsi(unsigned gsi)
 {
 	int irq;
@@ -638,14 +618,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
 		goto out;	/* XXX need refcount? */
 	}

-	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
-	 * we are using the !xen_initial_domain() to drop in the function.*/
-	if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
-				xen_pv_domain())) {
-		irq = gsi;
-		irq_alloc_desc_at(irq, -1);
-	} else
-		irq = find_unbound_irq();
+	irq = xen_allocate_irq_gsi(gsi);

 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
 				      handle_level_irq, name);
@@ -658,12 +631,12 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
 	 * this in the priv domain. */
 	if (xen_initial_domain() &&
 	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-		irq_free_desc(irq);
+		xen_free_irq(irq);
 		irq = -ENOSPC;
 		goto out;
 	}

-	irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector);
+	irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector, DOMID_SELF);
 	irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
 	pirq_to_irq[pirq] = irq;

@@ -674,87 +647,47 @@ out:
 }

 #ifdef CONFIG_PCI_MSI
-#include <linux/msi.h>
-#include "../pci/msi.h"
-
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
 {
-	spin_lock(&irq_mapping_update_lock);
-
-	if (alloc & XEN_ALLOC_IRQ) {
-		*irq = find_unbound_irq();
-		if (*irq == -1)
-			goto out;
-	}
-
-	if (alloc & XEN_ALLOC_PIRQ) {
-		*pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
-		if (*pirq == -1)
-			goto out;
-	}
+	int rc;
+	struct physdev_get_free_pirq op_get_free_pirq;

-	set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
-				      handle_level_irq, name);
+	op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);

-	irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
-	pirq_to_irq[*pirq] = *irq;
+	WARN_ONCE(rc == -ENOSYS,
+		  "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");

-out:
-	spin_unlock(&irq_mapping_update_lock);
+	return rc ? -1 : op_get_free_pirq.pirq;
 }

-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			     int pirq, int vector, const char *name,
+			     domid_t domid)
 {
-	int irq = -1;
-	struct physdev_map_pirq map_irq;
-	int rc;
-	int pos;
-	u32 table_offset, bir;
-
-	memset(&map_irq, 0, sizeof(map_irq));
-	map_irq.domid = DOMID_SELF;
-	map_irq.type = MAP_PIRQ_TYPE_MSI;
-	map_irq.index = -1;
-	map_irq.pirq = -1;
-	map_irq.bus = dev->bus->number;
-	map_irq.devfn = dev->devfn;
-
-	if (type == PCI_CAP_ID_MSIX) {
-		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-
-		pci_read_config_dword(dev, msix_table_offset_reg(pos),
-					&table_offset);
-		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
-
-		map_irq.table_base = pci_resource_start(dev, bir);
-		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
-	}
+	int irq, ret;

 	spin_lock(&irq_mapping_update_lock);

-	irq = find_unbound_irq();
-
+	irq = xen_allocate_irq_dynamic();
 	if (irq == -1)
 		goto out;

-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-	if (rc) {
-		printk(KERN_WARNING "xen map irq failed %d\n", rc);
-
-		irq_free_desc(irq);
-
-		irq = -1;
-		goto out;
-	}
-	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
-
 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-			handle_level_irq,
-			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+				      handle_level_irq, name);

+	irq_info[irq] = mk_pirq_info(0, pirq, 0, vector, domid);
+	pirq_to_irq[pirq] = irq;
+	ret = set_irq_msi(irq, msidesc);
+	if (ret < 0)
+		goto error_irq;
 out:
 	spin_unlock(&irq_mapping_update_lock);
 	return irq;
+error_irq:
+	spin_unlock(&irq_mapping_update_lock);
+	xen_free_irq(irq);
+	return -1;
 }
 #endif

@@ -773,17 +706,25 @@ int xen_destroy_irq(int irq)

 	if (xen_initial_domain()) {
 		unmap_irq.pirq = info->u.pirq.pirq;
-		unmap_irq.domid = DOMID_SELF;
+		unmap_irq.domid = info->u.pirq.domid;
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
-		if (rc) {
+		/* If another domain quits without making the pci_disable_msix
+		 * call, the Xen hypervisor takes care of freeing the PIRQs
+		 * (free_domain_pirqs).
+		 */
+		if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF))
+			printk(KERN_INFO "domain %d does not have %d anymore\n",
+				info->u.pirq.domid, info->u.pirq.pirq);
+		else if (rc) {
 			printk(KERN_WARNING "unmap irq failed %d\n", rc);
 			goto out;
 		}
-		pirq_to_irq[info->u.pirq.pirq] = -1;
 	}
+	pirq_to_irq[info->u.pirq.pirq] = -1;
+
 	irq_info[irq] = mk_unbound_info();

-	irq_free_desc(irq);
+	xen_free_irq(irq);

 out:
 	spin_unlock(&irq_mapping_update_lock);
@@ -805,6 +746,12 @@ int xen_irq_from_pirq(unsigned pirq)
 	return pirq_to_irq[pirq];
 }

+
+int xen_pirq_from_irq(unsigned irq)
+{
+	return pirq_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
 int bind_evtchn_to_irq(unsigned int evtchn)
 {
 	int irq;
@@ -814,7 +761,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 	irq = evtchn_to_irq[evtchn];

 	if (irq == -1) {
-		irq = find_unbound_irq();
+		irq = xen_allocate_irq_dynamic();

 		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
 					      handle_fasteoi_irq, "event");
@@ -839,7 +786,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 	irq = per_cpu(ipi_to_irq, cpu)[ipi];

 	if (irq == -1) {
-		irq = find_unbound_irq();
+		irq = xen_allocate_irq_dynamic();
 		if (irq < 0)
 			goto out;

@@ -864,6 +811,21 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 	return irq;
 }

+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+					  unsigned int remote_port)
+{
+	struct evtchn_bind_interdomain bind_interdomain;
+	int err;
+
+	bind_interdomain.remote_dom  = remote_domain;
+	bind_interdomain.remote_port = remote_port;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+					  &bind_interdomain);
+
+	return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
+}
+

 int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 {
@@ -875,7 +837,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 	irq = per_cpu(virq_to_irq, cpu)[virq];

 	if (irq == -1) {
-		irq = find_unbound_irq();
+		irq = xen_allocate_irq_dynamic();

 		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
 					      handle_percpu_irq, "virq");
@@ -934,7 +896,7 @@ static void unbind_from_irq(unsigned int irq)
 	if (irq_info[irq].type != IRQT_UNBOUND) {
 		irq_info[irq] = mk_unbound_info();

-		irq_free_desc(irq);
+		xen_free_irq(irq);
 	}

 	spin_unlock(&irq_mapping_update_lock);
@@ -959,6 +921,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
 }
 EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);

+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+					  unsigned int remote_port,
+					  irq_handler_t handler,
+					  unsigned long irqflags,
+					  const char *devname,
+					  void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
 int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
 			    irq_handler_t handler,
 			    unsigned long irqflags, const char *devname, void *dev_id)
@@ -990,7 +975,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 	if (irq < 0)
 		return irq;

-	irqflags |= IRQF_NO_SUSPEND;
+	irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME;
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
@@ -1234,11 +1219,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	return 0;
 }

-static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
+			    bool force)
 {
 	unsigned tcpu = cpumask_first(dest);

-	return rebind_irq_to_cpu(irq, tcpu);
+	return rebind_irq_to_cpu(data->irq, tcpu);
 }

 int resend_irq_on_evtchn(unsigned int irq)
@@ -1257,35 +1243,35 @@ int resend_irq_on_evtchn(unsigned int irq)
 	return 1;
 }

-static void enable_dynirq(unsigned int irq)
+static void enable_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);

 	if (VALID_EVTCHN(evtchn))
 		unmask_evtchn(evtchn);
 }

-static void disable_dynirq(unsigned int irq)
+static void disable_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);

 	if (VALID_EVTCHN(evtchn))
 		mask_evtchn(evtchn);
 }

-static void ack_dynirq(unsigned int irq)
+static void ack_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);

-	move_masked_irq(irq);
+	move_masked_irq(data->irq);

 	if (VALID_EVTCHN(evtchn))
 		unmask_evtchn(evtchn);
 }

-static int retrigger_dynirq(unsigned int irq)
+static int retrigger_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);
 	struct shared_info *sh = HYPERVISOR_shared_info;
 	int ret = 0;

@@ -1334,7 +1320,7 @@ static void restore_cpu_pirqs(void)

 		printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);

-		startup_pirq(irq);
+		__startup_pirq(irq);
 	}
 }

@@ -1442,10 +1428,21 @@ void xen_poll_irq(int irq)
 	xen_poll_irq_timeout(irq, 0 /* no timeout */);
 }

+/* Check whether the IRQ line is shared with other guests. */
+int xen_test_irq_shared(int irq)
+{
+	struct irq_info *info = info_for_irq(irq);
+	struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq };
+
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+		return 0;
+	return !(irq_status.flags & XENIRQSTAT_shared);
+}
+EXPORT_SYMBOL_GPL(xen_test_irq_shared);
+
 void xen_irq_resume(void)
 {
 	unsigned int cpu, irq, evtchn;
-	struct irq_desc *desc;

 	init_evtchn_cpu_bindings();

@@ -1465,66 +1462,48 @@ void xen_irq_resume(void)
 		restore_cpu_ipis(cpu);
 	}

-	/*
-	 * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
-	 * are not handled by the IRQ core.
-	 */
-	for_each_irq_desc(irq, desc) {
-		if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
-			continue;
-		if (desc->status & IRQ_DISABLED)
-			continue;
-
-		evtchn = evtchn_from_irq(irq);
-		if (evtchn == -1)
-			continue;
-
-		unmask_evtchn(evtchn);
-	}
-
 	restore_cpu_pirqs();
 }

 static struct irq_chip xen_dynamic_chip __read_mostly = {
-	.name		= "xen-dyn",
+	.name			= "xen-dyn",

-	.disable	= disable_dynirq,
-	.mask		= disable_dynirq,
-	.unmask		= enable_dynirq,
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,

-	.eoi		= ack_dynirq,
-	.set_affinity	= set_affinity_irq,
-	.retrigger	= retrigger_dynirq,
+	.irq_eoi		= ack_dynirq,
+	.irq_set_affinity	= set_affinity_irq,
+	.irq_retrigger		= retrigger_dynirq,
 };

 static struct irq_chip xen_pirq_chip __read_mostly = {
-	.name		= "xen-pirq",
+	.name			= "xen-pirq",

-	.startup	= startup_pirq,
-	.shutdown	= shutdown_pirq,
+	.irq_startup		= startup_pirq,
+	.irq_shutdown		= shutdown_pirq,

-	.enable		= enable_pirq,
-	.unmask		= enable_pirq,
+	.irq_enable		= enable_pirq,
+	.irq_unmask		= enable_pirq,

-	.disable	= disable_pirq,
-	.mask		= disable_pirq,
+	.irq_disable		= disable_pirq,
+	.irq_mask		= disable_pirq,

-	.ack		= ack_pirq,
-	.end		= end_pirq,
+	.irq_ack		= ack_pirq,

-	.set_affinity	= set_affinity_irq,
+	.irq_set_affinity	= set_affinity_irq,

-	.retrigger	= retrigger_dynirq,
+	.irq_retrigger		= retrigger_dynirq,
 };

 static struct irq_chip xen_percpu_chip __read_mostly = {
-	.name		= "xen-percpu",
+	.name			= "xen-percpu",

-	.disable	= disable_dynirq,
-	.mask		= disable_dynirq,
-	.unmask		= enable_dynirq,
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,

-	.ack		= ack_dynirq,
+	.irq_ack		= ack_dynirq,
 };

 int xen_set_callback_via(uint64_t via)
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
new file mode 100644
index 0000000..a7ffdfe
--- /dev/null
+++ b/drivers/xen/gntalloc.c
@@ -0,0 +1,545 @@
+/******************************************************************************
+ * gntalloc.c
+ *
+ * Device for creating grant references (in user-space) that may be shared
+ * with other domains.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * This driver exists to allow userspace programs in Linux to allocate kernel
+ * memory that will later be shared with another domain.  Without this device,
+ * Linux userspace programs cannot create grant references.
+ *
+ * How this stuff works:
+ *   X -> granting a page to Y
+ *   Y -> mapping the grant from X
+ *
+ *   1. X uses the gntalloc device to allocate a page of kernel memory, P.
+ *   2. X creates an entry in the grant table that says domid(Y) can access P.
+ *      This is done without a hypercall unless the grant table needs expansion.
+ *   3. X gives the grant reference identifier, GREF, to Y.
+ *   4. Y maps the page, either directly into kernel memory for use in a backend
+ *      driver, or via a the gntdev device to map into the address space of an
+ *      application running in Y. This is the first point at which Xen does any
+ *      tracking of the page.
+ *   5. A program in X mmap()s a segment of the gntalloc device that corresponds
+ *      to the shared page, and can now communicate with Y over the shared page.
+ *
+ *
+ * NOTE TO USERSPACE LIBRARIES:
+ *   The grant allocation and mmap()ing are, naturally, two separate operations.
+ *   You set up the sharing by calling the create ioctl() and then the mmap().
+ *   Teardown requires munmap() and either close() or ioctl().
+ *
+ * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant
+ * reference, this device can be used to consume kernel memory by leaving grant
+ * references mapped by another domain when an application exits. Therefore,
+ * there is a global limit on the number of pages that can be allocated. When
+ * all references to the page are unmapped, it will be freed during the next
+ * grant operation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/gntalloc.h>
+#include <xen/events.h>
+
+static int limit = 1024;
+module_param(limit, int, 0644);
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by "
+		"the gntalloc device");
+
+static LIST_HEAD(gref_list);
+static DEFINE_SPINLOCK(gref_lock);
+static int gref_size;
+
+struct notify_info {
+	uint16_t pgoff:12;    /* Bits 0-11: Offset of the byte to clear */
+	uint16_t flags:2;     /* Bits 12-13: Unmap notification flags */
+	int event;            /* Port (event channel) to notify */
+};
+
+/* Metadata on a grant reference. */
+struct gntalloc_gref {
+	struct list_head next_gref;  /* list entry gref_list */
+	struct list_head next_file;  /* list entry file->list, if open */
+	struct page *page;	     /* The shared page */
+	uint64_t file_index;         /* File offset for mmap() */
+	unsigned int users;          /* Use count - when zero, waiting on Xen */
+	grant_ref_t gref_id;         /* The grant reference number */
+	struct notify_info notify;   /* Unmap notification */
+};
+
+struct gntalloc_file_private_data {
+	struct list_head list;
+	uint64_t index;
+};
+
+static void __del_gref(struct gntalloc_gref *gref);
+
+static void do_cleanup(void)
+{
+	struct gntalloc_gref *gref, *n;
+	list_for_each_entry_safe(gref, n, &gref_list, next_gref) {
+		if (!gref->users)
+			__del_gref(gref);
+	}
+}
+
+static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
+	uint32_t *gref_ids, struct gntalloc_file_private_data *priv)
+{
+	int i, rc, readonly;
+	LIST_HEAD(queue_gref);
+	LIST_HEAD(queue_file);
+	struct gntalloc_gref *gref;
+
+	readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE);
+	rc = -ENOMEM;
+	for (i = 0; i < op->count; i++) {
+		gref = kzalloc(sizeof(*gref), GFP_KERNEL);
+		if (!gref)
+			goto undo;
+		list_add_tail(&gref->next_gref, &queue_gref);
+		list_add_tail(&gref->next_file, &queue_file);
+		gref->users = 1;
+		gref->file_index = op->index + i * PAGE_SIZE;
+		gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!gref->page)
+			goto undo;
+
+		/* Grant foreign access to the page. */
+		gref->gref_id = gnttab_grant_foreign_access(op->domid,
+			pfn_to_mfn(page_to_pfn(gref->page)), readonly);
+		if (gref->gref_id < 0) {
+			rc = gref->gref_id;
+			goto undo;
+		}
+		gref_ids[i] = gref->gref_id;
+	}
+
+	/* Add to gref lists. */
+	spin_lock(&gref_lock);
+	list_splice_tail(&queue_gref, &gref_list);
+	list_splice_tail(&queue_file, &priv->list);
+	spin_unlock(&gref_lock);
+
+	return 0;
+
+undo:
+	spin_lock(&gref_lock);
+	gref_size -= (op->count - i);
+
+	list_for_each_entry(gref, &queue_file, next_file) {
+		/* __del_gref does not remove from queue_file */
+		__del_gref(gref);
+	}
+
+	/* It's possible for the target domain to map the just-allocated grant
+	 * references by blindly guessing their IDs; if this is done, then
+	 * __del_gref will leave them in the queue_gref list. They need to be
+	 * added to the global list so that we can free them when they are no
+	 * longer referenced.
+	 */
+	if (unlikely(!list_empty(&queue_gref)))
+		list_splice_tail(&queue_gref, &gref_list);
+	spin_unlock(&gref_lock);
+	return rc;
+}
+
+static void __del_gref(struct gntalloc_gref *gref)
+{
+	if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+		uint8_t *tmp = kmap(gref->page);
+		tmp[gref->notify.pgoff] = 0;
+		kunmap(gref->page);
+	}
+	if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT)
+		notify_remote_via_evtchn(gref->notify.event);
+
+	gref->notify.flags = 0;
+
+	if (gref->gref_id > 0) {
+		if (gnttab_query_foreign_access(gref->gref_id))
+			return;
+
+		if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
+			return;
+	}
+
+	gref_size--;
+	list_del(&gref->next_gref);
+
+	if (gref->page)
+		__free_page(gref->page);
+
+	kfree(gref);
+}
+
+/* finds contiguous grant references in a file, returns the first */
+static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv,
+		uint64_t index, uint32_t count)
+{
+	struct gntalloc_gref *rv = NULL, *gref;
+	list_for_each_entry(gref, &priv->list, next_file) {
+		if (gref->file_index == index && !rv)
+			rv = gref;
+		if (rv) {
+			if (gref->file_index != index)
+				return NULL;
+			index += PAGE_SIZE;
+			count--;
+			if (count == 0)
+				return rv;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * -------------------------------------
+ *  File operations.
+ * -------------------------------------
+ */
+static int gntalloc_open(struct inode *inode, struct file *filp)
+{
+	struct gntalloc_file_private_data *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		goto out_nomem;
+	INIT_LIST_HEAD(&priv->list);
+
+	filp->private_data = priv;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	return 0;
+
+out_nomem:
+	return -ENOMEM;
+}
+
+static int gntalloc_release(struct inode *inode, struct file *filp)
+{
+	struct gntalloc_file_private_data *priv = filp->private_data;
+	struct gntalloc_gref *gref;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	spin_lock(&gref_lock);
+	while (!list_empty(&priv->list)) {
+		gref = list_entry(priv->list.next,
+			struct gntalloc_gref, next_file);
+		list_del(&gref->next_file);
+		gref->users--;
+		if (gref->users == 0)
+			__del_gref(gref);
+	}
+	kfree(priv);
+	spin_unlock(&gref_lock);
+
+	return 0;
+}
+
+static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
+		struct ioctl_gntalloc_alloc_gref __user *arg)
+{
+	int rc = 0;
+	struct ioctl_gntalloc_alloc_gref op;
+	uint32_t *gref_ids;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	if (copy_from_user(&op, arg, sizeof(op))) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	gref_ids = kzalloc(sizeof(gref_ids[0]) * op.count, GFP_TEMPORARY);
+	if (!gref_ids) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock(&gref_lock);
+	/* Clean up pages that were at zero (local) users but were still mapped
+	 * by remote domains. Since those pages count towards the limit that we
+	 * are about to enforce, removing them here is a good idea.
+	 */
+	do_cleanup();
+	if (gref_size + op.count > limit) {
+		spin_unlock(&gref_lock);
+		rc = -ENOSPC;
+		goto out_free;
+	}
+	gref_size += op.count;
+	op.index = priv->index;
+	priv->index += op.count * PAGE_SIZE;
+	spin_unlock(&gref_lock);
+
+	rc = add_grefs(&op, gref_ids, priv);
+	if (rc < 0)
+		goto out_free;
+
+	/* Once we finish add_grefs, it is unsafe to touch the new reference,
+	 * since it is possible for a concurrent ioctl to remove it (by guessing
+	 * its index). If the userspace application doesn't provide valid memory
+	 * to write the IDs to, then it will need to close the file in order to
+	 * release - which it will do by segfaulting when it tries to access the
+	 * IDs to close them.
+	 */
+	if (copy_to_user(arg, &op, sizeof(op))) {
+		rc = -EFAULT;
+		goto out_free;
+	}
+	if (copy_to_user(arg->gref_ids, gref_ids,
+			sizeof(gref_ids[0]) * op.count)) {
+		rc = -EFAULT;
+		goto out_free;
+	}
+
+out_free:
+	kfree(gref_ids);
+out:
+	return rc;
+}
+
+static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv,
+		void __user *arg)
+{
+	int i, rc = 0;
+	struct ioctl_gntalloc_dealloc_gref op;
+	struct gntalloc_gref *gref, *n;
+
+	pr_debug("%s: priv %p\n", __func__, priv);
+
+	if (copy_from_user(&op, arg, sizeof(op))) {
+		rc = -EFAULT;
+		goto dealloc_grant_out;
+	}
+
+	spin_lock(&gref_lock);
+	gref = find_grefs(priv, op.index, op.count);
+	if (gref) {
+		/* Remove from the file list only, and decrease reference count.
+		 * The later call to do_cleanup() will remove from gref_list and
+		 * free the memory if the pages aren't mapped anywhere.
+		 */
+		for (i = 0; i < op.count; i++) {
+			n = list_entry(gref->next_file.next,
+				struct gntalloc_gref, next_file);
+			list_del(&gref->next_file);
+			gref->users--;
+			gref = n;
+		}
+	} else {
+		rc = -EINVAL;
+	}
+
+	do_cleanup();
+
+	spin_unlock(&gref_lock);
+dealloc_grant_out:
+	return rc;
+}
+
+static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv,
+		void __user *arg)
+{
+	struct ioctl_gntalloc_unmap_notify op;
+	struct gntalloc_gref *gref;
+	uint64_t index;
+	int pgoff;
+	int rc;
+
+	if (copy_from_user(&op, arg, sizeof(op)))
+		return -EFAULT;
+
+	index = op.index & ~(PAGE_SIZE - 1);
+	pgoff = op.index & (PAGE_SIZE - 1);
+
+	spin_lock(&gref_lock);
+
+	gref = find_grefs(priv, index, 1);
+	if (!gref) {
+		rc = -ENOENT;
+		goto unlock_out;
+	}
+
+	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) {
+		rc = -EINVAL;
+		goto unlock_out;
+	}
+
+	gref->notify.flags = op.action;
+	gref->notify.pgoff = pgoff;
+	gref->notify.event = op.event_channel_port;
+	rc = 0;
+ unlock_out:
+	spin_unlock(&gref_lock);
+	return rc;
+}
+
+static long gntalloc_ioctl(struct file *filp, unsigned int cmd,
+		unsigned long arg)
+{
+	struct gntalloc_file_private_data *priv = filp->private_data;
+
+	switch (cmd) {
+	case IOCTL_GNTALLOC_ALLOC_GREF:
+		return gntalloc_ioctl_alloc(priv, (void __user *)arg);
+
+	case IOCTL_GNTALLOC_DEALLOC_GREF:
+		return gntalloc_ioctl_dealloc(priv, (void __user *)arg);
+
+	case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY:
+		return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg);
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+
+static void gntalloc_vma_close(struct vm_area_struct *vma)
+{
+	struct gntalloc_gref *gref = vma->vm_private_data;
+	if (!gref)
+		return;
+
+	spin_lock(&gref_lock);
+	gref->users--;
+	if (gref->users == 0)
+		__del_gref(gref);
+	spin_unlock(&gref_lock);
+}
+
+static struct vm_operations_struct gntalloc_vmops = {
+	.close = gntalloc_vma_close,
+};
+
+static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct gntalloc_file_private_data *priv = filp->private_data;
+	struct gntalloc_gref *gref;
+	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	int rv, i;
+
+	pr_debug("%s: priv %p, page %lu+%d\n", __func__,
+		       priv, vma->vm_pgoff, count);
+
+	if (!(vma->vm_flags & VM_SHARED)) {
+		printk(KERN_ERR "%s: Mapping must be shared.\n", __func__);
+		return -EINVAL;
+	}
+
+	spin_lock(&gref_lock);
+	gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count);
+	if (gref == NULL) {
+		rv = -ENOENT;
+		pr_debug("%s: Could not find grant reference",
+				__func__);
+		goto out_unlock;
+	}
+
+	vma->vm_private_data = gref;
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_DONTCOPY;
+	vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP;
+
+	vma->vm_ops = &gntalloc_vmops;
+
+	for (i = 0; i < count; i++) {
+		gref->users++;
+		rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
+				gref->page);
+		if (rv)
+			goto out_unlock;
+
+		gref = list_entry(gref->next_file.next,
+				struct gntalloc_gref, next_file);
+	}
+	rv = 0;
+
+out_unlock:
+	spin_unlock(&gref_lock);
+	return rv;
+}
+
+static const struct file_operations gntalloc_fops = {
+	.owner = THIS_MODULE,
+	.open = gntalloc_open,
+	.release = gntalloc_release,
+	.unlocked_ioctl = gntalloc_ioctl,
+	.mmap = gntalloc_mmap
+};
+
+/*
+ * -------------------------------------
+ * Module creation/destruction.
+ * -------------------------------------
+ */
+static struct miscdevice gntalloc_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "xen/gntalloc",
+	.fops	= &gntalloc_fops,
+};
+
+static int __init gntalloc_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	err = misc_register(&gntalloc_miscdev);
+	if (err != 0) {
+		printk(KERN_ERR "Could not register misc gntalloc device\n");
+		return err;
+	}
+
+	pr_debug("Created grant allocation device at %d,%d\n",
+			MISC_MAJOR, gntalloc_miscdev.minor);
+
+	return 0;
+}
+
+static void __exit gntalloc_exit(void)
+{
+	misc_deregister(&gntalloc_miscdev);
+}
+
+module_init(gntalloc_init);
+module_exit(gntalloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, "
+		"Daniel De Graaf <dgdegra@tycho.nsa.gov>");
+MODULE_DESCRIPTION("User-space grant reference allocator driver");
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 1e31cdc..d43ff30 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -32,10 +32,12 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
+#include <linux/highmem.h>

 #include <xen/xen.h>
 #include <xen/grant_table.h>
 #include <xen/gntdev.h>
+#include <xen/events.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
@@ -45,35 +47,46 @@ MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
 	      "Gerd Hoffmann <kraxel@redhat.com>");
 MODULE_DESCRIPTION("User-space granted page access driver");

-static int limit = 1024;
+static int limit = 1024*1024;
 module_param(limit, int, 0644);
-MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at "
-		"once by a gntdev instance");
+MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
+		"the gntdev device");
+
+static atomic_t pages_mapped = ATOMIC_INIT(0);
+
+static int use_ptemod;

 struct gntdev_priv {
 	struct list_head maps;
-	uint32_t used;
-	uint32_t limit;
 	/* lock protects maps from concurrent changes */
 	spinlock_t lock;
 	struct mm_struct *mm;
 	struct mmu_notifier mn;
 };

+struct unmap_notify {
+	int flags;
+	/* Address relative to the start of the grant_map */
+	int addr;
+	int event;
+};
+
 struct grant_map {
 	struct list_head next;
-	struct gntdev_priv *priv;
 	struct vm_area_struct *vma;
 	int index;
 	int count;
 	int flags;
-	int is_mapped;
+	atomic_t users;
+	struct unmap_notify notify;
 	struct ioctl_gntdev_grant_ref *grants;
 	struct gnttab_map_grant_ref   *map_ops;
 	struct gnttab_unmap_grant_ref *unmap_ops;
 	struct page **pages;
 };

+static int unmap_grant_pages(struct grant_map *map, int offset, int pages);
+
 /* ------------------------------------------------------------------ */

 static void gntdev_print_maps(struct gntdev_priv *priv,
@@ -82,9 +95,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv,
 #ifdef DEBUG
 	struct grant_map *map;

-	pr_debug("maps list (priv %p, usage %d/%d)\n",
-	       priv, priv->used, priv->limit);
-
+	pr_debug("%s: maps list (priv %p)\n", __func__, priv);
 	list_for_each_entry(map, &priv->maps, next)
 		pr_debug("  index %2d, count %2d %s\n",
 		       map->index, map->count,
@@ -115,14 +126,13 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
 		add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
 		if (add->pages[i] == NULL)
 			goto err;
+		add->map_ops[i].handle = -1;
+		add->unmap_ops[i].handle = -1;
 	}

 	add->index = 0;
 	add->count = count;
-	add->priv  = priv;
-
-	if (add->count + priv->used > priv->limit)
-		goto err;
+	atomic_set(&add->users, 1);

 	return add;

@@ -154,7 +164,6 @@ static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
 	list_add_tail(&add->next, &priv->maps);

 done:
-	priv->used += add->count;
 	gntdev_print_maps(priv, "[new]", add->index);
 }

@@ -166,57 +175,57 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv,
 	list_for_each_entry(map, &priv->maps, next) {
 		if (map->index != index)
 			continue;
-		if (map->count != count)
-			continue;
-		return map;
-	}
-	return NULL;
-}
-
-static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
-					       unsigned long vaddr)
-{
-	struct grant_map *map;
-
-	list_for_each_entry(map, &priv->maps, next) {
-		if (!map->vma)
-			continue;
-		if (vaddr < map->vma->vm_start)
-			continue;
-		if (vaddr >= map->vma->vm_end)
+		if (count && map->count != count)
 			continue;
 		return map;
 	}
 	return NULL;
 }

-static int gntdev_del_map(struct grant_map *map)
+static void gntdev_put_map(struct grant_map *map)
 {
 	int i;

-	if (map->vma)
-		return -EBUSY;
-	for (i = 0; i < map->count; i++)
-		if (map->unmap_ops[i].handle)
-			return -EBUSY;
+	if (!map)
+		return;

-	map->priv->used -= map->count;
-	list_del(&map->next);
-	return 0;
-}
+	if (!atomic_dec_and_test(&map->users))
+		return;

-static void gntdev_free_map(struct grant_map *map)
-{
-	int i;
+	atomic_sub(map->count, &pages_mapped);

-	if (!map)
-		return;
+	if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
+		notify_remote_via_evtchn(map->notify.event);
+	}
+
+	if (map->pages) {
+		if (!use_ptemod)
+			unmap_grant_pages(map, 0, map->count);

-	if (map->pages)
 		for (i = 0; i < map->count; i++) {
-			if (map->pages[i])
+			uint32_t check, *tmp;
+			if (!map->pages[i])
+				continue;
+			/* XXX When unmapping in an HVM domain, Xen will
+			 * sometimes end up mapping the GFN to an invalid MFN.
+			 * In this case, writes will be discarded and reads will
+			 * return all 0xFF bytes.  Leak these unusable GFNs
+			 * until Xen supports fixing their p2m mapping.
+			 *
+			 * Confirmed present in Xen 4.1-RC3 with HVM source
+			 */
+			tmp = kmap(map->pages[i]);
+			*tmp = 0xdeaddead;
+			mb();
+			check = *tmp;
+			kunmap(map->pages[i]);
+			if (check == 0xdeaddead)
 				__free_page(map->pages[i]);
+			else
+				pr_debug("Discard page %d=%ld\n", i,
+					page_to_pfn(map->pages[i]));
 		}
+	}
 	kfree(map->pages);
 	kfree(map->grants);
 	kfree(map->map_ops);
@@ -231,24 +240,39 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token,
 {
 	struct grant_map *map = data;
 	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+	int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte;
 	u64 pte_maddr;

 	BUG_ON(pgnr >= map->count);
 	pte_maddr = arbitrary_virt_to_machine(pte).maddr;

-	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
-			  GNTMAP_contains_pte | map->flags,
+	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags,
 			  map->grants[pgnr].ref,
 			  map->grants[pgnr].domid);
-	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
-			    GNTMAP_contains_pte | map->flags,
-			    0 /* handle */);
+	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags,
+			    -1 /* handle */);
 	return 0;
 }

 static int map_grant_pages(struct grant_map *map)
 {
 	int i, err = 0;
+	phys_addr_t addr;
+
+	if (!use_ptemod) {
+		/* Note: it could already be mapped */
+		if (map->map_ops[0].handle != -1)
+			return 0;
+		for (i = 0; i < map->count; i++) {
+			addr = (phys_addr_t)
+				pfn_to_kaddr(page_to_pfn(map->pages[i]));
+			gnttab_set_map_op(&map->map_ops[i], addr, map->flags,
+				map->grants[i].ref,
+				map->grants[i].domid);
+			gnttab_set_unmap_op(&map->unmap_ops[i], addr,
+				map->flags, -1 /* handle */);
+		}
+	}

 	pr_debug("map %d+%d\n", map->index, map->count);
 	err = gnttab_map_refs(map->map_ops, map->pages, map->count);
@@ -258,28 +282,81 @@ static int map_grant_pages(struct grant_map *map)
 	for (i = 0; i < map->count; i++) {
 		if (map->map_ops[i].status)
 			err = -EINVAL;
-		map->unmap_ops[i].handle = map->map_ops[i].handle;
+		else {
+			BUG_ON(map->map_ops[i].handle == -1);
+			map->unmap_ops[i].handle = map->map_ops[i].handle;
+			pr_debug("map handle=%d\n", map->map_ops[i].handle);
+		}
 	}
 	return err;
 }

-static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+static int __unmap_grant_pages(struct grant_map *map, int offset, int pages)
 {
 	int i, err = 0;

-	pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
-	err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
+	if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
+		int pgno = (map->notify.addr >> PAGE_SHIFT);
+		if (pgno >= offset && pgno < offset + pages && use_ptemod) {
+			void __user *tmp = (void __user *)
+				map->vma->vm_start + map->notify.addr;
+			err = copy_to_user(tmp, &err, 1);
+			if (err)
+				return err;
+			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+		} else if (pgno >= offset && pgno < offset + pages) {
+			uint8_t *tmp = kmap(map->pages[pgno]);
+			tmp[map->notify.addr & (PAGE_SIZE-1)] = 0;
+			kunmap(map->pages[pgno]);
+			map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE;
+		}
+	}
+
+	err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, pages);
 	if (err)
 		return err;

 	for (i = 0; i < pages; i++) {
 		if (map->unmap_ops[offset+i].status)
 			err = -EINVAL;
-		map->unmap_ops[offset+i].handle = 0;
+		pr_debug("unmap handle=%d st=%d\n",
+			map->unmap_ops[offset+i].handle,
+			map->unmap_ops[offset+i].status);
+		map->unmap_ops[offset+i].handle = -1;
 	}
 	return err;
 }

+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+	int range, err = 0;
+
+	pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages);
+
+	/* It is possible the requested range will have a "hole" where we
+	 * already unmapped some of the grants. Only unmap valid ranges.
+	 */
+	while (pages && !err) {
+		while (pages && map->unmap_ops[offset].handle == -1) {
+			offset++;
+			pages--;
+		}
+		range = 0;
+		while (range < pages) {
+			if (map->unmap_ops[offset+range].handle == -1) {
+				range--;
+				break;
+			}
+			range++;
+		}
+		err = __unmap_grant_pages(map, offset, range);
+		offset += range;
+		pages -= range;
+	}
+
+	return err;
+}
+
 /* ------------------------------------------------------------------ */

 static void gntdev_vma_close(struct vm_area_struct *vma)
@@ -287,22 +364,13 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
 	struct grant_map *map = vma->vm_private_data;

 	pr_debug("close %p\n", vma);
-	map->is_mapped = 0;
 	map->vma = NULL;
 	vma->vm_private_data = NULL;
-}
-
-static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n",
-			vmf->virtual_address, vmf->pgoff);
-	vmf->flags = VM_FAULT_ERROR;
-	return 0;
+	gntdev_put_map(map);
 }

 static struct vm_operations_struct gntdev_vmops = {
 	.close = gntdev_vma_close,
-	.fault = gntdev_vma_fault,
 };

 /* ------------------------------------------------------------------ */
@@ -320,8 +388,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
 	list_for_each_entry(map, &priv->maps, next) {
 		if (!map->vma)
 			continue;
-		if (!map->is_mapped)
-			continue;
 		if (map->vma->vm_start >= end)
 			continue;
 		if (map->vma->vm_end <= start)
@@ -386,16 +452,17 @@ static int gntdev_open(struct inode *inode, struct file *flip)

 	INIT_LIST_HEAD(&priv->maps);
 	spin_lock_init(&priv->lock);
-	priv->limit = limit;

-	priv->mm = get_task_mm(current);
-	if (!priv->mm) {
-		kfree(priv);
-		return -ENOMEM;
+	if (use_ptemod) {
+		priv->mm = get_task_mm(current);
+		if (!priv->mm) {
+			kfree(priv);
+			return -ENOMEM;
+		}
+		priv->mn.ops = &gntdev_mmu_ops;
+		ret = mmu_notifier_register(&priv->mn, priv->mm);
+		mmput(priv->mm);
 	}
-	priv->mn.ops = &gntdev_mmu_ops;
-	ret = mmu_notifier_register(&priv->mn, priv->mm);
-	mmput(priv->mm);

 	if (ret) {
 		kfree(priv);
@@ -412,21 +479,19 @@ static int gntdev_release(struct inode *inode, struct file *flip)
 {
 	struct gntdev_priv *priv = flip->private_data;
 	struct grant_map *map;
-	int err;

 	pr_debug("priv %p\n", priv);

 	spin_lock(&priv->lock);
 	while (!list_empty(&priv->maps)) {
 		map = list_entry(priv->maps.next, struct grant_map, next);
-		err = gntdev_del_map(map);
-		if (WARN_ON(err))
-			gntdev_free_map(map);
-
+		list_del(&map->next);
+		gntdev_put_map(map);
 	}
 	spin_unlock(&priv->lock);

-	mmu_notifier_unregister(&priv->mn, priv->mm);
+	if (use_ptemod)
+		mmu_notifier_unregister(&priv->mn, priv->mm);
 	kfree(priv);
 	return 0;
 }
@@ -443,16 +508,21 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
 	pr_debug("priv %p, add %d\n", priv, op.count);
 	if (unlikely(op.count <= 0))
 		return -EINVAL;
-	if (unlikely(op.count > priv->limit))
-		return -EINVAL;

 	err = -ENOMEM;
 	map = gntdev_alloc_map(priv, op.count);
 	if (!map)
 		return err;
+
+	if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) {
+		pr_debug("can't map: over limit\n");
+		gntdev_put_map(map);
+		return err;
+	}
+
 	if (copy_from_user(map->grants, &u->refs,
 			   sizeof(map->grants[0]) * op.count) != 0) {
-		gntdev_free_map(map);
+		gntdev_put_map(map);
 		return err;
 	}

@@ -461,13 +531,9 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
 	op.index = map->index << PAGE_SHIFT;
 	spin_unlock(&priv->lock);

-	if (copy_to_user(u, &op, sizeof(op)) != 0) {
-		spin_lock(&priv->lock);
-		gntdev_del_map(map);
-		spin_unlock(&priv->lock);
-		gntdev_free_map(map);
-		return err;
-	}
+	if (copy_to_user(u, &op, sizeof(op)) != 0)
+		return -EFAULT;
+
 	return 0;
 }

@@ -484,11 +550,12 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,

 	spin_lock(&priv->lock);
 	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
-	if (map)
-		err = gntdev_del_map(map);
+	if (map) {
+		list_del(&map->next);
+		gntdev_put_map(map);
+		err = 0;
+	}
 	spin_unlock(&priv->lock);
-	if (!err)
-		gntdev_free_map(map);
 	return err;
 }

@@ -496,43 +563,66 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
 					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
 {
 	struct ioctl_gntdev_get_offset_for_vaddr op;
+	struct vm_area_struct *vma;
 	struct grant_map *map;

 	if (copy_from_user(&op, u, sizeof(op)) != 0)
 		return -EFAULT;
 	pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr);

-	spin_lock(&priv->lock);
-	map = gntdev_find_map_vaddr(priv, op.vaddr);
-	if (map == NULL ||
-	    map->vma->vm_start != op.vaddr) {
-		spin_unlock(&priv->lock);
+	vma = find_vma(current->mm, op.vaddr);
+	if (!vma || vma->vm_ops != &gntdev_vmops)
 		return -EINVAL;
-	}
+
+	map = vma->vm_private_data;
+	if (!map)
+		return -EINVAL;
+
 	op.offset = map->index << PAGE_SHIFT;
 	op.count = map->count;
-	spin_unlock(&priv->lock);

 	if (copy_to_user(u, &op, sizeof(op)) != 0)
 		return -EFAULT;
 	return 0;
 }

-static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
-					struct ioctl_gntdev_set_max_grants __user *u)
+static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
 {
-	struct ioctl_gntdev_set_max_grants op;
+	struct ioctl_gntdev_unmap_notify op;
+	struct grant_map *map;
+	int rc;

-	if (copy_from_user(&op, u, sizeof(op)) != 0)
+	if (copy_from_user(&op, u, sizeof(op)))
 		return -EFAULT;
-	pr_debug("priv %p, limit %d\n", priv, op.count);
-	if (op.count > limit)
-		return -E2BIG;
+
+	if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT))
+		return -EINVAL;

 	spin_lock(&priv->lock);
-	priv->limit = op.count;
+
+	list_for_each_entry(map, &priv->maps, next) {
+		uint64_t begin = map->index << PAGE_SHIFT;
+		uint64_t end = (map->index + map->count) << PAGE_SHIFT;
+		if (op.index >= begin && op.index < end)
+			goto found;
+	}
+	rc = -ENOENT;
+	goto unlock_out;
+
+ found:
+	if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) &&
+			(map->flags & GNTMAP_readonly)) {
+		rc = -EINVAL;
+		goto unlock_out;
+	}
+
+	map->notify.flags = op.action;
+	map->notify.addr = op.index - (map->index << PAGE_SHIFT);
+	map->notify.event = op.event_channel_port;
+	rc = 0;
+ unlock_out:
 	spin_unlock(&priv->lock);
-	return 0;
+	return rc;
 }

 static long gntdev_ioctl(struct file *flip,
@@ -551,8 +641,8 @@ static long gntdev_ioctl(struct file *flip,
 	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
 		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);

-	case IOCTL_GNTDEV_SET_MAX_GRANTS:
-		return gntdev_ioctl_set_max_grants(priv, ptr);
+	case IOCTL_GNTDEV_SET_UNMAP_NOTIFY:
+		return gntdev_ioctl_notify(priv, ptr);

 	default:
 		pr_debug("priv %p, unknown cmd %x\n", priv, cmd);
@@ -568,7 +658,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	int index = vma->vm_pgoff;
 	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 	struct grant_map *map;
-	int err = -EINVAL;
+	int i, err = -EINVAL;

 	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
@@ -580,47 +670,70 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	map = gntdev_find_map_index(priv, index, count);
 	if (!map)
 		goto unlock_out;
-	if (map->vma)
+	if (use_ptemod && map->vma)
 		goto unlock_out;
-	if (priv->mm != vma->vm_mm) {
+	if (use_ptemod && priv->mm != vma->vm_mm) {
 		printk(KERN_WARNING "Huh? Other mm?\n");
 		goto unlock_out;
 	}

+	atomic_inc(&map->users);
+
 	vma->vm_ops = &gntdev_vmops;

 	vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP;

 	vma->vm_private_data = map;
-	map->vma = vma;

-	map->flags = GNTMAP_host_map | GNTMAP_application_map;
-	if (!(vma->vm_flags & VM_WRITE))
-		map->flags |= GNTMAP_readonly;
+	if (use_ptemod)
+		map->vma = vma;
+
+	if (map->flags) {
+		if ((vma->vm_flags & VM_WRITE) &&
+				(map->flags & GNTMAP_readonly))
+			return -EINVAL;
+	} else {
+		map->flags = GNTMAP_host_map;
+		if (!(vma->vm_flags & VM_WRITE))
+			map->flags |= GNTMAP_readonly;
+	}

 	spin_unlock(&priv->lock);

-	err = apply_to_page_range(vma->vm_mm, vma->vm_start,
-				  vma->vm_end - vma->vm_start,
-				  find_grant_ptes, map);
-	if (err) {
-		printk(KERN_WARNING "find_grant_ptes() failure.\n");
-		return err;
+	if (use_ptemod) {
+		err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+					  vma->vm_end - vma->vm_start,
+					  find_grant_ptes, map);
+		if (err) {
+			printk(KERN_WARNING "find_grant_ptes() failure.\n");
+			goto out_put_map;
+		}
 	}

 	err = map_grant_pages(map);
-	if (err) {
-		printk(KERN_WARNING "map_grant_pages() failure.\n");
-		return err;
-	}
+	if (err)
+		goto out_put_map;

-	map->is_mapped = 1;
+	if (!use_ptemod) {
+		for (i = 0; i < count; i++) {
+			err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
+				map->pages[i]);
+			if (err)
+				goto out_put_map;
+		}
+	}

 	return 0;

 unlock_out:
 	spin_unlock(&priv->lock);
 	return err;
+
+out_put_map:
+	if (use_ptemod)
+		map->vma = NULL;
+	gntdev_put_map(map);
+	return err;
 }

 static const struct file_operations gntdev_fops = {
@@ -646,6 +759,8 @@ static int __init gntdev_init(void)
 	if (!xen_domain())
 		return -ENODEV;

+	use_ptemod = xen_pv_domain();
+
 	err = misc_register(&gntdev_miscdev);
 	if (err != 0) {
 		printk(KERN_ERR "Could not register gntdev device\n");
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 9ef54eb..1a9bc2b 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -458,14 +458,19 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 	if (ret)
 		return ret;

+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return ret;
+
 	for (i = 0; i < count; i++) {
-		/* m2p override only supported for GNTMAP_contains_pte mappings */
-		if (!(map_ops[i].flags & GNTMAP_contains_pte))
-			continue;
-		pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+		if (map_ops[i].flags & GNTMAP_contains_pte) {
+			pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
 				(map_ops[i].host_addr & ~PAGE_MASK));
-		mfn = pte_mfn(*pte);
-		ret = m2p_add_override(mfn, pages[i]);
+			mfn = pte_mfn(*pte);
+		} else {
+			mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+		}
+		ret = m2p_add_override(mfn, pages[i],
+				       map_ops[i].flags & GNTMAP_contains_pte);
 		if (ret)
 			return ret;
 	}
@@ -483,8 +488,13 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 	if (ret)
 		return ret;

+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return ret;
+
 	for (i = 0; i < count; i++) {
-		ret = m2p_remove_override(pages[i]);
+		/* We do not have the means of checking if GNTMAP_contains_pte
+		 * is set. */
+		ret = m2p_remove_override(pages[i], true /* clear the PTE */);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2417727..ebb2928 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -34,42 +34,38 @@ enum shutdown_state {
 /* Ignore multiple shutdown requests. */
 static enum shutdown_state shutting_down = SHUTDOWN_INVALID;

-#ifdef CONFIG_PM_SLEEP
-static int xen_hvm_suspend(void *data)
-{
-	int err;
-	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
-	int *cancelled = data;
-
-	BUG_ON(!irqs_disabled());
-
-	err = sysdev_suspend(PMSG_SUSPEND);
-	if (err) {
-		printk(KERN_ERR "xen_hvm_suspend: sysdev_suspend failed: %d\n",
-		       err);
-		return err;
-	}
-
-	*cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+struct suspend_info {
+	int cancelled;
+	unsigned long arg; /* extra hypercall argument */
+	void (*pre)(void);
+	void (*post)(int cancelled);
+};

-	xen_hvm_post_suspend(*cancelled);
+static void xen_hvm_post_suspend(int cancelled)
+{
+	xen_arch_hvm_post_suspend(cancelled);
 	gnttab_resume();
+}

-	if (!*cancelled) {
-		xen_irq_resume();
-		xen_console_resume();
-		xen_timer_resume();
-	}
-
-	sysdev_resume();
+static void xen_pre_suspend(void)
+{
+	xen_mm_pin_all();
+	gnttab_suspend();
+	xen_arch_pre_suspend();
+}

-	return 0;
+static void xen_post_suspend(int cancelled)
+{
+	xen_arch_post_suspend(cancelled);
+	gnttab_resume();
+	xen_mm_unpin_all();
 }

+#ifdef CONFIG_PM_SLEEP
 static int xen_suspend(void *data)
 {
+	struct suspend_info *si = data;
 	int err;
-	int *cancelled = data;

 	BUG_ON(!irqs_disabled());

@@ -80,22 +76,20 @@ static int xen_suspend(void *data)
 		return err;
 	}

-	xen_mm_pin_all();
-	gnttab_suspend();
-	xen_pre_suspend();
+	if (si->pre)
+		si->pre();

 	/*
 	 * This hypercall returns 1 if suspend was cancelled
 	 * or the domain was merely checkpointed, and 0 if it
 	 * is resuming in a new domain.
 	 */
-	*cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+	si->cancelled = HYPERVISOR_suspend(si->arg);

-	xen_post_suspend(*cancelled);
-	gnttab_resume();
-	xen_mm_unpin_all();
+	if (si->post)
+		si->post(si->cancelled);

-	if (!*cancelled) {
+	if (!si->cancelled) {
 		xen_irq_resume();
 		xen_console_resume();
 		xen_timer_resume();
@@ -109,7 +103,7 @@ static int xen_suspend(void *data)
 static void do_suspend(void)
 {
 	int err;
-	int cancelled = 1;
+	struct suspend_info si;

 	shutting_down = SHUTDOWN_SUSPEND;

@@ -139,20 +133,29 @@ static void do_suspend(void)
 		goto out_resume;
 	}

-	if (xen_hvm_domain())
-		err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
-	else
-		err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+	si.cancelled = 1;
+
+	if (xen_hvm_domain()) {
+		si.arg = 0UL;
+		si.pre = NULL;
+		si.post = &xen_hvm_post_suspend;
+	} else {
+		si.arg = virt_to_mfn(xen_start_info);
+		si.pre = &xen_pre_suspend;
+		si.post = &xen_post_suspend;
+	}
+
+	err = stop_machine(xen_suspend, &si, cpumask_of(0));

 	dpm_resume_noirq(PMSG_RESUME);

 	if (err) {
 		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
-		cancelled = 1;
+		si.cancelled = 1;
 	}

 out_resume:
-	if (!cancelled) {
+	if (!si.cancelled) {
 		xen_arch_resume();
 		xs_resume();
 	} else
@@ -172,12 +175,39 @@ out:
 }
 #endif	/* CONFIG_PM_SLEEP */

+struct shutdown_handler {
+	const char *command;
+	void (*cb)(void);
+};
+
+static void do_poweroff(void)
+{
+	shutting_down = SHUTDOWN_POWEROFF;
+	orderly_poweroff(false);
+}
+
+static void do_reboot(void)
+{
+	shutting_down = SHUTDOWN_POWEROFF; /* ? */
+	ctrl_alt_del();
+}
+
 static void shutdown_handler(struct xenbus_watch *watch,
 			     const char **vec, unsigned int len)
 {
 	char *str;
 	struct xenbus_transaction xbt;
 	int err;
+	static struct shutdown_handler handlers[] = {
+		{ "poweroff",	do_poweroff },
+		{ "halt",	do_poweroff },
+		{ "reboot",	do_reboot   },
+#ifdef CONFIG_PM_SLEEP
+		{ "suspend",	do_suspend  },
+#endif
+		{NULL, NULL},
+	};
+	static struct shutdown_handler *handler;

 	if (shutting_down != SHUTDOWN_INVALID)
 		return;
@@ -194,7 +224,14 @@ static void shutdown_handler(struct xenbus_watch *watch,
 		return;
 	}

-	xenbus_write(xbt, "control", "shutdown", "");
+	for (handler = &handlers[0]; handler->command; handler++) {
+		if (strcmp(str, handler->command) == 0)
+			break;
+	}
+
+	/* Only acknowledge commands which we are prepared to handle. */
+	if (handler->cb)
+		xenbus_write(xbt, "control", "shutdown", "");

 	err = xenbus_transaction_end(xbt, 0);
 	if (err == -EAGAIN) {
@@ -202,17 +239,8 @@ static void shutdown_handler(struct xenbus_watch *watch,
 		goto again;
 	}

-	if (strcmp(str, "poweroff") == 0 ||
-	    strcmp(str, "halt") == 0) {
-		shutting_down = SHUTDOWN_POWEROFF;
-		orderly_poweroff(false);
-	} else if (strcmp(str, "reboot") == 0) {
-		shutting_down = SHUTDOWN_POWEROFF; /* ? */
-		ctrl_alt_del();
-#ifdef CONFIG_PM_SLEEP
-	} else if (strcmp(str, "suspend") == 0) {
-		do_suspend();
-#endif
+	if (handler->cb) {
+		handler->cb();
 	} else {
 		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
 		shutting_down = SHUTDOWN_INVALID;
@@ -291,27 +319,18 @@ static int shutdown_event(struct notifier_block *notifier,
 	return NOTIFY_DONE;
 }

-static int __init __setup_shutdown_event(void)
-{
-	/* Delay initialization in the PV on HVM case */
-	if (xen_hvm_domain())
-		return 0;
-
-	if (!xen_pv_domain())
-		return -ENODEV;
-
-	return xen_setup_shutdown_event();
-}
-
 int xen_setup_shutdown_event(void)
 {
 	static struct notifier_block xenstore_notifier = {
 		.notifier_call = shutdown_event
 	};
+
+	if (!xen_domain())
+		return -ENODEV;
 	register_xenstore_notifier(&xenstore_notifier);

 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);

-subsys_initcall(__setup_shutdown_event);
+subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
new file mode 100644
index 0000000..38bc123
--- /dev/null
+++ b/drivers/xen/pciback/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+
+xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
+xen-pciback-y += conf_space.o conf_space_header.o \
+		 conf_space_capability.o \
+		 conf_space_capability_vpd.o \
+		 conf_space_capability_pm.o \
+		 conf_space_quirks.o
+xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
+xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
+
+ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
new file mode 100644
index 0000000..eb6bba0
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.c
@@ -0,0 +1,435 @@
+/*
+ * PCI Backend - Functions for creating a virtual configuration space for
+ *               exported PCI Devices.
+ *               It's dangerous to allow PCI Driver Domains to change their
+ *               device's resources (memory, i/o ports, interrupts). We need to
+ *               restrict changes to certain PCI Configuration registers:
+ *               BARs, INTERRUPT_PIN, most registers in the header...
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+static int permissive;
+module_param(permissive, bool, 0644);
+
+#define DEFINE_PCI_CONFIG(op, size, type)			\
+int pciback_##op##_config_##size				\
+(struct pci_dev *dev, int offset, type value, void *data)	\
+{								\
+	return pci_##op##_config_##size(dev, offset, value);	\
+}
+
+DEFINE_PCI_CONFIG(read, byte, u8 *)
+DEFINE_PCI_CONFIG(read, word, u16 *)
+DEFINE_PCI_CONFIG(read, dword, u32 *)
+
+DEFINE_PCI_CONFIG(write, byte, u8)
+DEFINE_PCI_CONFIG(write, word, u16)
+DEFINE_PCI_CONFIG(write, dword, u32)
+
+static int conf_space_read(struct pci_dev *dev,
+			   const struct config_field_entry *entry,
+			   int offset, u32 *value)
+{
+	int ret = 0;
+	const struct config_field *field = entry->field;
+
+	*value = 0;
+
+	switch (field->size) {
+	case 1:
+		if (field->u.b.read)
+			ret = field->u.b.read(dev, offset, (u8 *) value,
+					      entry->data);
+		break;
+	case 2:
+		if (field->u.w.read)
+			ret = field->u.w.read(dev, offset, (u16 *) value,
+					      entry->data);
+		break;
+	case 4:
+		if (field->u.dw.read)
+			ret = field->u.dw.read(dev, offset, value, entry->data);
+		break;
+	}
+	return ret;
+}
+
+static int conf_space_write(struct pci_dev *dev,
+			    const struct config_field_entry *entry,
+			    int offset, u32 value)
+{
+	int ret = 0;
+	const struct config_field *field = entry->field;
+
+	switch (field->size) {
+	case 1:
+		if (field->u.b.write)
+			ret = field->u.b.write(dev, offset, (u8) value,
+					       entry->data);
+		break;
+	case 2:
+		if (field->u.w.write)
+			ret = field->u.w.write(dev, offset, (u16) value,
+					       entry->data);
+		break;
+	case 4:
+		if (field->u.dw.write)
+			ret = field->u.dw.write(dev, offset, value,
+						entry->data);
+		break;
+	}
+	return ret;
+}
+
+static inline u32 get_mask(int size)
+{
+	if (size == 1)
+		return 0xff;
+	else if (size == 2)
+		return 0xffff;
+	else
+		return 0xffffffff;
+}
+
+static inline int valid_request(int offset, int size)
+{
+	/* Validate request (no un-aligned requests) */
+	if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
+		return 1;
+	return 0;
+}
+
+static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
+			      int offset)
+{
+	if (offset >= 0) {
+		new_val_mask <<= (offset * 8);
+		new_val <<= (offset * 8);
+	} else {
+		new_val_mask >>= (offset * -8);
+		new_val >>= (offset * -8);
+	}
+	val = (val & ~new_val_mask) | (new_val & new_val_mask);
+
+	return val;
+}
+
+static int pcibios_err_to_errno(int err)
+{
+	switch (err) {
+	case PCIBIOS_SUCCESSFUL:
+		return XEN_PCI_ERR_success;
+	case PCIBIOS_DEVICE_NOT_FOUND:
+		return XEN_PCI_ERR_dev_not_found;
+	case PCIBIOS_BAD_REGISTER_NUMBER:
+		return XEN_PCI_ERR_invalid_offset;
+	case PCIBIOS_FUNC_NOT_SUPPORTED:
+		return XEN_PCI_ERR_not_implemented;
+	case PCIBIOS_SET_FAILED:
+		return XEN_PCI_ERR_access_denied;
+	}
+	return err;
+}
+
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+			u32 *ret_val)
+{
+	int err = 0;
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	const struct config_field_entry *cfg_entry;
+	const struct config_field *field;
+	int req_start, req_end, field_start, field_end;
+	/* if read fails for any reason, return 0
+	 * (as if device didn't respond) */
+	u32 value = 0, tmp_val;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
+		       pci_name(dev), size, offset);
+
+	if (!valid_request(offset, size)) {
+		err = XEN_PCI_ERR_invalid_offset;
+		goto out;
+	}
+
+	/* Get the real value first, then modify as appropriate */
+	switch (size) {
+	case 1:
+		err = pci_read_config_byte(dev, offset, (u8 *) &value);
+		break;
+	case 2:
+		err = pci_read_config_word(dev, offset, (u16 *) &value);
+		break;
+	case 4:
+		err = pci_read_config_dword(dev, offset, &value);
+		break;
+	}
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		req_start = offset;
+		req_end = offset + size;
+		field_start = OFFSET(cfg_entry);
+		field_end = OFFSET(cfg_entry) + field->size;
+
+		if ((req_start >= field_start && req_start < field_end)
+		    || (req_end > field_start && req_end <= field_end)) {
+			err = conf_space_read(dev, cfg_entry, field_start,
+					      &tmp_val);
+			if (err)
+				goto out;
+
+			value = merge_value(value, tmp_val,
+					    get_mask(field->size),
+					    field_start - req_start);
+		}
+	}
+
+out:
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
+		       pci_name(dev), size, offset, value);
+
+	*ret_val = value;
+	return pcibios_err_to_errno(err);
+}
+
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
+{
+	int err = 0, handled = 0;
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	const struct config_field_entry *cfg_entry;
+	const struct config_field *field;
+	u32 tmp_val;
+	int req_start, req_end, field_start, field_end;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG
+		       "pciback: %s: write request %d bytes at 0x%x = %x\n",
+		       pci_name(dev), size, offset, value);
+
+	if (!valid_request(offset, size))
+		return XEN_PCI_ERR_invalid_offset;
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		req_start = offset;
+		req_end = offset + size;
+		field_start = OFFSET(cfg_entry);
+		field_end = OFFSET(cfg_entry) + field->size;
+
+		if ((req_start >= field_start && req_start < field_end)
+		    || (req_end > field_start && req_end <= field_end)) {
+			tmp_val = 0;
+
+			err = pciback_config_read(dev, field_start,
+						  field->size, &tmp_val);
+			if (err)
+				break;
+
+			tmp_val = merge_value(tmp_val, value, get_mask(size),
+					      req_start - field_start);
+
+			err = conf_space_write(dev, cfg_entry, field_start,
+					       tmp_val);
+
+			/* handled is set true here, but not every byte
+			 * may have been written! Properly detecting if
+			 * every byte is handled is unnecessary as the
+			 * flag is used to detect devices that need
+			 * special helpers to work correctly.
+			 */
+			handled = 1;
+		}
+	}
+
+	if (!handled && !err) {
+		/* By default, anything not specificially handled above is
+		 * read-only. The permissive flag changes this behavior so
+		 * that anything not specifically handled above is writable.
+		 * This means that some fields may still be read-only because
+		 * they have entries in the config_field list that intercept
+		 * the write and do nothing. */
+		if (dev_data->permissive || permissive) {
+			switch (size) {
+			case 1:
+				err = pci_write_config_byte(dev, offset,
+							    (u8) value);
+				break;
+			case 2:
+				err = pci_write_config_word(dev, offset,
+							    (u16) value);
+				break;
+			case 4:
+				err = pci_write_config_dword(dev, offset,
+							     (u32) value);
+				break;
+			}
+		} else if (!dev_data->warned_on_write) {
+			dev_data->warned_on_write = 1;
+			dev_warn(&dev->dev, "Driver tried to write to a "
+				 "read-only configuration space field at offset"
+				 " 0x%x, size %d. This may be harmless, but if "
+				 "you have problems with your device:\n"
+				 "1) see permissive attribute in sysfs\n"
+				 "2) report problems to the xen-devel "
+				 "mailing list along with details of your "
+				 "device obtained from lspci.\n", offset, size);
+		}
+	}
+
+	return pcibios_err_to_errno(err);
+}
+
+void pciback_config_free_dyn_fields(struct pci_dev *dev)
+{
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry, *t;
+	const struct config_field *field;
+
+	dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
+			   "configuration space fields\n");
+	if (!dev_data)
+		return;
+
+	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		if (field->clean) {
+			field->clean((struct config_field *)field);
+
+			kfree(cfg_entry->data);
+
+			list_del(&cfg_entry->list);
+			kfree(cfg_entry);
+		}
+
+	}
+}
+
+void pciback_config_reset_dev(struct pci_dev *dev)
+{
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	const struct config_field_entry *cfg_entry;
+	const struct config_field *field;
+
+	dev_dbg(&dev->dev, "resetting virtual configuration space\n");
+	if (!dev_data)
+		return;
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		field = cfg_entry->field;
+
+		if (field->reset)
+			field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
+	}
+}
+
+void pciback_config_free_dev(struct pci_dev *dev)
+{
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry, *t;
+	const struct config_field *field;
+
+	dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
+	if (!dev_data)
+		return;
+
+	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
+		list_del(&cfg_entry->list);
+
+		field = cfg_entry->field;
+
+		if (field->release)
+			field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
+
+		kfree(cfg_entry);
+	}
+}
+
+int pciback_config_add_field_offset(struct pci_dev *dev,
+				    const struct config_field *field,
+				    unsigned int base_offset)
+{
+	int err = 0;
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry;
+	void *tmp;
+
+	cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
+	if (!cfg_entry) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	cfg_entry->data = NULL;
+	cfg_entry->field = field;
+	cfg_entry->base_offset = base_offset;
+
+	/* silently ignore duplicate fields */
+	err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
+	if (err)
+		goto out;
+
+	if (field->init) {
+		tmp = field->init(dev, OFFSET(cfg_entry));
+
+		if (IS_ERR(tmp)) {
+			err = PTR_ERR(tmp);
+			goto out;
+		}
+
+		cfg_entry->data = tmp;
+	}
+
+	dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
+		OFFSET(cfg_entry));
+	list_add_tail(&cfg_entry->list, &dev_data->config_fields);
+
+out:
+	if (err)
+		kfree(cfg_entry);
+
+	return err;
+}
+
+/* This sets up the device's virtual configuration space to keep track of
+ * certain registers (like the base address registers (BARs) so that we can
+ * keep the client from manipulating them directly.
+ */
+int pciback_config_init_dev(struct pci_dev *dev)
+{
+	int err = 0;
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+	dev_dbg(&dev->dev, "initializing virtual configuration space\n");
+
+	INIT_LIST_HEAD(&dev_data->config_fields);
+
+	err = pciback_config_header_add_fields(dev);
+	if (err)
+		goto out;
+
+	err = pciback_config_capability_add_fields(dev);
+	if (err)
+		goto out;
+
+	err = pciback_config_quirks_init(dev);
+
+out:
+	return err;
+}
+
+int pciback_config_init(void)
+{
+	return pciback_config_capability_init();
+}
diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
new file mode 100644
index 0000000..50ebef2
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.h
@@ -0,0 +1,126 @@
+/*
+ * PCI Backend - Common data structures for overriding the configuration space
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_H__
+#define __XEN_PCIBACK_CONF_SPACE_H__
+
+#include <linux/list.h>
+#include <linux/err.h>
+
+/* conf_field_init can return an errno in a ptr with ERR_PTR() */
+typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
+typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
+typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
+
+typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
+				 void *data);
+typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
+				void *data);
+typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
+				void *data);
+typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
+				void *data);
+typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
+			       void *data);
+typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
+			       void *data);
+
+/* These are the fields within the configuration space which we
+ * are interested in intercepting reads/writes to and changing their
+ * values.
+ */
+struct config_field {
+	unsigned int offset;
+	unsigned int size;
+	unsigned int mask;
+	conf_field_init init;
+	conf_field_reset reset;
+	conf_field_free release;
+	void (*clean) (struct config_field *field);
+	union {
+		struct {
+			conf_dword_write write;
+			conf_dword_read read;
+		} dw;
+		struct {
+			conf_word_write write;
+			conf_word_read read;
+		} w;
+		struct {
+			conf_byte_write write;
+			conf_byte_read read;
+		} b;
+	} u;
+	struct list_head list;
+};
+
+struct config_field_entry {
+	struct list_head list;
+	const struct config_field *field;
+	unsigned int base_offset;
+	void *data;
+};
+
+#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
+
+/* Add fields to a device - the add_fields macro expects to get a pointer to
+ * the first entry in an array (of which the ending is marked by size==0)
+ */
+int pciback_config_add_field_offset(struct pci_dev *dev,
+				    const struct config_field *field,
+				    unsigned int offset);
+
+static inline int pciback_config_add_field(struct pci_dev *dev,
+					   const struct config_field *field)
+{
+	return pciback_config_add_field_offset(dev, field, 0);
+}
+
+static inline int pciback_config_add_fields(struct pci_dev *dev,
+					    const struct config_field *field)
+{
+	int i, err = 0;
+	for (i = 0; field[i].size != 0; i++) {
+		err = pciback_config_add_field(dev, &field[i]);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
+					const struct config_field *field,
+					unsigned int offset)
+{
+	int i, err = 0;
+	for (i = 0; field[i].size != 0; i++) {
+		err = pciback_config_add_field_offset(dev, &field[i], offset);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/* Read/Write the real configuration space */
+int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
+			     void *data);
+int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
+			     void *data);
+int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
+			      void *data);
+int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
+			      void *data);
+int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
+			      void *data);
+int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
+			       void *data);
+
+int pciback_config_capability_init(void);
+
+int pciback_config_header_add_fields(struct pci_dev *dev);
+int pciback_config_capability_add_fields(struct pci_dev *dev);
+
+#endif				/* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
new file mode 100644
index 0000000..0ea84d6
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.c
@@ -0,0 +1,66 @@
+/*
+ * PCI Backend - Handles the virtual fields found on the capability lists
+ *               in the configuration space.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static LIST_HEAD(capabilities);
+
+static const struct config_field caplist_header[] = {
+	{
+	 .offset    = PCI_CAP_LIST_ID,
+	 .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
+	 .u.w.read  = pciback_read_config_word,
+	 .u.w.write = NULL,
+	},
+	{}
+};
+
+static inline void register_capability(struct pciback_config_capability *cap)
+{
+	list_add_tail(&cap->cap_list, &capabilities);
+}
+
+int pciback_config_capability_add_fields(struct pci_dev *dev)
+{
+	int err = 0;
+	struct pciback_config_capability *cap;
+	int cap_offset;
+
+	list_for_each_entry(cap, &capabilities, cap_list) {
+		cap_offset = pci_find_capability(dev, cap->capability);
+		if (cap_offset) {
+			dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
+				cap->capability, cap_offset);
+
+			err = pciback_config_add_fields_offset(dev,
+							       caplist_header,
+							       cap_offset);
+			if (err)
+				goto out;
+			err = pciback_config_add_fields_offset(dev,
+							       cap->fields,
+							       cap_offset);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	return err;
+}
+
+int pciback_config_capability_init(void)
+{
+	register_capability(&pciback_config_capability_vpd);
+	register_capability(&pciback_config_capability_pm);
+
+	return 0;
+}
diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
new file mode 100644
index 0000000..8da3ac4
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.h
@@ -0,0 +1,26 @@
+/*
+ * PCI Backend - Data structures for special overlays for structures on
+ *               the capability list.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
+#define __PCIBACK_CONFIG_CAPABILITY_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_capability {
+	struct list_head cap_list;
+
+	int capability;
+
+	/* If the device has the capability found above, add these fields */
+	const struct config_field *fields;
+};
+
+extern struct pciback_config_capability pciback_config_capability_vpd;
+extern struct pciback_config_capability pciback_config_capability_pm;
+
+#endif
diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
new file mode 100644
index 0000000..041e4aa
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_msi.c
@@ -0,0 +1,136 @@
+/*
+ * PCI Backend -- Configuration overlay for MSI capability
+ */
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+#include <xen/interface/io/pciif.h>
+#include <xen/events.h>
+#include "pciback.h"
+
+int pciback_enable_msi(struct pciback_device *pdev,
+		struct pci_dev *dev, struct xen_pci_op *op)
+{
+	struct pciback_dev_data *dev_data;
+	int otherend = pdev->xdev->otherend_id;
+	int status;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: enable MSI\n", pci_name(dev));
+
+	status = pci_enable_msi(dev);
+
+	if (status) {
+		printk(KERN_ERR "error enable msi for guest %x status %x\n",
+			otherend, status);
+		op->value = 0;
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	/* The value the guest needs is actually the IDT vector, not the
+	 * the local domain's IRQ number. */
+
+	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: MSI: %d\n", pci_name(dev),
+			op->value);
+
+	dev_data = pci_get_drvdata(dev);
+	if (dev_data)
+		dev_data->ack_intr = 0;
+
+	return 0;
+}
+
+int pciback_disable_msi(struct pciback_device *pdev,
+		struct pci_dev *dev, struct xen_pci_op *op)
+{
+	struct pciback_dev_data *dev_data;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: disable MSI\n", pci_name(dev));
+	pci_disable_msi(dev);
+
+	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: MSI: %d\n", pci_name(dev),
+			op->value);
+	dev_data = pci_get_drvdata(dev);
+	if (dev_data)
+		dev_data->ack_intr = 1;
+	return 0;
+}
+
+int pciback_enable_msix(struct pciback_device *pdev,
+		struct pci_dev *dev, struct xen_pci_op *op)
+{
+	struct pciback_dev_data *dev_data;
+	int i, result;
+	struct msix_entry *entries;
+
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: enable MSI-X\n", pci_name(dev));
+	if (op->value > SH_INFO_MAX_VEC)
+		return -EINVAL;
+
+	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
+	if (entries == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < op->value; i++) {
+		entries[i].entry = op->msix_entries[i].entry;
+		entries[i].vector = op->msix_entries[i].vector;
+	}
+
+	result = pci_enable_msix(dev, entries, op->value);
+
+	if (result == 0) {
+		for (i = 0; i < op->value; i++) {
+			op->msix_entries[i].entry = entries[i].entry;
+			if (entries[i].vector)
+				op->msix_entries[i].vector =
+					xen_pirq_from_irq(entries[i].vector);
+				if (unlikely(verbose_request))
+					printk(KERN_DEBUG "pciback: %s: " \
+						"MSI-X[%d]: %d\n",
+						pci_name(dev), i,
+						op->msix_entries[i].vector);
+		}
+	} else {
+		printk(KERN_WARNING "pciback: %s: failed to enable MSI-X: err %d!\n",
+			pci_name(dev), result);
+	}
+	kfree(entries);
+
+	op->value = result;
+	dev_data = pci_get_drvdata(dev);
+	if (dev_data)
+		dev_data->ack_intr = 0;
+
+	return result;
+}
+
+int pciback_disable_msix(struct pciback_device *pdev,
+		struct pci_dev *dev, struct xen_pci_op *op)
+{
+	struct pciback_dev_data *dev_data;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: disable MSI-X\n",
+			pci_name(dev));
+	pci_disable_msix(dev);
+
+	/*
+	 * SR-IOV devices (which don't have any legacy IRQ) have
+	 * an undefined IRQ value of zero.
+	 */
+	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+	if (unlikely(verbose_request))
+		printk(KERN_DEBUG "pciback: %s: MSI-X: %d\n", pci_name(dev),
+			op->value);
+	dev_data = pci_get_drvdata(dev);
+	if (dev_data)
+		dev_data->ack_intr = 1;
+	return 0;
+}
+
diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
new file mode 100644
index 0000000..0442616
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_pm.c
@@ -0,0 +1,113 @@
+/*
+ * PCI Backend - Configuration space overlay for power management
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
+			void *data)
+{
+	int err;
+	u16 real_value;
+
+	err = pci_read_config_word(dev, offset, &real_value);
+	if (err)
+		goto out;
+
+	*value = real_value & ~PCI_PM_CAP_PME_MASK;
+
+out:
+	return err;
+}
+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
+ * Can't allow driver domain to enable PMEs - they're shared */
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
+
+static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
+			 void *data)
+{
+	int err;
+	u16 old_value;
+	pci_power_t new_state, old_state;
+
+	err = pci_read_config_word(dev, offset, &old_value);
+	if (err)
+		goto out;
+
+	old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
+	new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
+
+	new_value &= PM_OK_BITS;
+	if ((old_value & PM_OK_BITS) != new_value) {
+		new_value = (old_value & ~PM_OK_BITS) | new_value;
+		err = pci_write_config_word(dev, offset, new_value);
+		if (err)
+			goto out;
+	}
+
+	/* Let pci core handle the power management change */
+	dev_dbg(&dev->dev, "set power state to %x\n", new_state);
+	err = pci_set_power_state(dev, new_state);
+	if (err) {
+		err = PCIBIOS_SET_FAILED;
+		goto out;
+	}
+
+ out:
+	return err;
+}
+
+/* Ensure PMEs are disabled */
+static void *pm_ctrl_init(struct pci_dev *dev, int offset)
+{
+	int err;
+	u16 value;
+
+	err = pci_read_config_word(dev, offset, &value);
+	if (err)
+		goto out;
+
+	if (value & PCI_PM_CTRL_PME_ENABLE) {
+		value &= ~PCI_PM_CTRL_PME_ENABLE;
+		err = pci_write_config_word(dev, offset, value);
+	}
+
+out:
+	return ERR_PTR(err);
+}
+
+static const struct config_field caplist_pm[] = {
+	{
+		.offset     = PCI_PM_PMC,
+		.size       = 2,
+		.u.w.read   = pm_caps_read,
+	},
+	{
+		.offset     = PCI_PM_CTRL,
+		.size       = 2,
+		.init       = pm_ctrl_init,
+		.u.w.read   = pciback_read_config_word,
+		.u.w.write  = pm_ctrl_write,
+	},
+	{
+		.offset     = PCI_PM_PPB_EXTENSIONS,
+		.size       = 1,
+		.u.b.read   = pciback_read_config_byte,
+	},
+	{
+		.offset     = PCI_PM_DATA_REGISTER,
+		.size       = 1,
+		.u.b.read   = pciback_read_config_byte,
+	},
+	{}
+};
+
+struct pciback_config_capability pciback_config_capability_pm = {
+	.capability = PCI_CAP_ID_PM,
+	.fields = caplist_pm,
+};
diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
new file mode 100644
index 0000000..e7b4d66
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_vpd.c
@@ -0,0 +1,40 @@
+/*
+ * PCI Backend - Configuration space overlay for Vital Product Data
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/pci.h>
+#include "conf_space.h"
+#include "conf_space_capability.h"
+
+static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
+			     void *data)
+{
+	/* Disallow writes to the vital product data */
+	if (value & PCI_VPD_ADDR_F)
+		return PCIBIOS_SET_FAILED;
+	else
+		return pci_write_config_word(dev, offset, value);
+}
+
+static const struct config_field caplist_vpd[] = {
+	{
+	 .offset    = PCI_VPD_ADDR,
+	 .size      = 2,
+	 .u.w.read  = pciback_read_config_word,
+	 .u.w.write = vpd_address_write,
+	 },
+	{
+	 .offset     = PCI_VPD_DATA,
+	 .size       = 4,
+	 .u.dw.read  = pciback_read_config_dword,
+	 .u.dw.write = NULL,
+	 },
+	{}
+};
+
+struct pciback_config_capability pciback_config_capability_vpd = {
+	.capability = PCI_CAP_ID_VPD,
+	.fields = caplist_vpd,
+};
diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
new file mode 100644
index 0000000..22ad0f5
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_header.c
@@ -0,0 +1,385 @@
+/*
+ * PCI Backend - Handles the virtual fields in the configuration space headers.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+
+struct pci_bar_info {
+	u32 val;
+	u32 len_val;
+	int which;
+};
+
+#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
+#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
+
+static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
+{
+	int i;
+	int ret;
+
+	ret = pciback_read_config_word(dev, offset, value, data);
+	if (!atomic_read(&dev->enable_cnt))
+		return ret;
+
+	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+		if (dev->resource[i].flags & IORESOURCE_IO)
+			*value |= PCI_COMMAND_IO;
+		if (dev->resource[i].flags & IORESOURCE_MEM)
+			*value |= PCI_COMMAND_MEMORY;
+	}
+
+	return ret;
+}
+
+static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
+{
+	struct pciback_dev_data *dev_data;
+	int err;
+
+	dev_data = pci_get_drvdata(dev);
+	if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG "pciback: %s: enable\n",
+			       pci_name(dev));
+		err = pci_enable_device(dev);
+		if (err)
+			return err;
+		if (dev_data)
+			dev_data->enable_intx = 1;
+	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG "pciback: %s: disable\n",
+			       pci_name(dev));
+		pci_disable_device(dev);
+		if (dev_data)
+			dev_data->enable_intx = 0;
+	}
+
+	if (!dev->is_busmaster && is_master_cmd(value)) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG "pciback: %s: set bus master\n",
+			       pci_name(dev));
+		pci_set_master(dev);
+	}
+
+	if (value & PCI_COMMAND_INVALIDATE) {
+		if (unlikely(verbose_request))
+			printk(KERN_DEBUG
+			       "pciback: %s: enable memory-write-invalidate\n",
+			       pci_name(dev));
+		err = pci_set_mwi(dev);
+		if (err) {
+			printk(KERN_WARNING
+			       "pciback: %s: cannot enable "
+			       "memory-write-invalidate (%d)\n",
+			       pci_name(dev), err);
+			value &= ~PCI_COMMAND_INVALIDATE;
+		}
+	}
+
+	return pci_write_config_word(dev, offset, value);
+}
+
+static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	if (unlikely(!bar)) {
+		printk(KERN_WARNING "pciback: driver data not found for %s\n",
+		       pci_name(dev));
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	/* A write to obtain the length must happen as a 32-bit write.
+	 * This does not (yet) support writing individual bytes
+	 */
+	if (value == ~PCI_ROM_ADDRESS_ENABLE)
+		bar->which = 1;
+	else {
+		u32 tmpval;
+		pci_read_config_dword(dev, offset, &tmpval);
+		if (tmpval != bar->val && value == bar->val) {
+			/* Allow restoration of bar value. */
+			pci_write_config_dword(dev, offset, bar->val);
+		}
+		bar->which = 0;
+	}
+
+	/* Do we need to support enabling/disabling the rom address here? */
+
+	return 0;
+}
+
+/* For the BARs, only allow writes which write ~0 or
+ * the correct resource information
+ * (Needed for when the driver probes the resource usage)
+ */
+static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	if (unlikely(!bar)) {
+		printk(KERN_WARNING "pciback: driver data not found for %s\n",
+		       pci_name(dev));
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	/* A write to obtain the length must happen as a 32-bit write.
+	 * This does not (yet) support writing individual bytes
+	 */
+	if (value == ~0)
+		bar->which = 1;
+	else {
+		u32 tmpval;
+		pci_read_config_dword(dev, offset, &tmpval);
+		if (tmpval != bar->val && value == bar->val) {
+			/* Allow restoration of bar value. */
+			pci_write_config_dword(dev, offset, bar->val);
+		}
+		bar->which = 0;
+	}
+
+	return 0;
+}
+
+static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	if (unlikely(!bar)) {
+		printk(KERN_WARNING "pciback: driver data not found for %s\n",
+		       pci_name(dev));
+		return XEN_PCI_ERR_op_failed;
+	}
+
+	*value = bar->which ? bar->len_val : bar->val;
+
+	return 0;
+}
+
+static inline void read_dev_bar(struct pci_dev *dev,
+				struct pci_bar_info *bar_info, int offset,
+				u32 len_mask)
+{
+	int	pos;
+	struct resource	*res = dev->resource;
+
+	if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
+		pos = PCI_ROM_RESOURCE;
+	else {
+		pos = (offset - PCI_BASE_ADDRESS_0) / 4;
+		if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
+				PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
+			   (PCI_BASE_ADDRESS_SPACE_MEMORY |
+				PCI_BASE_ADDRESS_MEM_TYPE_64))) {
+			bar_info->val = res[pos - 1].start >> 32;
+			bar_info->len_val = res[pos - 1].end >> 32;
+			return;
+		}
+	}
+
+	bar_info->val = res[pos].start |
+			(res[pos].flags & PCI_REGION_FLAG_MASK);
+	bar_info->len_val = res[pos].end - res[pos].start + 1;
+}
+
+static void *bar_init(struct pci_dev *dev, int offset)
+{
+	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+	if (!bar)
+		return ERR_PTR(-ENOMEM);
+
+	read_dev_bar(dev, bar, offset, ~0);
+	bar->which = 0;
+
+	return bar;
+}
+
+static void *rom_init(struct pci_dev *dev, int offset)
+{
+	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
+
+	if (!bar)
+		return ERR_PTR(-ENOMEM);
+
+	read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
+	bar->which = 0;
+
+	return bar;
+}
+
+static void bar_reset(struct pci_dev *dev, int offset, void *data)
+{
+	struct pci_bar_info *bar = data;
+
+	bar->which = 0;
+}
+
+static void bar_release(struct pci_dev *dev, int offset, void *data)
+{
+	kfree(data);
+}
+
+static int pciback_read_vendor(struct pci_dev *dev, int offset,
+			       u16 *value, void *data)
+{
+	*value = dev->vendor;
+
+	return 0;
+}
+
+static int pciback_read_device(struct pci_dev *dev, int offset,
+			       u16 *value, void *data)
+{
+	*value = dev->device;
+
+	return 0;
+}
+
+static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
+			  void *data)
+{
+	*value = (u8) dev->irq;
+
+	return 0;
+}
+
+static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
+{
+	u8 cur_value;
+	int err;
+
+	err = pci_read_config_byte(dev, offset, &cur_value);
+	if (err)
+		goto out;
+
+	if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
+	    || value == PCI_BIST_START)
+		err = pci_write_config_byte(dev, offset, value);
+
+out:
+	return err;
+}
+
+static const struct config_field header_common[] = {
+	{
+	 .offset    = PCI_VENDOR_ID,
+	 .size      = 2,
+	 .u.w.read  = pciback_read_vendor,
+	},
+	{
+	 .offset    = PCI_DEVICE_ID,
+	 .size      = 2,
+	 .u.w.read  = pciback_read_device,
+	},
+	{
+	 .offset    = PCI_COMMAND,
+	 .size      = 2,
+	 .u.w.read  = command_read,
+	 .u.w.write = command_write,
+	},
+	{
+	 .offset    = PCI_INTERRUPT_LINE,
+	 .size      = 1,
+	 .u.b.read  = interrupt_read,
+	},
+	{
+	 .offset    = PCI_INTERRUPT_PIN,
+	 .size      = 1,
+	 .u.b.read  = pciback_read_config_byte,
+	},
+	{
+	 /* Any side effects of letting driver domain control cache line? */
+	 .offset    = PCI_CACHE_LINE_SIZE,
+	 .size      = 1,
+	 .u.b.read  = pciback_read_config_byte,
+	 .u.b.write = pciback_write_config_byte,
+	},
+	{
+	 .offset    = PCI_LATENCY_TIMER,
+	 .size      = 1,
+	 .u.b.read  = pciback_read_config_byte,
+	},
+	{
+	 .offset    = PCI_BIST,
+	 .size      = 1,
+	 .u.b.read  = pciback_read_config_byte,
+	 .u.b.write = bist_write,
+	},
+	{}
+};
+
+#define CFG_FIELD_BAR(reg_offset)			\
+	{						\
+	.offset     = reg_offset,			\
+	.size       = 4,				\
+	.init       = bar_init,				\
+	.reset      = bar_reset,			\
+	.release    = bar_release,			\
+	.u.dw.read  = bar_read,				\
+	.u.dw.write = bar_write,			\
+	}
+
+#define CFG_FIELD_ROM(reg_offset)			\
+	{						\
+	.offset     = reg_offset,			\
+	.size       = 4,				\
+	.init       = rom_init,				\
+	.reset      = bar_reset,			\
+	.release    = bar_release,			\
+	.u.dw.read  = bar_read,				\
+	.u.dw.write = rom_write,			\
+	}
+
+static const struct config_field header_0[] = {
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
+	CFG_FIELD_ROM(PCI_ROM_ADDRESS),
+	{}
+};
+
+static const struct config_field header_1[] = {
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
+	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
+	CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
+	{}
+};
+
+int pciback_config_header_add_fields(struct pci_dev *dev)
+{
+	int err;
+
+	err = pciback_config_add_fields(dev, header_common);
+	if (err)
+		goto out;
+
+	switch (dev->hdr_type) {
+	case PCI_HEADER_TYPE_NORMAL:
+		err = pciback_config_add_fields(dev, header_0);
+		break;
+
+	case PCI_HEADER_TYPE_BRIDGE:
+		err = pciback_config_add_fields(dev, header_1);
+		break;
+
+	default:
+		err = -EINVAL;
+		printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
+		       pci_name(dev), dev->hdr_type);
+		break;
+	}
+
+out:
+	return err;
+}
diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
new file mode 100644
index 0000000..45c31fb
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.c
@@ -0,0 +1,140 @@
+/*
+ * PCI Backend - Handle special overlays for broken devices.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+LIST_HEAD(pciback_quirks);
+
+static inline const struct pci_device_id *
+match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
+{
+	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
+	    (id->device == PCI_ANY_ID || id->device == dev->device) &&
+	    (id->subvendor == PCI_ANY_ID ||
+				id->subvendor == dev->subsystem_vendor) &&
+	    (id->subdevice == PCI_ANY_ID ||
+				id->subdevice == dev->subsystem_device) &&
+	    !((id->class ^ dev->class) & id->class_mask))
+		return id;
+	return NULL;
+}
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
+{
+	struct pciback_config_quirk *tmp_quirk;
+
+	list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
+		if (match_one_device(&tmp_quirk->devid, dev) != NULL)
+			goto out;
+	tmp_quirk = NULL;
+	printk(KERN_DEBUG
+	       "quirk didn't match any device pciback knows about\n");
+out:
+	return tmp_quirk;
+}
+
+static inline void register_quirk(struct pciback_config_quirk *quirk)
+{
+	list_add_tail(&quirk->quirks_list, &pciback_quirks);
+}
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
+{
+	int ret = 0;
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+	struct config_field_entry *cfg_entry;
+
+	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+		if (OFFSET(cfg_entry) == reg) {
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+				    *field)
+{
+	int err = 0;
+
+	switch (field->size) {
+	case 1:
+		field->u.b.read = pciback_read_config_byte;
+		field->u.b.write = pciback_write_config_byte;
+		break;
+	case 2:
+		field->u.w.read = pciback_read_config_word;
+		field->u.w.write = pciback_write_config_word;
+		break;
+	case 4:
+		field->u.dw.read = pciback_read_config_dword;
+		field->u.dw.write = pciback_write_config_dword;
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	pciback_config_add_field(dev, field);
+
+out:
+	return err;
+}
+
+int pciback_config_quirks_init(struct pci_dev *dev)
+{
+	struct pciback_config_quirk *quirk;
+	int ret = 0;
+
+	quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
+	if (!quirk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	quirk->devid.vendor = dev->vendor;
+	quirk->devid.device = dev->device;
+	quirk->devid.subvendor = dev->subsystem_vendor;
+	quirk->devid.subdevice = dev->subsystem_device;
+	quirk->devid.class = 0;
+	quirk->devid.class_mask = 0;
+	quirk->devid.driver_data = 0UL;
+
+	quirk->pdev = dev;
+
+	register_quirk(quirk);
+out:
+	return ret;
+}
+
+void pciback_config_field_free(struct config_field *field)
+{
+	kfree(field);
+}
+
+int pciback_config_quirk_release(struct pci_dev *dev)
+{
+	struct pciback_config_quirk *quirk;
+	int ret = 0;
+
+	quirk = pciback_find_quirk(dev);
+	if (!quirk) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	list_del(&quirk->quirks_list);
+	kfree(quirk);
+
+out:
+	return ret;
+}
diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
new file mode 100644
index 0000000..acd0e1a
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.h
@@ -0,0 +1,35 @@
+/*
+ * PCI Backend - Data structures for special overlays for broken devices.
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+
+#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
+
+#include <linux/pci.h>
+#include <linux/list.h>
+
+struct pciback_config_quirk {
+	struct list_head quirks_list;
+	struct pci_device_id devid;
+	struct pci_dev *pdev;
+};
+
+struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
+
+int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
+				    *field);
+
+int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
+
+int pciback_config_quirks_init(struct pci_dev *dev);
+
+void pciback_config_field_free(struct config_field *field);
+
+int pciback_config_quirk_release(struct pci_dev *dev);
+
+int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
+
+#endif
diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
new file mode 100644
index 0000000..5a7e4cc
--- /dev/null
+++ b/drivers/xen/pciback/controller.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ *      Alex Williamson <alex.williamson@hp.com>
+ *
+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
+ * controllers.  Devices under the same PCI controller are exposed on the
+ * same virtual domain:bus.  Within a bus, device slots are virtualized
+ * to compact the bus.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/acpi.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_MAX_BUSSES	255
+#define PCI_MAX_SLOTS	32
+
+struct controller_dev_entry {
+	struct list_head list;
+	struct pci_dev *dev;
+	unsigned int devfn;
+};
+
+struct controller_list_entry {
+	struct list_head list;
+	struct pci_controller *controller;
+	unsigned int domain;
+	unsigned int bus;
+	unsigned int next_devfn;
+	struct list_head dev_list;
+};
+
+struct controller_dev_data {
+	struct list_head list;
+	unsigned int next_domain;
+	unsigned int next_bus;
+	spinlock_t lock;
+};
+
+struct walk_info {
+	struct pciback_device *pdev;
+	int resource_count;
+	int root_num;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus,
+				    unsigned int devfn)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_dev_entry *dev_entry;
+	struct controller_list_entry *cntrl_entry;
+	struct pci_dev *dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_data->lock, flags);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		if (cntrl_entry->domain != domain ||
+		    cntrl_entry->bus != bus)
+			continue;
+
+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+			if (devfn == dev_entry->devfn) {
+				dev = dev_entry->dev;
+				goto found;
+			}
+		}
+	}
+found:
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+
+	return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+			int devid, publish_pci_dev_cb publish_cb)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_dev_entry *dev_entry;
+	struct controller_list_entry *cntrl_entry;
+	struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
+	unsigned long flags;
+	int ret = 0, found = 0;
+
+	spin_lock_irqsave(&dev_data->lock, flags);
+
+	/* Look to see if we already have a domain:bus for this controller */
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		if (cntrl_entry->controller == dev_controller) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
+		if (!cntrl_entry) {
+			ret =  -ENOMEM;
+			goto out;
+		}
+
+		cntrl_entry->controller = dev_controller;
+		cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
+		cntrl_entry->domain = dev_data->next_domain;
+		cntrl_entry->bus = dev_data->next_bus++;
+		if (dev_data->next_bus > PCI_MAX_BUSSES) {
+			dev_data->next_domain++;
+			dev_data->next_bus = 0;
+		}
+
+		INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
+		list_add_tail(&cntrl_entry->list, &dev_data->list);
+	}
+
+	if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
+		/*
+		 * While it seems unlikely, this can actually happen if
+		 * a controller has P2P bridges under it.
+		 */
+		xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
+				 "is full, no room to export %04x:%02x:%02x.%x",
+				 cntrl_entry->domain, cntrl_entry->bus,
+				 pci_domain_nr(dev->bus), dev->bus->number,
+				 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
+	if (!dev_entry) {
+		if (list_empty(&cntrl_entry->dev_list)) {
+			list_del(&cntrl_entry->list);
+			kfree(cntrl_entry);
+		}
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dev_entry->dev = dev;
+	dev_entry->devfn = cntrl_entry->next_devfn;
+
+	list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
+	cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
+out:
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+
+	/* TODO: Publish virtual domain:bus:slot.func here. */
+
+	return ret;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_list_entry *cntrl_entry;
+	struct controller_dev_entry *dev_entry = NULL;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_data->lock, flags);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		if (cntrl_entry->controller != PCI_CONTROLLER(dev))
+			continue;
+
+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+			if (dev_entry->dev == dev) {
+				found_dev = dev_entry->dev;
+				break;
+			}
+		}
+	}
+
+	if (!found_dev) {
+		spin_unlock_irqrestore(&dev_data->lock, flags);
+		return;
+	}
+
+	list_del(&dev_entry->list);
+	kfree(dev_entry);
+
+	if (list_empty(&cntrl_entry->dev_list)) {
+		list_del(&cntrl_entry->list);
+		kfree(cntrl_entry);
+	}
+
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+	pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+	struct controller_dev_data *dev_data;
+
+	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+	if (!dev_data)
+		return -ENOMEM;
+
+	spin_lock_init(&dev_data->lock);
+
+	INIT_LIST_HEAD(&dev_data->list);
+
+	/* Starting domain:bus numbers */
+	dev_data->next_domain = 0;
+	dev_data->next_bus = 0;
+
+	pdev->pci_dev_data = dev_data;
+
+	return 0;
+}
+
+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
+{
+	struct walk_info *info = data;
+	struct acpi_resource_address64 addr;
+	acpi_status status;
+	int i, len, err;
+	char str[32], tmp[3];
+	unsigned char *ptr, *buf;
+
+	status = acpi_resource_to_address64(res, &addr);
+
+	/* Do we care about this range?  Let's check. */
+	if (!ACPI_SUCCESS(status) ||
+	    !(addr.resource_type == ACPI_MEMORY_RANGE ||
+	      addr.resource_type == ACPI_IO_RANGE) ||
+	    !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
+		return AE_OK;
+
+	/*
+	 * Furthermore, we really only care to tell the guest about
+	 * address ranges that require address translation of some sort.
+	 */
+	if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
+	      addr.info.mem.translation) &&
+	    !(addr.resource_type == ACPI_IO_RANGE &&
+	      addr.info.io.translation))
+		return AE_OK;
+
+	/* Store the resource in xenbus for the guest */
+	len = snprintf(str, sizeof(str), "root-%d-resource-%d",
+		       info->root_num, info->resource_count);
+	if (unlikely(len >= (sizeof(str) - 1)))
+		return AE_OK;
+
+	buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
+	if (!buf)
+		return AE_OK;
+
+	/* Clean out resource_source */
+	res->data.address64.resource_source.index = 0xFF;
+	res->data.address64.resource_source.string_length = 0;
+	res->data.address64.resource_source.string_ptr = NULL;
+
+	ptr = (unsigned char *)res;
+
+	/* Turn the acpi_resource into an ASCII byte stream */
+	for (i = 0; i < sizeof(*res); i++) {
+		snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
+		strncat(buf, tmp, 2);
+	}
+
+	err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
+			    str, "%s", buf);
+
+	if (!err)
+		info->resource_count++;
+
+	kfree(buf);
+
+	return AE_OK;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+			      publish_pci_root_cb publish_root_cb)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_list_entry *cntrl_entry;
+	int i, root_num, len, err = 0;
+	unsigned int domain, bus;
+	char str[64];
+	struct walk_info info;
+
+	spin_lock(&dev_data->lock);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		/* First publish all the domain:bus info */
+		err = publish_root_cb(pdev, cntrl_entry->domain,
+				      cntrl_entry->bus);
+		if (err)
+			goto out;
+
+		/*
+		 * Now figure out which root-%d this belongs to
+		 * so we can associate resources with it.
+		 */
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+				   "root_num", "%d", &root_num);
+
+		if (err != 1)
+			goto out;
+
+		for (i = 0; i < root_num; i++) {
+			len = snprintf(str, sizeof(str), "root-%d", i);
+			if (unlikely(len >= (sizeof(str) - 1))) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+					   str, "%x:%x", &domain, &bus);
+			if (err != 2)
+				goto out;
+
+			/* Is this the one we just published? */
+			if (domain == cntrl_entry->domain &&
+			    bus == cntrl_entry->bus)
+				break;
+		}
+
+		if (i == root_num)
+			goto out;
+
+		info.pdev = pdev;
+		info.resource_count = 0;
+		info.root_num = i;
+
+		/* Let ACPI do the heavy lifting on decoding resources */
+		acpi_walk_resources(cntrl_entry->controller->acpi_handle,
+				    METHOD_NAME__CRS, write_xenbus_resource,
+				    &info);
+
+		/* No resouces.  OK.  On to the next one */
+		if (!info.resource_count)
+			continue;
+
+		/* Store the number of resources we wrote for this root-%d */
+		len = snprintf(str, sizeof(str), "root-%d-resources", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+				    "%d", info.resource_count);
+		if (err)
+			goto out;
+	}
+
+	/* Finally, write some magic to synchronize with the guest. */
+	len = snprintf(str, sizeof(str), "root-resource-magic");
+	if (unlikely(len >= (sizeof(str) - 1))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+			    "%lx", (sizeof(struct acpi_resource) *2) + 1);
+
+out:
+	spin_unlock(&dev_data->lock);
+
+	return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_list_entry *cntrl_entry, *c;
+	struct controller_dev_entry *dev_entry, *d;
+
+	list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
+		list_for_each_entry_safe(dev_entry, d,
+					 &cntrl_entry->dev_list, list) {
+			list_del(&dev_entry->list);
+			pcistub_put_pci_dev(dev_entry->dev);
+			kfree(dev_entry);
+		}
+		list_del(&cntrl_entry->list);
+		kfree(cntrl_entry);
+	}
+
+	kfree(dev_data);
+	pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+		struct pciback_device *pdev,
+		unsigned int *domain, unsigned int *bus, unsigned int *devfn)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_dev_entry *dev_entry;
+	struct controller_list_entry *cntrl_entry;
+	unsigned long flags;
+	int found = 0;
+	spin_lock_irqsave(&dev_data->lock, flags);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+			if ((dev_entry->dev->bus->number ==
+					pcidev->bus->number) &&
+				(dev_entry->dev->devfn ==
+					pcidev->devfn) &&
+				(pci_domain_nr(dev_entry->dev->bus) ==
+					pci_domain_nr(pcidev->bus))) {
+				found = 1;
+				*domain = cntrl_entry->domain;
+				*bus = cntrl_entry->bus;
+				*devfn = dev_entry->devfn;
+				goto out;
+			}
+		}
+	}
+out:
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+	return found;
+
+}
+
diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
new file mode 100644
index 0000000..5386bebf
--- /dev/null
+++ b/drivers/xen/pciback/passthrough.c
@@ -0,0 +1,178 @@
+/*
+ * PCI Backend - Provides restricted access to the real PCI bus topology
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+struct passthrough_dev_data {
+	/* Access to dev_list must be protected by lock */
+	struct list_head dev_list;
+	spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus,
+				    unsigned int devfn)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry;
+	struct pci_dev *dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_data->lock, flags);
+
+	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+		if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
+		    && bus == (unsigned int)dev_entry->dev->bus->number
+		    && devfn == dev_entry->dev->devfn) {
+			dev = dev_entry->dev;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+
+	return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+			int devid, publish_pci_dev_cb publish_cb)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry;
+	unsigned long flags;
+	unsigned int domain, bus, devfn;
+	int err;
+
+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+	if (!dev_entry)
+		return -ENOMEM;
+	dev_entry->dev = dev;
+
+	spin_lock_irqsave(&dev_data->lock, flags);
+	list_add_tail(&dev_entry->list, &dev_data->dev_list);
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+
+	/* Publish this device. */
+	domain = (unsigned int)pci_domain_nr(dev->bus);
+	bus = (unsigned int)dev->bus->number;
+	devfn = dev->devfn;
+	err = publish_cb(pdev, domain, bus, devfn, devid);
+
+	return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry, *t;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_data->lock, flags);
+
+	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+		if (dev_entry->dev == dev) {
+			list_del(&dev_entry->list);
+			found_dev = dev_entry->dev;
+			kfree(dev_entry);
+		}
+	}
+
+	spin_unlock_irqrestore(&dev_data->lock, flags);
+
+	if (found_dev)
+		pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+	struct passthrough_dev_data *dev_data;
+
+	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+	if (!dev_data)
+		return -ENOMEM;
+
+	spin_lock_init(&dev_data->lock);
+
+	INIT_LIST_HEAD(&dev_data->dev_list);
+
+	pdev->pci_dev_data = dev_data;
+
+	return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+			      publish_pci_root_cb publish_root_cb)
+{
+	int err = 0;
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry, *e;
+	struct pci_dev *dev;
+	int found;
+	unsigned int domain, bus;
+
+	spin_lock(&dev_data->lock);
+
+	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
+		/* Only publish this device as a root if none of its
+		 * parent bridges are exported
+		 */
+		found = 0;
+		dev = dev_entry->dev->bus->self;
+		for (; !found && dev != NULL; dev = dev->bus->self) {
+			list_for_each_entry(e, &dev_data->dev_list, list) {
+				if (dev == e->dev) {
+					found = 1;
+					break;
+				}
+			}
+		}
+
+		domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
+		bus = (unsigned int)dev_entry->dev->bus->number;
+
+		if (!found) {
+			err = publish_root_cb(pdev, domain, bus);
+			if (err)
+				break;
+		}
+	}
+
+	spin_unlock(&dev_data->lock);
+
+	return err;
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
+	struct pci_dev_entry *dev_entry, *t;
+
+	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
+		list_del(&dev_entry->list);
+		pcistub_put_pci_dev(dev_entry->dev);
+		kfree(dev_entry);
+	}
+
+	kfree(dev_data);
+	pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+			     struct pciback_device *pdev,
+			     unsigned int *domain, unsigned int *bus,
+			     unsigned int *devfn)
+
+{
+	*domain = pci_domain_nr(pcidev->bus);
+	*bus = pcidev->bus->number;
+	*devfn = pcidev->devfn;
+	return 1;
+}
diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
new file mode 100644
index 0000000..c4d1071
--- /dev/null
+++ b/drivers/xen/pciback/pci_stub.c
@@ -0,0 +1,1371 @@
+/*
+ * PCI Stub Driver - Grabs devices in backend to be exported later
+ *
+ * Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Chris Bookholt <hap10@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+#include <linux/pci.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypervisor.h>
+#include "pciback.h"
+#include "conf_space.h"
+#include "conf_space_quirks.h"
+
+#define DRV_NAME	"pciback"
+
+static char *pci_devs_to_hide;
+wait_queue_head_t aer_wait_queue;
+/*Add sem for sync AER handling and pciback remove/reconfigue ops,
+* We want to avoid in middle of AER ops, pciback devices is being removed
+*/
+static DECLARE_RWSEM(pcistub_sem);
+module_param_named(hide, pci_devs_to_hide, charp, 0444);
+
+struct pcistub_device_id {
+	struct list_head slot_list;
+	int domain;
+	unsigned char bus;
+	unsigned int devfn;
+};
+static LIST_HEAD(pcistub_device_ids);
+static DEFINE_SPINLOCK(device_ids_lock);
+
+struct pcistub_device {
+	struct kref kref;
+	struct list_head dev_list;
+	spinlock_t lock;
+
+	struct pci_dev *dev;
+	struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
+};
+
+/* Access to pcistub_devices & seized_devices lists and the initialize_devices
+ * flag must be locked with pcistub_devices_lock
+ */
+static DEFINE_SPINLOCK(pcistub_devices_lock);
+static LIST_HEAD(pcistub_devices);
+
+/* wait for device_initcall before initializing our devices
+ * (see pcistub_init_devices_late)
+ */
+static int initialize_devices;
+static LIST_HEAD(seized_devices);
+
+static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+
+	dev_dbg(&dev->dev, "pcistub_device_alloc\n");
+
+	psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
+	if (!psdev)
+		return NULL;
+
+	psdev->dev = pci_dev_get(dev);
+	if (!psdev->dev) {
+		kfree(psdev);
+		return NULL;
+	}
+
+	kref_init(&psdev->kref);
+	spin_lock_init(&psdev->lock);
+
+	return psdev;
+}
+
+/* Don't call this directly as it's called by pcistub_device_put */
+static void pcistub_device_release(struct kref *kref)
+{
+	struct pcistub_device *psdev;
+
+	psdev = container_of(kref, struct pcistub_device, kref);
+
+	dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
+
+	xen_unregister_device_domain_owner(psdev->dev);
+
+	/* Clean-up the device */
+	pciback_reset_device(psdev->dev);
+	pciback_config_free_dyn_fields(psdev->dev);
+	pciback_config_free_dev(psdev->dev);
+	kfree(pci_get_drvdata(psdev->dev));
+	pci_set_drvdata(psdev->dev, NULL);
+
+	pci_dev_put(psdev->dev);
+
+	kfree(psdev);
+}
+
+static inline void pcistub_device_get(struct pcistub_device *psdev)
+{
+	kref_get(&psdev->kref);
+}
+
+static inline void pcistub_device_put(struct pcistub_device *psdev)
+{
+	kref_put(&psdev->kref, pcistub_device_release);
+}
+
+static struct pcistub_device *pcistub_device_find(int domain, int bus,
+						  int slot, int func)
+{
+	struct pcistub_device *psdev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev != NULL
+		    && domain == pci_domain_nr(psdev->dev->bus)
+		    && bus == psdev->dev->bus->number
+		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+			pcistub_device_get(psdev);
+			goto out;
+		}
+	}
+
+	/* didn't find it */
+	psdev = NULL;
+
+out:
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return psdev;
+}
+
+static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
+						  struct pcistub_device *psdev)
+{
+	struct pci_dev *pci_dev = NULL;
+	unsigned long flags;
+
+	pcistub_device_get(psdev);
+
+	spin_lock_irqsave(&psdev->lock, flags);
+	if (!psdev->pdev) {
+		psdev->pdev = pdev;
+		pci_dev = psdev->dev;
+	}
+	spin_unlock_irqrestore(&psdev->lock, flags);
+
+	if (!pci_dev)
+		pcistub_device_put(psdev);
+
+	return pci_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+					    int domain, int bus,
+					    int slot, int func)
+{
+	struct pcistub_device *psdev;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev != NULL
+		    && domain == pci_domain_nr(psdev->dev->bus)
+		    && bus == psdev->dev->bus->number
+		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
+			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return found_dev;
+}
+
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+				    struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev == dev) {
+			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return found_dev;
+}
+
+void pcistub_put_pci_dev(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev, *found_psdev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev == dev) {
+			found_psdev = psdev;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	/*hold this lock for avoiding breaking link between
+	* pcistub and pciback when AER is in processing
+	*/
+	down_write(&pcistub_sem);
+	/* Cleanup our device
+	 * (so it's ready for the next domain)
+	 */
+	pciback_reset_device(found_psdev->dev);
+	pciback_config_free_dyn_fields(found_psdev->dev);
+	pciback_config_reset_dev(found_psdev->dev);
+
+	spin_lock_irqsave(&found_psdev->lock, flags);
+	found_psdev->pdev = NULL;
+	spin_unlock_irqrestore(&found_psdev->lock, flags);
+
+	pcistub_device_put(found_psdev);
+	up_write(&pcistub_sem);
+}
+
+static int __devinit pcistub_match_one(struct pci_dev *dev,
+				       struct pcistub_device_id *pdev_id)
+{
+	/* Match the specified device by domain, bus, slot, func and also if
+	 * any of the device's parent bridges match.
+	 */
+	for (; dev != NULL; dev = dev->bus->self) {
+		if (pci_domain_nr(dev->bus) == pdev_id->domain
+		    && dev->bus->number == pdev_id->bus
+		    && dev->devfn == pdev_id->devfn)
+			return 1;
+
+		/* Sometimes topmost bridge links to itself. */
+		if (dev == dev->bus->self)
+			break;
+	}
+
+	return 0;
+}
+
+static int __devinit pcistub_match(struct pci_dev *dev)
+{
+	struct pcistub_device_id *pdev_id;
+	unsigned long flags;
+	int found = 0;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
+		if (pcistub_match_one(dev, pdev_id)) {
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return found;
+}
+
+static int __devinit pcistub_init_device(struct pci_dev *dev)
+{
+	struct pciback_dev_data *dev_data;
+	int err = 0;
+
+	dev_dbg(&dev->dev, "initializing...\n");
+
+	/* The PCI backend is not intended to be a module (or to work with
+	 * removable PCI devices (yet). If it were, pciback_config_free()
+	 * would need to be called somewhere to free the memory allocated
+	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
+	 */
+	dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
+				+ strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+	if (!dev_data) {
+		err = -ENOMEM;
+		goto out;
+	}
+	pci_set_drvdata(dev, dev_data);
+
+	/*
+	 * Setup name for fake IRQ handler. It will only be enabled
+	 * once the device is turned on by the guest.
+	 */
+	sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+
+	dev_dbg(&dev->dev, "initializing config\n");
+
+	init_waitqueue_head(&aer_wait_queue);
+	err = pciback_config_init_dev(dev);
+	if (err)
+		goto out;
+
+	/* HACK: Force device (& ACPI) to determine what IRQ it's on - we
+	 * must do this here because pcibios_enable_device may specify
+	 * the pci device's true irq (and possibly its other resources)
+	 * if they differ from what's in the configuration space.
+	 * This makes the assumption that the device's resources won't
+	 * change after this point (otherwise this code may break!)
+	 */
+	dev_dbg(&dev->dev, "enabling device\n");
+	err = pci_enable_device(dev);
+	if (err)
+		goto config_release;
+
+	/* Now disable the device (this also ensures some private device
+	 * data is setup before we export)
+	 */
+	dev_dbg(&dev->dev, "reset device\n");
+	pciback_reset_device(dev);
+
+	return 0;
+
+config_release:
+	pciback_config_free_dev(dev);
+
+out:
+	pci_set_drvdata(dev, NULL);
+	kfree(dev_data);
+	return err;
+}
+
+/*
+ * Because some initialization still happens on
+ * devices during fs_initcall, we need to defer
+ * full initialization of our devices until
+ * device_initcall.
+ */
+static int __init pcistub_init_devices_late(void)
+{
+	struct pcistub_device *psdev;
+	unsigned long flags;
+	int err = 0;
+
+	pr_debug("pciback: pcistub_init_devices_late\n");
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	while (!list_empty(&seized_devices)) {
+		psdev = container_of(seized_devices.next,
+				     struct pcistub_device, dev_list);
+		list_del(&psdev->dev_list);
+
+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+		err = pcistub_init_device(psdev->dev);
+		if (err) {
+			dev_err(&psdev->dev->dev,
+				"error %d initializing device\n", err);
+			kfree(psdev);
+			psdev = NULL;
+		}
+
+		spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+		if (psdev)
+			list_add_tail(&psdev->dev_list, &pcistub_devices);
+	}
+
+	initialize_devices = 1;
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	return 0;
+}
+
+static int __devinit pcistub_seize(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	unsigned long flags;
+	int err = 0;
+
+	psdev = pcistub_device_alloc(dev);
+	if (!psdev)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	if (initialize_devices) {
+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+		/* don't want irqs disabled when calling pcistub_init_device */
+		err = pcistub_init_device(psdev->dev);
+
+		spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+		if (!err)
+			list_add(&psdev->dev_list, &pcistub_devices);
+	} else {
+		dev_dbg(&dev->dev, "deferring initialization\n");
+		list_add(&psdev->dev_list, &seized_devices);
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	if (err)
+		pcistub_device_put(psdev);
+
+	return err;
+}
+
+static int __devinit pcistub_probe(struct pci_dev *dev,
+				   const struct pci_device_id *id)
+{
+	int err = 0;
+
+	dev_dbg(&dev->dev, "probing...\n");
+
+	if (pcistub_match(dev)) {
+
+		if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
+		    && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+			dev_err(&dev->dev, "can't export pci devices that "
+				"don't have a normal (0) or bridge (1) "
+				"header type!\n");
+			err = -ENODEV;
+			goto out;
+		}
+
+		dev_info(&dev->dev, "seizing device\n");
+		err = pcistub_seize(dev);
+	} else
+		/* Didn't find the device */
+		err = -ENODEV;
+
+out:
+	return err;
+}
+
+static void pcistub_remove(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev, *found_psdev = NULL;
+	unsigned long flags;
+
+	dev_dbg(&dev->dev, "removing\n");
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+
+	pciback_config_quirk_release(dev);
+
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (psdev->dev == dev) {
+			found_psdev = psdev;
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+	if (found_psdev) {
+		dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
+			found_psdev->pdev);
+
+		if (found_psdev->pdev) {
+			printk(KERN_WARNING "pciback: ****** removing device "
+			       "%s while still in-use! ******\n",
+			       pci_name(found_psdev->dev));
+			printk(KERN_WARNING "pciback: ****** driver domain may "
+			       "still access this device's i/o resources!\n");
+			printk(KERN_WARNING "pciback: ****** shutdown driver "
+			       "domain before binding device\n");
+			printk(KERN_WARNING "pciback: ****** to other drivers "
+			       "or domains\n");
+
+			pciback_release_pci_dev(found_psdev->pdev,
+						found_psdev->dev);
+		}
+
+		spin_lock_irqsave(&pcistub_devices_lock, flags);
+		list_del(&found_psdev->dev_list);
+		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+
+		/* the final put for releasing from the list */
+		pcistub_device_put(found_psdev);
+	}
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = {
+	{
+	 .vendor = PCI_ANY_ID,
+	 .device = PCI_ANY_ID,
+	 .subvendor = PCI_ANY_ID,
+	 .subdevice = PCI_ANY_ID,
+	 },
+	{0,},
+};
+
+#define PCI_NODENAME_MAX 40
+static void kill_domain_by_device(struct pcistub_device *psdev)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	char nodename[PCI_NODENAME_MAX];
+
+	if (!psdev)
+		dev_err(&psdev->dev->dev,
+			"device is NULL when do AER recovery/kill_domain\n");
+	snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
+		psdev->pdev->xdev->otherend_id);
+	nodename[strlen(nodename)] = '\0';
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		dev_err(&psdev->dev->dev,
+			"error %d when start xenbus transaction\n", err);
+		return;
+	}
+	/*PV AER handlers will set this flag*/
+	xenbus_printf(xbt, nodename, "aerState" , "aerfail");
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		dev_err(&psdev->dev->dev,
+			"error %d when end xenbus transaction\n", err);
+		return;
+	}
+}
+
+/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
+ * backend need to have cooperation. In pciback, those steps will do similar
+ * jobs: send service request and waiting for front_end response.
+*/
+static pci_ers_result_t common_process(struct pcistub_device *psdev,
+		pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
+{
+	pci_ers_result_t res = result;
+	struct xen_pcie_aer_op *aer_op;
+	int ret;
+
+	/*with PV AER drivers*/
+	aer_op = &(psdev->pdev->sh_info->aer_op);
+	aer_op->cmd = aer_cmd ;
+	/*useful for error_detected callback*/
+	aer_op->err = state;
+	/*pcifront_end BDF*/
+	ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
+		&aer_op->domain, &aer_op->bus, &aer_op->devfn);
+	if (!ret) {
+		dev_err(&psdev->dev->dev,
+			"pciback: failed to get pcifront device\n");
+		return PCI_ERS_RESULT_NONE;
+	}
+	wmb();
+
+	dev_dbg(&psdev->dev->dev,
+			"pciback: aer_op %x dom %x bus %x devfn %x\n",
+			aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
+	/*local flag to mark there's aer request, pciback callback will use this
+	* flag to judge whether we need to check pci-front give aer service
+	* ack signal
+	*/
+	set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+	/*It is possible that a pcifront conf_read_write ops request invokes
+	* the callback which cause the spurious execution of wake_up.
+	* Yet it is harmless and better than a spinlock here
+	*/
+	set_bit(_XEN_PCIB_active,
+		(unsigned long *)&psdev->pdev->sh_info->flags);
+	wmb();
+	notify_remote_via_irq(psdev->pdev->evtchn_irq);
+
+	ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
+		(unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
+
+	if (!ret) {
+		if (test_bit(_XEN_PCIB_active,
+			(unsigned long *)&psdev->pdev->sh_info->flags)) {
+			dev_err(&psdev->dev->dev,
+				"pcifront aer process not responding!\n");
+			clear_bit(_XEN_PCIB_active,
+			  (unsigned long *)&psdev->pdev->sh_info->flags);
+			aer_op->err = PCI_ERS_RESULT_NONE;
+			return res;
+		}
+	}
+	clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
+
+	if (test_bit(_XEN_PCIF_active,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_dbg(&psdev->dev->dev,
+			"schedule pci_conf service in pciback\n");
+		test_and_schedule_op(psdev->pdev);
+	}
+
+	res = (pci_ers_result_t)aer_op->err;
+	return res;
+}
+
+/*
+* pciback_slot_reset: it will send the slot_reset request to  pcifront in case
+* of the device driver could provide this service, and then wait for pcifront
+* ack.
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	pci_ers_result_t result;
+
+	result = PCI_ERS_RESULT_RECOVERED;
+	dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			"pciback device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, "pciback device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto release;
+	}
+
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_err(&dev->dev,
+			"guest with no AER driver should have been killed\n");
+		goto release;
+	}
+	result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
+
+	if (result == PCI_ERS_RESULT_NONE ||
+		result == PCI_ERS_RESULT_DISCONNECT) {
+		dev_dbg(&dev->dev,
+			"No AER slot_reset service or disconnected!\n");
+		kill_domain_by_device(psdev);
+	}
+release:
+	pcistub_device_put(psdev);
+end:
+	up_write(&pcistub_sem);
+	return result;
+
+}
+
+
+/*pciback_mmio_enabled: it will send the mmio_enabled request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack
+* @dev: pointer to PCI devices
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+	pci_ers_result_t result;
+
+	result = PCI_ERS_RESULT_RECOVERED;
+	dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			"pciback device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, "pciback device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto release;
+	}
+
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_err(&dev->dev,
+			"guest with no AER driver should have been killed\n");
+		goto release;
+	}
+	result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
+
+	if (result == PCI_ERS_RESULT_NONE ||
+		result == PCI_ERS_RESULT_DISCONNECT) {
+		dev_dbg(&dev->dev,
+			"No AER mmio_enabled service or disconnected!\n");
+		kill_domain_by_device(psdev);
+	}
+release:
+	pcistub_device_put(psdev);
+end:
+	up_write(&pcistub_sem);
+	return result;
+}
+
+/*pciback_error_detected: it will send the error_detected request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+* @error: the current PCI connection state
+* return value is used by aer_core do_recovery policy
+*/
+
+static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
+	pci_channel_state_t error)
+{
+	struct pcistub_device *psdev;
+	pci_ers_result_t result;
+
+	result = PCI_ERS_RESULT_CAN_RECOVER;
+	dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			"pciback device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, "pciback device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto release;
+	}
+
+	/*Guest owns the device yet no aer handler regiested, kill guest*/
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
+		kill_domain_by_device(psdev);
+		goto release;
+	}
+	result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
+
+	if (result == PCI_ERS_RESULT_NONE ||
+		result == PCI_ERS_RESULT_DISCONNECT) {
+		dev_dbg(&dev->dev,
+			"No AER error_detected service or disconnected!\n");
+		kill_domain_by_device(psdev);
+	}
+release:
+	pcistub_device_put(psdev);
+end:
+	up_write(&pcistub_sem);
+	return result;
+}
+
+/*pciback_error_resume: it will send the error_resume request to  pcifront
+* in case of the device driver could provide this service, and then wait
+* for pcifront ack.
+* @dev: pointer to PCI devices
+*/
+
+static void pciback_error_resume(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+
+	dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
+		dev->bus->number, dev->devfn);
+
+	down_write(&pcistub_sem);
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
+				dev->bus->number,
+				PCI_SLOT(dev->devfn),
+				PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev) {
+		dev_err(&dev->dev,
+			"pciback device is not found/assigned\n");
+		goto end;
+	}
+
+	if (!psdev->pdev->sh_info) {
+		dev_err(&dev->dev, "pciback device is not connected or owned"
+			" by HVM, kill it\n");
+		kill_domain_by_device(psdev);
+		goto release;
+	}
+
+	if (!test_bit(_XEN_PCIB_AERHANDLER,
+		(unsigned long *)&psdev->pdev->sh_info->flags)) {
+		dev_err(&dev->dev,
+			"guest with no AER driver should have been killed\n");
+		kill_domain_by_device(psdev);
+		goto release;
+	}
+	common_process(psdev, 1, XEN_PCI_OP_aer_resume,
+		       PCI_ERS_RESULT_RECOVERED);
+release:
+	pcistub_device_put(psdev);
+end:
+	up_write(&pcistub_sem);
+	return;
+}
+
+/*add pciback AER handling*/
+static struct pci_error_handlers pciback_error_handler = {
+	.error_detected = pciback_error_detected,
+	.mmio_enabled = pciback_mmio_enabled,
+	.slot_reset = pciback_slot_reset,
+	.resume = pciback_error_resume,
+};
+
+/*
+ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
+ * for a normal device. I don't want it to be loaded automatically.
+ */
+
+static struct pci_driver pciback_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = pcistub_ids,
+	.probe = pcistub_probe,
+	.remove = pcistub_remove,
+	.err_handler = &pciback_error_handler,
+};
+
+static inline int str_to_slot(const char *buf, int *domain, int *bus,
+			      int *slot, int *func)
+{
+	int err;
+
+	err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
+	if (err == 4)
+		return 0;
+	else if (err < 0)
+		return -EINVAL;
+
+	/* try again without domain */
+	*domain = 0;
+	err = sscanf(buf, " %x:%x.%x", bus, slot, func);
+	if (err == 3)
+		return 0;
+
+	return -EINVAL;
+}
+
+static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
+			       *slot, int *func, int *reg, int *size, int *mask)
+{
+	int err;
+
+	err =
+	    sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
+		   func, reg, size, mask);
+	if (err == 7)
+		return 0;
+	return -EINVAL;
+}
+
+static int pcistub_device_id_add(int domain, int bus, int slot, int func)
+{
+	struct pcistub_device_id *pci_dev_id;
+	unsigned long flags;
+
+	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
+	if (!pci_dev_id)
+		return -ENOMEM;
+
+	pci_dev_id->domain = domain;
+	pci_dev_id->bus = bus;
+	pci_dev_id->devfn = PCI_DEVFN(slot, func);
+
+	pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
+		 domain, bus, slot, func);
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return 0;
+}
+
+static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
+{
+	struct pcistub_device_id *pci_dev_id, *t;
+	int devfn = PCI_DEVFN(slot, func);
+	int err = -ENOENT;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
+				 slot_list) {
+		if (pci_dev_id->domain == domain
+		    && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
+			/* Don't break; here because it's possible the same
+			 * slot could be in the list more than once
+			 */
+			list_del(&pci_dev_id->slot_list);
+			kfree(pci_dev_id);
+
+			err = 0;
+
+			pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
+				 "seize list\n", domain, bus, slot, func);
+		}
+	}
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return err;
+}
+
+static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
+			   int size, int mask)
+{
+	int err = 0;
+	struct pcistub_device *psdev;
+	struct pci_dev *dev;
+	struct config_field *field;
+
+	psdev = pcistub_device_find(domain, bus, slot, func);
+	if (!psdev || !psdev->dev) {
+		err = -ENODEV;
+		goto out;
+	}
+	dev = psdev->dev;
+
+	field = kzalloc(sizeof(*field), GFP_ATOMIC);
+	if (!field) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	field->offset = reg;
+	field->size = size;
+	field->mask = mask;
+	field->init = NULL;
+	field->reset = NULL;
+	field->release = NULL;
+	field->clean = pciback_config_field_free;
+
+	err = pciback_config_quirks_add_field(dev, field);
+	if (err)
+		kfree(field);
+out:
+	return err;
+}
+
+static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
+				size_t count)
+{
+	int domain, bus, slot, func;
+	int err;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+
+	err = pcistub_device_id_add(domain, bus, slot, func);
+
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+
+DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
+
+static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
+				   size_t count)
+{
+	int domain, bus, slot, func;
+	int err;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+
+	err = pcistub_device_id_remove(domain, bus, slot, func);
+
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+
+DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
+
+static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
+{
+	struct pcistub_device_id *pci_dev_id;
+	size_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
+		if (count >= PAGE_SIZE)
+			break;
+
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				   "%04x:%02x:%02x.%01x\n",
+				   pci_dev_id->domain, pci_dev_id->bus,
+				   PCI_SLOT(pci_dev_id->devfn),
+				   PCI_FUNC(pci_dev_id->devfn));
+	}
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return count;
+}
+
+DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
+
+static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
+{
+	struct pcistub_device *psdev;
+	struct pciback_dev_data *dev_data;
+	size_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (count >= PAGE_SIZE)
+			break;
+		if (!psdev->dev)
+			continue;
+		dev_data = pci_get_drvdata(psdev->dev);
+		if (!dev_data)
+			continue;
+		count +=
+		    scnprintf(buf + count, PAGE_SIZE - count,
+			      "%s:%s:%sing:%ld\n",
+			      pci_name(psdev->dev),
+			      dev_data->isr_on ? "on" : "off",
+			      dev_data->ack_intr ? "ack" : "not ack",
+			      dev_data->handled);
+	}
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return count;
+}
+
+DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
+
+static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
+					  const char *buf,
+					  size_t count)
+{
+	struct pcistub_device *psdev;
+	struct pciback_dev_data *dev_data;
+	int domain, bus, slot, func;
+	int err = -ENOENT;
+
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+
+	psdev = pcistub_device_find(domain, bus, slot, func);
+
+	if (!psdev)
+		goto out;
+
+	dev_data = pci_get_drvdata(psdev->dev);
+	if (!dev_data)
+		goto out;
+
+	dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
+		dev_data->irq_name, dev_data->isr_on,
+		!dev_data->isr_on);
+
+	dev_data->isr_on = !(dev_data->isr_on);
+	if (dev_data->isr_on)
+		dev_data->ack_intr = 1;
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch);
+
+static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
+				 size_t count)
+{
+	int domain, bus, slot, func, reg, size, mask;
+	int err;
+
+	err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
+			   &mask);
+	if (err)
+		goto out;
+
+	err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
+
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+
+static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
+{
+	int count = 0;
+	unsigned long flags;
+	struct pciback_config_quirk *quirk;
+	struct pciback_dev_data *dev_data;
+	const struct config_field *field;
+	const struct config_field_entry *cfg_entry;
+
+	spin_lock_irqsave(&device_ids_lock, flags);
+	list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
+		if (count >= PAGE_SIZE)
+			goto out;
+
+		count += scnprintf(buf + count, PAGE_SIZE - count,
+				   "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
+				   quirk->pdev->bus->number,
+				   PCI_SLOT(quirk->pdev->devfn),
+				   PCI_FUNC(quirk->pdev->devfn),
+				   quirk->devid.vendor, quirk->devid.device,
+				   quirk->devid.subvendor,
+				   quirk->devid.subdevice);
+
+		dev_data = pci_get_drvdata(quirk->pdev);
+
+		list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
+			field = cfg_entry->field;
+			if (count >= PAGE_SIZE)
+				goto out;
+
+			count += scnprintf(buf + count, PAGE_SIZE - count,
+					   "\t\t%08x:%01x:%08x\n",
+					   cfg_entry->base_offset +
+					   field->offset, field->size,
+					   field->mask);
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&device_ids_lock, flags);
+
+	return count;
+}
+
+DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
+
+static ssize_t permissive_add(struct device_driver *drv, const char *buf,
+			      size_t count)
+{
+	int domain, bus, slot, func;
+	int err;
+	struct pcistub_device *psdev;
+	struct pciback_dev_data *dev_data;
+	err = str_to_slot(buf, &domain, &bus, &slot, &func);
+	if (err)
+		goto out;
+	psdev = pcistub_device_find(domain, bus, slot, func);
+	if (!psdev) {
+		err = -ENODEV;
+		goto out;
+	}
+	if (!psdev->dev) {
+		err = -ENODEV;
+		goto release;
+	}
+	dev_data = pci_get_drvdata(psdev->dev);
+	/* the driver data for a device should never be null at this point */
+	if (!dev_data) {
+		err = -ENXIO;
+		goto release;
+	}
+	if (!dev_data->permissive) {
+		dev_data->permissive = 1;
+		/* Let user know that what they're doing could be unsafe */
+		dev_warn(&psdev->dev->dev, "enabling permissive mode "
+			 "configuration space accesses!\n");
+		dev_warn(&psdev->dev->dev,
+			 "permissive mode is potentially unsafe!\n");
+	}
+release:
+	pcistub_device_put(psdev);
+out:
+	if (!err)
+		err = count;
+	return err;
+}
+
+static ssize_t permissive_show(struct device_driver *drv, char *buf)
+{
+	struct pcistub_device *psdev;
+	struct pciback_dev_data *dev_data;
+	size_t count = 0;
+	unsigned long flags;
+	spin_lock_irqsave(&pcistub_devices_lock, flags);
+	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
+		if (count >= PAGE_SIZE)
+			break;
+		if (!psdev->dev)
+			continue;
+		dev_data = pci_get_drvdata(psdev->dev);
+		if (!dev_data || !dev_data->permissive)
+			continue;
+		count +=
+		    scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
+			      pci_name(psdev->dev));
+	}
+	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
+	return count;
+}
+
+DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
+
+static void pcistub_exit(void)
+{
+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
+	driver_remove_file(&pciback_pci_driver.driver,
+			   &driver_attr_remove_slot);
+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
+	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
+	driver_remove_file(&pciback_pci_driver.driver,
+			   &driver_attr_irq_handlers);
+	driver_remove_file(&pciback_pci_driver.driver,
+			   &driver_attr_irq_handler_state);
+	pci_unregister_driver(&pciback_pci_driver);
+}
+
+static int __init pcistub_init(void)
+{
+	int pos = 0;
+	int err = 0;
+	int domain, bus, slot, func;
+	int parsed;
+
+	if (pci_devs_to_hide && *pci_devs_to_hide) {
+		do {
+			parsed = 0;
+
+			err = sscanf(pci_devs_to_hide + pos,
+				     " (%x:%x:%x.%x) %n",
+				     &domain, &bus, &slot, &func, &parsed);
+			if (err != 4) {
+				domain = 0;
+				err = sscanf(pci_devs_to_hide + pos,
+					     " (%x:%x.%x) %n",
+					     &bus, &slot, &func, &parsed);
+				if (err != 3)
+					goto parse_error;
+			}
+
+			err = pcistub_device_id_add(domain, bus, slot, func);
+			if (err)
+				goto out;
+
+			/* if parsed<=0, we've reached the end of the string */
+			pos += parsed;
+		} while (parsed > 0 && pci_devs_to_hide[pos]);
+	}
+
+	/* If we're the first PCI Device Driver to register, we're the
+	 * first one to get offered PCI devices as they become
+	 * available (and thus we can be the first to grab them)
+	 */
+	err = pci_register_driver(&pciback_pci_driver);
+	if (err < 0)
+		goto out;
+
+	err = driver_create_file(&pciback_pci_driver.driver,
+				 &driver_attr_new_slot);
+	if (!err)
+		err = driver_create_file(&pciback_pci_driver.driver,
+					 &driver_attr_remove_slot);
+	if (!err)
+		err = driver_create_file(&pciback_pci_driver.driver,
+					 &driver_attr_slots);
+	if (!err)
+		err = driver_create_file(&pciback_pci_driver.driver,
+					 &driver_attr_quirks);
+	if (!err)
+		err = driver_create_file(&pciback_pci_driver.driver,
+					 &driver_attr_permissive);
+
+	if (!err)
+		err = driver_create_file(&pciback_pci_driver.driver,
+					 &driver_attr_irq_handlers);
+	if (!err)
+		err = driver_create_file(&pciback_pci_driver.driver,
+					&driver_attr_irq_handler_state);
+	if (err)
+		pcistub_exit();
+
+out:
+	return err;
+
+parse_error:
+	printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
+	       pci_devs_to_hide + pos);
+	return -EINVAL;
+}
+
+#ifndef MODULE
+/*
+ * fs_initcall happens before device_initcall
+ * so pciback *should* get called first (b/c we
+ * want to suck up any device before other drivers
+ * get a chance by being the first pci device
+ * driver to register)
+ */
+fs_initcall(pcistub_init);
+#endif
+
+static int __init pciback_init(void)
+{
+	int err;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	err = pciback_config_init();
+	if (err)
+		return err;
+
+#ifdef MODULE
+	err = pcistub_init();
+	if (err < 0)
+		return err;
+#endif
+
+	pcistub_init_devices_late();
+	err = pciback_xenbus_register();
+	if (err)
+		pcistub_exit();
+
+	return err;
+}
+
+static void __exit pciback_cleanup(void)
+{
+	pciback_xenbus_unregister();
+	pcistub_exit();
+}
+
+module_init(pciback_init);
+module_exit(pciback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
new file mode 100644
index 0000000..5c14020
--- /dev/null
+++ b/drivers/xen/pciback/pciback.h
@@ -0,0 +1,142 @@
+/*
+ * PCI Backend Common Data Structures & Function Declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIBACK_H__
+#define __XEN_PCIBACK_H__
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <xen/xenbus.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/atomic.h>
+#include <xen/interface/io/pciif.h>
+
+struct pci_dev_entry {
+	struct list_head list;
+	struct pci_dev *dev;
+};
+
+#define _PDEVF_op_active	(0)
+#define PDEVF_op_active		(1<<(_PDEVF_op_active))
+#define _PCIB_op_pending	(1)
+#define PCIB_op_pending		(1<<(_PCIB_op_pending))
+
+struct pciback_device {
+	void *pci_dev_data;
+	spinlock_t dev_lock;
+
+	struct xenbus_device *xdev;
+
+	struct xenbus_watch be_watch;
+	u8 be_watching;
+
+	int evtchn_irq;
+
+	struct xen_pci_sharedinfo *sh_info;
+
+	unsigned long flags;
+
+	struct work_struct op_work;
+};
+
+struct pciback_dev_data {
+	struct list_head config_fields;
+	unsigned int permissive:1;
+	unsigned int warned_on_write:1;
+	unsigned int enable_intx:1;
+	unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
+	unsigned int ack_intr:1; /* .. and ACK-ing */
+	unsigned long handled;
+	unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
+	char irq_name[0]; /* pciback[000:04:00.0] */
+};
+
+/* Used by XenBus and pciback_ops.c */
+extern wait_queue_head_t aer_wait_queue;
+extern struct workqueue_struct *pciback_wq;
+/* Used by pcistub.c and conf_space_quirks.c */
+extern struct list_head pciback_quirks;
+
+/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
+struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
+					    int domain, int bus,
+					    int slot, int func);
+struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
+				    struct pci_dev *dev);
+void pcistub_put_pci_dev(struct pci_dev *dev);
+
+/* Ensure a device is turned off or reset */
+void pciback_reset_device(struct pci_dev *pdev);
+
+/* Access a virtual configuration space for a PCI device */
+int pciback_config_init(void);
+int pciback_config_init_dev(struct pci_dev *dev);
+void pciback_config_free_dyn_fields(struct pci_dev *dev);
+void pciback_config_reset_dev(struct pci_dev *dev);
+void pciback_config_free_dev(struct pci_dev *dev);
+int pciback_config_read(struct pci_dev *dev, int offset, int size,
+			u32 *ret_val);
+int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
+
+/* Handle requests for specific devices from the frontend */
+typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
+				   unsigned int domain, unsigned int bus,
+				   unsigned int devfn, unsigned int devid);
+typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus);
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+			int devid, publish_pci_dev_cb publish_cb);
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus,
+				    unsigned int devfn);
+
+/**
+* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
+* before sending aer request to pcifront, so that guest could identify
+* device, coopearte with pciback to finish aer recovery job if device driver
+* has the capability
+*/
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+			     struct pciback_device *pdev,
+			     unsigned int *domain, unsigned int *bus,
+			     unsigned int *devfn);
+int pciback_init_devices(struct pciback_device *pdev);
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+			      publish_pci_root_cb cb);
+void pciback_release_devices(struct pciback_device *pdev);
+
+/* Handles events from front-end */
+irqreturn_t pciback_handle_event(int irq, void *dev_id);
+void pciback_do_op(struct work_struct *data);
+
+int pciback_xenbus_register(void);
+void pciback_xenbus_unregister(void);
+
+#ifdef CONFIG_PCI_MSI
+int pciback_enable_msi(struct pciback_device *pdev,
+			struct pci_dev *dev, struct xen_pci_op *op);
+
+int pciback_disable_msi(struct pciback_device *pdev,
+			struct pci_dev *dev, struct xen_pci_op *op);
+
+
+int pciback_enable_msix(struct pciback_device *pdev,
+			struct pci_dev *dev, struct xen_pci_op *op);
+
+int pciback_disable_msix(struct pciback_device *pdev,
+			struct pci_dev *dev, struct xen_pci_op *op);
+#endif
+extern int verbose_request;
+
+void test_and_schedule_op(struct pciback_device *pdev);
+#endif
+
+/* Handles shared IRQs that can to device domain and control domain. */
+void pciback_irq_handler(struct pci_dev *dev, int reset);
+irqreturn_t pciback_guest_interrupt(int irq, void *dev_id);
diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
new file mode 100644
index 0000000..28a2a55
--- /dev/null
+++ b/drivers/xen/pciback/pciback_ops.c
@@ -0,0 +1,248 @@
+/*
+ * PCI Backend Operations - respond to PCI requests from Frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <xen/events.h>
+#include <linux/sched.h>
+#include "pciback.h"
+
+int verbose_request;
+module_param(verbose_request, int, 0644);
+
+/* Ensure a device is has the fake IRQ handler "turned on/off" and is
+ * ready to be exported. This MUST be run after pciback_reset_device
+ * which does the actual PCI device enable/disable.
+ */
+void pciback_control_isr(struct pci_dev *dev, int reset)
+{
+	struct pciback_dev_data *dev_data;
+	int rc;
+	int enable = 0;
+
+	dev_data = pci_get_drvdata(dev);
+	if (!dev_data)
+		return;
+
+	/* We don't deal with bridges */
+	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return;
+
+	if (reset) {
+		dev_data->enable_intx = 0;
+		dev_data->ack_intr = 0;
+	}
+	enable =  dev_data->enable_intx;
+
+	/* Asked to disable, but ISR isn't runnig */
+	if (!enable && !dev_data->isr_on)
+		return;
+
+	/* Squirrel away the IRQs in the dev_data. We need this
+	 * b/c when device transitions to MSI, the dev->irq is
+	 * overwritten with the MSI vector.
+	 */
+	if (enable)
+		dev_data->irq = dev->irq;
+
+	/*
+	 * SR-IOV devices in all use MSI-X and have no legacy
+	 * interrupts, so inhibit creating a fake IRQ handler for them.
+	 */
+	if (dev_data->irq == 0)
+		goto out;
+
+	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
+		dev_data->irq_name,
+		dev_data->irq,
+		pci_is_enabled(dev) ? "on" : "off",
+		dev->msi_enabled ? "MSI" : "",
+		dev->msix_enabled ? "MSI/X" : "",
+		dev_data->isr_on ? "enable" : "disable",
+		enable ? "enable" : "disable");
+
+	if (enable) {
+		rc = request_irq(dev_data->irq,
+				pciback_guest_interrupt, IRQF_SHARED,
+				dev_data->irq_name, dev);
+		if (rc) {
+			dev_err(&dev->dev, "%s: failed to install fake IRQ " \
+				"handler for IRQ %d! (rc:%d)\n",
+				dev_data->irq_name, dev_data->irq, rc);
+			goto out;
+		}
+	} else {
+		free_irq(dev_data->irq, dev);
+		dev_data->irq = 0;
+	}
+	dev_data->isr_on = enable;
+	dev_data->ack_intr = enable;
+out:
+	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
+		dev_data->irq_name,
+		dev_data->irq,
+		pci_is_enabled(dev) ? "on" : "off",
+		dev->msi_enabled ? "MSI" : "",
+		dev->msix_enabled ? "MSI/X" : "",
+		enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
+			(dev_data->isr_on ? "failed to disable" : "disabled"));
+}
+
+/* Ensure a device is "turned off" and ready to be exported.
+ * (Also see pciback_config_reset to ensure virtual configuration space is
+ * ready to be re-exported)
+ */
+void pciback_reset_device(struct pci_dev *dev)
+{
+	u16 cmd;
+
+	pciback_control_isr(dev, 1 /* reset device */);
+
+	/* Disable devices (but not bridges) */
+	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+#ifdef CONFIG_PCI_MSI
+		/* The guest could have been abruptly killed without
+		 * disabling MSI/MSI-X interrupts.*/
+		if (dev->msix_enabled)
+			pci_disable_msix(dev);
+		if (dev->msi_enabled)
+			pci_disable_msi(dev);
+#endif
+		pci_disable_device(dev);
+
+		pci_write_config_word(dev, PCI_COMMAND, 0);
+
+		dev->is_busmaster = 0;
+	} else {
+		pci_read_config_word(dev, PCI_COMMAND, &cmd);
+		if (cmd & (PCI_COMMAND_INVALIDATE)) {
+			cmd &= ~(PCI_COMMAND_INVALIDATE);
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
+
+			dev->is_busmaster = 0;
+		}
+	}
+}
+/*
+* Now the same evtchn is used for both pcifront conf_read_write request
+* as well as pcie aer front end ack. We use a new work_queue to schedule
+* pciback conf_read_write service for avoiding confict with aer_core
+* do_recovery job which also use the system default work_queue
+*/
+void test_and_schedule_op(struct pciback_device *pdev)
+{
+	/* Check that frontend is requesting an operation and that we are not
+	 * already processing a request */
+	if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
+	    && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
+		queue_work(pciback_wq, &pdev->op_work);
+	}
+	/*_XEN_PCIB_active should have been cleared by pcifront. And also make
+	sure pciback is waiting for ack by checking _PCIB_op_pending*/
+	if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+	    && test_bit(_PCIB_op_pending, &pdev->flags)) {
+		wake_up(&aer_wait_queue);
+	}
+}
+
+/* Performing the configuration space reads/writes must not be done in atomic
+ * context because some of the pci_* functions can sleep (mostly due to ACPI
+ * use of semaphores). This function is intended to be called from a work
+ * queue in process context taking a struct pciback_device as a parameter */
+
+void pciback_do_op(struct work_struct *data)
+{
+	struct pciback_device *pdev =
+		container_of(data, struct pciback_device, op_work);
+	struct pci_dev *dev;
+	struct pciback_dev_data *dev_data = NULL;
+	struct xen_pci_op *op = &pdev->sh_info->op;
+	int test_intx = 0;
+
+	dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
+
+	if (dev == NULL)
+		op->err = XEN_PCI_ERR_dev_not_found;
+	else {
+		dev_data = pci_get_drvdata(dev);
+		if (dev_data)
+			test_intx = dev_data->enable_intx;
+		switch (op->cmd) {
+		case XEN_PCI_OP_conf_read:
+			op->err = pciback_config_read(dev,
+				  op->offset, op->size, &op->value);
+			break;
+		case XEN_PCI_OP_conf_write:
+			op->err = pciback_config_write(dev,
+				  op->offset, op->size,	op->value);
+			break;
+#ifdef CONFIG_PCI_MSI
+		case XEN_PCI_OP_enable_msi:
+			op->err = pciback_enable_msi(pdev, dev, op);
+			break;
+		case XEN_PCI_OP_disable_msi:
+			op->err = pciback_disable_msi(pdev, dev, op);
+			break;
+		case XEN_PCI_OP_enable_msix:
+			op->err = pciback_enable_msix(pdev, dev, op);
+			break;
+		case XEN_PCI_OP_disable_msix:
+			op->err = pciback_disable_msix(pdev, dev, op);
+			break;
+#endif
+		default:
+			op->err = XEN_PCI_ERR_not_implemented;
+			break;
+		}
+	}
+	if (!op->err && dev && dev_data) {
+		/* Transition detected */
+		if ((dev_data->enable_intx != test_intx))
+			pciback_control_isr(dev, 0 /* no reset */);
+	}
+	/* Tell the driver domain that we're done. */
+	wmb();
+	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+	notify_remote_via_irq(pdev->evtchn_irq);
+
+	/* Mark that we're done. */
+	smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
+	clear_bit(_PDEVF_op_active, &pdev->flags);
+	smp_mb__after_clear_bit(); /* /before/ final check for work */
+
+	/* Check to see if the driver domain tried to start another request in
+	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
+	*/
+	test_and_schedule_op(pdev);
+}
+
+irqreturn_t pciback_handle_event(int irq, void *dev_id)
+{
+	struct pciback_device *pdev = dev_id;
+
+	test_and_schedule_op(pdev);
+
+	return IRQ_HANDLED;
+}
+irqreturn_t pciback_guest_interrupt(int irq, void *dev_id)
+{
+	struct pci_dev *dev = (struct pci_dev *)dev_id;
+	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
+
+	if (dev_data->isr_on && dev_data->ack_intr) {
+		dev_data->handled++;
+		if ((dev_data->handled % 1000) == 0) {
+			if (xen_test_irq_shared(irq)) {
+				printk(KERN_INFO "%s IRQ line is not shared "
+					"with other domains. Turning ISR off\n",
+					 dev_data->irq_name);
+				dev_data->ack_intr = 0;
+			}
+		}
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
new file mode 100644
index 0000000..efb922d
--- /dev/null
+++ b/drivers/xen/pciback/slot.c
@@ -0,0 +1,191 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
+ *   Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+/* There are at most 32 slots in a pci bus.  */
+#define PCI_SLOT_MAX 32
+
+#define PCI_BUS_NBR 2
+
+struct slot_dev_data {
+	/* Access to dev_list must be protected by lock */
+	struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
+	spinlock_t lock;
+};
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus,
+				    unsigned int devfn)
+{
+	struct pci_dev *dev = NULL;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	unsigned long flags;
+
+	if (domain != 0 || PCI_FUNC(devfn) != 0)
+		return NULL;
+
+	if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
+		return NULL;
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+	dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+	return dev;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+			int devid, publish_pci_dev_cb publish_cb)
+{
+	int err = 0, slot, bus;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	unsigned long flags;
+
+	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+		err = -EFAULT;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Can't export bridges on the virtual PCI bus");
+		goto out;
+	}
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+
+	/* Assign to a new slot on the virtual PCI bus */
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			if (slot_dev->slots[bus][slot] == NULL) {
+				printk(KERN_INFO
+				       "pciback: slot: %s: assign to virtual "
+				       "slot %d, bus %d\n",
+				       pci_name(dev), slot, bus);
+				slot_dev->slots[bus][slot] = dev;
+				goto unlock;
+			}
+		}
+
+	err = -ENOMEM;
+	xenbus_dev_fatal(pdev->xdev, err,
+			 "No more space on root virtual PCI bus");
+
+unlock:
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+	/* Publish this device. */
+	if (!err)
+		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
+out:
+	return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+	int slot, bus;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			if (slot_dev->slots[bus][slot] == dev) {
+				slot_dev->slots[bus][slot] = NULL;
+				found_dev = dev;
+				goto out;
+			}
+		}
+
+out:
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+	if (found_dev)
+		pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+	int slot, bus;
+	struct slot_dev_data *slot_dev;
+
+	slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
+	if (!slot_dev)
+		return -ENOMEM;
+
+	spin_lock_init(&slot_dev->lock);
+
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+			slot_dev->slots[bus][slot] = NULL;
+
+	pdev->pci_dev_data = slot_dev;
+
+	return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+			      publish_pci_root_cb publish_cb)
+{
+	/* The Virtual PCI bus has only one root */
+	return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+	int slot, bus;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	struct pci_dev *dev;
+
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			dev = slot_dev->slots[bus][slot];
+			if (dev != NULL)
+				pcistub_put_pci_dev(dev);
+		}
+
+	kfree(slot_dev);
+	pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+			     struct pciback_device *pdev,
+			     unsigned int *domain, unsigned int *bus,
+			     unsigned int *devfn)
+{
+	int slot, busnr;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	struct pci_dev *dev;
+	int found = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+
+	for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			dev = slot_dev->slots[busnr][slot];
+			if (dev && dev->bus->number == pcidev->bus->number
+				&& dev->devfn == pcidev->devfn
+				&& pci_domain_nr(dev->bus) ==
+					pci_domain_nr(pcidev->bus)) {
+				found = 1;
+				*domain = 0;
+				*bus = busnr;
+				*devfn = PCI_DEVFN(slot, 0);
+				goto out;
+			}
+		}
+out:
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+	return found;
+
+}
diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
new file mode 100644
index 0000000..2857ab8
--- /dev/null
+++ b/drivers/xen/pciback/vpci.c
@@ -0,0 +1,244 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+#define PCI_SLOT_MAX 32
+
+struct vpci_dev_data {
+	/* Access to dev_list must be protected by lock */
+	struct list_head dev_list[PCI_SLOT_MAX];
+	spinlock_t lock;
+};
+
+static inline struct list_head *list_first(struct list_head *head)
+{
+	return head->next;
+}
+
+struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus,
+				    unsigned int devfn)
+{
+	struct pci_dev_entry *entry;
+	struct pci_dev *dev = NULL;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+	unsigned long flags;
+
+	if (domain != 0 || bus != 0)
+		return NULL;
+
+	if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
+		spin_lock_irqsave(&vpci_dev->lock, flags);
+
+		list_for_each_entry(entry,
+				    &vpci_dev->dev_list[PCI_SLOT(devfn)],
+				    list) {
+			if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
+				dev = entry->dev;
+				break;
+			}
+		}
+
+		spin_unlock_irqrestore(&vpci_dev->lock, flags);
+	}
+	return dev;
+}
+
+static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
+{
+	if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
+	    && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
+		return 1;
+
+	return 0;
+}
+
+int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
+			int devid, publish_pci_dev_cb publish_cb)
+{
+	int err = 0, slot, func = -1;
+	struct pci_dev_entry *t, *dev_entry;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+	unsigned long flags;
+
+	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+		err = -EFAULT;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Can't export bridges on the virtual PCI bus");
+		goto out;
+	}
+
+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
+	if (!dev_entry) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error adding entry to virtual PCI bus");
+		goto out;
+	}
+
+	dev_entry->dev = dev;
+
+	spin_lock_irqsave(&vpci_dev->lock, flags);
+
+	/* Keep multi-function devices together on the virtual PCI bus */
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		if (!list_empty(&vpci_dev->dev_list[slot])) {
+			t = list_entry(list_first(&vpci_dev->dev_list[slot]),
+				       struct pci_dev_entry, list);
+
+			if (match_slot(dev, t->dev)) {
+				pr_info("pciback: vpci: %s: "
+					"assign to virtual slot %d func %d\n",
+					pci_name(dev), slot,
+					PCI_FUNC(dev->devfn));
+				list_add_tail(&dev_entry->list,
+					      &vpci_dev->dev_list[slot]);
+				func = PCI_FUNC(dev->devfn);
+				goto unlock;
+			}
+		}
+	}
+
+	/* Assign to a new slot on the virtual PCI bus */
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		if (list_empty(&vpci_dev->dev_list[slot])) {
+			printk(KERN_INFO
+			       "pciback: vpci: %s: assign to virtual slot %d\n",
+			       pci_name(dev), slot);
+			list_add_tail(&dev_entry->list,
+				      &vpci_dev->dev_list[slot]);
+			func = PCI_FUNC(dev->devfn);
+			goto unlock;
+		}
+	}
+
+	err = -ENOMEM;
+	xenbus_dev_fatal(pdev->xdev, err,
+			 "No more space on root virtual PCI bus");
+
+unlock:
+	spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+	/* Publish this device. */
+	if (!err)
+		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
+
+out:
+	return err;
+}
+
+void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
+{
+	int slot;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vpci_dev->lock, flags);
+
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		struct pci_dev_entry *e, *tmp;
+		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+					 list) {
+			if (e->dev == dev) {
+				list_del(&e->list);
+				found_dev = e->dev;
+				kfree(e);
+				goto out;
+			}
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&vpci_dev->lock, flags);
+
+	if (found_dev)
+		pcistub_put_pci_dev(found_dev);
+}
+
+int pciback_init_devices(struct pciback_device *pdev)
+{
+	int slot;
+	struct vpci_dev_data *vpci_dev;
+
+	vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
+	if (!vpci_dev)
+		return -ENOMEM;
+
+	spin_lock_init(&vpci_dev->lock);
+
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+		INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
+
+	pdev->pci_dev_data = vpci_dev;
+
+	return 0;
+}
+
+int pciback_publish_pci_roots(struct pciback_device *pdev,
+			      publish_pci_root_cb publish_cb)
+{
+	/* The Virtual PCI bus has only one root */
+	return publish_cb(pdev, 0, 0);
+}
+
+void pciback_release_devices(struct pciback_device *pdev)
+{
+	int slot;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		struct pci_dev_entry *e, *tmp;
+		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
+					 list) {
+			list_del(&e->list);
+			pcistub_put_pci_dev(e->dev);
+			kfree(e);
+		}
+	}
+
+	kfree(vpci_dev);
+	pdev->pci_dev_data = NULL;
+}
+
+int pciback_get_pcifront_dev(struct pci_dev *pcidev,
+			     struct pciback_device *pdev,
+			     unsigned int *domain, unsigned int *bus,
+			     unsigned int *devfn)
+{
+	struct pci_dev_entry *entry;
+	struct pci_dev *dev = NULL;
+	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
+	unsigned long flags;
+	int found = 0, slot;
+
+	spin_lock_irqsave(&vpci_dev->lock, flags);
+	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+		list_for_each_entry(entry,
+			    &vpci_dev->dev_list[slot],
+			    list) {
+			dev = entry->dev;
+			if (dev && dev->bus->number == pcidev->bus->number
+				&& pci_domain_nr(dev->bus) ==
+					pci_domain_nr(pcidev->bus)
+				&& dev->devfn == pcidev->devfn) {
+				found = 1;
+				*domain = 0;
+				*bus = 0;
+				*devfn = PCI_DEVFN(slot,
+					 PCI_FUNC(pcidev->devfn));
+			}
+		}
+	}
+	spin_unlock_irqrestore(&vpci_dev->lock, flags);
+	return found;
+}
diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
new file mode 100644
index 0000000..70030c4
--- /dev/null
+++ b/drivers/xen/pciback/xenbus.c
@@ -0,0 +1,726 @@
+/*
+ * PCI Backend Xenbus Setup - handles setup with frontend and xend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <linux/workqueue.h>
+#include "pciback.h"
+
+#define INVALID_EVTCHN_IRQ  (-1)
+struct workqueue_struct *pciback_wq;
+
+static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
+{
+	struct pciback_device *pdev;
+
+	pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
+	if (pdev == NULL)
+		goto out;
+	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
+
+	pdev->xdev = xdev;
+	dev_set_drvdata(&xdev->dev, pdev);
+
+	spin_lock_init(&pdev->dev_lock);
+
+	pdev->sh_info = NULL;
+	pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+	pdev->be_watching = 0;
+
+	INIT_WORK(&pdev->op_work, pciback_do_op);
+
+	if (pciback_init_devices(pdev)) {
+		kfree(pdev);
+		pdev = NULL;
+	}
+out:
+	return pdev;
+}
+
+static void pciback_disconnect(struct pciback_device *pdev)
+{
+	spin_lock(&pdev->dev_lock);
+
+	/* Ensure the guest can't trigger our handler before removing devices */
+	if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
+		unbind_from_irqhandler(pdev->evtchn_irq, pdev);
+		pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
+	}
+	spin_unlock(&pdev->dev_lock);
+
+	/* If the driver domain started an op, make sure we complete it
+	 * before releasing the shared memory */
+
+	/* Note, the workqueue does not use spinlocks at all.*/
+	flush_workqueue(pciback_wq);
+
+	spin_lock(&pdev->dev_lock);
+	if (pdev->sh_info != NULL) {
+		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+		pdev->sh_info = NULL;
+	}
+	spin_unlock(&pdev->dev_lock);
+
+}
+
+static void free_pdev(struct pciback_device *pdev)
+{
+	if (pdev->be_watching) {
+		unregister_xenbus_watch(&pdev->be_watch);
+		pdev->be_watching = 0;
+	}
+
+	pciback_disconnect(pdev);
+
+	pciback_release_devices(pdev);
+
+	dev_set_drvdata(&pdev->xdev->dev, NULL);
+	pdev->xdev = NULL;
+
+	kfree(pdev);
+}
+
+static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
+			     int remote_evtchn)
+{
+	int err = 0;
+	void *vaddr;
+
+	dev_dbg(&pdev->xdev->dev,
+		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
+		gnt_ref, remote_evtchn);
+
+	err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+	if (err < 0) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				"Error mapping other domain page in ours.");
+		goto out;
+	}
+
+	spin_lock(&pdev->dev_lock);
+	pdev->sh_info = vaddr;
+	spin_unlock(&pdev->dev_lock);
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
+		0, "pciback", pdev);
+	if (err < 0) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error binding event channel to IRQ");
+		goto out;
+	}
+
+	spin_lock(&pdev->dev_lock);
+	pdev->evtchn_irq = err;
+	spin_unlock(&pdev->dev_lock);
+	err = 0;
+
+	dev_dbg(&pdev->xdev->dev, "Attached!\n");
+out:
+	return err;
+}
+
+static int pciback_attach(struct pciback_device *pdev)
+{
+	int err = 0;
+	int gnt_ref, remote_evtchn;
+	char *magic = NULL;
+
+
+	/* Make sure we only do this setup once */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateInitialised)
+		goto out;
+
+	/* Wait for frontend to state that it has published the configuration */
+	if (xenbus_read_driver_state(pdev->xdev->otherend) !=
+	    XenbusStateInitialised)
+		goto out;
+
+	dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
+
+	err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
+			    "pci-op-ref", "%u", &gnt_ref,
+			    "event-channel", "%u", &remote_evtchn,
+			    "magic", NULL, &magic, NULL);
+	if (err) {
+		/* If configuration didn't get read correctly, wait longer */
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading configuration from frontend");
+		goto out;
+	}
+
+	if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
+		xenbus_dev_fatal(pdev->xdev, -EFAULT,
+				 "version mismatch (%s/%s) with pcifront - "
+				 "halting pciback",
+				 magic, XEN_PCI_MAGIC);
+		goto out;
+	}
+
+	err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
+	if (err)
+		goto out;
+
+	dev_dbg(&pdev->xdev->dev, "Connecting...\n");
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error switching to connected state!");
+
+	dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
+out:
+
+	kfree(magic);
+
+	return err;
+}
+
+static int pciback_publish_pci_dev(struct pciback_device *pdev,
+				   unsigned int domain, unsigned int bus,
+				   unsigned int devfn, unsigned int devid)
+{
+	int err;
+	int len;
+	char str[64];
+
+	len = snprintf(str, sizeof(str), "vdev-%d", devid);
+	if (unlikely(len >= (sizeof(str) - 1))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+			    "%04x:%02x:%02x.%02x", domain, bus,
+			    PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+out:
+	return err;
+}
+
+static int pciback_export_device(struct pciback_device *pdev,
+				 int domain, int bus, int slot, int func,
+				 int devid)
+{
+	struct pci_dev *dev;
+	int err = 0;
+
+	dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
+		domain, bus, slot, func);
+
+	dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
+	if (!dev) {
+		err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Couldn't locate PCI device "
+				 "(%04x:%02x:%02x.%01x)! "
+				 "perhaps already in-use?",
+				 domain, bus, slot, func);
+		goto out;
+	}
+
+	err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
+	if (err)
+		goto out;
+
+	dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
+	if (xen_register_device_domain_owner(dev,
+					     pdev->xdev->otherend_id) != 0) {
+		dev_err(&dev->dev, "device has been assigned to another " \
+			"domain! Over-writting the ownership, but beware.\n");
+		xen_unregister_device_domain_owner(dev);
+		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
+	}
+
+	/* TODO: It'd be nice to export a bridge and have all of its children
+	 * get exported with it. This may be best done in xend (which will
+	 * have to calculate resource usage anyway) but we probably want to
+	 * put something in here to ensure that if a bridge gets given to a
+	 * driver domain, that all devices under that bridge are not given
+	 * to other driver domains (as he who controls the bridge can disable
+	 * it and stop the other devices from working).
+	 */
+out:
+	return err;
+}
+
+static int pciback_remove_device(struct pciback_device *pdev,
+				 int domain, int bus, int slot, int func)
+{
+	int err = 0;
+	struct pci_dev *dev;
+
+	dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
+		domain, bus, slot, func);
+
+	dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
+	if (!dev) {
+		err = -EINVAL;
+		dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
+			"(%04x:%02x:%02x.%01x)! not owned by this domain\n",
+			domain, bus, slot, func);
+		goto out;
+	}
+
+	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
+	xen_unregister_device_domain_owner(dev);
+
+	pciback_release_pci_dev(pdev, dev);
+
+out:
+	return err;
+}
+
+static int pciback_publish_pci_root(struct pciback_device *pdev,
+				    unsigned int domain, unsigned int bus)
+{
+	unsigned int d, b;
+	int i, root_num, len, err;
+	char str[64];
+
+	dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+			   "root_num", "%d", &root_num);
+	if (err == 0 || err == -ENOENT)
+		root_num = 0;
+	else if (err < 0)
+		goto out;
+
+	/* Verify that we haven't already published this pci root */
+	for (i = 0; i < root_num; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+				   str, "%x:%x", &d, &b);
+		if (err < 0)
+			goto out;
+		if (err != 2) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (d == domain && b == bus) {
+			err = 0;
+			goto out;
+		}
+	}
+
+	len = snprintf(str, sizeof(str), "root-%d", root_num);
+	if (unlikely(len >= (sizeof(str) - 1))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
+		root_num, domain, bus);
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+			    "%04x:%02x", domain, bus);
+	if (err)
+		goto out;
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+			    "root_num", "%d", (root_num + 1));
+
+out:
+	return err;
+}
+
+static int pciback_reconfigure(struct pciback_device *pdev)
+{
+	int err = 0;
+	int num_devs;
+	int domain, bus, slot, func;
+	int substate;
+	int i, len;
+	char state_str[64];
+	char dev_str[64];
+
+
+	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
+
+	/* Make sure we only reconfigure once */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateReconfiguring)
+		goto out;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+			   &num_devs);
+	if (err != 1) {
+		if (err >= 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of devices");
+		goto out;
+	}
+
+	for (i = 0; i < num_devs; i++) {
+		len = snprintf(state_str, sizeof(state_str), "state-%d", i);
+		if (unlikely(len >= (sizeof(state_str) - 1))) {
+			err = -ENOMEM;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "String overflow while reading "
+					 "configuration");
+			goto out;
+		}
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
+				   "%d", &substate);
+		if (err != 1)
+			substate = XenbusStateUnknown;
+
+		switch (substate) {
+		case XenbusStateInitialising:
+			dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
+
+			len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+			if (unlikely(len >= (sizeof(dev_str) - 1))) {
+				err = -ENOMEM;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "String overflow while "
+						 "reading configuration");
+				goto out;
+			}
+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+					   dev_str, "%x:%x:%x.%x",
+					   &domain, &bus, &slot, &func);
+			if (err < 0) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error reading device "
+						 "configuration");
+				goto out;
+			}
+			if (err != 4) {
+				err = -EINVAL;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error parsing pci device "
+						 "configuration");
+				goto out;
+			}
+
+			err = pciback_export_device(pdev, domain, bus, slot,
+						    func, i);
+			if (err)
+				goto out;
+
+			/* Publish pci roots. */
+			err = pciback_publish_pci_roots(pdev,
+						pciback_publish_pci_root);
+			if (err) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error while publish PCI root"
+						 "buses for frontend");
+				goto out;
+			}
+
+			err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
+					    state_str, "%d",
+					    XenbusStateInitialised);
+			if (err) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error switching substate of "
+						 "dev-%d\n", i);
+				goto out;
+			}
+			break;
+
+		case XenbusStateClosing:
+			dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
+
+			len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
+			if (unlikely(len >= (sizeof(dev_str) - 1))) {
+				err = -ENOMEM;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "String overflow while "
+						 "reading configuration");
+				goto out;
+			}
+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+					   dev_str, "%x:%x:%x.%x",
+					   &domain, &bus, &slot, &func);
+			if (err < 0) {
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error reading device "
+						 "configuration");
+				goto out;
+			}
+			if (err != 4) {
+				err = -EINVAL;
+				xenbus_dev_fatal(pdev->xdev, err,
+						 "Error parsing pci device "
+						 "configuration");
+				goto out;
+			}
+
+			err = pciback_remove_device(pdev, domain, bus, slot,
+						    func);
+			if (err)
+				goto out;
+
+			/* TODO: If at some point we implement support for pci
+			 * root hot-remove on pcifront side, we'll need to
+			 * remove unnecessary xenstore nodes of pci roots here.
+			 */
+
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error switching to reconfigured state!");
+		goto out;
+	}
+
+out:
+	return 0;
+}
+
+static void pciback_frontend_changed(struct xenbus_device *xdev,
+				     enum xenbus_state fe_state)
+{
+	struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
+
+	dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
+
+	switch (fe_state) {
+	case XenbusStateInitialised:
+		pciback_attach(pdev);
+		break;
+
+	case XenbusStateReconfiguring:
+		pciback_reconfigure(pdev);
+		break;
+
+	case XenbusStateConnected:
+		/* pcifront switched its state from reconfiguring to connected.
+		 * Then switch to connected state.
+		 */
+		xenbus_switch_state(xdev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		pciback_disconnect(pdev);
+		xenbus_switch_state(xdev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		pciback_disconnect(pdev);
+		xenbus_switch_state(xdev, XenbusStateClosed);
+		if (xenbus_dev_is_online(xdev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
+		device_unregister(&xdev->dev);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int pciback_setup_backend(struct pciback_device *pdev)
+{
+	/* Get configuration from xend (if available now) */
+	int domain, bus, slot, func;
+	int err = 0;
+	int i, num_devs;
+	char dev_str[64];
+	char state_str[64];
+
+	/* It's possible we could get the call to setup twice, so make sure
+	 * we're not already connected.
+	 */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateInitWait)
+		goto out;
+
+	dev_dbg(&pdev->xdev->dev, "getting be setup\n");
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
+			   &num_devs);
+	if (err != 1) {
+		if (err >= 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of devices");
+		goto out;
+	}
+
+	for (i = 0; i < num_devs; i++) {
+		int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
+		if (unlikely(l >= (sizeof(dev_str) - 1))) {
+			err = -ENOMEM;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "String overflow while reading "
+					 "configuration");
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
+				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+		if (err < 0) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading device configuration");
+			goto out;
+		}
+		if (err != 4) {
+			err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error parsing pci device "
+					 "configuration");
+			goto out;
+		}
+
+		err = pciback_export_device(pdev, domain, bus, slot, func, i);
+		if (err)
+			goto out;
+
+		/* Switch substate of this device. */
+		l = snprintf(state_str, sizeof(state_str), "state-%d", i);
+		if (unlikely(l >= (sizeof(state_str) - 1))) {
+			err = -ENOMEM;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "String overflow while reading "
+					 "configuration");
+			goto out;
+		}
+		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
+				    "%d", XenbusStateInitialised);
+		if (err) {
+			xenbus_dev_fatal(pdev->xdev, err, "Error switching "
+					 "substate of dev-%d\n", i);
+			goto out;
+		}
+	}
+
+	err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error while publish PCI root buses "
+				 "for frontend");
+		goto out;
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+	if (err)
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error switching to initialised state!");
+
+out:
+	if (!err)
+		/* see if pcifront is already configured (if not, we'll wait) */
+		pciback_attach(pdev);
+
+	return err;
+}
+
+static void pciback_be_watch(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	struct pciback_device *pdev =
+	    container_of(watch, struct pciback_device, be_watch);
+
+	switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
+	case XenbusStateInitWait:
+		pciback_setup_backend(pdev);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int pciback_xenbus_probe(struct xenbus_device *dev,
+				const struct xenbus_device_id *id)
+{
+	int err = 0;
+	struct pciback_device *pdev = alloc_pdev(dev);
+
+	if (pdev == NULL) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err,
+				 "Error allocating pciback_device struct");
+		goto out;
+	}
+
+	/* wait for xend to configure us */
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto out;
+
+	/* watch the backend node for backend configuration information */
+	err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
+				pciback_be_watch);
+	if (err)
+		goto out;
+
+	pdev->be_watching = 1;
+
+	/* We need to force a call to our callback here in case
+	 * xend already configured us!
+	 */
+	pciback_be_watch(&pdev->be_watch, NULL, 0);
+
+out:
+	return err;
+}
+
+static int pciback_xenbus_remove(struct xenbus_device *dev)
+{
+	struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
+
+	if (pdev != NULL)
+		free_pdev(pdev);
+
+	return 0;
+}
+
+static const struct xenbus_device_id xenpci_ids[] = {
+	{"pci"},
+	{""},
+};
+
+static struct xenbus_driver xenbus_pciback_driver = {
+	.name			= "pciback",
+	.owner			= THIS_MODULE,
+	.ids			= xenpci_ids,
+	.probe			= pciback_xenbus_probe,
+	.remove			= pciback_xenbus_remove,
+	.otherend_changed	= pciback_frontend_changed,
+};
+
+int __init pciback_xenbus_register(void)
+{
+	pciback_wq = create_workqueue("pciback_workqueue");
+	if (!pciback_wq) {
+		printk(KERN_ERR "%s: create"
+			"pciback_workqueue failed\n", __func__);
+		return -EFAULT;
+	}
+	return xenbus_register_backend(&xenbus_pciback_driver);
+}
+
+void __exit pciback_xenbus_unregister(void)
+{
+	destroy_workqueue(pciback_wq);
+	xenbus_unregister_driver(&xenbus_pciback_driver);
+}
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index afbe041..319dd0a 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	if (ret)
 		goto out;
 	xenbus_probe(NULL);
-	ret = xen_setup_shutdown_event();
-	if (ret)
-		goto out;
 	return 0;

 out:
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 1da8af6..2024a74 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -50,13 +50,15 @@ struct ttm_backend_func {
 	 * @pages: Array of pointers to ttm pages.
 	 * @dummy_read_page: Page to be used instead of NULL pages in the
 	 * array @pages.
+	 * @dma_addrs: Array of DMA (bus) address of the ttm pages.
 	 *
 	 * Populate the backend with ttm pages. Depending on the backend,
 	 * it may or may not copy the @pages array.
 	 */
 	int (*populate) (struct ttm_backend *backend,
 			 unsigned long num_pages, struct page **pages,
-			 struct page *dummy_read_page);
+			 struct page *dummy_read_page,
+			 dma_addr_t *dma_addrs);
 	/**
 	 * struct ttm_backend_func member clear
 	 *
@@ -149,6 +151,7 @@ enum ttm_caching_state {
  * @swap_storage: Pointer to shmem struct file for swap storage.
  * @caching_state: The current caching state of the pages.
  * @state: The current binding state of the pages.
+ * @dma_address: The DMA (bus) addresses of the pages (if TTM_PAGE_FLAG_DMA32)
  *
  * This is a structure holding the pages, caching- and aperture binding
  * status for a buffer object that isn't backed by fixed (VRAM / AGP)
@@ -173,6 +176,8 @@ struct ttm_tt {
 		tt_unbound,
 		tt_unpopulated,
 	} state;
+	dma_addr_t *dma_address;
+	struct device *dev;
 };

 #define TTM_MEMTYPE_FLAG_FIXED         (1 << 0)	/* Fixed (on-card) PCI memory */
@@ -547,6 +552,7 @@ struct ttm_bo_device {
 	struct list_head device_list;
 	struct ttm_bo_global *glob;
 	struct ttm_bo_driver *driver;
+	struct device *dev;
 	rwlock_t vm_lock;
 	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
 	spinlock_t fence_lock;
@@ -787,6 +793,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
  * @file_page_offset: Offset into the device address space that is available
  * for buffer data. This ensures compatibility with other users of the
  * address space.
+ * @need_dma32: Allocate pages under 4GB
+ * @dev: 'struct device' of the PCI device.
  *
  * Initializes a struct ttm_bo_device:
  * Returns:
@@ -795,7 +803,8 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev);
 extern int ttm_bo_device_init(struct ttm_bo_device *bdev,
 			      struct ttm_bo_global *glob,
 			      struct ttm_bo_driver *driver,
-			      uint64_t file_page_offset, bool need_dma32);
+			      uint64_t file_page_offset, bool need_dma32,
+			      struct device *dev);

 /**
  * ttm_bo_unmap_virtual
diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index 1168214..ccb6b7a 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -36,11 +36,15 @@
  * @flags: ttm flags for page allocation.
  * @cstate: ttm caching state for the page.
  * @count: number of pages to allocate.
+ * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
  */
 int ttm_get_pages(struct list_head *pages,
 		  int flags,
 		  enum ttm_caching_state cstate,
-		  unsigned count);
+		  unsigned count,
+		  dma_addr_t *dma_address,
+		  struct device *dev);
 /**
  * Put linked list of pages to pool.
  *
@@ -49,11 +53,15 @@ int ttm_get_pages(struct list_head *pages,
  * count.
  * @flags: ttm flags for page allocation.
  * @cstate: ttm caching state.
+ * @dma_address: The DMA (bus) address of pages (if TTM_PAGE_FLAG_DMA32 set).
+ * @dev: struct device for appropiate DMA accounting.
  */
 void ttm_put_pages(struct list_head *pages,
 		   unsigned page_count,
 		   int flags,
-		   enum ttm_caching_state cstate);
+		   enum ttm_caching_state cstate,
+		   dma_addr_t *dma_address,
+		   struct device *dev);
 /**
  * Initialize pool allocator.
  */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d42..d746da1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -55,7 +55,7 @@
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
- *
+ * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
@@ -67,6 +67,7 @@
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
 #define IRQF_NO_SUSPEND		0x00004000
+#define IRQF_FORCE_RESUME	0x00008000

 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)

diff --git a/include/xen/blkif.h b/include/xen/blkif.h
new file mode 100644
index 0000000..ab79426
--- /dev/null
+++ b/include/xen/blkif.h
@@ -0,0 +1,122 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+struct blkif_common_response {
+	char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+	uint64_t        id;              /* copied from request */
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       __attribute__((__aligned__(8))) id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+	uint64_t       __attribute__((__aligned__(8))) id;
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+	struct blkif_back_ring        native;
+	struct blkif_common_back_ring common;
+	struct blkif_x86_32_back_ring x86_32;
+	struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->u.rw.sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->u.rw.seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->u.rw.sector_number = src->sector_number;
+	barrier();
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->u.rw.seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/include/xen/events.h b/include/xen/events.h
index 00f53dd..a0c8185 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -23,6 +23,12 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 			   unsigned long irqflags,
 			   const char *devname,
 			   void *dev_id);
+int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
+					  unsigned int remote_port,
+					  irq_handler_t handler,
+					  unsigned long irqflags,
+					  const char *devname,
+					  void *dev_id);

 /*
  * Common unbind function for all event sources. Takes IRQ to unbind from.
@@ -75,11 +81,10 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
 int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);

 #ifdef CONFIG_PCI_MSI
-/* Allocate an irq and a pirq to be used with MSIs. */
-#define XEN_ALLOC_PIRQ (1 << 0)
-#define XEN_ALLOC_IRQ  (1 << 1)
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_mask);
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			     int pirq, int vector, const char *name,
+			     domid_t domid);
 #endif

 /* De-allocates the above mentioned physical interrupt. */
@@ -94,4 +99,10 @@ int xen_gsi_from_irq(unsigned pirq);
 /* Return irq from pirq */
 int xen_irq_from_pirq(unsigned pirq);

+/* Return the pirq allocated to the irq. */
+int xen_pirq_from_irq(unsigned irq);
+
+/* Determine whether to ignore this IRQ if it is passed to a guest. */
+int xen_test_irq_shared(int irq);
+
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/gntalloc.h b/include/xen/gntalloc.h
new file mode 100644
index 0000000..76bd580
--- /dev/null
+++ b/include/xen/gntalloc.h
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * gntalloc.h
+ *
+ * Interface to /dev/xen/gntalloc.
+ *
+ * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov>
+ *
+ * This file is in the public domain.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTALLOC_H__
+#define __LINUX_PUBLIC_GNTALLOC_H__
+
+/*
+ * Allocates a new page and creates a new grant reference.
+ */
+#define IOCTL_GNTALLOC_ALLOC_GREF \
+_IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref))
+struct ioctl_gntalloc_alloc_gref {
+	/* IN parameters */
+	/* The ID of the domain to be given access to the grants. */
+	uint16_t domid;
+	/* Flags for this mapping */
+	uint16_t flags;
+	/* Number of pages to map */
+	uint32_t count;
+	/* OUT parameters */
+	/* The offset to be used on a subsequent call to mmap(). */
+	uint64_t index;
+	/* The grant references of the newly created grant, one per page */
+	/* Variable size, depending on count */
+	uint32_t gref_ids[1];
+};
+
+#define GNTALLOC_FLAG_WRITABLE 1
+
+/*
+ * Deallocates the grant reference, allowing the associated page to be freed if
+ * no other domains are using it.
+ */
+#define IOCTL_GNTALLOC_DEALLOC_GREF \
+_IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref))
+struct ioctl_gntalloc_dealloc_gref {
+	/* IN parameters */
+	/* The offset returned in the map operation */
+	uint64_t index;
+	/* Number of references to unmap */
+	uint32_t count;
+};
+
+/*
+ * Sets up an unmap notification within the page, so that the other side can do
+ * cleanup if this side crashes. Required to implement cross-domain robust
+ * mutexes or close notification on communication channels.
+ *
+ * Each mapped page only supports one notification; multiple calls referring to
+ * the same page overwrite the previous notification. You must clear the
+ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
+ * to occur.
+ */
+#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \
+_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntalloc_unmap_notify))
+struct ioctl_gntalloc_unmap_notify {
+	/* IN parameters */
+	/* Offset in the file descriptor for a byte within the page (same as
+	 * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
+	 * be cleared. Otherwise, it can be any byte in the page whose
+	 * notification we are adjusting.
+	 */
+	uint64_t index;
+	/* Action(s) to take on unmap */
+	uint32_t action;
+	/* Event channel to notify */
+	uint32_t event_channel_port;
+};
+
+/* Clear (set to zero) the byte specified by index */
+#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
+/* Send an interrupt on the indicated event channel */
+#define UNMAP_NOTIFY_SEND_EVENT 0x2
+
+#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
index eb23f41..5304bd3 100644
--- a/include/xen/gntdev.h
+++ b/include/xen/gntdev.h
@@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants {
 	uint32_t count;
 };

+/*
+ * Sets up an unmap notification within the page, so that the other side can do
+ * cleanup if this side crashes. Required to implement cross-domain robust
+ * mutexes or close notification on communication channels.
+ *
+ * Each mapped page only supports one notification; multiple calls referring to
+ * the same page overwrite the previous notification. You must clear the
+ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
+ * to occur.
+ */
+#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
+_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
+struct ioctl_gntdev_unmap_notify {
+	/* IN parameters */
+	/* Offset in the file descriptor for a byte within the page (same as
+	 * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
+	 * be cleared. Otherwise, it can be any byte in the page whose
+	 * notification we are adjusting.
+	 */
+	uint64_t index;
+	/* Action(s) to take on unmap */
+	uint32_t action;
+	/* Event channel to notify */
+	uint32_t event_channel_port;
+};
+
+/* Clear (set to zero) the byte specified by index */
+#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
+/* Send an interrupt on the indicated event channel */
+#define UNMAP_NOTIFY_SEND_EVENT 0x2
+
 #endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c2d1fa4..61e523a 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -51,11 +51,7 @@ typedef uint64_t blkif_sector_t;
  */
 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11

-struct blkif_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       id;           /* private guest value, echoed in resp  */
+struct blkif_request_rw {
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment {
 		grant_ref_t gref;        /* reference to I/O buffer frame        */
@@ -65,6 +61,16 @@ struct blkif_request {
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };

+struct blkif_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	union {
+		struct blkif_request_rw rw;
+	} u;
+};
+
 struct blkif_response {
 	uint64_t        id;              /* copied from request */
 	uint8_t         operation;       /* copied from request */
@@ -91,4 +97,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
 #define VDISK_REMOVABLE    0x2
 #define VDISK_READONLY     0x4

+/* Xen-defined major numbers for virtual disks, they look strangely
+ * familiar */
+#define XEN_IDE0_MAJOR	3
+#define XEN_IDE1_MAJOR	22
+#define XEN_SCSI_DISK0_MAJOR	8
+#define XEN_SCSI_DISK1_MAJOR	65
+#define XEN_SCSI_DISK2_MAJOR	66
+#define XEN_SCSI_DISK3_MAJOR	67
+#define XEN_SCSI_DISK4_MAJOR	68
+#define XEN_SCSI_DISK5_MAJOR	69
+#define XEN_SCSI_DISK6_MAJOR	70
+#define XEN_SCSI_DISK7_MAJOR	71
+#define XEN_SCSI_DISK8_MAJOR	128
+#define XEN_SCSI_DISK9_MAJOR	129
+#define XEN_SCSI_DISK10_MAJOR	130
+#define XEN_SCSI_DISK11_MAJOR	131
+#define XEN_SCSI_DISK12_MAJOR	132
+#define XEN_SCSI_DISK13_MAJOR	133
+#define XEN_SCSI_DISK14_MAJOR	134
+#define XEN_SCSI_DISK15_MAJOR	135
+
 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
index 518481c..cb94668 100644
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -22,50 +22,50 @@

 /*
  * This is the 'wire' format for packets:
- *  Request 1: netif_tx_request -- NETTXF_* (any flags)
- * [Request 2: netif_tx_extra]  (only if request 1 has NETTXF_extra_info)
- * [Request 3: netif_tx_extra]  (only if request 2 has XEN_NETIF_EXTRA_MORE)
- *  Request 4: netif_tx_request -- NETTXF_more_data
- *  Request 5: netif_tx_request -- NETTXF_more_data
+ *  Request 1: xen_netif_tx_request  -- XEN_NETTXF_* (any flags)
+ * [Request 2: xen_netif_extra_info]    (only if request 1 has XEN_NETTXF_extra_info)
+ * [Request 3: xen_netif_extra_info]    (only if request 2 has XEN_NETIF_EXTRA_MORE)
+ *  Request 4: xen_netif_tx_request  -- XEN_NETTXF_more_data
+ *  Request 5: xen_netif_tx_request  -- XEN_NETTXF_more_data
  *  ...
- *  Request N: netif_tx_request -- 0
+ *  Request N: xen_netif_tx_request  -- 0
  */

 /* Protocol checksum field is blank in the packet (hardware offload)? */
-#define _NETTXF_csum_blank     (0)
-#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
+#define _XEN_NETTXF_csum_blank		(0)
+#define  XEN_NETTXF_csum_blank		(1U<<_XEN_NETTXF_csum_blank)

 /* Packet data has been validated against protocol checksum. */
-#define _NETTXF_data_validated (1)
-#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
+#define _XEN_NETTXF_data_validated	(1)
+#define  XEN_NETTXF_data_validated	(1U<<_XEN_NETTXF_data_validated)

 /* Packet continues in the next request descriptor. */
-#define _NETTXF_more_data      (2)
-#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
+#define _XEN_NETTXF_more_data		(2)
+#define  XEN_NETTXF_more_data		(1U<<_XEN_NETTXF_more_data)

 /* Packet to be followed by extra descriptor(s). */
-#define _NETTXF_extra_info     (3)
-#define  NETTXF_extra_info     (1U<<_NETTXF_extra_info)
+#define _XEN_NETTXF_extra_info		(3)
+#define  XEN_NETTXF_extra_info		(1U<<_XEN_NETTXF_extra_info)

 struct xen_netif_tx_request {
     grant_ref_t gref;      /* Reference to buffer page */
     uint16_t offset;       /* Offset within buffer page */
-    uint16_t flags;        /* NETTXF_* */
+    uint16_t flags;        /* XEN_NETTXF_* */
     uint16_t id;           /* Echoed in response message. */
     uint16_t size;         /* Packet size in bytes.       */
 };

-/* Types of netif_extra_info descriptors. */
-#define XEN_NETIF_EXTRA_TYPE_NONE  (0)  /* Never used - invalid */
-#define XEN_NETIF_EXTRA_TYPE_GSO   (1)  /* u.gso */
-#define XEN_NETIF_EXTRA_TYPE_MAX   (2)
+/* Types of xen_netif_extra_info descriptors. */
+#define XEN_NETIF_EXTRA_TYPE_NONE	(0)  /* Never used - invalid */
+#define XEN_NETIF_EXTRA_TYPE_GSO	(1)  /* u.gso */
+#define XEN_NETIF_EXTRA_TYPE_MAX	(2)

-/* netif_extra_info flags. */
-#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
-#define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
+/* xen_netif_extra_info flags. */
+#define _XEN_NETIF_EXTRA_FLAG_MORE	(0)
+#define  XEN_NETIF_EXTRA_FLAG_MORE	(1U<<_XEN_NETIF_EXTRA_FLAG_MORE)

 /* GSO types - only TCPv4 currently supported. */
-#define XEN_NETIF_GSO_TYPE_TCPV4        (1)
+#define XEN_NETIF_GSO_TYPE_TCPV4	(1)

 /*
  * This structure needs to fit within both netif_tx_request and
@@ -107,7 +107,7 @@ struct xen_netif_extra_info {

 struct xen_netif_tx_response {
 	uint16_t id;
-	int16_t  status;       /* NETIF_RSP_* */
+	int16_t  status;       /* XEN_NETIF_RSP_* */
 };

 struct xen_netif_rx_request {
@@ -116,25 +116,29 @@ struct xen_netif_rx_request {
 };

 /* Packet data has been validated against protocol checksum. */
-#define _NETRXF_data_validated (0)
-#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
+#define _XEN_NETRXF_data_validated	(0)
+#define  XEN_NETRXF_data_validated	(1U<<_XEN_NETRXF_data_validated)

 /* Protocol checksum field is blank in the packet (hardware offload)? */
-#define _NETRXF_csum_blank     (1)
-#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
+#define _XEN_NETRXF_csum_blank		(1)
+#define  XEN_NETRXF_csum_blank		(1U<<_XEN_NETRXF_csum_blank)

 /* Packet continues in the next request descriptor. */
-#define _NETRXF_more_data      (2)
-#define  NETRXF_more_data      (1U<<_NETRXF_more_data)
+#define _XEN_NETRXF_more_data		(2)
+#define  XEN_NETRXF_more_data		(1U<<_XEN_NETRXF_more_data)

 /* Packet to be followed by extra descriptor(s). */
-#define _NETRXF_extra_info     (3)
-#define  NETRXF_extra_info     (1U<<_NETRXF_extra_info)
+#define _XEN_NETRXF_extra_info		(3)
+#define  XEN_NETRXF_extra_info		(1U<<_XEN_NETRXF_extra_info)
+
+/* GSO Prefix descriptor. */
+#define _XEN_NETRXF_gso_prefix		(4)
+#define  XEN_NETRXF_gso_prefix		(1U<<_XEN_NETRXF_gso_prefix)

 struct xen_netif_rx_response {
     uint16_t id;
     uint16_t offset;       /* Offset in page of start of received packet  */
-    uint16_t flags;        /* NETRXF_* */
+    uint16_t flags;        /* XEN_NETRXF_* */
     int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
 };

@@ -149,10 +153,10 @@ DEFINE_RING_TYPES(xen_netif_rx,
 		  struct xen_netif_rx_request,
 		  struct xen_netif_rx_response);

-#define NETIF_RSP_DROPPED         -2
-#define NETIF_RSP_ERROR           -1
-#define NETIF_RSP_OKAY             0
-/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
-#define NETIF_RSP_NULL             1
+#define XEN_NETIF_RSP_DROPPED	-2
+#define XEN_NETIF_RSP_ERROR	-1
+#define XEN_NETIF_RSP_OKAY	 0
+/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */
+#define XEN_NETIF_RSP_NULL	 1

 #endif
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 2befa3e..b33257b 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -30,7 +30,7 @@
 #define __HYPERVISOR_stack_switch          3
 #define __HYPERVISOR_set_callbacks         4
 #define __HYPERVISOR_fpu_taskswitch        5
-#define __HYPERVISOR_sched_op              6
+#define __HYPERVISOR_sched_op_compat       6
 #define __HYPERVISOR_dom0_op               7
 #define __HYPERVISOR_set_debugreg          8
 #define __HYPERVISOR_get_debugreg          9
@@ -52,7 +52,7 @@
 #define __HYPERVISOR_mmuext_op            26
 #define __HYPERVISOR_acm_op               27
 #define __HYPERVISOR_nmi_op               28
-#define __HYPERVISOR_sched_op_new         29
+#define __HYPERVISOR_sched_op             29
 #define __HYPERVISOR_callback_op          30
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 98b9215..03c85d7 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,9 +5,9 @@

 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);

-void xen_pre_suspend(void);
-void xen_post_suspend(int suspend_cancelled);
-void xen_hvm_post_suspend(int suspend_cancelled);
+void xen_arch_pre_suspend(void);
+void xen_arch_post_suspend(int suspend_cancelled);
+void xen_arch_hvm_post_suspend(int suspend_cancelled);

 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9033c1c..2782bac 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq);

 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
-	if (resume)
+	if (resume) {
+		if (!(desc->status & IRQ_SUSPENDED)) {
+			if (!desc->action)
+				return;
+			if (!(desc->action->flags & IRQF_FORCE_RESUME))
+				return;
+			/* Pretend that it got disabled ! */
+			desc->depth++;
+		}
 		desc->status &= ~IRQ_SUSPENDED;
+	}

 	switch (desc->depth) {
 	case 0:
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d8..d6bfb89 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
 	for_each_irq_desc(irq, desc) {
 		unsigned long flags;

-		if (!(desc->status & IRQ_SUSPENDED))
-			continue;
-
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		__enable_irq(desc, irq, true);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);