Add patches to fix nouveau issues preventing booting the installer or system
This commit is contained in:
parent
9d763c2454
commit
c242538da1
141
0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
Normal file
141
0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
Normal file
@ -0,0 +1,141 @@
|
||||
From 7a7662fe09eb2ccd2eb93ce7261aa47c86111b4d Mon Sep 17 00:00:00 2001
|
||||
From: Karol Herbst <kherbst@redhat.com>
|
||||
Date: Tue, 24 Mar 2020 21:29:23 +0100
|
||||
Subject: [PATCH 1/2] drm/nouveau: workaround runpm fail by disabling PCI power
|
||||
management on certain intel bridges
|
||||
|
||||
Fixes the infamous 'runtime PM' bug many users are facing on Laptops with
|
||||
Nvidia Pascal GPUs by skipping said PCI power state changes on the GPU.
|
||||
|
||||
Depending on the used kernel there might be messages like those in demsg:
|
||||
|
||||
"nouveau 0000:01:00.0: Refused to change power state, currently in D3"
|
||||
"nouveau 0000:01:00.0: can't change power state from D3cold to D0 (config
|
||||
space inaccessible)"
|
||||
followed by backtraces of kernel crashes or timeouts within nouveau.
|
||||
|
||||
It's still unkown why this issue exists, but this is a reliable workaround
|
||||
and solves a very annoying issue for user having to choose between a
|
||||
crashing kernel or higher power consumption of their Laptops.
|
||||
|
||||
Signed-off-by: Karol Herbst <kherbst@redhat.com>
|
||||
Cc: Bjorn Helgaas <bhelgaas@google.com>
|
||||
Cc: Lyude Paul <lyude@redhat.com>
|
||||
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
|
||||
Cc: Mika Westerberg <mika.westerberg@intel.com>
|
||||
Cc: linux-pci@vger.kernel.org
|
||||
Cc: linux-pm@vger.kernel.org
|
||||
Cc: dri-devel@lists.freedesktop.org
|
||||
Cc: nouveau@lists.freedesktop.org
|
||||
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205623
|
||||
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
|
||||
---
|
||||
drivers/gpu/drm/nouveau/nouveau_drm.c | 63 +++++++++++++++++++++++++++
|
||||
drivers/gpu/drm/nouveau/nouveau_drv.h | 2 +
|
||||
2 files changed, 65 insertions(+)
|
||||
|
||||
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
|
||||
index 6b1629c14dd7..ca4087f5a15b 100644
|
||||
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
|
||||
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
|
||||
@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev)
|
||||
kfree(drm);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * On some Intel PCIe bridge controllers doing a
|
||||
+ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear.
|
||||
+ * Skipping the intermediate D3hot step seems to make it work again. This is
|
||||
+ * probably caused by not meeting the expectation the involved AML code has
|
||||
+ * when the GPU is put into D3hot state before invoking it.
|
||||
+ *
|
||||
+ * This leads to various manifestations of this issue:
|
||||
+ * - AML code execution to power on the GPU hits an infinite loop (as the
|
||||
+ * code waits on device memory to change).
|
||||
+ * - kernel crashes, as all PCI reads return -1, which most code isn't able
|
||||
+ * to handle well enough.
|
||||
+ *
|
||||
+ * In all cases dmesg will contain at least one line like this:
|
||||
+ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3'
|
||||
+ * followed by a lot of nouveau timeouts.
|
||||
+ *
|
||||
+ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not
|
||||
+ * documented PCI config space register 0x248 of the Intel PCIe bridge
|
||||
+ * controller (0x1901) in order to change the state of the PCIe link between
|
||||
+ * the PCIe port and the GPU. There are alternative code paths using other
|
||||
+ * registers, which seem to work fine (executed pre Windows 8):
|
||||
+ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved')
|
||||
+ * - 0xb0 bit 0x10 (link disable)
|
||||
+ * Changing the conditions inside the firmware by poking into the relevant
|
||||
+ * addresses does resolve the issue, but it seemed to be ACPI private memory
|
||||
+ * and not any device accessible memory at all, so there is no portable way of
|
||||
+ * changing the conditions.
|
||||
+ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared.
|
||||
+ *
|
||||
+ * The only systems where this behavior can be seen are hybrid graphics laptops
|
||||
+ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether
|
||||
+ * this issue only occurs in combination with listed Intel PCIe bridge
|
||||
+ * controllers and the mentioned GPUs or other devices as well.
|
||||
+ *
|
||||
+ * documentation on the PCIe bridge controller can be found in the
|
||||
+ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2"
|
||||
+ * Section "12 PCI Express* Controller (x16) Registers"
|
||||
+ */
|
||||
+
|
||||
+static void quirk_broken_nv_runpm(struct pci_dev *pdev)
|
||||
+{
|
||||
+ struct drm_device *dev = pci_get_drvdata(pdev);
|
||||
+ struct nouveau_drm *drm = nouveau_drm(dev);
|
||||
+ struct pci_dev *bridge = pci_upstream_bridge(pdev);
|
||||
+
|
||||
+ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL)
|
||||
+ return;
|
||||
+
|
||||
+ switch (bridge->device) {
|
||||
+ case 0x1901:
|
||||
+ drm->old_pm_cap = pdev->pm_cap;
|
||||
+ pdev->pm_cap = 0;
|
||||
+ NV_INFO(drm, "Disabling PCI power management to avoid bug\n");
|
||||
+ break;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static int nouveau_drm_probe(struct pci_dev *pdev,
|
||||
const struct pci_device_id *pent)
|
||||
{
|
||||
@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
|
||||
if (ret)
|
||||
goto fail_drm_dev_init;
|
||||
|
||||
+ quirk_broken_nv_runpm(pdev);
|
||||
return 0;
|
||||
|
||||
fail_drm_dev_init:
|
||||
@@ -734,7 +793,11 @@ static void
|
||||
nouveau_drm_remove(struct pci_dev *pdev)
|
||||
{
|
||||
struct drm_device *dev = pci_get_drvdata(pdev);
|
||||
+ struct nouveau_drm *drm = nouveau_drm(dev);
|
||||
|
||||
+ /* revert our workaround */
|
||||
+ if (drm->old_pm_cap)
|
||||
+ pdev->pm_cap = drm->old_pm_cap;
|
||||
nouveau_drm_device_remove(dev);
|
||||
pci_disable_device(pdev);
|
||||
}
|
||||
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
|
||||
index c2c332fbde97..2a6519737800 100644
|
||||
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
|
||||
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
|
||||
@@ -140,6 +140,8 @@ struct nouveau_drm {
|
||||
|
||||
struct list_head clients;
|
||||
|
||||
+ u8 old_pm_cap;
|
||||
+
|
||||
struct {
|
||||
struct agp_bridge_data *bridge;
|
||||
u32 base;
|
||||
--
|
||||
2.25.1
|
||||
|
@ -0,0 +1,68 @@
|
||||
From 37b556606d1217b4367e622d88cef11c65764386 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Skeggs <bskeggs@redhat.com>
|
||||
Date: Tue, 31 Mar 2020 16:08:44 +1000
|
||||
Subject: [PATCH 2/2] drm/nouveau/gr/gp107,gp108: implement workaround for HW
|
||||
hanging during init
|
||||
|
||||
Certain boards with GP107/GP108 chipsets hang (often, but randomly) for
|
||||
unknown reasons during GR initialisation.
|
||||
|
||||
The first tell-tale symptom of this issue is:
|
||||
|
||||
nouveau 0000:01:00.0: bus: MMIO read of 00000000 FAULT at 409800 [ TIMEOUT ]
|
||||
|
||||
appearing in dmesg, likely followed by many other failures being logged.
|
||||
|
||||
Karol found this WAR for the issue a while back, but efforts to isolate
|
||||
the root cause and proper fix have not yielded success so far. I've
|
||||
modified the original patch to include a few more details, limit it to
|
||||
GP107/GP108 by default, and added a config option to override this choice.
|
||||
|
||||
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
|
||||
Reviewed-by: Karol Herbst <kherbst@redhat.com>
|
||||
---
|
||||
.../gpu/drm/nouveau/nvkm/engine/gr/gf100.c | 26 +++++++++++++++++++
|
||||
1 file changed, 26 insertions(+)
|
||||
|
||||
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
|
||||
index dd8f85b8b3a7..f2f5636efac4 100644
|
||||
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
|
||||
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
|
||||
@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base)
|
||||
{
|
||||
struct gf100_gr *gr = gf100_gr(base);
|
||||
struct nvkm_subdev *subdev = &base->engine.subdev;
|
||||
+ struct nvkm_device *device = subdev->device;
|
||||
+ bool reset = device->chipset == 0x137 || device->chipset == 0x138;
|
||||
u32 ret;
|
||||
|
||||
+ /* On certain GP107/GP108 boards, we trigger a weird issue where
|
||||
+ * GR will stop responding to PRI accesses after we've asked the
|
||||
+ * SEC2 RTOS to boot the GR falcons. This happens with far more
|
||||
+ * frequency when cold-booting a board (ie. returning from D3).
|
||||
+ *
|
||||
+ * The root cause for this is not known and has proven difficult
|
||||
+ * to isolate, with many avenues being dead-ends.
|
||||
+ *
|
||||
+ * A workaround was discovered by Karol, whereby putting GR into
|
||||
+ * reset for an extended period right before initialisation
|
||||
+ * prevents the problem from occuring.
|
||||
+ *
|
||||
+ * XXX: As RM does not require any such workaround, this is more
|
||||
+ * of a hack than a true fix.
|
||||
+ */
|
||||
+ reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset);
|
||||
+ if (reset) {
|
||||
+ nvkm_mask(device, 0x000200, 0x00001000, 0x00000000);
|
||||
+ nvkm_rd32(device, 0x000200);
|
||||
+ msleep(50);
|
||||
+ nvkm_mask(device, 0x000200, 0x00001000, 0x00001000);
|
||||
+ nvkm_rd32(device, 0x000200);
|
||||
+ }
|
||||
+
|
||||
nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false);
|
||||
|
||||
ret = nvkm_falcon_get(&gr->fecs.falcon, subdev);
|
||||
--
|
||||
2.25.1
|
||||
|
@ -865,6 +865,12 @@ Patch511: 0001-ALSA-hda-realtek-Add-quirk-for-Lenovo-Carbon-X1-8th-.patch
|
||||
# Fixes build on s390 and should be upstream after rc1
|
||||
Patch512: export_sysrq_mask.patch
|
||||
|
||||
# nouveau runpm and secboot fixes
|
||||
# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/f5755e7069d4acbcce1a93692421f358241ead7b
|
||||
Patch513: 0001-drm-nouveau-workaround-runpm-fail-by-disabling-PCI-p.patch
|
||||
# Accepted nouveau upstream https://github.com/skeggsb/nouveau/commit/41c6a13e8143af71928749ea9895d2ebc2fb4ffd
|
||||
Patch514: 0002-drm-nouveau-gr-gp107-gp108-implement-workaround-for-.patch
|
||||
|
||||
# END OF PATCH DEFINITIONS
|
||||
|
||||
%endif
|
||||
@ -2960,6 +2966,9 @@ fi
|
||||
#
|
||||
#
|
||||
%changelog
|
||||
* Tue Apr 07 2020 Karol Herbst <kherbst@redhat.com>
|
||||
- Add patches to fix nouveau issues preventing booting the installer or system
|
||||
|
||||
* Mon Apr 06 2020 Justin M. Forbes <jforbes@fedoraproject.org> - 5.7.0-0.rc0.git6.1
|
||||
- Linux v5.6-11374-ga10c9c710f9e
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user