436 lines
15 KiB
Diff
436 lines
15 KiB
Diff
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||
|
From: Daniel Axtens <dja@axtens.net>
|
||
|
Date: Mon, 6 Feb 2023 10:03:22 -0500
|
||
|
Subject: [PATCH] ieee1275: support runtime memory claiming
|
||
|
|
||
|
On powerpc-ieee1275, we are running out of memory trying to verify
|
||
|
anything. This is because:
|
||
|
|
||
|
- we have to load an entire file into memory to verify it. This is
|
||
|
difficult to change with appended signatures.
|
||
|
- We only have 32MB of heap.
|
||
|
- Distro kernels are now often around 30MB.
|
||
|
|
||
|
So we want to be able to claim more memory from OpenFirmware for our heap
|
||
|
at runtime.
|
||
|
|
||
|
There are some complications:
|
||
|
|
||
|
- The grub mm code isn't the only thing that will make claims on
|
||
|
memory from OpenFirmware:
|
||
|
|
||
|
* PFW/SLOF will have claimed some for their own use.
|
||
|
|
||
|
* The ieee1275 loader will try to find other bits of memory that we
|
||
|
haven't claimed to place the kernel and initrd when we go to boot.
|
||
|
|
||
|
* Once we load Linux, it will also try to claim memory. It claims
|
||
|
memory without any reference to /memory/available, it just starts
|
||
|
at min(top of RMO, 768MB) and works down. So we need to avoid this
|
||
|
area. See arch/powerpc/kernel/prom_init.c as of v5.11.
|
||
|
|
||
|
- The smallest amount of memory a ppc64 KVM guest can have is 256MB.
|
||
|
It doesn't work with distro kernels but can work with custom kernels.
|
||
|
We should maintain support for that. (ppc32 can boot with even less,
|
||
|
and we shouldn't break that either.)
|
||
|
|
||
|
- Even if a VM has more memory, the memory OpenFirmware makes available
|
||
|
as Real Memory Area can be restricted. Even with our CAS work, an LPAR
|
||
|
on a PowerVM box is likely to have only 512MB available to OpenFirmware
|
||
|
even if it has many gigabytes of memory allocated.
|
||
|
|
||
|
What should we do?
|
||
|
|
||
|
We don't know in advance how big the kernel and initrd are going to be,
|
||
|
which makes figuring out how much memory we can take a bit tricky.
|
||
|
|
||
|
To figure out how much memory we should leave unused, I looked at:
|
||
|
|
||
|
- an Ubuntu 20.04.1 ppc64le pseries KVM guest:
|
||
|
vmlinux: ~30MB
|
||
|
initrd: ~50MB
|
||
|
|
||
|
- a RHEL8.2 ppc64le pseries KVM guest:
|
||
|
vmlinux: ~30MB
|
||
|
initrd: ~30MB
|
||
|
|
||
|
So to give us a little wriggle room, I think we want to leave at least
|
||
|
128MB for the loader to put vmlinux and initrd in memory and leave Linux
|
||
|
with space to satisfy its early allocations.
|
||
|
|
||
|
Allow other space to be allocated at runtime.
|
||
|
|
||
|
Tested-by: Stefan Berger <stefanb@linux.ibm.com>
|
||
|
Signed-off-by: Daniel Axtens <dja@axtens.net>
|
||
|
(cherry picked from commit a5c710789ccdd27a84ae4a34c7d453bd585e2b66)
|
||
|
[rharwood: _start?]
|
||
|
---
|
||
|
grub-core/kern/ieee1275/init.c | 270 ++++++++++++++++++++++++++++++++++++++---
|
||
|
docs/grub-dev.texi | 7 +-
|
||
|
2 files changed, 257 insertions(+), 20 deletions(-)
|
||
|
|
||
|
diff --git a/grub-core/kern/ieee1275/init.c b/grub-core/kern/ieee1275/init.c
|
||
|
index c8d551759d..85af8fa97b 100644
|
||
|
--- a/grub-core/kern/ieee1275/init.c
|
||
|
+++ b/grub-core/kern/ieee1275/init.c
|
||
|
@@ -46,13 +46,26 @@
|
||
|
#endif
|
||
|
#include <grub/lockdown.h>
|
||
|
|
||
|
-/* The maximum heap size we're going to claim */
|
||
|
+/* The maximum heap size we're going to claim at boot. Not used by sparc. */
|
||
|
#ifdef __i386__
|
||
|
#define HEAP_MAX_SIZE (unsigned long) (64 * 1024 * 1024)
|
||
|
-#else
|
||
|
+#else /* __powerpc__ */
|
||
|
#define HEAP_MAX_SIZE (unsigned long) (32 * 1024 * 1024)
|
||
|
#endif
|
||
|
|
||
|
+/* RMO max. address at 768 MB */
|
||
|
+#define RMO_ADDR_MAX (grub_uint64_t) (768 * 1024 * 1024)
|
||
|
+
|
||
|
+/*
|
||
|
+ * The amount of OF space we will not claim here so as to leave space for
|
||
|
+ * the loader and linux to service early allocations.
|
||
|
+ *
|
||
|
+ * In 2021, Daniel Axtens claims that we should leave at least 128MB to
|
||
|
+ * ensure we can load a stock kernel and initrd on a pseries guest with
|
||
|
+ * a 512MB real memory area under PowerVM.
|
||
|
+ */
|
||
|
+#define RUNTIME_MIN_SPACE (128UL * 1024 * 1024)
|
||
|
+
|
||
|
extern char _end[];
|
||
|
|
||
|
#ifdef __sparc__
|
||
|
@@ -147,16 +160,52 @@ grub_claim_heap (void)
|
||
|
+ GRUB_KERNEL_MACHINE_STACK_SIZE), 0x200000);
|
||
|
}
|
||
|
#else
|
||
|
-/* Helper for grub_claim_heap. */
|
||
|
+/* Helpers for mm on powerpc. */
|
||
|
+
|
||
|
+/*
|
||
|
+ * How much memory does OF believe exists in total?
|
||
|
+ *
|
||
|
+ * This isn't necessarily the true total. It can be the total memory
|
||
|
+ * accessible in real mode for a pseries guest, for example.
|
||
|
+ */
|
||
|
+static grub_uint64_t rmo_top;
|
||
|
+
|
||
|
static int
|
||
|
-heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
- void *data)
|
||
|
+count_free (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
+ void *data)
|
||
|
{
|
||
|
- unsigned long *total = data;
|
||
|
+ if (type != GRUB_MEMORY_AVAILABLE)
|
||
|
+ return 0;
|
||
|
+
|
||
|
+ /* Do not consider memory beyond 4GB */
|
||
|
+ if (addr > 0xffffffffULL)
|
||
|
+ return 0;
|
||
|
+
|
||
|
+ if (addr + len > 0xffffffffULL)
|
||
|
+ len = 0xffffffffULL - addr;
|
||
|
+
|
||
|
+ *(grub_uint32_t *) data += len;
|
||
|
+
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+static int
|
||
|
+regions_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
+ unsigned int flags, void *data)
|
||
|
+{
|
||
|
+ grub_uint32_t total = *(grub_uint32_t *) data;
|
||
|
+ grub_uint64_t linux_rmo_save;
|
||
|
|
||
|
if (type != GRUB_MEMORY_AVAILABLE)
|
||
|
return 0;
|
||
|
|
||
|
+ /* Do not consider memory beyond 4GB */
|
||
|
+ if (addr > 0xffffffffULL)
|
||
|
+ return 0;
|
||
|
+
|
||
|
+ if (addr + len > 0xffffffffULL)
|
||
|
+ len = 0xffffffffULL - addr;
|
||
|
+
|
||
|
if (grub_ieee1275_test_flag (GRUB_IEEE1275_FLAG_NO_PRE1_5M_CLAIM))
|
||
|
{
|
||
|
if (addr + len <= 0x180000)
|
||
|
@@ -169,10 +218,6 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
- /* Never exceed HEAP_MAX_SIZE */
|
||
|
- if (*total + len > HEAP_MAX_SIZE)
|
||
|
- len = HEAP_MAX_SIZE - *total;
|
||
|
-
|
||
|
/* In theory, firmware should already prevent this from happening by not
|
||
|
listing our own image in /memory/available. The check below is intended
|
||
|
as a safeguard in case that doesn't happen. However, it doesn't protect
|
||
|
@@ -184,6 +229,108 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
len = 0;
|
||
|
}
|
||
|
|
||
|
+ /*
|
||
|
+ * Linux likes to claim memory at min(RMO top, 768MB) and works down
|
||
|
+ * without reference to /memory/available. (See prom_init.c::alloc_down)
|
||
|
+ *
|
||
|
+ * If this block contains min(RMO top, 768MB), do not claim below that for
|
||
|
+ * at least a few MB (this is where RTAS, SML and potentially TCEs live).
|
||
|
+ *
|
||
|
+ * We also need to leave enough space for the DT in the RMA. (See
|
||
|
+ * prom_init.c::alloc_up)
|
||
|
+ *
|
||
|
+ * Finally, we also want to make sure that when grub loads the kernel,
|
||
|
+ * it isn't going to use up all the memory we're trying to reserve! So
|
||
|
+ * enforce our entire RUNTIME_MIN_SPACE here:
|
||
|
+ *
|
||
|
+ * |---------- Top of memory ----------|
|
||
|
+ * | |
|
||
|
+ * | available |
|
||
|
+ * | |
|
||
|
+ * |---------- 768 MB ----------|
|
||
|
+ * | |
|
||
|
+ * | reserved |
|
||
|
+ * | |
|
||
|
+ * |--- 768 MB - runtime min space ---|
|
||
|
+ * | |
|
||
|
+ * | available |
|
||
|
+ * | |
|
||
|
+ * |---------- 0 MB ----------|
|
||
|
+ *
|
||
|
+ * Edge cases:
|
||
|
+ *
|
||
|
+ * - Total memory less than RUNTIME_MIN_SPACE: only claim up to HEAP_MAX_SIZE.
|
||
|
+ * (enforced elsewhere)
|
||
|
+ *
|
||
|
+ * - Total memory between RUNTIME_MIN_SPACE and 768MB:
|
||
|
+ *
|
||
|
+ * |---------- Top of memory ----------|
|
||
|
+ * | |
|
||
|
+ * | reserved |
|
||
|
+ * | |
|
||
|
+ * |---- top - runtime min space ----|
|
||
|
+ * | |
|
||
|
+ * | available |
|
||
|
+ * | |
|
||
|
+ * |---------- 0 MB ----------|
|
||
|
+ *
|
||
|
+ * This by itself would not leave us with RUNTIME_MIN_SPACE of free bytes: if
|
||
|
+ * rmo_top < 768MB, we will almost certainly have FW claims in the reserved
|
||
|
+ * region. We try to address that elsewhere: grub_ieee1275_mm_add_region will
|
||
|
+ * not call us if the resulting free space would be less than RUNTIME_MIN_SPACE.
|
||
|
+ */
|
||
|
+ linux_rmo_save = grub_min (RMO_ADDR_MAX, rmo_top) - RUNTIME_MIN_SPACE;
|
||
|
+ if (rmo_top > RUNTIME_MIN_SPACE)
|
||
|
+ {
|
||
|
+ if (rmo_top <= RMO_ADDR_MAX)
|
||
|
+ {
|
||
|
+ if (addr > linux_rmo_save)
|
||
|
+ {
|
||
|
+ grub_dprintf ("ieee1275", "rejecting region in RUNTIME_MIN_SPACE reservation (%llx)\n",
|
||
|
+ addr);
|
||
|
+ return 0;
|
||
|
+ }
|
||
|
+ else if (addr + len > linux_rmo_save)
|
||
|
+ {
|
||
|
+ grub_dprintf ("ieee1275", "capping region: (%llx -> %llx) -> (%llx -> %llx)\n",
|
||
|
+ addr, addr + len, addr, rmo_top - RUNTIME_MIN_SPACE);
|
||
|
+ len = linux_rmo_save - addr;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ else
|
||
|
+ {
|
||
|
+ /*
|
||
|
+ * we order these cases to prefer higher addresses and avoid some
|
||
|
+ * splitting issues
|
||
|
+ */
|
||
|
+ if (addr < RMO_ADDR_MAX && (addr + len) > RMO_ADDR_MAX)
|
||
|
+ {
|
||
|
+ grub_dprintf ("ieee1275",
|
||
|
+ "adjusting region for RUNTIME_MIN_SPACE: (%llx -> %llx) -> (%llx -> %llx)\n",
|
||
|
+ addr, addr + len, RMO_ADDR_MAX, addr + len);
|
||
|
+ len = (addr + len) - RMO_ADDR_MAX;
|
||
|
+ addr = RMO_ADDR_MAX;
|
||
|
+ }
|
||
|
+ else if ((addr < linux_rmo_save) && ((addr + len) > linux_rmo_save))
|
||
|
+ {
|
||
|
+ grub_dprintf ("ieee1275", "capping region: (%llx -> %llx) -> (%llx -> %llx)\n",
|
||
|
+ addr, addr + len, addr, linux_rmo_save);
|
||
|
+ len = linux_rmo_save - addr;
|
||
|
+ }
|
||
|
+ else if (addr >= linux_rmo_save && (addr + len) <= RMO_ADDR_MAX)
|
||
|
+ {
|
||
|
+ grub_dprintf ("ieee1275", "rejecting region in RUNTIME_MIN_SPACE reservation (%llx)\n",
|
||
|
+ addr);
|
||
|
+ return 0;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ }
|
||
|
+ if (flags & GRUB_MM_ADD_REGION_CONSECUTIVE && len < total)
|
||
|
+ return 0;
|
||
|
+
|
||
|
+ if (len > total)
|
||
|
+ len = total;
|
||
|
+
|
||
|
if (len)
|
||
|
{
|
||
|
grub_err_t err;
|
||
|
@@ -192,15 +339,95 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
if (err)
|
||
|
return err;
|
||
|
grub_mm_init_region ((void *) (grub_addr_t) addr, len);
|
||
|
+ total -= len;
|
||
|
}
|
||
|
|
||
|
- *total += len;
|
||
|
- if (*total >= HEAP_MAX_SIZE)
|
||
|
+ *(grub_uint32_t *) data = total;
|
||
|
+
|
||
|
+ if (total == 0)
|
||
|
return 1;
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+static int
|
||
|
+heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
+ void *data)
|
||
|
+{
|
||
|
+ return regions_claim (addr, len, type, GRUB_MM_ADD_REGION_NONE, data);
|
||
|
+}
|
||
|
+
|
||
|
+static int
|
||
|
+region_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
|
||
|
+ void *data)
|
||
|
+{
|
||
|
+ return regions_claim (addr, len, type, GRUB_MM_ADD_REGION_CONSECUTIVE, data);
|
||
|
+}
|
||
|
+
|
||
|
+static grub_err_t
|
||
|
+grub_ieee1275_mm_add_region (grub_size_t size, unsigned int flags)
|
||
|
+{
|
||
|
+ grub_uint32_t free_memory = 0;
|
||
|
+ grub_uint32_t avail = 0;
|
||
|
+ grub_uint32_t total;
|
||
|
+
|
||
|
+ grub_dprintf ("ieee1275", "mm requested region of size %x, flags %x\n",
|
||
|
+ size, flags);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Update free memory each time, which is a bit inefficient but guards us
|
||
|
+ * against a situation where some OF driver goes out to firmware for
|
||
|
+ * memory and we don't realise.
|
||
|
+ */
|
||
|
+ grub_machine_mmap_iterate (count_free, &free_memory);
|
||
|
+
|
||
|
+ /* Ensure we leave enough space to boot. */
|
||
|
+ if (free_memory <= RUNTIME_MIN_SPACE + size)
|
||
|
+ {
|
||
|
+ grub_dprintf ("ieee1275", "Cannot satisfy allocation and retain minimum runtime space\n");
|
||
|
+ return GRUB_ERR_OUT_OF_MEMORY;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (free_memory > RUNTIME_MIN_SPACE)
|
||
|
+ avail = free_memory - RUNTIME_MIN_SPACE;
|
||
|
+
|
||
|
+ grub_dprintf ("ieee1275", "free = 0x%x available = 0x%x\n", free_memory, avail);
|
||
|
+
|
||
|
+ if (flags & GRUB_MM_ADD_REGION_CONSECUTIVE)
|
||
|
+ {
|
||
|
+ /* first try rounding up hard for the sake of speed */
|
||
|
+ total = grub_max (ALIGN_UP (size, 1024 * 1024) + 1024 * 1024, 32 * 1024 * 1024);
|
||
|
+ total = grub_min (avail, total);
|
||
|
+
|
||
|
+ grub_dprintf ("ieee1275", "looking for %x bytes of memory (%x requested)\n", total, size);
|
||
|
+
|
||
|
+ grub_machine_mmap_iterate (region_claim, &total);
|
||
|
+ grub_dprintf ("ieee1275", "get memory from fw %s\n", total == 0 ? "succeeded" : "failed");
|
||
|
+
|
||
|
+ if (total != 0)
|
||
|
+ {
|
||
|
+ total = grub_min (avail, size);
|
||
|
+
|
||
|
+ grub_dprintf ("ieee1275", "fallback for %x bytes of memory (%x requested)\n", total, size);
|
||
|
+
|
||
|
+ grub_machine_mmap_iterate (region_claim, &total);
|
||
|
+ grub_dprintf ("ieee1275", "fallback from fw %s\n", total == 0 ? "succeeded" : "failed");
|
||
|
+ }
|
||
|
+ }
|
||
|
+ else
|
||
|
+ {
|
||
|
+ /* provide padding for a grub_mm_header_t and region */
|
||
|
+ total = grub_min (avail, size);
|
||
|
+ grub_machine_mmap_iterate (heap_init, &total);
|
||
|
+ grub_dprintf ("ieee1275", "get noncontig memory from fw %s\n", total == 0 ? "succeeded" : "failed");
|
||
|
+ }
|
||
|
+
|
||
|
+ if (total == 0)
|
||
|
+ return GRUB_ERR_NONE;
|
||
|
+ else
|
||
|
+ return GRUB_ERR_OUT_OF_MEMORY;
|
||
|
+}
|
||
|
+
|
||
|
/*
|
||
|
* How much memory does OF believe it has? (regardless of whether
|
||
|
* it's accessible or not)
|
||
|
@@ -356,17 +583,24 @@ grub_ieee1275_ibm_cas (void)
|
||
|
static void
|
||
|
grub_claim_heap (void)
|
||
|
{
|
||
|
- unsigned long total = 0;
|
||
|
+ grub_err_t err;
|
||
|
+ grub_uint32_t total = HEAP_MAX_SIZE;
|
||
|
+
|
||
|
+ err = grub_ieee1275_total_mem (&rmo_top);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * If we cannot size the available memory, we can't be sure we're leaving
|
||
|
+ * space for the kernel, initrd and things Linux loads early in boot. So only
|
||
|
+ * allow further allocations from firmware on success
|
||
|
+ */
|
||
|
+ if (err == GRUB_ERR_NONE)
|
||
|
+ grub_mm_add_region_fn = grub_ieee1275_mm_add_region;
|
||
|
|
||
|
#if defined(__powerpc__)
|
||
|
if (grub_ieee1275_test_flag (GRUB_IEEE1275_FLAG_CAN_TRY_CAS_FOR_MORE_MEMORY))
|
||
|
{
|
||
|
- grub_uint64_t rma_size;
|
||
|
- grub_err_t err;
|
||
|
-
|
||
|
- err = grub_ieee1275_total_mem (&rma_size);
|
||
|
/* if we have an error, don't call CAS, just hope for the best */
|
||
|
- if (err == GRUB_ERR_NONE && rma_size < (512 * 1024 * 1024))
|
||
|
+ if (err == GRUB_ERR_NONE && rmo_top < (512 * 1024 * 1024))
|
||
|
grub_ieee1275_ibm_cas ();
|
||
|
}
|
||
|
#endif
|
||
|
diff --git a/docs/grub-dev.texi b/docs/grub-dev.texi
|
||
|
index 7b2455a8fe..7edc5b7e2b 100644
|
||
|
--- a/docs/grub-dev.texi
|
||
|
+++ b/docs/grub-dev.texi
|
||
|
@@ -1047,7 +1047,10 @@ space is limited to 4GiB. GRUB allocates pages from EFI for its heap, at most
|
||
|
1.6 GiB.
|
||
|
|
||
|
On i386-ieee1275 and powerpc-ieee1275 GRUB uses same stack as IEEE1275.
|
||
|
-It allocates at most 32MiB for its heap.
|
||
|
+
|
||
|
+On i386-ieee1275 and powerpc-ieee1275, GRUB will allocate 32MiB for its heap on
|
||
|
+startup. It may allocate more at runtime, as long as at least 128MiB remain free
|
||
|
+in OpenFirmware.
|
||
|
|
||
|
On sparc64-ieee1275 stack is 256KiB and heap is 2MiB.
|
||
|
|
||
|
@@ -1075,7 +1078,7 @@ In short:
|
||
|
@item i386-qemu @tab 60 KiB @tab < 4 GiB
|
||
|
@item *-efi @tab ? @tab < 1.6 GiB
|
||
|
@item i386-ieee1275 @tab ? @tab < 32 MiB
|
||
|
-@item powerpc-ieee1275 @tab ? @tab < 32 MiB
|
||
|
+@item powerpc-ieee1275 @tab ? @tab available memory - 128MiB
|
||
|
@item sparc64-ieee1275 @tab 256KiB @tab 2 MiB
|
||
|
@item arm-uboot @tab 256KiB @tab 2 MiB
|
||
|
@item mips(el)-qemu_mips @tab 2MiB @tab 253 MiB
|