diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7b56ca80e37a..e34d3c938729 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back. The total amount of memory currently being used by the cgroup and its descendants. + memory.min + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Hard memory protection. If the memory usage of a cgroup + is within its effective min boundary, the cgroup's memory + won't be reclaimed under any conditions. If there is no + unprotected reclaimable memory available, OOM killer + is invoked. + + Effective min boundary is limited by memory.min values of + all ancestor cgroups. If there is memory.min overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.min. + + Putting more memory than generally available under this + protection is discouraged and may lead to constant OOMs. + + If a memory cgroup is not populated with processes, + its memory.min is ignored. + memory.low A read-write single value file which exists on non-root cgroups. The default is "0". @@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment - (child cgroup or cgroups are requiring more protected memory, + (child cgroup or cgroups are requiring more protected memory than parent will allow), then each child cgroup will get - the part of parent's protection proportional to the its + the part of parent's protection proportional to its actual memory usage below memory.low. Putting more memory than generally available under this diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 10d741e8fe51..9c04cf8e6487 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,12 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7902a727d3b6..bab7e57f659b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -8,10 +8,16 @@ struct page_counter { atomic_long_t usage; - unsigned long max; + unsigned long min; unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; @@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6de0d6a3a8d..e3d56927a724 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); @@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, @@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is in the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is in the normal range. + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected * - * @root is exclusive; it is never low when looked at directly + * @root is exclusive; it is never protected when looked at directly * - * To provide a proper hierarchical behavior, effective memory.low value - * is used. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * * Effective memory.low is always equal or less than the original memory.low. * If there is no memory.low overcommittment (which is always true for @@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = { * E/memory.current = 0 * * These calculations require constant tracking of the actual low usages - * (see propagate_low_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_low() + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() * path for each memory cgroup top-down from the reclaim, * it's possible to optimize this part, and save calculated elow * for next usage. This part is intentionally racy, but it's ok, * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { - unsigned long usage, low_usage, siblings_low_usage; - unsigned long elow, parent_elow; struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; - elow = memcg->memory.low; usage = page_counter_read(&memcg->memory); - parent = parent_mem_cgroup(memcg); + if (!usage) + return MEMCG_PROT_NONE; + emin = memcg->memory.min; + elow = memcg->memory.low; + + parent = parent_mem_cgroup(memcg); if (parent == root) goto exit; + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); + } + parent_elow = READ_ONCE(parent->memory.elow); elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; - if (!elow || !parent_elow) - goto exit; + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } - if (!low_usage || !siblings_low_usage) - goto exit; - - elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: + memcg->memory.emin = emin; memcg->memory.elow = elow; - return usage && usage <= elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index a5ff4cbc355a..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,26 +13,38 @@ #include #include -static void propagate_low_usage(struct page_counter *c, unsigned long usage) +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) { - unsigned long low_usage, old; + unsigned long protected, old_protected; long delta; if (!c->parent) return; - if (!c->low && !atomic_long_read(&c->low_usage)) - return; + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; - if (usage <= c->low) - low_usage = usage; - else - low_usage = 0; + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } - old = atomic_long_xchg(&c->low_usage, low_usage); - delta = low_usage - old; - if (delta) - atomic_long_add(delta, &c->parent->children_low_usage); + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } } /** @@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_set_low - set the amount of protected memory * @counter: counter @@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) counter->low = nr_pages; for (c = counter; c; c = c->parent) - propagate_low_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e67b477ecef..03822f86f288 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed;