mirror of
git://git.openwrt.org/openwrt/openwrt.git
synced 2025-01-22 22:53:11 +00:00
708a507af0
Refresh kernel patches for generic kernel 5.15 due to new backport version of MGLRU patchset. Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
499 lines
15 KiB
Diff
499 lines
15 KiB
Diff
From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
Date: Sun, 18 Sep 2022 02:00:07 -0600
|
|
Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
|
|
can be disabled include:
|
|
0x0001: the multi-gen LRU core
|
|
0x0002: walking page table, when arch_has_hw_pte_young() returns
|
|
true
|
|
0x0004: clearing the accessed bit in non-leaf PMD entries, when
|
|
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
|
|
[yYnN]: apply to all the components above
|
|
E.g.,
|
|
echo y >/sys/kernel/mm/lru_gen/enabled
|
|
cat /sys/kernel/mm/lru_gen/enabled
|
|
0x0007
|
|
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
|
cat /sys/kernel/mm/lru_gen/enabled
|
|
0x0005
|
|
|
|
NB: the page table walks happen on the scale of seconds under heavy memory
|
|
pressure, in which case the mmap_lock contention is a lesser concern,
|
|
compared with the LRU lock contention and the I/O congestion. So far the
|
|
only well-known case of the mmap_lock contention happens on Android, due
|
|
to Scudo [1] which allocates several thousand VMAs for merely a few
|
|
hundred MBs. The SPF and the Maple Tree also have provided their own
|
|
assessments [2][3]. However, if walking page tables does worsen the
|
|
mmap_lock contention, the kill switch can be used to disable it. In this
|
|
case the multi-gen LRU will suffer a minor performance degradation, as
|
|
shown previously.
|
|
|
|
Clearing the accessed bit in non-leaf PMD entries can also be disabled,
|
|
since this behavior was not tested on x86 varieties other than Intel and
|
|
AMD.
|
|
|
|
[1] https://source.android.com/devices/tech/debug/scudo
|
|
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
|
|
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
|
|
|
|
Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
|
Cc: Barry Song <baohua@kernel.org>
|
|
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Cc: Hillf Danton <hdanton@sina.com>
|
|
Cc: Jens Axboe <axboe@kernel.dk>
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Cc: Matthew Wilcox <willy@infradead.org>
|
|
Cc: Mel Gorman <mgorman@suse.de>
|
|
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
|
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
|
Cc: Michal Hocko <mhocko@kernel.org>
|
|
Cc: Mike Rapoport <rppt@kernel.org>
|
|
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
|
Cc: Tejun Heo <tj@kernel.org>
|
|
Cc: Vlastimil Babka <vbabka@suse.cz>
|
|
Cc: Will Deacon <will@kernel.org>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
---
|
|
include/linux/cgroup.h | 15 ++-
|
|
include/linux/mm_inline.h | 15 ++-
|
|
include/linux/mmzone.h | 9 ++
|
|
kernel/cgroup/cgroup-internal.h | 1 -
|
|
mm/Kconfig | 6 +
|
|
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
|
|
6 files changed, 265 insertions(+), 9 deletions(-)
|
|
|
|
--- a/include/linux/cgroup.h
|
|
+++ b/include/linux/cgroup.h
|
|
@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
|
|
css_put(&cgrp->self);
|
|
}
|
|
|
|
+extern struct mutex cgroup_mutex;
|
|
+
|
|
+static inline void cgroup_lock(void)
|
|
+{
|
|
+ mutex_lock(&cgroup_mutex);
|
|
+}
|
|
+
|
|
+static inline void cgroup_unlock(void)
|
|
+{
|
|
+ mutex_unlock(&cgroup_mutex);
|
|
+}
|
|
+
|
|
/**
|
|
* task_css_set_check - obtain a task's css_set with extra access conditions
|
|
* @task: the task to obtain css_set for
|
|
@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
|
|
* as locks used during the cgroup_subsys::attach() methods.
|
|
*/
|
|
#ifdef CONFIG_PROVE_RCU
|
|
-extern struct mutex cgroup_mutex;
|
|
extern spinlock_t css_set_lock;
|
|
#define task_css_set_check(task, __c) \
|
|
rcu_dereference_check((task)->cgroups, \
|
|
@@ -708,6 +719,8 @@ struct cgroup;
|
|
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
|
static inline void css_get(struct cgroup_subsys_state *css) {}
|
|
static inline void css_put(struct cgroup_subsys_state *css) {}
|
|
+static inline void cgroup_lock(void) {}
|
|
+static inline void cgroup_unlock(void) {}
|
|
static inline int cgroup_attach_task_all(struct task_struct *from,
|
|
struct task_struct *t) { return 0; }
|
|
static inline int cgroupstats_build(struct cgroupstats *stats,
|
|
--- a/include/linux/mm_inline.h
|
|
+++ b/include/linux/mm_inline.h
|
|
@@ -91,10 +91,21 @@ static __always_inline enum lru_list pag
|
|
|
|
#ifdef CONFIG_LRU_GEN
|
|
|
|
+#ifdef CONFIG_LRU_GEN_ENABLED
|
|
static inline bool lru_gen_enabled(void)
|
|
{
|
|
- return true;
|
|
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
|
+
|
|
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
|
|
+}
|
|
+#else
|
|
+static inline bool lru_gen_enabled(void)
|
|
+{
|
|
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
|
+
|
|
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
|
|
}
|
|
+#endif
|
|
|
|
static inline bool lru_gen_in_fault(void)
|
|
{
|
|
@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(stru
|
|
|
|
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
|
|
|
- if (PageUnevictable(page))
|
|
+ if (PageUnevictable(page) || !lrugen->enabled)
|
|
return false;
|
|
/*
|
|
* There are three common cases for this page:
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -364,6 +364,13 @@ enum {
|
|
LRU_GEN_FILE,
|
|
};
|
|
|
|
+enum {
|
|
+ LRU_GEN_CORE,
|
|
+ LRU_GEN_MM_WALK,
|
|
+ LRU_GEN_NONLEAF_YOUNG,
|
|
+ NR_LRU_GEN_CAPS
|
|
+};
|
|
+
|
|
#define MIN_LRU_BATCH BITS_PER_LONG
|
|
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
|
|
|
@@ -405,6 +412,8 @@ struct lru_gen_struct {
|
|
/* can be modified without holding the LRU lock */
|
|
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
|
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
|
+ /* whether the multi-gen LRU is enabled */
|
|
+ bool enabled;
|
|
};
|
|
|
|
enum {
|
|
--- a/kernel/cgroup/cgroup-internal.h
|
|
+++ b/kernel/cgroup/cgroup-internal.h
|
|
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
|
|
#define DEFINE_CGROUP_MGCTX(name) \
|
|
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
|
|
|
-extern struct mutex cgroup_mutex;
|
|
extern spinlock_t css_set_lock;
|
|
extern struct cgroup_subsys *cgroup_subsys[];
|
|
extern struct list_head cgroup_roots;
|
|
--- a/mm/Kconfig
|
|
+++ b/mm/Kconfig
|
|
@@ -906,6 +906,12 @@ config LRU_GEN
|
|
help
|
|
A high performance LRU implementation to overcommit memory.
|
|
|
|
+config LRU_GEN_ENABLED
|
|
+ bool "Enable by default"
|
|
+ depends on LRU_GEN
|
|
+ help
|
|
+ This option enables the multi-gen LRU by default.
|
|
+
|
|
config LRU_GEN_STATS
|
|
bool "Full stats for debugging"
|
|
depends on LRU_GEN
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -52,6 +52,7 @@
|
|
#include <linux/psi.h>
|
|
#include <linux/pagewalk.h>
|
|
#include <linux/shmem_fs.h>
|
|
+#include <linux/ctype.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/div64.h>
|
|
@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pg
|
|
|
|
#ifdef CONFIG_LRU_GEN
|
|
|
|
+#ifdef CONFIG_LRU_GEN_ENABLED
|
|
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
|
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
|
|
+#else
|
|
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
|
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
|
|
+#endif
|
|
+
|
|
/******************************************************************************
|
|
* shorthand helpers
|
|
******************************************************************************/
|
|
@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t
|
|
goto next;
|
|
|
|
if (!pmd_trans_huge(pmd[i])) {
|
|
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
|
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
|
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
|
|
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
|
goto next;
|
|
}
|
|
@@ -3815,10 +3825,12 @@ restart:
|
|
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
|
|
|
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
|
- if (!pmd_young(val))
|
|
- continue;
|
|
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
|
+ if (!pmd_young(val))
|
|
+ continue;
|
|
|
|
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
|
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
|
+ }
|
|
#endif
|
|
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
|
continue;
|
|
@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lr
|
|
* handful of PTEs. Spreading the work out over a period of time usually
|
|
* is less efficient, but it avoids bursty page faults.
|
|
*/
|
|
- if (!arch_has_hw_pte_young()) {
|
|
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
|
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
|
goto done;
|
|
}
|
|
@@ -4846,6 +4858,208 @@ done:
|
|
}
|
|
|
|
/******************************************************************************
|
|
+ * state change
|
|
+ ******************************************************************************/
|
|
+
|
|
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
|
+{
|
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+
|
|
+ if (lrugen->enabled) {
|
|
+ enum lru_list lru;
|
|
+
|
|
+ for_each_evictable_lru(lru) {
|
|
+ if (!list_empty(&lruvec->lists[lru]))
|
|
+ return false;
|
|
+ }
|
|
+ } else {
|
|
+ int gen, type, zone;
|
|
+
|
|
+ for_each_gen_type_zone(gen, type, zone) {
|
|
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool fill_evictable(struct lruvec *lruvec)
|
|
+{
|
|
+ enum lru_list lru;
|
|
+ int remaining = MAX_LRU_BATCH;
|
|
+
|
|
+ for_each_evictable_lru(lru) {
|
|
+ int type = is_file_lru(lru);
|
|
+ bool active = is_active_lru(lru);
|
|
+ struct list_head *head = &lruvec->lists[lru];
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ bool success;
|
|
+ struct page *page = lru_to_page(head);
|
|
+
|
|
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
|
+ VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
|
|
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
|
+ VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
|
|
+
|
|
+ del_page_from_lru_list(page, lruvec);
|
|
+ success = lru_gen_add_page(lruvec, page, false);
|
|
+ VM_WARN_ON_ONCE(!success);
|
|
+
|
|
+ if (!--remaining)
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool drain_evictable(struct lruvec *lruvec)
|
|
+{
|
|
+ int gen, type, zone;
|
|
+ int remaining = MAX_LRU_BATCH;
|
|
+
|
|
+ for_each_gen_type_zone(gen, type, zone) {
|
|
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ bool success;
|
|
+ struct page *page = lru_to_page(head);
|
|
+
|
|
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
|
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
|
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
|
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
|
+
|
|
+ success = lru_gen_del_page(lruvec, page, false);
|
|
+ VM_WARN_ON_ONCE(!success);
|
|
+ add_page_to_lru_list(page, lruvec);
|
|
+
|
|
+ if (!--remaining)
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void lru_gen_change_state(bool enabled)
|
|
+{
|
|
+ static DEFINE_MUTEX(state_mutex);
|
|
+
|
|
+ struct mem_cgroup *memcg;
|
|
+
|
|
+ cgroup_lock();
|
|
+ cpus_read_lock();
|
|
+ get_online_mems();
|
|
+ mutex_lock(&state_mutex);
|
|
+
|
|
+ if (enabled == lru_gen_enabled())
|
|
+ goto unlock;
|
|
+
|
|
+ if (enabled)
|
|
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
|
+ else
|
|
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
|
+
|
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
|
+ do {
|
|
+ int nid;
|
|
+
|
|
+ for_each_node(nid) {
|
|
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
+
|
|
+ if (!lruvec)
|
|
+ continue;
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
|
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
|
|
+
|
|
+ lruvec->lrugen.enabled = enabled;
|
|
+
|
|
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+ cond_resched();
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+ }
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+ }
|
|
+
|
|
+ cond_resched();
|
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
+unlock:
|
|
+ mutex_unlock(&state_mutex);
|
|
+ put_online_mems();
|
|
+ cpus_read_unlock();
|
|
+ cgroup_unlock();
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * sysfs interface
|
|
+ ******************************************************************************/
|
|
+
|
|
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
|
+{
|
|
+ unsigned int caps = 0;
|
|
+
|
|
+ if (get_cap(LRU_GEN_CORE))
|
|
+ caps |= BIT(LRU_GEN_CORE);
|
|
+
|
|
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
|
+ caps |= BIT(LRU_GEN_MM_WALK);
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
|
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
|
+
|
|
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
|
+}
|
|
+
|
|
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
|
|
+ const char *buf, size_t len)
|
|
+{
|
|
+ int i;
|
|
+ unsigned int caps;
|
|
+
|
|
+ if (tolower(*buf) == 'n')
|
|
+ caps = 0;
|
|
+ else if (tolower(*buf) == 'y')
|
|
+ caps = -1;
|
|
+ else if (kstrtouint(buf, 0, &caps))
|
|
+ return -EINVAL;
|
|
+
|
|
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
|
+ bool enabled = caps & BIT(i);
|
|
+
|
|
+ if (i == LRU_GEN_CORE)
|
|
+ lru_gen_change_state(enabled);
|
|
+ else if (enabled)
|
|
+ static_branch_enable(&lru_gen_caps[i]);
|
|
+ else
|
|
+ static_branch_disable(&lru_gen_caps[i]);
|
|
+ }
|
|
+
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
|
+ enabled, 0644, show_enabled, store_enabled
|
|
+);
|
|
+
|
|
+static struct attribute *lru_gen_attrs[] = {
|
|
+ &lru_gen_enabled_attr.attr,
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static struct attribute_group lru_gen_attr_group = {
|
|
+ .name = "lru_gen",
|
|
+ .attrs = lru_gen_attrs,
|
|
+};
|
|
+
|
|
+/******************************************************************************
|
|
* initialization
|
|
******************************************************************************/
|
|
|
|
@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
lrugen->max_seq = MIN_NR_GENS + 1;
|
|
+ lrugen->enabled = lru_gen_enabled();
|
|
|
|
for_each_gen_type_zone(gen, type, zone)
|
|
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
|
@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void)
|
|
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
|
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
|
|
|
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
|
+ pr_err("lru_gen: failed to create sysfs group\n");
|
|
+
|
|
return 0;
|
|
};
|
|
late_initcall(init_lru_gen);
|