mirror of
git://git.openwrt.org/openwrt/openwrt.git
synced 2025-01-01 20:02:29 +00:00
kernel: Backport MGLRU patch from 6.4
This fixes a bug where the reclaim path could occasionally have long tail latency. Signed-off-by: Kazuki Hashimoto <kazukih0205@gmail.com>
This commit is contained in:
parent
b28b8ed1f4
commit
ae8db3941c
@ -0,0 +1,266 @@
|
||||
From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001
|
||||
From: Kalesh Singh <kaleshsingh@google.com>
|
||||
Date: Thu, 13 Apr 2023 14:43:26 -0700
|
||||
Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable()
|
||||
|
||||
Android 14 and later default to MGLRU [1] and field telemetry showed
|
||||
occasional long tail latency (>100ms) in the reclaim path.
|
||||
|
||||
Tracing revealed priority inversion in the reclaim path. In
|
||||
try_to_inc_max_seq(), when high priority tasks were blocked on
|
||||
wait_event_killable(), the preemption of the low priority task to call
|
||||
wake_up_all() caused those high priority tasks to wait longer than
|
||||
necessary. In general, this problem is not different from others of its
|
||||
kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU
|
||||
because it introduced the new wait queue lruvec->mm_state.wait.
|
||||
|
||||
The purpose of this new wait queue is to avoid the thundering herd
|
||||
problem. If many direct reclaimers rush into try_to_inc_max_seq(), only
|
||||
one can succeed, i.e., the one to wake up the rest, and the rest who
|
||||
failed might cause premature OOM kills if they do not wait. So far there
|
||||
is no evidence supporting this scenario, based on how often the wait has
|
||||
been hit. And this begs the question how useful the wait queue is in
|
||||
practice.
|
||||
|
||||
Based on Minchan's recommendation, which is in line with his commit
|
||||
6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
|
||||
rest of the MGLRU code which also uses trylock when possible, remove the
|
||||
wait queue.
|
||||
|
||||
[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
|
||||
|
||||
Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de
|
||||
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
|
||||
Suggested-by: Minchan Kim <minchan@kernel.org>
|
||||
Reported-by: Wei Wang <wvw@google.com>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Minchan Kim <minchan@kernel.org>
|
||||
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Cc: Suleiman Souhlal <suleiman@google.com>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mmzone.h | 8 +--
|
||||
mm/vmscan.c | 111 +++++++++++++++--------------------------
|
||||
2 files changed, 42 insertions(+), 77 deletions(-)
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -452,18 +452,14 @@ enum {
|
||||
struct lru_gen_mm_state {
|
||||
/* set to max_seq after each iteration */
|
||||
unsigned long seq;
|
||||
- /* where the current iteration continues (inclusive) */
|
||||
+ /* where the current iteration continues after */
|
||||
struct list_head *head;
|
||||
- /* where the last iteration ended (exclusive) */
|
||||
+ /* where the last iteration ended before */
|
||||
struct list_head *tail;
|
||||
- /* to wait for the last page table walker to finish */
|
||||
- struct wait_queue_head wait;
|
||||
/* Bloom filters flip after each iteration */
|
||||
unsigned long *filters[NR_BLOOM_FILTERS];
|
||||
/* the mm stats for debugging */
|
||||
unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
|
||||
- /* the number of concurrent page table walkers */
|
||||
- int nr_walkers;
|
||||
};
|
||||
|
||||
struct lru_gen_mm_walk {
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm
|
||||
if (!lruvec)
|
||||
continue;
|
||||
|
||||
- /* where the last iteration ended (exclusive) */
|
||||
+ /* where the current iteration continues after */
|
||||
+ if (lruvec->mm_state.head == &mm->lru_gen.list)
|
||||
+ lruvec->mm_state.head = lruvec->mm_state.head->prev;
|
||||
+
|
||||
+ /* where the last iteration ended before */
|
||||
if (lruvec->mm_state.tail == &mm->lru_gen.list)
|
||||
lruvec->mm_state.tail = lruvec->mm_state.tail->next;
|
||||
-
|
||||
- /* where the current iteration continues (inclusive) */
|
||||
- if (lruvec->mm_state.head != &mm->lru_gen.list)
|
||||
- continue;
|
||||
-
|
||||
- lruvec->mm_state.head = lruvec->mm_state.head->next;
|
||||
- /* the deletion ends the current iteration */
|
||||
- if (lruvec->mm_state.head == &mm_list->fifo)
|
||||
- WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
|
||||
}
|
||||
|
||||
list_del_init(&mm->lru_gen.list);
|
||||
@@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve
|
||||
struct mm_struct **iter)
|
||||
{
|
||||
bool first = false;
|
||||
- bool last = true;
|
||||
+ bool last = false;
|
||||
struct mm_struct *mm = NULL;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||||
struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
|
||||
|
||||
/*
|
||||
- * There are four interesting cases for this page table walker:
|
||||
- * 1. It tries to start a new iteration of mm_list with a stale max_seq;
|
||||
- * there is nothing left to do.
|
||||
- * 2. It's the first of the current generation, and it needs to reset
|
||||
- * the Bloom filter for the next generation.
|
||||
- * 3. It reaches the end of mm_list, and it needs to increment
|
||||
- * mm_state->seq; the iteration is done.
|
||||
- * 4. It's the last of the current generation, and it needs to reset the
|
||||
- * mm stats counters for the next generation.
|
||||
+ * mm_state->seq is incremented after each iteration of mm_list. There
|
||||
+ * are three interesting cases for this page table walker:
|
||||
+ * 1. It tries to start a new iteration with a stale max_seq: there is
|
||||
+ * nothing left to do.
|
||||
+ * 2. It started the next iteration: it needs to reset the Bloom filter
|
||||
+ * so that a fresh set of PTE tables can be recorded.
|
||||
+ * 3. It ended the current iteration: it needs to reset the mm stats
|
||||
+ * counters and tell its caller to increment max_seq.
|
||||
*/
|
||||
spin_lock(&mm_list->lock);
|
||||
|
||||
VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
|
||||
- VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
|
||||
- VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
|
||||
|
||||
- if (walk->max_seq <= mm_state->seq) {
|
||||
- if (!*iter)
|
||||
- last = false;
|
||||
+ if (walk->max_seq <= mm_state->seq)
|
||||
goto done;
|
||||
- }
|
||||
|
||||
- if (!mm_state->nr_walkers) {
|
||||
- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
|
||||
+ if (!mm_state->head)
|
||||
+ mm_state->head = &mm_list->fifo;
|
||||
|
||||
- mm_state->head = mm_list->fifo.next;
|
||||
+ if (mm_state->head == &mm_list->fifo)
|
||||
first = true;
|
||||
- }
|
||||
-
|
||||
- while (!mm && mm_state->head != &mm_list->fifo) {
|
||||
- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
|
||||
|
||||
+ do {
|
||||
mm_state->head = mm_state->head->next;
|
||||
+ if (mm_state->head == &mm_list->fifo) {
|
||||
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
||||
+ last = true;
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
/* force scan for those added after the last iteration */
|
||||
- if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
|
||||
- mm_state->tail = mm_state->head;
|
||||
+ if (!mm_state->tail || mm_state->tail == mm_state->head) {
|
||||
+ mm_state->tail = mm_state->head->next;
|
||||
walk->force_scan = true;
|
||||
}
|
||||
|
||||
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
|
||||
if (should_skip_mm(mm, walk))
|
||||
mm = NULL;
|
||||
- }
|
||||
-
|
||||
- if (mm_state->head == &mm_list->fifo)
|
||||
- WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
||||
+ } while (!mm);
|
||||
done:
|
||||
- if (*iter && !mm)
|
||||
- mm_state->nr_walkers--;
|
||||
- if (!*iter && mm)
|
||||
- mm_state->nr_walkers++;
|
||||
-
|
||||
- if (mm_state->nr_walkers)
|
||||
- last = false;
|
||||
-
|
||||
if (*iter || last)
|
||||
reset_mm_stats(lruvec, walk, last);
|
||||
|
||||
@@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc
|
||||
|
||||
VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
|
||||
|
||||
- if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
|
||||
- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
|
||||
-
|
||||
+ if (max_seq > mm_state->seq) {
|
||||
+ mm_state->head = NULL;
|
||||
+ mm_state->tail = NULL;
|
||||
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
||||
reset_mm_stats(lruvec, NULL, true);
|
||||
success = true;
|
||||
@@ -3894,10 +3875,6 @@ restart:
|
||||
|
||||
walk_pmd_range(&val, addr, next, args);
|
||||
|
||||
- /* a racy check to curtail the waiting time */
|
||||
- if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
|
||||
- return 1;
|
||||
-
|
||||
if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
|
||||
end = (addr | ~PUD_MASK) + 1;
|
||||
goto done;
|
||||
@@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve
|
||||
walk->next_addr = FIRST_USER_ADDRESS;
|
||||
|
||||
do {
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
err = -EBUSY;
|
||||
|
||||
+ /* another thread might have called inc_max_seq() */
|
||||
+ if (walk->max_seq != max_seq)
|
||||
+ break;
|
||||
+
|
||||
/* page_update_gen() requires stable page_memcg() */
|
||||
if (!mem_cgroup_trylock_pages(memcg))
|
||||
break;
|
||||
@@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr
|
||||
success = iterate_mm_list(lruvec, walk, &mm);
|
||||
if (mm)
|
||||
walk_mm(lruvec, mm, walk);
|
||||
-
|
||||
- cond_resched();
|
||||
} while (mm);
|
||||
done:
|
||||
- if (!success) {
|
||||
- if (sc->priority <= DEF_PRIORITY - 2)
|
||||
- wait_event_killable(lruvec->mm_state.wait,
|
||||
- max_seq < READ_ONCE(lrugen->max_seq));
|
||||
- return false;
|
||||
- }
|
||||
+ if (success)
|
||||
+ inc_max_seq(lruvec, can_swap, force_scan);
|
||||
|
||||
- VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
-
|
||||
- inc_max_seq(lruvec, can_swap, force_scan);
|
||||
- /* either this sees any waiters or they will see updated max_seq */
|
||||
- if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
- wake_up_all(&lruvec->mm_state.wait);
|
||||
-
|
||||
- return true;
|
||||
+ return success;
|
||||
}
|
||||
|
||||
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
@@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
|
||||
|
||||
lruvec->mm_state.seq = MIN_NR_GENS;
|
||||
- init_waitqueue_head(&lruvec->mm_state.wait);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
@ -0,0 +1,280 @@
|
||||
From 418038c22452df38cde519cc8c662bb15139764a Mon Sep 17 00:00:00 2001
|
||||
From: Kalesh Singh <kaleshsingh@google.com>
|
||||
Date: Thu, 13 Apr 2023 14:43:26 -0700
|
||||
Subject: [PATCH 19/19] mm: Multi-gen LRU: remove wait_event_killable()
|
||||
|
||||
Android 14 and later default to MGLRU [1] and field telemetry showed
|
||||
occasional long tail latency (>100ms) in the reclaim path.
|
||||
|
||||
Tracing revealed priority inversion in the reclaim path. In
|
||||
try_to_inc_max_seq(), when high priority tasks were blocked on
|
||||
wait_event_killable(), the preemption of the low priority task to call
|
||||
wake_up_all() caused those high priority tasks to wait longer than
|
||||
necessary. In general, this problem is not different from others of its
|
||||
kind, e.g., one caused by mutex_lock(). However, it is specific to MGLRU
|
||||
because it introduced the new wait queue lruvec->mm_state.wait.
|
||||
|
||||
The purpose of this new wait queue is to avoid the thundering herd
|
||||
problem. If many direct reclaimers rush into try_to_inc_max_seq(), only
|
||||
one can succeed, i.e., the one to wake up the rest, and the rest who
|
||||
failed might cause premature OOM kills if they do not wait. So far there
|
||||
is no evidence supporting this scenario, based on how often the wait has
|
||||
been hit. And this begs the question how useful the wait queue is in
|
||||
practice.
|
||||
|
||||
Based on Minchan's recommendation, which is in line with his commit
|
||||
6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
|
||||
rest of the MGLRU code which also uses trylock when possible, remove the
|
||||
wait queue.
|
||||
|
||||
[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
|
||||
|
||||
Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
|
||||
Suggested-by: Minchan Kim <minchan@kernel.org>
|
||||
Reported-by: Wei Wang <wvw@google.com>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Minchan Kim <minchan@kernel.org>
|
||||
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Cc: Suleiman Souhlal <suleiman@google.com>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mmzone.h | 8 +--
|
||||
mm/vmscan.c | 112 +++++++++++++++--------------------------
|
||||
2 files changed, 42 insertions(+), 78 deletions(-)
|
||||
|
||||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||
index 403c7461e7a70..d62a5accf1be4 100644
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -453,18 +453,14 @@ enum {
|
||||
struct lru_gen_mm_state {
|
||||
/* set to max_seq after each iteration */
|
||||
unsigned long seq;
|
||||
- /* where the current iteration continues (inclusive) */
|
||||
+ /* where the current iteration continues after */
|
||||
struct list_head *head;
|
||||
- /* where the last iteration ended (exclusive) */
|
||||
+ /* where the last iteration ended before */
|
||||
struct list_head *tail;
|
||||
- /* to wait for the last page table walker to finish */
|
||||
- struct wait_queue_head wait;
|
||||
/* Bloom filters flip after each iteration */
|
||||
unsigned long *filters[NR_BLOOM_FILTERS];
|
||||
/* the mm stats for debugging */
|
||||
unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
|
||||
- /* the number of concurrent page table walkers */
|
||||
- int nr_walkers;
|
||||
};
|
||||
|
||||
struct lru_gen_mm_walk {
|
||||
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||
index f6ce7a1fd78a3..851758303dbf4 100644
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3371,18 +3371,13 @@ void lru_gen_del_mm(struct mm_struct *mm)
|
||||
if (!lruvec)
|
||||
continue;
|
||||
|
||||
- /* where the last iteration ended (exclusive) */
|
||||
+ /* where the current iteration continues after */
|
||||
+ if (lruvec->mm_state.head == &mm->lru_gen.list)
|
||||
+ lruvec->mm_state.head = lruvec->mm_state.head->prev;
|
||||
+
|
||||
+ /* where the last iteration ended before */
|
||||
if (lruvec->mm_state.tail == &mm->lru_gen.list)
|
||||
lruvec->mm_state.tail = lruvec->mm_state.tail->next;
|
||||
-
|
||||
- /* where the current iteration continues (inclusive) */
|
||||
- if (lruvec->mm_state.head != &mm->lru_gen.list)
|
||||
- continue;
|
||||
-
|
||||
- lruvec->mm_state.head = lruvec->mm_state.head->next;
|
||||
- /* the deletion ends the current iteration */
|
||||
- if (lruvec->mm_state.head == &mm_list->fifo)
|
||||
- WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
|
||||
}
|
||||
|
||||
list_del_init(&mm->lru_gen.list);
|
||||
@@ -3478,68 +3473,54 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
|
||||
struct mm_struct **iter)
|
||||
{
|
||||
bool first = false;
|
||||
- bool last = true;
|
||||
+ bool last = false;
|
||||
struct mm_struct *mm = NULL;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||||
struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
|
||||
|
||||
/*
|
||||
- * There are four interesting cases for this page table walker:
|
||||
- * 1. It tries to start a new iteration of mm_list with a stale max_seq;
|
||||
- * there is nothing left to do.
|
||||
- * 2. It's the first of the current generation, and it needs to reset
|
||||
- * the Bloom filter for the next generation.
|
||||
- * 3. It reaches the end of mm_list, and it needs to increment
|
||||
- * mm_state->seq; the iteration is done.
|
||||
- * 4. It's the last of the current generation, and it needs to reset the
|
||||
- * mm stats counters for the next generation.
|
||||
+ * mm_state->seq is incremented after each iteration of mm_list. There
|
||||
+ * are three interesting cases for this page table walker:
|
||||
+ * 1. It tries to start a new iteration with a stale max_seq: there is
|
||||
+ * nothing left to do.
|
||||
+ * 2. It started the next iteration: it needs to reset the Bloom filter
|
||||
+ * so that a fresh set of PTE tables can be recorded.
|
||||
+ * 3. It ended the current iteration: it needs to reset the mm stats
|
||||
+ * counters and tell its caller to increment max_seq.
|
||||
*/
|
||||
spin_lock(&mm_list->lock);
|
||||
|
||||
VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
|
||||
- VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
|
||||
- VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
|
||||
|
||||
- if (walk->max_seq <= mm_state->seq) {
|
||||
- if (!*iter)
|
||||
- last = false;
|
||||
+ if (walk->max_seq <= mm_state->seq)
|
||||
goto done;
|
||||
- }
|
||||
|
||||
- if (!mm_state->nr_walkers) {
|
||||
- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
|
||||
+ if (!mm_state->head)
|
||||
+ mm_state->head = &mm_list->fifo;
|
||||
|
||||
- mm_state->head = mm_list->fifo.next;
|
||||
+ if (mm_state->head == &mm_list->fifo)
|
||||
first = true;
|
||||
- }
|
||||
-
|
||||
- while (!mm && mm_state->head != &mm_list->fifo) {
|
||||
- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
|
||||
|
||||
+ do {
|
||||
mm_state->head = mm_state->head->next;
|
||||
+ if (mm_state->head == &mm_list->fifo) {
|
||||
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
||||
+ last = true;
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
/* force scan for those added after the last iteration */
|
||||
- if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
|
||||
- mm_state->tail = mm_state->head;
|
||||
+ if (!mm_state->tail || mm_state->tail == mm_state->head) {
|
||||
+ mm_state->tail = mm_state->head->next;
|
||||
walk->force_scan = true;
|
||||
}
|
||||
|
||||
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
|
||||
if (should_skip_mm(mm, walk))
|
||||
mm = NULL;
|
||||
- }
|
||||
-
|
||||
- if (mm_state->head == &mm_list->fifo)
|
||||
- WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
||||
+ } while (!mm);
|
||||
done:
|
||||
- if (*iter && !mm)
|
||||
- mm_state->nr_walkers--;
|
||||
- if (!*iter && mm)
|
||||
- mm_state->nr_walkers++;
|
||||
-
|
||||
- if (mm_state->nr_walkers)
|
||||
- last = false;
|
||||
-
|
||||
if (*iter || last)
|
||||
reset_mm_stats(lruvec, walk, last);
|
||||
|
||||
@@ -3567,9 +3548,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
|
||||
|
||||
VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
|
||||
|
||||
- if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
|
||||
- VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
|
||||
-
|
||||
+ if (max_seq > mm_state->seq) {
|
||||
+ mm_state->head = NULL;
|
||||
+ mm_state->tail = NULL;
|
||||
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
||||
reset_mm_stats(lruvec, NULL, true);
|
||||
success = true;
|
||||
@@ -4172,10 +4153,6 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
|
||||
|
||||
walk_pmd_range(&val, addr, next, args);
|
||||
|
||||
- /* a racy check to curtail the waiting time */
|
||||
- if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
|
||||
- return 1;
|
||||
-
|
||||
if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
|
||||
end = (addr | ~PUD_MASK) + 1;
|
||||
goto done;
|
||||
@@ -4208,8 +4185,14 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
|
||||
walk->next_addr = FIRST_USER_ADDRESS;
|
||||
|
||||
do {
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
err = -EBUSY;
|
||||
|
||||
+ /* another thread might have called inc_max_seq() */
|
||||
+ if (walk->max_seq != max_seq)
|
||||
+ break;
|
||||
+
|
||||
/* folio_update_gen() requires stable folio_memcg() */
|
||||
if (!mem_cgroup_trylock_pages(memcg))
|
||||
break;
|
||||
@@ -4442,25 +4425,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||
success = iterate_mm_list(lruvec, walk, &mm);
|
||||
if (mm)
|
||||
walk_mm(lruvec, mm, walk);
|
||||
-
|
||||
- cond_resched();
|
||||
} while (mm);
|
||||
done:
|
||||
- if (!success) {
|
||||
- if (sc->priority <= DEF_PRIORITY - 2)
|
||||
- wait_event_killable(lruvec->mm_state.wait,
|
||||
- max_seq < READ_ONCE(lrugen->max_seq));
|
||||
- return false;
|
||||
- }
|
||||
+ if (success)
|
||||
+ inc_max_seq(lruvec, can_swap, force_scan);
|
||||
|
||||
- VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
-
|
||||
- inc_max_seq(lruvec, can_swap, force_scan);
|
||||
- /* either this sees any waiters or they will see updated max_seq */
|
||||
- if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
- wake_up_all(&lruvec->mm_state.wait);
|
||||
-
|
||||
- return true;
|
||||
+ return success;
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
@@ -6105,7 +6075,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
|
||||
|
||||
lruvec->mm_state.seq = MIN_NR_GENS;
|
||||
- init_waitqueue_head(&lruvec->mm_state.wait);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
@@ -6138,7 +6107,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
for_each_node(nid) {
|
||||
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
|
||||
- VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
|
||||
VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
||||
sizeof(lruvec->lrugen.nr_pages)));
|
||||
|
||||
--
|
||||
2.40.1
|
||||
|
Loading…
Reference in New Issue
Block a user