1931 lines
58 KiB
Diff
1931 lines
58 KiB
Diff
From mboxrd@z Thu Jan 1 00:00:00 1970
|
|
Return-Path: <linux-kernel-owner@kernel.org>
|
|
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
|
|
aws-us-west-2-korg-lkml-1.web.codeaurora.org
|
|
X-Spam-Level:
|
|
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
|
|
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
|
|
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
|
|
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
|
|
version=3.4.0
|
|
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
|
|
by smtp.lore.kernel.org (Postfix) with ESMTP id 19540C43333
|
|
for <linux-kernel@archiver.kernel.org>; Sat, 13 Mar 2021 07:59:07 +0000 (UTC)
|
|
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
|
|
by mail.kernel.org (Postfix) with ESMTP id F40FB64F1D
|
|
for <linux-kernel@archiver.kernel.org>; Sat, 13 Mar 2021 07:59:06 +0000 (UTC)
|
|
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
|
|
id S233530AbhCMH6o (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
|
|
Sat, 13 Mar 2021 02:58:44 -0500
|
|
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:59004 "EHLO
|
|
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
|
|
with ESMTP id S232955AbhCMH6K (ORCPT
|
|
<rfc822;linux-kernel@vger.kernel.org>);
|
|
Sat, 13 Mar 2021 02:58:10 -0500
|
|
Received: from mail-yb1-xb49.google.com (mail-yb1-xb49.google.com [IPv6:2607:f8b0:4864:20::b49])
|
|
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 354EEC061574
|
|
for <linux-kernel@vger.kernel.org>; Fri, 12 Mar 2021 23:58:10 -0800 (PST)
|
|
Received: by mail-yb1-xb49.google.com with SMTP id 131so32126863ybp.16
|
|
for <linux-kernel@vger.kernel.org>; Fri, 12 Mar 2021 23:58:10 -0800 (PST)
|
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=google.com; s=20161025;
|
|
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
|
|
:cc;
|
|
bh=lKQl5pndqzB+9aY2sJazZHwXRaTv0wpZFLyUcYj7GQA=;
|
|
b=ocmCOVDHi5OSSh/YMeG/LdNp1eeWdPoEMakQGeAxNOTTy4sPlhb0ca4Ygnm14qrLsD
|
|
7XlvUWfcVDj+VdeKuBFZ02EJcowKSOpYpfqbnXi30dSRVEXwEh9UEJHsHiDUDrtJMPmN
|
|
6yygmHTQqf4ygLaa/NYlNiVWi/Q0IQLt9NYECQdaBfahTxezfSJ5IMvCQffZtqs/lqJZ
|
|
FI1m5ZDkYiZf082Xl9ELwXTp+u4V/ZvflKPEeeP3UDh01OqomXqhXIAfYogyshWFmyJ7
|
|
6IUIbFbalKQBZVgSeoVTfLzQ3nu2sLzqLhWOaS0ioFZEyttu8Dq+JSXK5U4Baz2yL4vT
|
|
duNA==
|
|
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=1e100.net; s=20161025;
|
|
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
|
|
:references:subject:from:to:cc;
|
|
bh=lKQl5pndqzB+9aY2sJazZHwXRaTv0wpZFLyUcYj7GQA=;
|
|
b=OCDBNo9HF1yn2n6c27SbWzFZ6kQ3ICeRBco3VdQEG9KlylQXTsfjcs+I3LB8hfnZXT
|
|
N1zdPfEmvw0AUOsKTvDsTWvt5Z/NglhSAQf1gnXVjGPUM75ip5Pb5OnUcolO3cx5Wjyv
|
|
qnAj7g0P49L/93ni6pxMRMIdJQ1HkuT9R7ZgSu2iQUlPEoAiAKT3PAXBnPDvD2o8STGb
|
|
EIosRt5rRacdFaO4/oW2X0JzgdeiMeaiN1X4mbInlAv5EZ2mMYq4SuTIcDTinYk3gag8
|
|
hGksM4dnqvQmNoUrFdij+oL3yXlgtpx0SaBgSHv8XMv6FYoR+ZwXdiZxP8YVj0+G+Y7e
|
|
3zbg==
|
|
X-Gm-Message-State: AOAM531HBgefraurevC51sGm+5dC/6PFZbGstSL3lMCltRzp5xggu0Ig
|
|
5Cxe1LVCQXIG+aDadudBNaqDUxkim/0=
|
|
X-Google-Smtp-Source: ABdhPJxmaAV3Zuy65DGlCkT6I0DbSJaatE40ypeQ7LndNLV2Hp9Q9Eq4d4osGDWr2nCUrvQqAxFej4TvfJo=
|
|
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:f931:d3e4:faa0:4f74])
|
|
(user=yuzhao job=sendgmr) by 2002:a25:2f97:: with SMTP id v145mr25917083ybv.221.1615622289389;
|
|
Fri, 12 Mar 2021 23:58:09 -0800 (PST)
|
|
Date: Sat, 13 Mar 2021 00:57:43 -0700
|
|
In-Reply-To: <20210313075747.3781593-1-yuzhao@google.com>
|
|
Message-Id: <20210313075747.3781593-11-yuzhao@google.com>
|
|
Mime-Version: 1.0
|
|
References: <20210313075747.3781593-1-yuzhao@google.com>
|
|
X-Mailer: git-send-email 2.31.0.rc2.261.g7f71774620-goog
|
|
Subject: [PATCH v1 10/14] mm: multigenerational lru: core
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
To: linux-mm@kvack.org
|
|
Cc: Alex Shi <alex.shi@linux.alibaba.com>,
|
|
Andrew Morton <akpm@linux-foundation.org>,
|
|
Dave Hansen <dave.hansen@linux.intel.com>,
|
|
Hillf Danton <hdanton@sina.com>,
|
|
Johannes Weiner <hannes@cmpxchg.org>,
|
|
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
|
|
Matthew Wilcox <willy@infradead.org>,
|
|
Mel Gorman <mgorman@suse.de>, Michal Hocko <mhocko@suse.com>,
|
|
Roman Gushchin <guro@fb.com>, Vlastimil Babka <vbabka@suse.cz>,
|
|
Wei Yang <richard.weiyang@linux.alibaba.com>,
|
|
Yang Shi <shy828301@gmail.com>,
|
|
Ying Huang <ying.huang@intel.com>,
|
|
linux-kernel@vger.kernel.org, page-reclaim@google.com,
|
|
Yu Zhao <yuzhao@google.com>
|
|
Content-Type: text/plain; charset="UTF-8"
|
|
Precedence: bulk
|
|
List-ID: <linux-kernel.vger.kernel.org>
|
|
X-Mailing-List: linux-kernel@vger.kernel.org
|
|
Archived-At: <https://lore.kernel.org/lkml/20210313075747.3781593-11-yuzhao@google.com/>
|
|
List-Archive: <https://lore.kernel.org/lkml/>
|
|
List-Post: <mailto:linux-kernel@vger.kernel.org>
|
|
|
|
Evictable pages are divided into multiple generations for each lruvec.
|
|
The youngest generation number is stored in max_seq for both anon and
|
|
file types as they are aged on an equal footing. The oldest generation
|
|
numbers are stored in min_seq[2] separately for anon and file types as
|
|
clean file pages can be evicted regardless of may_swap or
|
|
may_writepage. Generation numbers are truncated into
|
|
ilog2(MAX_NR_GENS)+1 bits in order to fit into page->flags. The
|
|
sliding window technique is used to prevent truncated generation
|
|
numbers from overlapping. Each truncated generation number is an index
|
|
to lruvec->evictable.lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
|
|
Evictable pages are added to the per-zone lists indexed by max_seq or
|
|
min_seq[2] (modulo MAX_NR_GENS), depending on whether they are being
|
|
faulted in or read ahead.
|
|
|
|
The workflow comprises two conceptually independent functions: the
|
|
aging and the eviction. The aging produces young generations. Given an
|
|
lruvec, the aging walks the mm_struct list associated with this
|
|
lruvec, i.e., memcg->mm_list or global_mm_list, to scan page tables
|
|
for referenced pages. Upon finding one, the aging updates its
|
|
generation number to max_seq. After each round of scan, the aging
|
|
increments max_seq. Since scans are differential with respect to
|
|
referenced pages, the cost is roughly proportional to their number.
|
|
|
|
The eviction consumes old generations. Given an lruvec, the eviction
|
|
scans the pages on the per-zone lists indexed by either of min_seq[2].
|
|
It selects a type based on the values of min_seq[2] and swappiness.
|
|
During a scan, the eviction either sorts or isolates a page, depending
|
|
on whether the aging has updated its generation number or not. When it
|
|
finds all the per-zone lists are empty, the eviction increments
|
|
min_seq[2] indexed by this selected type. The eviction triggers the
|
|
aging when both of min_seq[2] reaches max_seq-1, assuming both anon
|
|
and file types are reclaimable.
|
|
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
---
|
|
include/linux/mm.h | 1 +
|
|
include/linux/mm_inline.h | 194 +++++
|
|
include/linux/mmzone.h | 54 ++
|
|
include/linux/page-flags-layout.h | 20 +-
|
|
mm/huge_memory.c | 3 +-
|
|
mm/mm_init.c | 13 +-
|
|
mm/mmzone.c | 2 +
|
|
mm/swap.c | 4 +
|
|
mm/swapfile.c | 4 +
|
|
mm/vmscan.c | 1255 +++++++++++++++++++++++++++++
|
|
10 files changed, 1541 insertions(+), 9 deletions(-)
|
|
|
|
diff --git a/include/linux/mm.h b/include/linux/mm.h
|
|
index 77e64e3eac80..ac57ea124fb8 100644
|
|
--- a/include/linux/mm.h
|
|
+++ b/include/linux/mm.h
|
|
@@ -1070,6 +1070,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
|
|
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
|
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
|
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
|
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
|
|
|
/*
|
|
* Define the bit shifts to access each section. For non-existent
|
|
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
|
|
index 355ea1ee32bd..2d306cab36bc 100644
|
|
--- a/include/linux/mm_inline.h
|
|
+++ b/include/linux/mm_inline.h
|
|
@@ -79,11 +79,199 @@ static __always_inline enum lru_list page_lru(struct page *page)
|
|
return lru;
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+#ifdef CONFIG_LRU_GEN_ENABLED
|
|
+DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
|
|
+#define lru_gen_enabled() static_branch_likely(&lru_gen_static_key)
|
|
+#else
|
|
+DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
|
|
+#define lru_gen_enabled() static_branch_unlikely(&lru_gen_static_key)
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * Raw generation numbers (seq) from struct lru_gen are in unsigned long and
|
|
+ * therefore (virtually) monotonic; truncated generation numbers (gen) occupy
|
|
+ * at most ilog2(MAX_NR_GENS)+1 bits in page flags and therefore are cyclic.
|
|
+ */
|
|
+static inline int lru_gen_from_seq(unsigned long seq)
|
|
+{
|
|
+ return seq % MAX_NR_GENS;
|
|
+}
|
|
+
|
|
+/* The youngest and the second youngest generations are considered active. */
|
|
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
|
+{
|
|
+ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq);
|
|
+
|
|
+ VM_BUG_ON(!max_seq);
|
|
+ VM_BUG_ON(gen >= MAX_NR_GENS);
|
|
+
|
|
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
|
+}
|
|
+
|
|
+/* Returns -1 when multigenerational lru is disabled or page is isolated. */
|
|
+static inline int page_lru_gen(struct page *page)
|
|
+{
|
|
+ return ((READ_ONCE(page->flags) & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+}
|
|
+
|
|
+/* Update multigenerational lru sizes in addition to active/inactive lru sizes. */
|
|
+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
|
|
+ int old_gen, int new_gen)
|
|
+{
|
|
+ int file = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+ int delta = thp_nr_pages(page);
|
|
+ enum lru_list lru = LRU_FILE * file;
|
|
+
|
|
+ lockdep_assert_held(&lruvec->lru_lock);
|
|
+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
|
+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
|
+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
|
|
+
|
|
+ if (old_gen >= 0)
|
|
+ WRITE_ONCE(lruvec->evictable.sizes[old_gen][file][zone],
|
|
+ lruvec->evictable.sizes[old_gen][file][zone] - delta);
|
|
+ if (new_gen >= 0)
|
|
+ WRITE_ONCE(lruvec->evictable.sizes[new_gen][file][zone],
|
|
+ lruvec->evictable.sizes[new_gen][file][zone] + delta);
|
|
+
|
|
+ if (old_gen < 0) {
|
|
+ if (lru_gen_is_active(lruvec, new_gen))
|
|
+ lru += LRU_ACTIVE;
|
|
+ update_lru_size(lruvec, lru, zone, delta);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (new_gen < 0) {
|
|
+ if (lru_gen_is_active(lruvec, old_gen))
|
|
+ lru += LRU_ACTIVE;
|
|
+ update_lru_size(lruvec, lru, zone, -delta);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
|
|
+ update_lru_size(lruvec, lru, zone, -delta);
|
|
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
|
|
+ }
|
|
+
|
|
+ /* can't deactivate a page without deleting it first */
|
|
+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
|
+}
|
|
+
|
|
+/* Add a page to a multigenerational lru list. Returns true on success. */
|
|
+static inline bool page_set_lru_gen(struct page *page, struct lruvec *lruvec, bool front)
|
|
+{
|
|
+ int gen;
|
|
+ unsigned long old_flags, new_flags;
|
|
+ int file = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+
|
|
+ if (PageUnevictable(page) || !lruvec->evictable.enabled[file])
|
|
+ return false;
|
|
+ /*
|
|
+ * If a page is being faulted in, mark it as the youngest generation.
|
|
+ * try_walk_mm_list() may look at the size of the youngest generation
|
|
+ * to determine if a page table walk is needed.
|
|
+ *
|
|
+ * If an unmapped page is being activated, e.g., mark_page_accessed(),
|
|
+ * mark it as the second youngest generation so it won't affect
|
|
+ * try_walk_mm_list().
|
|
+ *
|
|
+ * If a page is being evicted, i.e., waiting for writeback, mark it
|
|
+ * as the second oldest generation so it won't be scanned again
|
|
+ * immediately. And if there are more than three generations, it won't
|
|
+ * be counted as active either.
|
|
+ *
|
|
+ * If a page is being deactivated, rotated by writeback or allocated
|
|
+ * by readahead, mark it as the oldest generation so it will evicted
|
|
+ * first.
|
|
+ */
|
|
+ if (PageActive(page) && page_mapped(page))
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.max_seq);
|
|
+ else if (PageActive(page))
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.max_seq - 1);
|
|
+ else if (PageReclaim(page))
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.min_seq[file] + 1);
|
|
+ else
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.min_seq[file]);
|
|
+
|
|
+ do {
|
|
+ old_flags = READ_ONCE(page->flags);
|
|
+ VM_BUG_ON_PAGE(old_flags & LRU_GEN_MASK, page);
|
|
+
|
|
+ new_flags = (old_flags & ~(LRU_GEN_MASK | BIT(PG_active) | BIT(PG_workingset))) |
|
|
+ ((gen + 1UL) << LRU_GEN_PGOFF);
|
|
+ /* mark page as workingset if active */
|
|
+ if (PageActive(page))
|
|
+ new_flags |= BIT(PG_workingset);
|
|
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
+
|
|
+ lru_gen_update_size(page, lruvec, -1, gen);
|
|
+ if (front)
|
|
+ list_add(&page->lru, &lruvec->evictable.lists[gen][file][zone]);
|
|
+ else
|
|
+ list_add_tail(&page->lru, &lruvec->evictable.lists[gen][file][zone]);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Delete a page from a multigenerational lru list. Returns true on success. */
|
|
+static inline bool page_clear_lru_gen(struct page *page, struct lruvec *lruvec)
|
|
+{
|
|
+ int gen;
|
|
+ unsigned long old_flags, new_flags;
|
|
+
|
|
+ do {
|
|
+ old_flags = READ_ONCE(page->flags);
|
|
+ if (!(old_flags & LRU_GEN_MASK))
|
|
+ return false;
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+
|
|
+ gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+
|
|
+ new_flags = old_flags & ~LRU_GEN_MASK;
|
|
+ /* mark page active accordingly */
|
|
+ if (lru_gen_is_active(lruvec, gen))
|
|
+ new_flags |= BIT(PG_active);
|
|
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
+
|
|
+ lru_gen_update_size(page, lruvec, gen, -1);
|
|
+ list_del(&page->lru);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#else /* CONFIG_LRU_GEN */
|
|
+
|
|
+static inline bool lru_gen_enabled(void)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool page_set_lru_gen(struct page *page, struct lruvec *lruvec, bool front)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool page_clear_lru_gen(struct page *page, struct lruvec *lruvec)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
static __always_inline void add_page_to_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
+ if (page_set_lru_gen(page, lruvec, true))
|
|
+ return;
|
|
+
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
@@ -93,6 +281,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
+ if (page_set_lru_gen(page, lruvec, false))
|
|
+ return;
|
|
+
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
@@ -100,6 +291,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
|
|
static __always_inline void del_page_from_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
+ if (page_clear_lru_gen(page, lruvec))
|
|
+ return;
|
|
+
|
|
list_del(&page->lru);
|
|
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
|
-thp_nr_pages(page));
|
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
|
index a99a1050565a..173083bb846e 100644
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -291,6 +291,56 @@ enum lruvec_flags {
|
|
*/
|
|
};
|
|
|
|
+struct lruvec;
|
|
+
|
|
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
|
+
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+#define MAX_NR_GENS CONFIG_NR_LRU_GENS
|
|
+
|
|
+/*
|
|
+ * For a common x86_64 configuration that has 3 zones and 7 generations,
|
|
+ * the size of this struct is 1112; and 4 zones and 15 generations, the
|
|
+ * size is 3048. Though it can be configured to have 6 zones and 63
|
|
+ * generations, there is unlikely a need for it.
|
|
+ */
|
|
+struct lru_gen {
|
|
+ /* aging increments max generation number */
|
|
+ unsigned long max_seq;
|
|
+ /* eviction increments min generation numbers */
|
|
+ unsigned long min_seq[ANON_AND_FILE];
|
|
+ /* birth time of each generation in jiffies */
|
|
+ unsigned long timestamps[MAX_NR_GENS];
|
|
+ /* multigenerational lru lists */
|
|
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
+ /* sizes of multigenerational lru lists in pages */
|
|
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
+ /* used with swappiness to determine which to reclaim */
|
|
+ unsigned long isolated[ANON_AND_FILE];
|
|
+#ifdef CONFIG_MEMCG
|
|
+ /* reclaim priority to compare with other memcgs */
|
|
+ atomic_t priority;
|
|
+#endif
|
|
+ /* whether multigenerational lru is enabled */
|
|
+ bool enabled[ANON_AND_FILE];
|
|
+};
|
|
+
|
|
+void lru_gen_init_lruvec(struct lruvec *lruvec);
|
|
+void lru_gen_set_state(bool enable, bool main, bool swap);
|
|
+
|
|
+#else /* CONFIG_LRU_GEN */
|
|
+
|
|
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void lru_gen_set_state(bool enable, bool main, bool swap)
|
|
+{
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
struct lruvec {
|
|
struct list_head lists[NR_LRU_LISTS];
|
|
/* per lruvec lru_lock for memcg */
|
|
@@ -308,6 +358,10 @@ struct lruvec {
|
|
unsigned long refaults[ANON_AND_FILE];
|
|
/* Various lruvec state flags (enum lruvec_flags) */
|
|
unsigned long flags;
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ /* unevictable pages are on LRU_UNEVICTABLE */
|
|
+ struct lru_gen evictable;
|
|
+#endif
|
|
#ifdef CONFIG_MEMCG
|
|
struct pglist_data *pgdat;
|
|
#endif
|
|
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
|
|
index 7d4ec26d8a3e..0c24ace9da3c 100644
|
|
--- a/include/linux/page-flags-layout.h
|
|
+++ b/include/linux/page-flags-layout.h
|
|
@@ -24,6 +24,20 @@
|
|
#error ZONES_SHIFT -- too many zones configured adjust calculation
|
|
#endif
|
|
|
|
+#ifndef CONFIG_LRU_GEN
|
|
+#define LRU_GEN_WIDTH 0
|
|
+#else
|
|
+#if CONFIG_NR_LRU_GENS < 8
|
|
+#define LRU_GEN_WIDTH 3
|
|
+#elif CONFIG_NR_LRU_GENS < 16
|
|
+#define LRU_GEN_WIDTH 4
|
|
+#elif CONFIG_NR_LRU_GENS < 32
|
|
+#define LRU_GEN_WIDTH 5
|
|
+#else
|
|
+#define LRU_GEN_WIDTH 6
|
|
+#endif
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
#ifdef CONFIG_SPARSEMEM
|
|
#include <asm/sparsemem.h>
|
|
|
|
@@ -56,7 +70,7 @@
|
|
|
|
#define ZONES_WIDTH ZONES_SHIFT
|
|
|
|
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
+#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
#define NODES_WIDTH NODES_SHIFT
|
|
#else
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
@@ -83,14 +97,14 @@
|
|
#define KASAN_TAG_WIDTH 0
|
|
#endif
|
|
|
|
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
|
|
+#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+NODES_WIDTH+KASAN_TAG_WIDTH+LAST_CPUPID_SHIFT \
|
|
<= BITS_PER_LONG - NR_PAGEFLAGS
|
|
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
|
#else
|
|
#define LAST_CPUPID_WIDTH 0
|
|
#endif
|
|
|
|
-#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \
|
|
+#if SECTIONS_WIDTH+ZONES_WIDTH+LRU_GEN_WIDTH+NODES_WIDTH+KASAN_TAG_WIDTH+LAST_CPUPID_WIDTH \
|
|
> BITS_PER_LONG - NR_PAGEFLAGS
|
|
#error "Not enough bits in page flags"
|
|
#endif
|
|
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
|
|
index 395c75111d33..be9bf681313c 100644
|
|
--- a/mm/huge_memory.c
|
|
+++ b/mm/huge_memory.c
|
|
@@ -2422,7 +2422,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
|
|
#ifdef CONFIG_64BIT
|
|
(1L << PG_arch_2) |
|
|
#endif
|
|
- (1L << PG_dirty)));
|
|
+ (1L << PG_dirty) |
|
|
+ LRU_GEN_MASK));
|
|
|
|
/* ->mapping in first tail page is compound_mapcount */
|
|
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
|
diff --git a/mm/mm_init.c b/mm/mm_init.c
|
|
index 8e02e865cc65..0b91a25fbdee 100644
|
|
--- a/mm/mm_init.c
|
|
+++ b/mm/mm_init.c
|
|
@@ -71,27 +71,30 @@ void __init mminit_verify_pageflags_layout(void)
|
|
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
|
- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
|
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
|
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d lru_gen %d Flags %d\n",
|
|
SECTIONS_WIDTH,
|
|
NODES_WIDTH,
|
|
ZONES_WIDTH,
|
|
LAST_CPUPID_WIDTH,
|
|
KASAN_TAG_WIDTH,
|
|
+ LRU_GEN_WIDTH,
|
|
NR_PAGEFLAGS);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
|
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
|
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d lru_gen %d\n",
|
|
SECTIONS_SHIFT,
|
|
NODES_SHIFT,
|
|
ZONES_SHIFT,
|
|
LAST_CPUPID_SHIFT,
|
|
- KASAN_TAG_WIDTH);
|
|
+ KASAN_TAG_WIDTH,
|
|
+ LRU_GEN_WIDTH);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
|
|
- "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
|
|
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu lru_gen %lu\n",
|
|
(unsigned long)SECTIONS_PGSHIFT,
|
|
(unsigned long)NODES_PGSHIFT,
|
|
(unsigned long)ZONES_PGSHIFT,
|
|
(unsigned long)LAST_CPUPID_PGSHIFT,
|
|
- (unsigned long)KASAN_TAG_PGSHIFT);
|
|
+ (unsigned long)KASAN_TAG_PGSHIFT,
|
|
+ (unsigned long)LRU_GEN_PGOFF);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
|
|
"Node/Zone ID: %lu -> %lu\n",
|
|
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
|
|
diff --git a/mm/mmzone.c b/mm/mmzone.c
|
|
index eb89d6e018e2..2ec0d7793424 100644
|
|
--- a/mm/mmzone.c
|
|
+++ b/mm/mmzone.c
|
|
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
|
|
|
|
for_each_lru(lru)
|
|
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
|
+
|
|
+ lru_gen_init_lruvec(lruvec);
|
|
}
|
|
|
|
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
|
diff --git a/mm/swap.c b/mm/swap.c
|
|
index f20ed56ebbbf..bd10efe00684 100644
|
|
--- a/mm/swap.c
|
|
+++ b/mm/swap.c
|
|
@@ -300,6 +300,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
|
|
|
|
void lru_note_cost_page(struct page *page)
|
|
{
|
|
+ /* multigenerational lru doesn't use any heuristics */
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
|
|
page_is_file_lru(page), thp_nr_pages(page));
|
|
}
|
|
diff --git a/mm/swapfile.c b/mm/swapfile.c
|
|
index 084a5b9a18e5..fe03cfeaa08f 100644
|
|
--- a/mm/swapfile.c
|
|
+++ b/mm/swapfile.c
|
|
@@ -2702,6 +2702,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
err = 0;
|
|
atomic_inc(&proc_poll_event);
|
|
wake_up_interruptible(&proc_poll_wait);
|
|
+ /* stop anon multigenerational lru if it's enabled */
|
|
+ lru_gen_set_state(false, false, true);
|
|
|
|
out_dput:
|
|
filp_close(victim, NULL);
|
|
@@ -3348,6 +3350,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
|
|
mutex_unlock(&swapon_mutex);
|
|
atomic_inc(&proc_poll_event);
|
|
wake_up_interruptible(&proc_poll_wait);
|
|
+ /* start anon multigenerational lru if it's enabled */
|
|
+ lru_gen_set_state(true, false, true);
|
|
|
|
error = 0;
|
|
goto out;
|
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
|
index f7657ab0d4b7..fd49a9a5d7f5 100644
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -49,6 +49,8 @@
|
|
#include <linux/printk.h>
|
|
#include <linux/dax.h>
|
|
#include <linux/psi.h>
|
|
+#include <linux/pagewalk.h>
|
|
+#include <linux/memory.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/div64.h>
|
|
@@ -1110,6 +1112,10 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
|
if (!sc->may_unmap && page_mapped(page))
|
|
goto keep_locked;
|
|
|
|
+ /* in case this page was found accessed after it was isolated */
|
|
+ if (lru_gen_enabled() && !ignore_references && PageReferenced(page))
|
|
+ goto activate_locked;
|
|
+
|
|
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
|
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
|
|
|
@@ -2229,6 +2235,10 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
|
unsigned long file;
|
|
struct lruvec *target_lruvec;
|
|
|
|
+ /* multigenerational lru doesn't use any heuristics */
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
|
|
|
/*
|
|
@@ -2518,6 +2528,19 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
|
|
}
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+static void age_lru_gens(struct pglist_data *pgdat, struct scan_control *sc);
|
|
+static void shrink_lru_gens(struct lruvec *lruvec, struct scan_control *sc);
|
|
+#else
|
|
+static void age_lru_gens(struct pglist_data *pgdat, struct scan_control *sc)
|
|
+{
|
|
+}
|
|
+
|
|
+static void shrink_lru_gens(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+}
|
|
+#endif
|
|
+
|
|
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
{
|
|
unsigned long nr[NR_LRU_LISTS];
|
|
@@ -2529,6 +2552,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
struct blk_plug plug;
|
|
bool scan_adjusted;
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ shrink_lru_gens(lruvec, sc);
|
|
+ return;
|
|
+ }
|
|
+
|
|
get_scan_count(lruvec, sc, nr);
|
|
|
|
/* Record the original scan target for proportional adjustments later */
|
|
@@ -2995,6 +3023,10 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
|
|
struct lruvec *target_lruvec;
|
|
unsigned long refaults;
|
|
|
|
+ /* multigenerational lru doesn't use any heuristics */
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
|
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
|
|
target_lruvec->refaults[0] = refaults;
|
|
@@ -3369,6 +3401,11 @@ static void age_active_anon(struct pglist_data *pgdat,
|
|
struct mem_cgroup *memcg;
|
|
struct lruvec *lruvec;
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ age_lru_gens(pgdat, sc);
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (!total_swap_pages)
|
|
return;
|
|
|
|
@@ -4553,12 +4590,1227 @@ static bool get_next_mm(struct lruvec *lruvec, unsigned long next_seq,
|
|
return last;
|
|
}
|
|
|
|
+/******************************************************************************
|
|
+ * aging (page table walk)
|
|
+ ******************************************************************************/
|
|
+
|
|
+#define DEFINE_MAX_SEQ(lruvec) \
|
|
+ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq)
|
|
+
|
|
+#define DEFINE_MIN_SEQ(lruvec) \
|
|
+ unsigned long min_seq[ANON_AND_FILE] = { \
|
|
+ READ_ONCE((lruvec)->evictable.min_seq[0]), \
|
|
+ READ_ONCE((lruvec)->evictable.min_seq[1]), \
|
|
+ }
|
|
+
|
|
+#define for_each_gen_type_zone(gen, file, zone) \
|
|
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
|
+ for ((file) = 0; (file) < ANON_AND_FILE; (file)++) \
|
|
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
|
+
|
|
+#define for_each_type_zone(file, zone) \
|
|
+ for ((file) = 0; (file) < ANON_AND_FILE; (file)++) \
|
|
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
|
+
|
|
+#define MAX_BATCH_SIZE 8192
|
|
+
|
|
+static DEFINE_PER_CPU(int [MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES], lru_batch_size);
|
|
+
|
|
+static void update_batch_size(struct page *page, int old_gen, int new_gen)
|
|
+{
|
|
+ int file = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+ int delta = thp_nr_pages(page);
|
|
+
|
|
+ VM_BUG_ON(preemptible());
|
|
+ VM_BUG_ON(in_interrupt());
|
|
+ VM_BUG_ON(old_gen >= MAX_NR_GENS);
|
|
+ VM_BUG_ON(new_gen >= MAX_NR_GENS);
|
|
+
|
|
+ __this_cpu_sub(lru_batch_size[old_gen][file][zone], delta);
|
|
+ __this_cpu_add(lru_batch_size[new_gen][file][zone], delta);
|
|
+}
|
|
+
|
|
+static void reset_batch_size(struct lruvec *lruvec)
|
|
+{
|
|
+ int gen, file, zone;
|
|
+
|
|
+ VM_BUG_ON(preemptible());
|
|
+ VM_BUG_ON(in_interrupt());
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ for_each_gen_type_zone(gen, file, zone) {
|
|
+ enum lru_list lru = LRU_FILE * file;
|
|
+ int total = __this_cpu_read(lru_batch_size[gen][file][zone]);
|
|
+
|
|
+ if (!total)
|
|
+ continue;
|
|
+
|
|
+ __this_cpu_write(lru_batch_size[gen][file][zone], 0);
|
|
+
|
|
+ WRITE_ONCE(lruvec->evictable.sizes[gen][file][zone],
|
|
+ lruvec->evictable.sizes[gen][file][zone] + total);
|
|
+
|
|
+ if (lru_gen_is_active(lruvec, gen))
|
|
+ lru += LRU_ACTIVE;
|
|
+ update_lru_size(lruvec, lru, zone, total);
|
|
+ }
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+}
|
|
+
|
|
+static int page_update_lru_gen(struct page *page, int new_gen)
|
|
+{
|
|
+ int old_gen;
|
|
+ unsigned long old_flags, new_flags;
|
|
+
|
|
+ VM_BUG_ON(new_gen >= MAX_NR_GENS);
|
|
+
|
|
+ do {
|
|
+ old_flags = READ_ONCE(page->flags);
|
|
+
|
|
+ old_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+ if (old_gen < 0) {
|
|
+ /* make sure shrink_page_list() rejects this page */
|
|
+ if (!PageReferenced(page))
|
|
+ SetPageReferenced(page);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ new_flags = (old_flags & ~LRU_GEN_MASK) | ((new_gen + 1UL) << LRU_GEN_PGOFF);
|
|
+ if (old_flags == new_flags)
|
|
+ break;
|
|
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
+
|
|
+ /* sort_page_by_gen() will sort this page during eviction */
|
|
+
|
|
+ return old_gen;
|
|
+}
|
|
+
|
|
+struct mm_walk_args {
|
|
+ struct mem_cgroup *memcg;
|
|
+ unsigned long max_seq;
|
|
+ unsigned long next_addr;
|
|
+ unsigned long start_pfn;
|
|
+ unsigned long end_pfn;
|
|
+ unsigned long addr_bitmap;
|
|
+ int node_id;
|
|
+ int batch_size;
|
|
+ bool should_walk[ANON_AND_FILE];
|
|
+};
|
|
+
|
|
+static inline unsigned long get_addr_mask(unsigned long addr)
|
|
+{
|
|
+ return BIT((addr & ~PUD_MASK) >> ilog2(PUD_SIZE / BITS_PER_LONG));
|
|
+}
|
|
+
|
|
+static int walk_pte_range(pmd_t *pmdp, unsigned long start, unsigned long end,
|
|
+ struct mm_walk *walk)
|
|
+{
|
|
+ pmd_t pmd;
|
|
+ pte_t *pte;
|
|
+ spinlock_t *ptl;
|
|
+ struct mm_walk_args *args = walk->private;
|
|
+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
|
|
+
|
|
+ pmd = pmd_read_atomic(pmdp);
|
|
+ barrier();
|
|
+ if (!pmd_present(pmd) || pmd_trans_huge(pmd))
|
|
+ return 0;
|
|
+
|
|
+ VM_BUG_ON(pmd_huge(pmd) || pmd_devmap(pmd) || is_hugepd(__hugepd(pmd_val(pmd))));
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG) && !pmd_young(pmd))
|
|
+ return 0;
|
|
+
|
|
+ pte = pte_offset_map_lock(walk->mm, &pmd, start, &ptl);
|
|
+ arch_enter_lazy_mmu_mode();
|
|
+
|
|
+ for (; start != end; pte++, start += PAGE_SIZE) {
|
|
+ struct page *page;
|
|
+ unsigned long pfn = pte_pfn(*pte);
|
|
+
|
|
+ if (!pte_present(*pte) || !pte_young(*pte) || is_zero_pfn(pfn))
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * If this pte maps a page from a different node, set the
|
|
+ * bitmap to prevent the accessed bit on its parent pmd from
|
|
+ * being cleared.
|
|
+ */
|
|
+ if (pfn < args->start_pfn || pfn >= args->end_pfn) {
|
|
+ args->addr_bitmap |= get_addr_mask(start);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ page = compound_head(pte_page(*pte));
|
|
+ if (page_to_nid(page) != args->node_id) {
|
|
+ args->addr_bitmap |= get_addr_mask(start);
|
|
+ continue;
|
|
+ }
|
|
+ if (page_memcg_rcu(page) != args->memcg)
|
|
+ continue;
|
|
+
|
|
+ if (ptep_test_and_clear_young(walk->vma, start, pte)) {
|
|
+ old_gen = page_update_lru_gen(page, new_gen);
|
|
+ if (old_gen >= 0 && old_gen != new_gen) {
|
|
+ update_batch_size(page, old_gen, new_gen);
|
|
+ args->batch_size++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (pte_dirty(*pte) && !PageDirty(page) &&
|
|
+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
|
|
+ set_page_dirty(page);
|
|
+ }
|
|
+
|
|
+ arch_leave_lazy_mmu_mode();
|
|
+ pte_unmap_unlock(pte, ptl);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int walk_pmd_range(pud_t *pudp, unsigned long start, unsigned long end,
|
|
+ struct mm_walk *walk)
|
|
+{
|
|
+ pud_t pud;
|
|
+ pmd_t *pmd;
|
|
+ spinlock_t *ptl;
|
|
+ struct mm_walk_args *args = walk->private;
|
|
+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
|
|
+
|
|
+ pud = READ_ONCE(*pudp);
|
|
+ if (!pud_present(pud) || WARN_ON_ONCE(pud_trans_huge(pud)))
|
|
+ return 0;
|
|
+
|
|
+ VM_BUG_ON(pud_huge(pud) || pud_devmap(pud) || is_hugepd(__hugepd(pud_val(pud))));
|
|
+
|
|
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
|
|
+ !IS_ENABLED(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG))
|
|
+ goto done;
|
|
+
|
|
+ pmd = pmd_offset(&pud, start);
|
|
+ ptl = pmd_lock(walk->mm, pmd);
|
|
+ arch_enter_lazy_mmu_mode();
|
|
+
|
|
+ for (; start != end; pmd++, start = pmd_addr_end(start, end)) {
|
|
+ struct page *page;
|
|
+ unsigned long pfn = pmd_pfn(*pmd);
|
|
+
|
|
+ if (!pmd_present(*pmd) || !pmd_young(*pmd) || is_huge_zero_pmd(*pmd))
|
|
+ continue;
|
|
+
|
|
+ if (!pmd_trans_huge(*pmd)) {
|
|
+ if (!(args->addr_bitmap & get_addr_mask(start)) &&
|
|
+ (!(pmd_addr_end(start, end) & ~PMD_MASK) ||
|
|
+ !walk->vma->vm_next ||
|
|
+ (walk->vma->vm_next->vm_start & PMD_MASK) > end))
|
|
+ pmdp_test_and_clear_young(walk->vma, start, pmd);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (pfn < args->start_pfn || pfn >= args->end_pfn)
|
|
+ continue;
|
|
+
|
|
+ page = pmd_page(*pmd);
|
|
+ if (page_to_nid(page) != args->node_id)
|
|
+ continue;
|
|
+ if (page_memcg_rcu(page) != args->memcg)
|
|
+ continue;
|
|
+
|
|
+ if (pmdp_test_and_clear_young(walk->vma, start, pmd)) {
|
|
+ old_gen = page_update_lru_gen(page, new_gen);
|
|
+ if (old_gen >= 0 && old_gen != new_gen) {
|
|
+ update_batch_size(page, old_gen, new_gen);
|
|
+ args->batch_size++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (pmd_dirty(*pmd) && !PageDirty(page) &&
|
|
+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
|
|
+ set_page_dirty(page);
|
|
+ }
|
|
+
|
|
+ arch_leave_lazy_mmu_mode();
|
|
+ spin_unlock(ptl);
|
|
+done:
|
|
+ args->addr_bitmap = 0;
|
|
+
|
|
+ if (args->batch_size < MAX_BATCH_SIZE)
|
|
+ return 0;
|
|
+
|
|
+ args->next_addr = end;
|
|
+
|
|
+ return -EAGAIN;
|
|
+}
|
|
+
|
|
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
|
|
+{
|
|
+ struct vm_area_struct *vma = walk->vma;
|
|
+ struct mm_walk_args *args = walk->private;
|
|
+
|
|
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_HUGETLB))
|
|
+ return true;
|
|
+
|
|
+ if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
|
|
+ return true;
|
|
+
|
|
+ if (vma_is_anonymous(vma))
|
|
+ return !args->should_walk[0];
|
|
+
|
|
+ if (vma_is_shmem(vma))
|
|
+ return !args->should_walk[0] ||
|
|
+ mapping_unevictable(vma->vm_file->f_mapping);
|
|
+
|
|
+ return !args->should_walk[1] || vma_is_dax(vma) ||
|
|
+ vma == get_gate_vma(vma->vm_mm) ||
|
|
+ mapping_unevictable(vma->vm_file->f_mapping);
|
|
+}
|
|
+
|
|
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, int swappiness)
|
|
+{
|
|
+ int err;
|
|
+ int file;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
+ struct mm_walk_args args = {};
|
|
+ struct mm_walk_ops ops = {
|
|
+ .test_walk = should_skip_vma,
|
|
+ .pmd_entry = walk_pte_range,
|
|
+ .pud_entry_post = walk_pmd_range,
|
|
+ };
|
|
+
|
|
+ args.memcg = memcg;
|
|
+ args.max_seq = READ_ONCE(lruvec->evictable.max_seq);
|
|
+ args.next_addr = FIRST_USER_ADDRESS;
|
|
+ args.start_pfn = pgdat->node_start_pfn;
|
|
+ args.end_pfn = pgdat_end_pfn(pgdat);
|
|
+ args.node_id = pgdat->node_id;
|
|
+
|
|
+ for (file = !swappiness; file < ANON_AND_FILE; file++)
|
|
+ args.should_walk[file] = lru_gen_mm_is_active(mm) ||
|
|
+ node_isset(pgdat->node_id, mm->lru_gen.nodes[file]);
|
|
+
|
|
+ do {
|
|
+ unsigned long start = args.next_addr;
|
|
+ unsigned long end = mm->highest_vm_end;
|
|
+
|
|
+ err = -EBUSY;
|
|
+
|
|
+ preempt_disable();
|
|
+ rcu_read_lock();
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+ if (memcg && atomic_read(&memcg->moving_account))
|
|
+ goto contended;
|
|
+#endif
|
|
+ if (!mmap_read_trylock(mm))
|
|
+ goto contended;
|
|
+
|
|
+ args.batch_size = 0;
|
|
+
|
|
+ err = walk_page_range(mm, start, end, &ops, &args);
|
|
+
|
|
+ mmap_read_unlock(mm);
|
|
+
|
|
+ if (args.batch_size)
|
|
+ reset_batch_size(lruvec);
|
|
+contended:
|
|
+ rcu_read_unlock();
|
|
+ preempt_enable();
|
|
+
|
|
+ cond_resched();
|
|
+ } while (err == -EAGAIN && !mm_is_oom_victim(mm) && !mm_has_migrated(mm, memcg));
|
|
+
|
|
+ if (err)
|
|
+ return;
|
|
+
|
|
+ for (file = !swappiness; file < ANON_AND_FILE; file++) {
|
|
+ if (args.should_walk[file])
|
|
+ node_clear(pgdat->node_id, mm->lru_gen.nodes[file]);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void page_inc_lru_gen(struct page *page, struct lruvec *lruvec, bool front)
|
|
+{
|
|
+ int old_gen, new_gen;
|
|
+ unsigned long old_flags, new_flags;
|
|
+ int file = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+
|
|
+ old_gen = lru_gen_from_seq(lruvec->evictable.min_seq[file]);
|
|
+
|
|
+ do {
|
|
+ old_flags = READ_ONCE(page->flags);
|
|
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+ VM_BUG_ON_PAGE(new_gen < 0, page);
|
|
+ if (new_gen >= 0 && new_gen != old_gen)
|
|
+ goto sort;
|
|
+
|
|
+ new_gen = (old_gen + 1) % MAX_NR_GENS;
|
|
+ new_flags = (old_flags & ~LRU_GEN_MASK) | ((new_gen + 1UL) << LRU_GEN_PGOFF);
|
|
+ /* mark page for reclaim if pending writeback */
|
|
+ if (front)
|
|
+ new_flags |= BIT(PG_reclaim);
|
|
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
+
|
|
+ lru_gen_update_size(page, lruvec, old_gen, new_gen);
|
|
+sort:
|
|
+ if (front)
|
|
+ list_move(&page->lru, &lruvec->evictable.lists[new_gen][file][zone]);
|
|
+ else
|
|
+ list_move_tail(&page->lru, &lruvec->evictable.lists[new_gen][file][zone]);
|
|
+}
|
|
+
|
|
+static int get_nr_gens(struct lruvec *lruvec, int file)
|
|
+{
|
|
+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[file] + 1;
|
|
+}
|
|
+
|
|
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
|
+{
|
|
+ lockdep_assert_held(&lruvec->lru_lock);
|
|
+
|
|
+ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS &&
|
|
+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS &&
|
|
+ get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
|
|
+ get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
|
|
+}
|
|
+
|
|
+static bool try_inc_min_seq(struct lruvec *lruvec, int file)
|
|
+{
|
|
+ int gen, zone;
|
|
+ bool success = false;
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+
|
|
+ while (get_nr_gens(lruvec, file) > MIN_NR_GENS) {
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.min_seq[file]);
|
|
+
|
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
|
+ if (!list_empty(&lruvec->evictable.lists[gen][file][zone]))
|
|
+ return success;
|
|
+ }
|
|
+
|
|
+ lruvec->evictable.isolated[file] = 0;
|
|
+ WRITE_ONCE(lruvec->evictable.min_seq[file],
|
|
+ lruvec->evictable.min_seq[file] + 1);
|
|
+
|
|
+ success = true;
|
|
+ }
|
|
+
|
|
+ return success;
|
|
+}
|
|
+
|
|
+static bool inc_min_seq(struct lruvec *lruvec, int file)
|
|
+{
|
|
+ int gen, zone;
|
|
+ int batch_size = 0;
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+
|
|
+ if (get_nr_gens(lruvec, file) != MAX_NR_GENS)
|
|
+ return true;
|
|
+
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.min_seq[file]);
|
|
+
|
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
|
+ struct list_head *head = &lruvec->evictable.lists[gen][file][zone];
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ struct page *page = lru_to_page(head);
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != file, page);
|
|
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
|
+
|
|
+ prefetchw_prev_lru_page(page, head, flags);
|
|
+
|
|
+ page_inc_lru_gen(page, lruvec, false);
|
|
+
|
|
+ if (++batch_size == MAX_BATCH_SIZE)
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ VM_BUG_ON(lruvec->evictable.sizes[gen][file][zone]);
|
|
+ }
|
|
+
|
|
+ lruvec->evictable.isolated[file] = 0;
|
|
+ WRITE_ONCE(lruvec->evictable.min_seq[file],
|
|
+ lruvec->evictable.min_seq[file] + 1);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void inc_max_seq(struct lruvec *lruvec)
|
|
+{
|
|
+ int gen, file, zone;
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+
|
|
+ for (file = 0; file < ANON_AND_FILE; file++) {
|
|
+ if (try_inc_min_seq(lruvec, file))
|
|
+ continue;
|
|
+
|
|
+ while (!inc_min_seq(lruvec, file)) {
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+ cond_resched();
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.max_seq - 1);
|
|
+ for_each_type_zone(file, zone) {
|
|
+ enum lru_list lru = LRU_FILE * file;
|
|
+ long total = lruvec->evictable.sizes[gen][file][zone];
|
|
+
|
|
+ WARN_ON_ONCE(total != (int)total);
|
|
+
|
|
+ update_lru_size(lruvec, lru, zone, total);
|
|
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -total);
|
|
+ }
|
|
+
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.max_seq + 1);
|
|
+ for_each_type_zone(file, zone) {
|
|
+ VM_BUG_ON(lruvec->evictable.sizes[gen][file][zone]);
|
|
+ VM_BUG_ON(!list_empty(&lruvec->evictable.lists[gen][file][zone]));
|
|
+ }
|
|
+
|
|
+ WRITE_ONCE(lruvec->evictable.timestamps[gen], jiffies);
|
|
+ /* make sure the birth time is valid when read locklessly */
|
|
+ smp_store_release(&lruvec->evictable.max_seq, lruvec->evictable.max_seq + 1);
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+}
|
|
+
|
|
+/* Main function used by foreground, background and user-triggered aging. */
|
|
+static bool walk_mm_list(struct lruvec *lruvec, unsigned long next_seq,
|
|
+ struct scan_control *sc, int swappiness)
|
|
+{
|
|
+ bool last;
|
|
+ struct mm_struct *mm = NULL;
|
|
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
|
+
|
|
+ VM_BUG_ON(next_seq > READ_ONCE(lruvec->evictable.max_seq));
|
|
+
|
|
+ /*
|
|
+ * For each walk of the mm list of a memcg, we decrement the priority
|
|
+ * of its lruvec. For each walk of memcgs in kswapd, we increment the
|
|
+ * priorities of all lruvecs.
|
|
+ *
|
|
+ * So if this lruvec has a higher priority (smaller value), it means
|
|
+ * other concurrent reclaimers (global or memcg reclaim) have walked
|
|
+ * its mm list. Skip it for this priority to balance the pressure on
|
|
+ * all memcgs.
|
|
+ */
|
|
+#ifdef CONFIG_MEMCG
|
|
+ if (!mem_cgroup_disabled() && !cgroup_reclaim(sc) &&
|
|
+ sc->priority > atomic_read(&lruvec->evictable.priority))
|
|
+ return false;
|
|
+#endif
|
|
+
|
|
+ do {
|
|
+ last = get_next_mm(lruvec, next_seq, swappiness, &mm);
|
|
+ if (mm)
|
|
+ walk_mm(lruvec, mm, swappiness);
|
|
+
|
|
+ cond_resched();
|
|
+ } while (mm);
|
|
+
|
|
+ if (!last) {
|
|
+ /* foreground aging prefers not to wait unless "necessary" */
|
|
+ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2)
|
|
+ wait_event_killable(mm_list->nodes[nid].wait,
|
|
+ next_seq < READ_ONCE(lruvec->evictable.max_seq));
|
|
+
|
|
+ return next_seq < READ_ONCE(lruvec->evictable.max_seq);
|
|
+ }
|
|
+
|
|
+ VM_BUG_ON(next_seq != READ_ONCE(lruvec->evictable.max_seq));
|
|
+
|
|
+ inc_max_seq(lruvec);
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+ if (!mem_cgroup_disabled())
|
|
+ atomic_add_unless(&lruvec->evictable.priority, -1, 0);
|
|
+#endif
|
|
+
|
|
+ /* order against inc_max_seq() */
|
|
+ smp_mb();
|
|
+ /* either we see any waiters or they will see updated max_seq */
|
|
+ if (waitqueue_active(&mm_list->nodes[nid].wait))
|
|
+ wake_up_all(&mm_list->nodes[nid].wait);
|
|
+
|
|
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * eviction (lru list scan)
|
|
+ ******************************************************************************/
|
|
+
|
|
+static int max_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness)
|
|
+{
|
|
+ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1;
|
|
+}
|
|
+
|
|
+static bool sort_page_by_gen(struct page *page, struct lruvec *lruvec)
|
|
+{
|
|
+ bool success;
|
|
+ int gen = page_lru_gen(page);
|
|
+ int file = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+
|
|
+ VM_BUG_ON_PAGE(gen == -1, page);
|
|
+
|
|
+ /* a lazy free page that has been written into */
|
|
+ if (file && PageDirty(page) && PageAnon(page)) {
|
|
+ success = page_clear_lru_gen(page, lruvec);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+ SetPageSwapBacked(page);
|
|
+ add_page_to_lru_list_tail(page, lruvec);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* page_update_lru_gen() has updated the page */
|
|
+ if (gen != lru_gen_from_seq(lruvec->evictable.min_seq[file])) {
|
|
+ list_move(&page->lru, &lruvec->evictable.lists[gen][file][zone]);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * A page can't be immediately evicted, and page_inc_lru_gen() will
|
|
+ * mark it for reclaim and hopefully writeback will write it soon.
|
|
+ *
|
|
+ * During page table walk, we call set_page_dirty() on pages that have
|
|
+ * dirty PTEs, which helps account dirty pages so writeback should do
|
|
+ * its job.
|
|
+ */
|
|
+ if (PageLocked(page) || PageWriteback(page) || (file && PageDirty(page))) {
|
|
+ page_inc_lru_gen(page, lruvec, true);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static bool should_skip_page(struct page *page, struct scan_control *sc)
|
|
+{
|
|
+ if (!sc->may_unmap && page_mapped(page))
|
|
+ return true;
|
|
+
|
|
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
|
|
+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
|
|
+ return true;
|
|
+
|
|
+ if (!get_page_unless_zero(page))
|
|
+ return true;
|
|
+
|
|
+ if (!TestClearPageLRU(page)) {
|
|
+ put_page(page);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void isolate_page_by_gen(struct page *page, struct lruvec *lruvec)
|
|
+{
|
|
+ bool success;
|
|
+
|
|
+ success = page_clear_lru_gen(page, lruvec);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+
|
|
+ if (PageActive(page)) {
|
|
+ ClearPageActive(page);
|
|
+ /* make sure shrink_page_list() rejects this page */
|
|
+ if (!PageReferenced(page))
|
|
+ SetPageReferenced(page);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* make sure shrink_page_list() doesn't write back this page */
|
|
+ if (PageReclaim(page))
|
|
+ ClearPageReclaim(page);
|
|
+ /* make sure shrink_page_list() doesn't reject this page */
|
|
+ if (PageReferenced(page))
|
|
+ ClearPageReferenced(page);
|
|
+}
|
|
+
|
|
+static int scan_lru_gen_pages(struct lruvec *lruvec, struct scan_control *sc,
|
|
+ long *nr_to_scan, int file, struct list_head *list)
|
|
+{
|
|
+ bool success;
|
|
+ int gen, zone;
|
|
+ enum vm_event_item item;
|
|
+ int sorted = 0;
|
|
+ int scanned = 0;
|
|
+ int isolated = 0;
|
|
+ int batch_size = 0;
|
|
+
|
|
+ VM_BUG_ON(!list_empty(list));
|
|
+
|
|
+ if (get_nr_gens(lruvec, file) == MIN_NR_GENS)
|
|
+ return -ENOENT;
|
|
+
|
|
+ gen = lru_gen_from_seq(lruvec->evictable.min_seq[file]);
|
|
+
|
|
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
|
|
+ LIST_HEAD(moved);
|
|
+ int skipped = 0;
|
|
+ struct list_head *head = &lruvec->evictable.lists[gen][file][zone];
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ struct page *page = lru_to_page(head);
|
|
+ int delta = thp_nr_pages(page);
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != file, page);
|
|
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
|
+
|
|
+ prefetchw_prev_lru_page(page, head, flags);
|
|
+
|
|
+ scanned += delta;
|
|
+
|
|
+ if (sort_page_by_gen(page, lruvec))
|
|
+ sorted += delta;
|
|
+ else if (should_skip_page(page, sc)) {
|
|
+ list_move(&page->lru, &moved);
|
|
+ skipped += delta;
|
|
+ } else {
|
|
+ isolate_page_by_gen(page, lruvec);
|
|
+ list_add(&page->lru, list);
|
|
+ isolated += delta;
|
|
+ }
|
|
+
|
|
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
|
|
+ ++batch_size == MAX_BATCH_SIZE)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ list_splice(&moved, head);
|
|
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
|
|
+
|
|
+ if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX ||
|
|
+ batch_size == MAX_BATCH_SIZE)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ success = try_inc_min_seq(lruvec, file);
|
|
+
|
|
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
|
|
+ if (!cgroup_reclaim(sc))
|
|
+ __count_vm_events(item, scanned);
|
|
+ __count_memcg_events(lruvec_memcg(lruvec), item, scanned);
|
|
+ __count_vm_events(PGSCAN_ANON + file, scanned);
|
|
+
|
|
+ *nr_to_scan -= scanned;
|
|
+
|
|
+ if (*nr_to_scan <= 0 || success || isolated)
|
|
+ return isolated;
|
|
+ /*
|
|
+ * We may have trouble finding eligible pages due to restrictions from
|
|
+ * reclaim_idx, may_unmap and may_writepage. The following check makes
|
|
+ * sure we won't be stuck if we aren't making enough progress.
|
|
+ */
|
|
+ return batch_size == MAX_BATCH_SIZE && sorted >= SWAP_CLUSTER_MAX ? 0 : -ENOENT;
|
|
+}
|
|
+
|
|
+static int isolate_lru_gen_pages(struct lruvec *lruvec, struct scan_control *sc,
|
|
+ int swappiness, long *nr_to_scan, int *file,
|
|
+ struct list_head *list)
|
|
+{
|
|
+ int i;
|
|
+ int isolated;
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
+ DEFINE_MIN_SEQ(lruvec);
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+
|
|
+ if (max_nr_gens(max_seq, min_seq, swappiness) == MIN_NR_GENS)
|
|
+ return 0;
|
|
+
|
|
+ /* simply choose a type based on generations and swappiness */
|
|
+ *file = !swappiness || min_seq[0] > min_seq[1] ||
|
|
+ (min_seq[0] == min_seq[1] &&
|
|
+ max(lruvec->evictable.isolated[0], 1UL) * (200 - swappiness) >
|
|
+ max(lruvec->evictable.isolated[1], 1UL) * (swappiness - 1));
|
|
+
|
|
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
|
|
+ isolated = scan_lru_gen_pages(lruvec, sc, nr_to_scan, *file, list);
|
|
+ if (isolated >= 0)
|
|
+ break;
|
|
+
|
|
+ *file = !*file;
|
|
+ }
|
|
+
|
|
+ if (isolated < 0)
|
|
+ isolated = *nr_to_scan = 0;
|
|
+
|
|
+ lruvec->evictable.isolated[*file] += isolated;
|
|
+
|
|
+ return isolated;
|
|
+}
|
|
+
|
|
+/* Main function used by foreground, background and user-triggered eviction. */
|
|
+static bool evict_lru_gen_pages(struct lruvec *lruvec, struct scan_control *sc,
|
|
+ int swappiness, long *nr_to_scan)
|
|
+{
|
|
+ int file;
|
|
+ int isolated;
|
|
+ int reclaimed;
|
|
+ LIST_HEAD(list);
|
|
+ struct page *page;
|
|
+ enum vm_event_item item;
|
|
+ struct reclaim_stat stat;
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ isolated = isolate_lru_gen_pages(lruvec, sc, swappiness, nr_to_scan, &file, &list);
|
|
+ VM_BUG_ON(list_empty(&list) == !!isolated);
|
|
+
|
|
+ if (isolated)
|
|
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, isolated);
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ if (!isolated)
|
|
+ goto done;
|
|
+
|
|
+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
|
|
+ /*
|
|
+ * We have to prevent any pages from being added back to the same list
|
|
+ * it was isolated from. Otherwise we may risk looping on them forever.
|
|
+ */
|
|
+ list_for_each_entry(page, &list, lru) {
|
|
+ if (!PageReclaim(page) && !PageMlocked(page) && !PageActive(page))
|
|
+ SetPageActive(page);
|
|
+ }
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ move_pages_to_lru(lruvec, &list);
|
|
+
|
|
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -isolated);
|
|
+
|
|
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
|
|
+ if (!cgroup_reclaim(sc))
|
|
+ __count_vm_events(item, reclaimed);
|
|
+ __count_memcg_events(lruvec_memcg(lruvec), item, reclaimed);
|
|
+ __count_vm_events(PGSTEAL_ANON + file, reclaimed);
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ mem_cgroup_uncharge_list(&list);
|
|
+ free_unref_page_list(&list);
|
|
+
|
|
+ sc->nr_reclaimed += reclaimed;
|
|
+done:
|
|
+ return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim;
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * reclaim (aging + eviction)
|
|
+ ******************************************************************************/
|
|
+
|
|
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
|
+ int swappiness)
|
|
+{
|
|
+ int gen, file, zone;
|
|
+ long nr_to_scan = 0;
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
+ DEFINE_MIN_SEQ(lruvec);
|
|
+
|
|
+ lru_add_drain();
|
|
+
|
|
+ for (file = !swappiness; file < ANON_AND_FILE; file++) {
|
|
+ unsigned long seq;
|
|
+
|
|
+ for (seq = min_seq[file]; seq <= max_seq; seq++) {
|
|
+ gen = lru_gen_from_seq(seq);
|
|
+
|
|
+ for (zone = 0; zone <= sc->reclaim_idx; zone++)
|
|
+ nr_to_scan += READ_ONCE(
|
|
+ lruvec->evictable.sizes[gen][file][zone]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ nr_to_scan = max(nr_to_scan, 0L);
|
|
+ nr_to_scan = round_up(nr_to_scan >> sc->priority, SWAP_CLUSTER_MAX);
|
|
+
|
|
+ if (max_nr_gens(max_seq, min_seq, swappiness) > MIN_NR_GENS)
|
|
+ return nr_to_scan;
|
|
+
|
|
+ /* kswapd does background aging, i.e., age_lru_gens() */
|
|
+ if (current_is_kswapd())
|
|
+ return 0;
|
|
+
|
|
+ return walk_mm_list(lruvec, max_seq, sc, swappiness) ? nr_to_scan : 0;
|
|
+}
|
|
+
|
|
+static int get_swappiness(struct lruvec *lruvec)
|
|
+{
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ int swappiness = mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ?
|
|
+ mem_cgroup_swappiness(memcg) : 0;
|
|
+
|
|
+ VM_BUG_ON(swappiness > 200U);
|
|
+
|
|
+ return swappiness;
|
|
+}
|
|
+
|
|
+static void shrink_lru_gens(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+ struct blk_plug plug;
|
|
+ unsigned long scanned = 0;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+
|
|
+ blk_start_plug(&plug);
|
|
+
|
|
+ while (true) {
|
|
+ long nr_to_scan;
|
|
+ int swappiness = sc->may_swap ? get_swappiness(lruvec) : 0;
|
|
+
|
|
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness) - scanned;
|
|
+ if (nr_to_scan < (long)SWAP_CLUSTER_MAX)
|
|
+ break;
|
|
+
|
|
+ scanned += nr_to_scan;
|
|
+
|
|
+ if (!evict_lru_gen_pages(lruvec, sc, swappiness, &nr_to_scan))
|
|
+ break;
|
|
+
|
|
+ scanned -= nr_to_scan;
|
|
+
|
|
+ if (mem_cgroup_below_min(memcg) ||
|
|
+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
|
+ break;
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+
|
|
+ blk_finish_plug(&plug);
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * background aging
|
|
+ ******************************************************************************/
|
|
+
|
|
+static int lru_gen_spread = MIN_NR_GENS;
|
|
+
|
|
+static int min_nr_gens(unsigned long max_seq, unsigned long *min_seq, int swappiness)
|
|
+{
|
|
+ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1;
|
|
+}
|
|
+
|
|
+static void try_walk_mm_list(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+ int gen, file, zone;
|
|
+ long old_and_young[2] = {};
|
|
+ int spread = READ_ONCE(lru_gen_spread);
|
|
+ int swappiness = get_swappiness(lruvec);
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
+ DEFINE_MIN_SEQ(lruvec);
|
|
+
|
|
+ lru_add_drain();
|
|
+
|
|
+ for (file = !swappiness; file < ANON_AND_FILE; file++) {
|
|
+ unsigned long seq;
|
|
+
|
|
+ for (seq = min_seq[file]; seq <= max_seq; seq++) {
|
|
+ gen = lru_gen_from_seq(seq);
|
|
+
|
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
|
+ old_and_young[seq == max_seq] += READ_ONCE(
|
|
+ lruvec->evictable.sizes[gen][file][zone]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ old_and_young[0] = max(old_and_young[0], 0L);
|
|
+ old_and_young[1] = max(old_and_young[1], 0L);
|
|
+
|
|
+ if (old_and_young[0] + old_and_young[1] < SWAP_CLUSTER_MAX)
|
|
+ return;
|
|
+
|
|
+ /* try to spread pages out across spread+1 generations */
|
|
+ if (old_and_young[0] >= old_and_young[1] * spread &&
|
|
+ min_nr_gens(max_seq, min_seq, swappiness) > max(spread, MIN_NR_GENS))
|
|
+ return;
|
|
+
|
|
+ walk_mm_list(lruvec, max_seq, sc, swappiness);
|
|
+}
|
|
+
|
|
+static void age_lru_gens(struct pglist_data *pgdat, struct scan_control *sc)
|
|
+{
|
|
+ struct mem_cgroup *memcg;
|
|
+
|
|
+ VM_BUG_ON(!current_is_kswapd());
|
|
+
|
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
|
+ do {
|
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
|
+
|
|
+ if (!mem_cgroup_below_min(memcg) &&
|
|
+ (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
|
|
+ try_walk_mm_list(lruvec, sc);
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+ if (!mem_cgroup_disabled())
|
|
+ atomic_add_unless(&lruvec->evictable.priority, 1, DEF_PRIORITY);
|
|
+#endif
|
|
+
|
|
+ cond_resched();
|
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * state change
|
|
+ ******************************************************************************/
|
|
+
|
|
+#ifdef CONFIG_LRU_GEN_ENABLED
|
|
+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
|
|
+#else
|
|
+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
|
|
+#endif
|
|
+
|
|
+static DEFINE_MUTEX(lru_gen_state_mutex);
|
|
+static int lru_gen_nr_swapfiles __read_mostly;
|
|
+
|
|
+static bool fill_lru_gen_lists(struct lruvec *lruvec)
|
|
+{
|
|
+ enum lru_list lru;
|
|
+ int batch_size = 0;
|
|
+
|
|
+ for_each_evictable_lru(lru) {
|
|
+ int file = is_file_lru(lru);
|
|
+ bool active = is_active_lru(lru);
|
|
+ struct list_head *head = &lruvec->lists[lru];
|
|
+
|
|
+ if (!lruvec->evictable.enabled[file])
|
|
+ continue;
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ bool success;
|
|
+ struct page *page = lru_to_page(head);
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
|
|
+ VM_BUG_ON_PAGE(page_lru_gen(page) != -1, page);
|
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != file, page);
|
|
+
|
|
+ prefetchw_prev_lru_page(page, head, flags);
|
|
+
|
|
+ del_page_from_lru_list(page, lruvec);
|
|
+ success = page_set_lru_gen(page, lruvec, true);
|
|
+ VM_BUG_ON(!success);
|
|
+
|
|
+ if (++batch_size == MAX_BATCH_SIZE)
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool drain_lru_gen_lists(struct lruvec *lruvec)
|
|
+{
|
|
+ int gen, file, zone;
|
|
+ int batch_size = 0;
|
|
+
|
|
+ for_each_gen_type_zone(gen, file, zone) {
|
|
+ struct list_head *head = &lruvec->evictable.lists[gen][file][zone];
|
|
+
|
|
+ if (lruvec->evictable.enabled[file])
|
|
+ continue;
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ bool success;
|
|
+ struct page *page = lru_to_page(head);
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != file, page);
|
|
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
|
+
|
|
+ prefetchw_prev_lru_page(page, head, flags);
|
|
+
|
|
+ success = page_clear_lru_gen(page, lruvec);
|
|
+ VM_BUG_ON(!success);
|
|
+ add_page_to_lru_list(page, lruvec);
|
|
+
|
|
+ if (++batch_size == MAX_BATCH_SIZE)
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
|
+{
|
|
+ int gen, file, zone;
|
|
+ enum lru_list lru;
|
|
+
|
|
+ for_each_evictable_lru(lru) {
|
|
+ file = is_file_lru(lru);
|
|
+
|
|
+ if (lruvec->evictable.enabled[file] &&
|
|
+ !list_empty(&lruvec->lists[lru]))
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ for_each_gen_type_zone(gen, file, zone) {
|
|
+ if (!lruvec->evictable.enabled[file] &&
|
|
+ !list_empty(&lruvec->evictable.lists[gen][file][zone]))
|
|
+ return false;
|
|
+
|
|
+ VM_WARN_ONCE(!lruvec->evictable.enabled[file] &&
|
|
+ lruvec->evictable.sizes[gen][file][zone],
|
|
+ "lru_gen: possible unbalanced number of pages");
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We enable/disable file multigenerational lru according to the main switch.
|
|
+ *
|
|
+ * For anon multigenerational lru, we only enabled it when main switch is on
|
|
+ * and there is at least one swapfile; we disable it when there is no swapfile
|
|
+ * regardless of the value of the main switch. Otherwise, we may eventually
|
|
+ * run out of generation numbers and have to call inc_min_seq(), which brings
|
|
+ * an unnecessary cost.
|
|
+ */
|
|
+void lru_gen_set_state(bool enable, bool main, bool swap)
|
|
+{
|
|
+ struct mem_cgroup *memcg;
|
|
+
|
|
+ mem_hotplug_begin();
|
|
+ mutex_lock(&lru_gen_state_mutex);
|
|
+ cgroup_lock();
|
|
+
|
|
+ main = main && enable != lru_gen_enabled();
|
|
+ swap = swap && !(enable ? lru_gen_nr_swapfiles++ : --lru_gen_nr_swapfiles);
|
|
+ swap = swap && lru_gen_enabled();
|
|
+ if (!main && !swap)
|
|
+ goto unlock;
|
|
+
|
|
+ if (main) {
|
|
+ if (enable)
|
|
+ static_branch_enable(&lru_gen_static_key);
|
|
+ else
|
|
+ static_branch_disable(&lru_gen_static_key);
|
|
+ }
|
|
+
|
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
|
+ do {
|
|
+ int nid;
|
|
+
|
|
+ for_each_node_state(nid, N_MEMORY) {
|
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+ VM_BUG_ON(!state_is_valid(lruvec));
|
|
+
|
|
+ WRITE_ONCE(lruvec->evictable.enabled[0],
|
|
+ lru_gen_enabled() && lru_gen_nr_swapfiles);
|
|
+ WRITE_ONCE(lruvec->evictable.enabled[1],
|
|
+ lru_gen_enabled());
|
|
+
|
|
+ while (!(enable ? fill_lru_gen_lists(lruvec) :
|
|
+ drain_lru_gen_lists(lruvec))) {
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+ cond_resched();
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+ }
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+ }
|
|
+
|
|
+ cond_resched();
|
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
+unlock:
|
|
+ cgroup_unlock();
|
|
+ mutex_unlock(&lru_gen_state_mutex);
|
|
+ mem_hotplug_done();
|
|
+}
|
|
+
|
|
+static int __meminit __maybe_unused
|
|
+lru_gen_online_mem(struct notifier_block *self, unsigned long action, void *arg)
|
|
+{
|
|
+ struct mem_cgroup *memcg;
|
|
+ struct memory_notify *mnb = arg;
|
|
+ int nid = mnb->status_change_nid;
|
|
+
|
|
+ if (action != MEM_GOING_ONLINE || nid == NUMA_NO_NODE)
|
|
+ return NOTIFY_DONE;
|
|
+
|
|
+ mutex_lock(&lru_gen_state_mutex);
|
|
+ cgroup_lock();
|
|
+
|
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
|
+ do {
|
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+ VM_BUG_ON(!state_is_valid(lruvec));
|
|
+
|
|
+ WRITE_ONCE(lruvec->evictable.enabled[0],
|
|
+ lru_gen_enabled() && lru_gen_nr_swapfiles);
|
|
+ WRITE_ONCE(lruvec->evictable.enabled[1],
|
|
+ lru_gen_enabled());
|
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
+
|
|
+ cgroup_unlock();
|
|
+ mutex_unlock(&lru_gen_state_mutex);
|
|
+
|
|
+ return NOTIFY_DONE;
|
|
+}
|
|
+
|
|
/******************************************************************************
|
|
* initialization
|
|
******************************************************************************/
|
|
|
|
+void lru_gen_init_lruvec(struct lruvec *lruvec)
|
|
+{
|
|
+ int i;
|
|
+ int gen, file, zone;
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+ atomic_set(&lruvec->evictable.priority, DEF_PRIORITY);
|
|
+#endif
|
|
+
|
|
+ lruvec->evictable.max_seq = MIN_NR_GENS;
|
|
+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
|
+ lruvec->evictable.enabled[1] = lru_gen_enabled();
|
|
+
|
|
+ for (i = 0; i <= MIN_NR_GENS; i++)
|
|
+ lruvec->evictable.timestamps[i] = jiffies;
|
|
+
|
|
+ for_each_gen_type_zone(gen, file, zone)
|
|
+ INIT_LIST_HEAD(&lruvec->evictable.lists[gen][file][zone]);
|
|
+}
|
|
+
|
|
static int __init init_lru_gen(void)
|
|
{
|
|
+ BUILD_BUG_ON(MAX_NR_GENS <= MIN_NR_GENS);
|
|
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
|
+
|
|
if (mem_cgroup_disabled()) {
|
|
global_mm_list = alloc_mm_list();
|
|
if (!global_mm_list) {
|
|
@@ -4567,6 +5819,9 @@ static int __init init_lru_gen(void)
|
|
}
|
|
}
|
|
|
|
+ if (hotplug_memory_notifier(lru_gen_online_mem, 0))
|
|
+ pr_err("lru_gen: failed to subscribe hotplug notifications\n");
|
|
+
|
|
return 0;
|
|
};
|
|
/*
|
|
--
|
|
2.31.0.rc2.261.g7f71774620-goog
|
|
|
|
|