687 lines
26 KiB
Diff
687 lines
26 KiB
Diff
|
From mboxrd@z Thu Jan 1 00:00:00 1970
|
||
|
Return-Path: <linux-kernel-owner@kernel.org>
|
||
|
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
|
||
|
aws-us-west-2-korg-lkml-1.web.codeaurora.org
|
||
|
X-Spam-Level:
|
||
|
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
|
||
|
DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
|
||
|
INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
|
||
|
USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
|
||
|
version=3.4.0
|
||
|
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
|
||
|
by smtp.lore.kernel.org (Postfix) with ESMTP id AEEEDC43461
|
||
|
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:26 +0000 (UTC)
|
||
|
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
|
||
|
by mail.kernel.org (Postfix) with ESMTP id 4D74961186
|
||
|
for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:26 +0000 (UTC)
|
||
|
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
|
||
|
id S231144AbhETGzp (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
|
||
|
Thu, 20 May 2021 02:55:45 -0400
|
||
|
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37910 "EHLO
|
||
|
lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
|
||
|
with ESMTP id S230478AbhETGzg (ORCPT
|
||
|
<rfc822;linux-kernel@vger.kernel.org>);
|
||
|
Thu, 20 May 2021 02:55:36 -0400
|
||
|
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
|
||
|
by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5CD6FC061763
|
||
|
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:14 -0700 (PDT)
|
||
|
Received: by mail-qv1-xf4a.google.com with SMTP id x2-20020a0cda020000b02901edb4c412fdso12424236qvj.11
|
||
|
for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:14 -0700 (PDT)
|
||
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
||
|
d=google.com; s=20161025;
|
||
|
h=date:in-reply-to:message-id:mime-version:references:subject:from:to
|
||
|
:cc;
|
||
|
bh=Jb580jSe4IcT6fVqPR22jrL3z+VNcMEKM2UgbfL90k4=;
|
||
|
b=rTxj5e7tRY5wx29jetDGP8dUly4vBHNX0SBJeZKRsCOEiHaQ+coy05du1f4bT6oCWw
|
||
|
rJWrdbUyp5aci9MKmCQ2Z5qPBf7F+zDTL+8wpoufyGbRvdGkfwDkAgQV6LLsi9xZzdyr
|
||
|
bpcyHItG1lIReRXOkR0GKWNz8GfEVNO7lE+G6Sc1sHPUEEfw3FF5Vl/Wta1OxKsGQQe4
|
||
|
02oeo8STGdqGF0yOczRyqWZ/SBFcNGiPQ7nrGaWA3FguRBAwZ2dOrTrmM5ug10rbOQmf
|
||
|
L/m3eja1mOwffFkrgumZ0Sm9KZ5sbKJNbLAjPYQAmAcoXhU/NVnLrMVtxSGppGFwdyOz
|
||
|
NMsw==
|
||
|
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
||
|
d=1e100.net; s=20161025;
|
||
|
h=x-gm-message-state:date:in-reply-to:message-id:mime-version
|
||
|
:references:subject:from:to:cc;
|
||
|
bh=Jb580jSe4IcT6fVqPR22jrL3z+VNcMEKM2UgbfL90k4=;
|
||
|
b=i0Py5qwrQv4OOBWcpJcYxjG5lgHvV4Gq3X3fG5L0aB3lLLnS3mObKdM6XG+uYC1b1G
|
||
|
z/Sfx4n/1+/0EPZnFoo80K1ry0Y7SD/W30OUEPR8PValuCLHEHzVeoVK2+TPI8DMEzz1
|
||
|
r+jWpxZkah8B613QrPIvvcZSIb0lxcsV6JxpYjFixO/mizct7mrdls35j1Thb7ehgWtO
|
||
|
W5aAGiMIxBprDhKHJ2D2Oz85hWRyQYND4jEA68bzh9ybz4cYMVIX3C+9uH+cVIhZ6JZL
|
||
|
febwADPME4CsH8gMntK/GWzf5Yu+sdeBYn+6VJKrG/4c7dWi0xgFGWYrgCtSlk8kVgOe
|
||
|
bH1w==
|
||
|
X-Gm-Message-State: AOAM530OUX0GiyYChE/1C1GuJXPP4zDS9QrWZKB+3aIFDiz73ADIQaxu
|
||
|
gJxNX12VvCvNdCSId0kuSWl88ETTfcg=
|
||
|
X-Google-Smtp-Source: ABdhPJz4m/yxkWn5wBamzXd/wEoVvHq3AOPsnc1+c/ewg4oojPM6XcGKJYYybO2Mtsb6BDRPtu5ccAJRcHw=
|
||
|
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
|
||
|
(user=yuzhao job=sendgmr) by 2002:a0c:edcf:: with SMTP id i15mr4021372qvr.10.1621493653456;
|
||
|
Wed, 19 May 2021 23:54:13 -0700 (PDT)
|
||
|
Date: Thu, 20 May 2021 00:53:49 -0600
|
||
|
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
|
||
|
Message-Id: <20210520065355.2736558-9-yuzhao@google.com>
|
||
|
Mime-Version: 1.0
|
||
|
References: <20210520065355.2736558-1-yuzhao@google.com>
|
||
|
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
|
||
|
Subject: [PATCH v3 08/14] mm: multigenerational lru: activation
|
||
|
From: Yu Zhao <yuzhao@google.com>
|
||
|
To: linux-mm@kvack.org
|
||
|
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
|
||
|
Andrew Morton <akpm@linux-foundation.org>,
|
||
|
Dave Chinner <david@fromorbit.com>,
|
||
|
Dave Hansen <dave.hansen@linux.intel.com>,
|
||
|
Donald Carr <sirspudd@gmail.com>,
|
||
|
Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
|
||
|
Johannes Weiner <hannes@cmpxchg.org>,
|
||
|
Jonathan Corbet <corbet@lwn.net>,
|
||
|
Joonsoo Kim <iamjoonsoo.kim@lge.com>,
|
||
|
Konstantin Kharlamov <hi-angel@yandex.ru>,
|
||
|
Marcus Seyfarth <m.seyfarth@gmail.com>,
|
||
|
Matthew Wilcox <willy@infradead.org>,
|
||
|
Mel Gorman <mgorman@suse.de>,
|
||
|
Miaohe Lin <linmiaohe@huawei.com>,
|
||
|
Michael Larabel <michael@michaellarabel.com>,
|
||
|
Michal Hocko <mhocko@suse.com>,
|
||
|
Michel Lespinasse <michel@lespinasse.org>,
|
||
|
Rik van Riel <riel@surriel.com>,
|
||
|
Roman Gushchin <guro@fb.com>,
|
||
|
Tim Chen <tim.c.chen@linux.intel.com>,
|
||
|
Vlastimil Babka <vbabka@suse.cz>,
|
||
|
Yang Shi <shy828301@gmail.com>,
|
||
|
Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
|
||
|
linux-kernel@vger.kernel.org, lkp@lists.01.org,
|
||
|
page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
|
||
|
Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||
|
Content-Type: text/plain; charset="UTF-8"
|
||
|
Precedence: bulk
|
||
|
List-ID: <linux-kernel.vger.kernel.org>
|
||
|
X-Mailing-List: linux-kernel@vger.kernel.org
|
||
|
List-Archive: <https://lore.kernel.org/lkml/>
|
||
|
|
||
|
For pages accessed multiple times via file descriptors, instead of
|
||
|
activating them upon the second access, we activate them based on the
|
||
|
refault rates of their tiers. Each generation contains at most
|
||
|
MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in
|
||
|
page->flags. Pages accessed N times via file descriptors belong to
|
||
|
tier order_base_2(N). Tier 0 is the base tier and it contains pages
|
||
|
read ahead, accessed once via file descriptors and accessed only via
|
||
|
page tables. Pages from the base tier are evicted regardless of the
|
||
|
refault rate. Pages from upper tiers that have higher refault rates
|
||
|
than the base tier will be moved to the next generation. A feedback
|
||
|
loop modeled after the PID controller monitors refault rates across
|
||
|
all tiers and decides when to activate pages from which upper tiers
|
||
|
in the reclaim path. The advantages of this model are:
|
||
|
1) It has a negligible cost in the buffered IO access path because
|
||
|
activations are done optionally in the reclaim path.
|
||
|
2) It takes mapped pages into account and avoids overprotecting
|
||
|
pages accessed multiple times via file descriptors.
|
||
|
3) More tiers offer better protection to pages accessed more than
|
||
|
twice when workloads doing intensive buffered IO are under memory
|
||
|
pressure.
|
||
|
|
||
|
For pages mapped upon page faults, the accessed bit is set during the
|
||
|
initial faults. Ideally we add them to the per-zone lists index by
|
||
|
max_seq, i.e., the youngest generation, so that eviction will not
|
||
|
consider them before the aging has scanned them. For anon pages not in
|
||
|
swap cache, this can be done easily in the page fault path: we rename
|
||
|
lru_cache_add_inactive_or_unevictable() to lru_cache_add_page_vma()
|
||
|
and add a new parameter, which is set to true for pages mapped upon
|
||
|
page faults. For pages in page cache or swap cache, we cannot
|
||
|
differentiate the page fault path from the read ahead path at the time
|
||
|
we call lru_cache_add(). So we add them to the per-zone lists index by
|
||
|
min_seq, i.e., the oldest generation, for now.
|
||
|
|
||
|
Finally, we need to make sure deactivation works when the
|
||
|
multigenerational lru is enabled. We cannot use PageActive() because
|
||
|
it is not set on pages from active generations, in order to spare the
|
||
|
aging the trouble of clearing it when active generations become
|
||
|
inactive. So we deactivate pages unconditionally since deactivation is
|
||
|
not a hot code path worth additional optimizations.
|
||
|
|
||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||
|
---
|
||
|
include/linux/mm_inline.h | 40 ++++++++++++++
|
||
|
include/linux/swap.h | 4 +-
|
||
|
kernel/events/uprobes.c | 2 +-
|
||
|
mm/huge_memory.c | 2 +-
|
||
|
mm/khugepaged.c | 2 +-
|
||
|
mm/memory.c | 10 ++--
|
||
|
mm/migrate.c | 2 +-
|
||
|
mm/swap.c | 22 +++++---
|
||
|
mm/swapfile.c | 2 +-
|
||
|
mm/userfaultfd.c | 2 +-
|
||
|
mm/vmscan.c | 91 ++++++++++++++++++++++++++++++-
|
||
|
mm/workingset.c | 112 ++++++++++++++++++++++++++++++++++++++
|
||
|
12 files changed, 269 insertions(+), 22 deletions(-)
|
||
|
|
||
|
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
|
||
|
index ae3e3826dd7f..f3b99f65a652 100644
|
||
|
--- a/include/linux/mm_inline.h
|
||
|
+++ b/include/linux/mm_inline.h
|
||
|
@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
|
||
|
return seq % MAX_NR_GENS;
|
||
|
}
|
||
|
|
||
|
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
|
||
|
+static inline int lru_tier_from_usage(int usage)
|
||
|
+{
|
||
|
+ return order_base_2(usage + 1);
|
||
|
+}
|
||
|
+
|
||
|
/* Return a proper index regardless whether we keep a full history of stats. */
|
||
|
static inline int hist_from_seq_or_gen(int seq_or_gen)
|
||
|
{
|
||
|
@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
|
||
|
+static inline int page_tier_usage(struct page *page)
|
||
|
+{
|
||
|
+ unsigned long flags = READ_ONCE(page->flags);
|
||
|
+
|
||
|
+ return flags & BIT(PG_workingset) ?
|
||
|
+ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
|
||
|
+}
|
||
|
+
|
||
|
+/* Increment the usage counter after a page is accessed via file descriptors. */
|
||
|
+static inline void page_inc_usage(struct page *page)
|
||
|
+{
|
||
|
+ unsigned long usage;
|
||
|
+ unsigned long old_flags, new_flags;
|
||
|
+
|
||
|
+ do {
|
||
|
+ old_flags = READ_ONCE(page->flags);
|
||
|
+
|
||
|
+ if (!(old_flags & BIT(PG_workingset))) {
|
||
|
+ new_flags = old_flags | BIT(PG_workingset);
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
|
||
|
+
|
||
|
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
|
||
|
+ } while (new_flags != old_flags &&
|
||
|
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||
|
+}
|
||
|
+
|
||
|
#else /* CONFIG_LRU_GEN */
|
||
|
|
||
|
static inline bool lru_gen_enabled(void)
|
||
|
@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
+static inline void page_inc_usage(struct page *page)
|
||
|
+{
|
||
|
+}
|
||
|
+
|
||
|
#endif /* CONFIG_LRU_GEN */
|
||
|
|
||
|
static __always_inline void add_page_to_lru_list(struct page *page,
|
||
|
diff --git a/include/linux/swap.h b/include/linux/swap.h
|
||
|
index 144727041e78..30b1f15f5c6e 100644
|
||
|
--- a/include/linux/swap.h
|
||
|
+++ b/include/linux/swap.h
|
||
|
@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page);
|
||
|
extern void mark_page_lazyfree(struct page *page);
|
||
|
extern void swap_setup(void);
|
||
|
|
||
|
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||
|
- struct vm_area_struct *vma);
|
||
|
+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
|
||
|
+ bool faulting);
|
||
|
|
||
|
/* linux/mm/vmscan.c */
|
||
|
extern unsigned long zone_reclaimable_pages(struct zone *zone);
|
||
|
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
|
||
|
index 6addc9780319..4e93e5602723 100644
|
||
|
--- a/kernel/events/uprobes.c
|
||
|
+++ b/kernel/events/uprobes.c
|
||
|
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||
|
if (new_page) {
|
||
|
get_page(new_page);
|
||
|
page_add_new_anon_rmap(new_page, vma, addr, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(new_page, vma);
|
||
|
+ lru_cache_add_page_vma(new_page, vma, false);
|
||
|
} else
|
||
|
/* no new page, just dec_mm_counter for old_page */
|
||
|
dec_mm_counter(mm, MM_ANONPAGES);
|
||
|
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
|
||
|
index 8ac9093e5a0d..681da4a3cf61 100644
|
||
|
--- a/mm/huge_memory.c
|
||
|
+++ b/mm/huge_memory.c
|
||
|
@@ -636,7 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
|
||
|
entry = mk_huge_pmd(page, vma->vm_page_prot);
|
||
|
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
|
||
|
page_add_new_anon_rmap(page, vma, haddr, true);
|
||
|
- lru_cache_add_inactive_or_unevictable(page, vma);
|
||
|
+ lru_cache_add_page_vma(page, vma, true);
|
||
|
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
|
||
|
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
|
||
|
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
|
||
|
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
|
||
|
index 6c0185fdd815..09e5346c2754 100644
|
||
|
--- a/mm/khugepaged.c
|
||
|
+++ b/mm/khugepaged.c
|
||
|
@@ -1198,7 +1198,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
||
|
spin_lock(pmd_ptl);
|
||
|
BUG_ON(!pmd_none(*pmd));
|
||
|
page_add_new_anon_rmap(new_page, vma, address, true);
|
||
|
- lru_cache_add_inactive_or_unevictable(new_page, vma);
|
||
|
+ lru_cache_add_page_vma(new_page, vma, true);
|
||
|
pgtable_trans_huge_deposit(mm, pmd, pgtable);
|
||
|
set_pmd_at(mm, address, pmd, _pmd);
|
||
|
update_mmu_cache_pmd(vma, address, pmd);
|
||
|
diff --git a/mm/memory.c b/mm/memory.c
|
||
|
index 730daa00952b..a76196885f92 100644
|
||
|
--- a/mm/memory.c
|
||
|
+++ b/mm/memory.c
|
||
|
@@ -839,7 +839,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
|
||
|
copy_user_highpage(new_page, page, addr, src_vma);
|
||
|
__SetPageUptodate(new_page);
|
||
|
page_add_new_anon_rmap(new_page, dst_vma, addr, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
|
||
|
+ lru_cache_add_page_vma(new_page, dst_vma, false);
|
||
|
rss[mm_counter(new_page)]++;
|
||
|
|
||
|
/* All done, just insert the new page copy in the child */
|
||
|
@@ -2950,7 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
|
||
|
*/
|
||
|
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
|
||
|
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(new_page, vma);
|
||
|
+ lru_cache_add_page_vma(new_page, vma, true);
|
||
|
/*
|
||
|
* We call the notify macro here because, when using secondary
|
||
|
* mmu page tables (such as kvm shadow page tables), we want the
|
||
|
@@ -3479,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||
|
/* ksm created a completely new copy */
|
||
|
if (unlikely(page != swapcache && swapcache)) {
|
||
|
page_add_new_anon_rmap(page, vma, vmf->address, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(page, vma);
|
||
|
+ lru_cache_add_page_vma(page, vma, true);
|
||
|
} else {
|
||
|
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
|
||
|
}
|
||
|
@@ -3625,7 +3625,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
|
||
|
|
||
|
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
|
||
|
page_add_new_anon_rmap(page, vma, vmf->address, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(page, vma);
|
||
|
+ lru_cache_add_page_vma(page, vma, true);
|
||
|
setpte:
|
||
|
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
|
||
|
|
||
|
@@ -3793,7 +3793,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
|
||
|
if (write && !(vma->vm_flags & VM_SHARED)) {
|
||
|
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
|
||
|
page_add_new_anon_rmap(page, vma, addr, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(page, vma);
|
||
|
+ lru_cache_add_page_vma(page, vma, true);
|
||
|
} else {
|
||
|
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
|
||
|
page_add_file_rmap(page, false);
|
||
|
diff --git a/mm/migrate.c b/mm/migrate.c
|
||
|
index b234c3f3acb7..d3307c9eced4 100644
|
||
|
--- a/mm/migrate.c
|
||
|
+++ b/mm/migrate.c
|
||
|
@@ -2967,7 +2967,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
|
||
|
inc_mm_counter(mm, MM_ANONPAGES);
|
||
|
page_add_new_anon_rmap(page, vma, addr, false);
|
||
|
if (!is_zone_device_page(page))
|
||
|
- lru_cache_add_inactive_or_unevictable(page, vma);
|
||
|
+ lru_cache_add_page_vma(page, vma, false);
|
||
|
get_page(page);
|
||
|
|
||
|
if (flush) {
|
||
|
diff --git a/mm/swap.c b/mm/swap.c
|
||
|
index dfb48cf9c2c9..96ce95eeb2c9 100644
|
||
|
--- a/mm/swap.c
|
||
|
+++ b/mm/swap.c
|
||
|
@@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
|
||
|
* this list is never rotated or maintained, so marking an
|
||
|
* evictable page accessed has no effect.
|
||
|
*/
|
||
|
+ } else if (lru_gen_enabled()) {
|
||
|
+ page_inc_usage(page);
|
||
|
} else if (!PageActive(page)) {
|
||
|
/*
|
||
|
* If the page is on the LRU, queue it for activation via
|
||
|
@@ -478,15 +480,14 @@ void lru_cache_add(struct page *page)
|
||
|
EXPORT_SYMBOL(lru_cache_add);
|
||
|
|
||
|
/**
|
||
|
- * lru_cache_add_inactive_or_unevictable
|
||
|
+ * lru_cache_add_page_vma
|
||
|
* @page: the page to be added to LRU
|
||
|
* @vma: vma in which page is mapped for determining reclaimability
|
||
|
*
|
||
|
- * Place @page on the inactive or unevictable LRU list, depending on its
|
||
|
- * evictability.
|
||
|
+ * Place @page on an LRU list, depending on its evictability.
|
||
|
*/
|
||
|
-void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||
|
- struct vm_area_struct *vma)
|
||
|
+void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
|
||
|
+ bool faulting)
|
||
|
{
|
||
|
bool unevictable;
|
||
|
|
||
|
@@ -503,6 +504,11 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||
|
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
|
||
|
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
||
|
}
|
||
|
+
|
||
|
+ /* tell the multigenerational lru that the page is being faulted in */
|
||
|
+ if (lru_gen_enabled() && !unevictable && faulting)
|
||
|
+ SetPageActive(page);
|
||
|
+
|
||
|
lru_cache_add(page);
|
||
|
}
|
||
|
|
||
|
@@ -529,7 +535,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||
|
*/
|
||
|
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
|
||
|
{
|
||
|
- bool active = PageActive(page);
|
||
|
+ bool active = PageActive(page) || lru_gen_enabled();
|
||
|
int nr_pages = thp_nr_pages(page);
|
||
|
|
||
|
if (PageUnevictable(page))
|
||
|
@@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
|
||
|
|
||
|
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||
|
{
|
||
|
- if (PageActive(page) && !PageUnevictable(page)) {
|
||
|
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||
|
int nr_pages = thp_nr_pages(page);
|
||
|
|
||
|
del_page_from_lru_list(page, lruvec);
|
||
|
@@ -684,7 +690,7 @@ void deactivate_file_page(struct page *page)
|
||
|
*/
|
||
|
void deactivate_page(struct page *page)
|
||
|
{
|
||
|
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||
|
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||
|
struct pagevec *pvec;
|
||
|
|
||
|
local_lock(&lru_pvecs.lock);
|
||
|
diff --git a/mm/swapfile.c b/mm/swapfile.c
|
||
|
index 3598b668f533..549e94318b2f 100644
|
||
|
--- a/mm/swapfile.c
|
||
|
+++ b/mm/swapfile.c
|
||
|
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
||
|
page_add_anon_rmap(page, vma, addr, false);
|
||
|
} else { /* ksm created a completely new copy */
|
||
|
page_add_new_anon_rmap(page, vma, addr, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(page, vma);
|
||
|
+ lru_cache_add_page_vma(page, vma, false);
|
||
|
}
|
||
|
swap_free(entry);
|
||
|
out:
|
||
|
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
|
||
|
index e14b3820c6a8..175d55b4f594 100644
|
||
|
--- a/mm/userfaultfd.c
|
||
|
+++ b/mm/userfaultfd.c
|
||
|
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||
|
|
||
|
inc_mm_counter(dst_mm, MM_ANONPAGES);
|
||
|
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
|
||
|
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
|
||
|
+ lru_cache_add_page_vma(page, dst_vma, true);
|
||
|
|
||
|
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
|
||
|
|
||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||
|
index f7bbfc0b1ebd..84d25079092e 100644
|
||
|
--- a/mm/vmscan.c
|
||
|
+++ b/mm/vmscan.c
|
||
|
@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||
|
|
||
|
if (PageSwapCache(page)) {
|
||
|
swp_entry_t swap = { .val = page_private(page) };
|
||
|
- mem_cgroup_swapout(page, swap);
|
||
|
+
|
||
|
+ /* get a shadow entry before page_memcg() is cleared */
|
||
|
if (reclaimed && !mapping_exiting(mapping))
|
||
|
shadow = workingset_eviction(page, target_memcg);
|
||
|
+ mem_cgroup_swapout(page, swap);
|
||
|
__delete_from_swap_cache(page, swap, shadow);
|
||
|
xa_unlock_irqrestore(&mapping->i_pages, flags);
|
||
|
put_swap_page(page, swap);
|
||
|
@@ -2780,6 +2782,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
||
|
get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
|
||
|
}
|
||
|
|
||
|
+/******************************************************************************
|
||
|
+ * refault feedback loop
|
||
|
+ ******************************************************************************/
|
||
|
+
|
||
|
+/*
|
||
|
+ * A feedback loop modeled after the PID controller. Currently supports the
|
||
|
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
|
||
|
+ * added if necessary. The setpoint (SP) is the desired position; the process
|
||
|
+ * variable (PV) is the measured position. The error is the difference between
|
||
|
+ * the SP and the PV. A positive error results in a positive control output
|
||
|
+ * correction, which, in our case, is to allow eviction.
|
||
|
+ *
|
||
|
+ * The P term is the current refault rate refaulted/(evicted+activated), which
|
||
|
+ * has a weight of 1. The I term is the arithmetic mean of the last N refault
|
||
|
+ * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
|
||
|
+ *
|
||
|
+ * Our goal is to make sure upper tiers have similar refault rates as the base
|
||
|
+ * tier. That is we try to be fair to all tiers by maintaining similar refault
|
||
|
+ * rates across them.
|
||
|
+ */
|
||
|
+struct controller_pos {
|
||
|
+ unsigned long refaulted;
|
||
|
+ unsigned long total;
|
||
|
+ int gain;
|
||
|
+};
|
||
|
+
|
||
|
+static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
|
||
|
+ int type, int tier, int gain)
|
||
|
+{
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+ int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
|
||
|
+
|
||
|
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
||
|
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||
|
+ pos->total = lrugen->avg_total[type][tier] +
|
||
|
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||
|
+ if (tier)
|
||
|
+ pos->total += lrugen->activated[hist][type][tier - 1];
|
||
|
+ pos->gain = gain;
|
||
|
+}
|
||
|
+
|
||
|
+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
|
||
|
+{
|
||
|
+ int tier;
|
||
|
+ int hist = hist_from_seq_or_gen(gen);
|
||
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
||
|
+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
|
||
|
+
|
||
|
+ if (!carryover && NR_STAT_GENS == 1)
|
||
|
+ return;
|
||
|
+
|
||
|
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||
|
+ if (carryover) {
|
||
|
+ unsigned long sum;
|
||
|
+
|
||
|
+ sum = lrugen->avg_refaulted[type][tier] +
|
||
|
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||
|
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
|
||
|
+
|
||
|
+ sum = lrugen->avg_total[type][tier] +
|
||
|
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||
|
+ if (tier)
|
||
|
+ sum += lrugen->activated[hist][type][tier - 1];
|
||
|
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
|
||
|
+
|
||
|
+ if (NR_STAT_GENS > 1)
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
|
||
|
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
|
||
|
+ if (tier)
|
||
|
+ WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
|
||
|
+{
|
||
|
+ /*
|
||
|
+ * Allow eviction if the PV has a limited number of refaulted pages or a
|
||
|
+ * lower refault rate than the SP.
|
||
|
+ */
|
||
|
+ return pv->refaulted < SWAP_CLUSTER_MAX ||
|
||
|
+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
|
||
|
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
|
||
|
+}
|
||
|
+
|
||
|
/******************************************************************************
|
||
|
* state change
|
||
|
******************************************************************************/
|
||
|
diff --git a/mm/workingset.c b/mm/workingset.c
|
||
|
index edb8aed2587e..3f3f03d51ea7 100644
|
||
|
--- a/mm/workingset.c
|
||
|
+++ b/mm/workingset.c
|
||
|
@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
|
||
|
return val >> MEM_CGROUP_ID_SHIFT;
|
||
|
}
|
||
|
|
||
|
+#ifdef CONFIG_LRU_GEN
|
||
|
+
|
||
|
+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
|
||
|
+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
|
||
|
+#endif
|
||
|
+
|
||
|
+static void page_set_usage(struct page *page, int usage)
|
||
|
+{
|
||
|
+ unsigned long old_flags, new_flags;
|
||
|
+
|
||
|
+ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
|
||
|
+
|
||
|
+ if (!usage)
|
||
|
+ return;
|
||
|
+
|
||
|
+ do {
|
||
|
+ old_flags = READ_ONCE(page->flags);
|
||
|
+ new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
|
||
|
+ ((usage - 1UL) << LRU_USAGE_PGOFF);
|
||
|
+ } while (new_flags != old_flags &&
|
||
|
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||
|
+}
|
||
|
+
|
||
|
+/* Return a token to be stored in the shadow entry of a page being evicted. */
|
||
|
+static void *lru_gen_eviction(struct page *page)
|
||
|
+{
|
||
|
+ int hist, tier;
|
||
|
+ unsigned long token;
|
||
|
+ unsigned long min_seq;
|
||
|
+ struct lruvec *lruvec;
|
||
|
+ struct lrugen *lrugen;
|
||
|
+ int type = page_is_file_lru(page);
|
||
|
+ int usage = page_tier_usage(page);
|
||
|
+ struct mem_cgroup *memcg = page_memcg(page);
|
||
|
+ struct pglist_data *pgdat = page_pgdat(page);
|
||
|
+
|
||
|
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||
|
+ lrugen = &lruvec->evictable;
|
||
|
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
|
||
|
+ token = (min_seq << LRU_USAGE_SHIFT) | usage;
|
||
|
+
|
||
|
+ hist = hist_from_seq_or_gen(min_seq);
|
||
|
+ tier = lru_tier_from_usage(usage);
|
||
|
+ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
|
||
|
+
|
||
|
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
|
||
|
+}
|
||
|
+
|
||
|
+/* Account a refaulted page based on the token stored in its shadow entry. */
|
||
|
+static void lru_gen_refault(struct page *page, void *shadow)
|
||
|
+{
|
||
|
+ int hist, tier, usage;
|
||
|
+ int memcg_id;
|
||
|
+ unsigned long token;
|
||
|
+ unsigned long min_seq;
|
||
|
+ struct lruvec *lruvec;
|
||
|
+ struct lrugen *lrugen;
|
||
|
+ struct pglist_data *pgdat;
|
||
|
+ struct mem_cgroup *memcg;
|
||
|
+ int type = page_is_file_lru(page);
|
||
|
+
|
||
|
+ token = unpack_shadow(shadow, &memcg_id, &pgdat);
|
||
|
+ if (page_pgdat(page) != pgdat)
|
||
|
+ return;
|
||
|
+
|
||
|
+ rcu_read_lock();
|
||
|
+ memcg = page_memcg_rcu(page);
|
||
|
+ if (mem_cgroup_id(memcg) != memcg_id)
|
||
|
+ goto unlock;
|
||
|
+
|
||
|
+ usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
|
||
|
+ token >>= LRU_USAGE_SHIFT;
|
||
|
+
|
||
|
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||
|
+ lrugen = &lruvec->evictable;
|
||
|
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
|
||
|
+ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
|
||
|
+ goto unlock;
|
||
|
+
|
||
|
+ page_set_usage(page, usage);
|
||
|
+
|
||
|
+ hist = hist_from_seq_or_gen(min_seq);
|
||
|
+ tier = lru_tier_from_usage(usage);
|
||
|
+ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
|
||
|
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
|
||
|
+ if (tier)
|
||
|
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
|
||
|
+unlock:
|
||
|
+ rcu_read_unlock();
|
||
|
+}
|
||
|
+
|
||
|
+#else /* CONFIG_LRU_GEN */
|
||
|
+
|
||
|
+static void *lru_gen_eviction(struct page *page)
|
||
|
+{
|
||
|
+ return NULL;
|
||
|
+}
|
||
|
+
|
||
|
+static void lru_gen_refault(struct page *page, void *shadow)
|
||
|
+{
|
||
|
+}
|
||
|
+
|
||
|
+#endif /* CONFIG_LRU_GEN */
|
||
|
+
|
||
|
/**
|
||
|
* workingset_age_nonresident - age non-resident entries as LRU ages
|
||
|
* @lruvec: the lruvec that was aged
|
||
|
@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
|
||
|
VM_BUG_ON_PAGE(page_count(page), page);
|
||
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||
|
|
||
|
+ if (lru_gen_enabled())
|
||
|
+ return lru_gen_eviction(page);
|
||
|
+
|
||
|
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
||
|
/* XXX: target_memcg can be NULL, go through lruvec */
|
||
|
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
|
||
|
@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
|
||
|
bool workingset;
|
||
|
int memcgid;
|
||
|
|
||
|
+ if (lru_gen_enabled()) {
|
||
|
+ lru_gen_refault(page, shadow);
|
||
|
+ return;
|
||
|
+ }
|
||
|
+
|
||
|
eviction = unpack_shadow(shadow, &memcgid, &pgdat);
|
||
|
|
||
|
rcu_read_lock();
|
||
|
--
|
||
|
2.31.1.751.gd2f1c929bd-goog
|
||
|
|
||
|
|