gjdwebserver-overlay/sys-kernel/pinephone-sources/files/PATCH-v3-08-14-mm-multigenerational-lru-activation.patch

From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED,
	DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
	INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS,
	USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=unavailable autolearn_force=no
	version=3.4.0
Received: from mail.kernel.org (mail.kernel.org [198.145.29.99])
	by smtp.lore.kernel.org (Postfix) with ESMTP id AEEEDC43461
	for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:26 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 4D74961186
	for <linux-kernel@archiver.kernel.org>; Thu, 20 May 2021 06:54:26 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S231144AbhETGzp (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Thu, 20 May 2021 02:55:45 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37910 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S230478AbhETGzg (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Thu, 20 May 2021 02:55:36 -0400
Received: from mail-qv1-xf4a.google.com (mail-qv1-xf4a.google.com [IPv6:2607:f8b0:4864:20::f4a])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 5CD6FC061763
        for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:14 -0700 (PDT)
Received: by mail-qv1-xf4a.google.com with SMTP id x2-20020a0cda020000b02901edb4c412fdso12424236qvj.11
        for <linux-kernel@vger.kernel.org>; Wed, 19 May 2021 23:54:14 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=google.com; s=20161025;
        h=date:in-reply-to:message-id:mime-version:references:subject:from:to
         :cc;
        bh=Jb580jSe4IcT6fVqPR22jrL3z+VNcMEKM2UgbfL90k4=;
        b=rTxj5e7tRY5wx29jetDGP8dUly4vBHNX0SBJeZKRsCOEiHaQ+coy05du1f4bT6oCWw
         rJWrdbUyp5aci9MKmCQ2Z5qPBf7F+zDTL+8wpoufyGbRvdGkfwDkAgQV6LLsi9xZzdyr
         bpcyHItG1lIReRXOkR0GKWNz8GfEVNO7lE+G6Sc1sHPUEEfw3FF5Vl/Wta1OxKsGQQe4
         02oeo8STGdqGF0yOczRyqWZ/SBFcNGiPQ7nrGaWA3FguRBAwZ2dOrTrmM5ug10rbOQmf
         L/m3eja1mOwffFkrgumZ0Sm9KZ5sbKJNbLAjPYQAmAcoXhU/NVnLrMVtxSGppGFwdyOz
         NMsw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20161025;
        h=x-gm-message-state:date:in-reply-to:message-id:mime-version
         :references:subject:from:to:cc;
        bh=Jb580jSe4IcT6fVqPR22jrL3z+VNcMEKM2UgbfL90k4=;
        b=i0Py5qwrQv4OOBWcpJcYxjG5lgHvV4Gq3X3fG5L0aB3lLLnS3mObKdM6XG+uYC1b1G
         z/Sfx4n/1+/0EPZnFoo80K1ry0Y7SD/W30OUEPR8PValuCLHEHzVeoVK2+TPI8DMEzz1
         r+jWpxZkah8B613QrPIvvcZSIb0lxcsV6JxpYjFixO/mizct7mrdls35j1Thb7ehgWtO
         W5aAGiMIxBprDhKHJ2D2Oz85hWRyQYND4jEA68bzh9ybz4cYMVIX3C+9uH+cVIhZ6JZL
         febwADPME4CsH8gMntK/GWzf5Yu+sdeBYn+6VJKrG/4c7dWi0xgFGWYrgCtSlk8kVgOe
         bH1w==
X-Gm-Message-State: AOAM530OUX0GiyYChE/1C1GuJXPP4zDS9QrWZKB+3aIFDiz73ADIQaxu
        gJxNX12VvCvNdCSId0kuSWl88ETTfcg=
X-Google-Smtp-Source: ABdhPJz4m/yxkWn5wBamzXd/wEoVvHq3AOPsnc1+c/ewg4oojPM6XcGKJYYybO2Mtsb6BDRPtu5ccAJRcHw=
X-Received: from yuzhao.bld.corp.google.com ([2620:15c:183:200:595d:62ee:f08:8e83])
 (user=yuzhao job=sendgmr) by 2002:a0c:edcf:: with SMTP id i15mr4021372qvr.10.1621493653456;
 Wed, 19 May 2021 23:54:13 -0700 (PDT)
Date:   Thu, 20 May 2021 00:53:49 -0600
In-Reply-To: <20210520065355.2736558-1-yuzhao@google.com>
Message-Id: <20210520065355.2736558-9-yuzhao@google.com>
Mime-Version: 1.0
References: <20210520065355.2736558-1-yuzhao@google.com>
X-Mailer: git-send-email 2.31.1.751.gd2f1c929bd-goog
Subject: [PATCH v3 08/14] mm: multigenerational lru: activation
From:   Yu Zhao <yuzhao@google.com>
To:     linux-mm@kvack.org
Cc:     Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        Dave Chinner <david@fromorbit.com>,
        Dave Hansen <dave.hansen@linux.intel.com>,
        Donald Carr <sirspudd@gmail.com>,
        Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
        Johannes Weiner <hannes@cmpxchg.org>,
        Jonathan Corbet <corbet@lwn.net>,
        Joonsoo Kim <iamjoonsoo.kim@lge.com>,
        Konstantin Kharlamov <hi-angel@yandex.ru>,
        Marcus Seyfarth <m.seyfarth@gmail.com>,
        Matthew Wilcox <willy@infradead.org>,
        Mel Gorman <mgorman@suse.de>,
        Miaohe Lin <linmiaohe@huawei.com>,
        Michael Larabel <michael@michaellarabel.com>,
        Michal Hocko <mhocko@suse.com>,
        Michel Lespinasse <michel@lespinasse.org>,
        Rik van Riel <riel@surriel.com>,
        Roman Gushchin <guro@fb.com>,
        Tim Chen <tim.c.chen@linux.intel.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Yang Shi <shy828301@gmail.com>,
        Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
        linux-kernel@vger.kernel.org, lkp@lists.01.org,
        page-reclaim@google.com, Yu Zhao <yuzhao@google.com>,
        Konstantin Kharlamov <Hi-Angel@yandex.ru>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
List-Archive: <https://lore.kernel.org/lkml/>

For pages accessed multiple times via file descriptors, instead of
activating them upon the second access, we activate them based on the
refault rates of their tiers. Each generation contains at most
MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in
page->flags. Pages accessed N times via file descriptors belong to
tier order_base_2(N). Tier 0 is the base tier and it contains pages
read ahead, accessed once via file descriptors and accessed only via
page tables. Pages from the base tier are evicted regardless of the
refault rate. Pages from upper tiers that have higher refault rates
than the base tier will be moved to the next generation. A feedback
loop modeled after the PID controller monitors refault rates across
all tiers and decides when to activate pages from which upper tiers
in the reclaim path. The advantages of this model are:
  1) It has a negligible cost in the buffered IO access path because
  activations are done optionally in the reclaim path.
  2) It takes mapped pages into account and avoids overprotecting
  pages accessed multiple times via file descriptors.
  3) More tiers offer better protection to pages accessed more than
  twice when workloads doing intensive buffered IO are under memory
  pressure.

For pages mapped upon page faults, the accessed bit is set during the
initial faults. Ideally we add them to the per-zone lists index by
max_seq, i.e., the youngest generation, so that eviction will not
consider them before the aging has scanned them. For anon pages not in
swap cache, this can be done easily in the page fault path: we rename
lru_cache_add_inactive_or_unevictable() to lru_cache_add_page_vma()
and add a new parameter, which is set to true for pages mapped upon
page faults. For pages in page cache or swap cache, we cannot
differentiate the page fault path from the read ahead path at the time
we call lru_cache_add(). So we add them to the per-zone lists index by
min_seq, i.e., the oldest generation, for now.

Finally, we need to make sure deactivation works when the
multigenerational lru is enabled. We cannot use PageActive() because
it is not set on pages from active generations, in order to spare the
aging the trouble of clearing it when active generations become
inactive. So we deactivate pages unconditionally since deactivation is
not a hot code path worth additional optimizations.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
---
 include/linux/mm_inline.h |  40 ++++++++++++++
 include/linux/swap.h      |   4 +-
 kernel/events/uprobes.c   |   2 +-
 mm/huge_memory.c          |   2 +-
 mm/khugepaged.c           |   2 +-
 mm/memory.c               |  10 ++--
 mm/migrate.c              |   2 +-
 mm/swap.c                 |  22 +++++---
 mm/swapfile.c             |   2 +-
 mm/userfaultfd.c          |   2 +-
 mm/vmscan.c               |  91 ++++++++++++++++++++++++++++++-
 mm/workingset.c           | 112 ++++++++++++++++++++++++++++++++++++++
 12 files changed, 269 insertions(+), 22 deletions(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ae3e3826dd7f..f3b99f65a652 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -103,6 +103,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
 	return seq % MAX_NR_GENS;
 }
 
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
+static inline int lru_tier_from_usage(int usage)
+{
+	return order_base_2(usage + 1);
+}
+
 /* Return a proper index regardless whether we keep a full history of stats. */
 static inline int hist_from_seq_or_gen(int seq_or_gen)
 {
@@ -245,6 +251,36 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
 	return true;
 }
 
+/* Return the level of usage of a page. See the comment on MAX_NR_TIERS. */
+static inline int page_tier_usage(struct page *page)
+{
+	unsigned long flags = READ_ONCE(page->flags);
+
+	return flags & BIT(PG_workingset) ?
+	       ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0;
+}
+
+/* Increment the usage counter after a page is accessed via file descriptors. */
+static inline void page_inc_usage(struct page *page)
+{
+	unsigned long usage;
+	unsigned long old_flags, new_flags;
+
+	do {
+		old_flags = READ_ONCE(page->flags);
+
+		if (!(old_flags & BIT(PG_workingset))) {
+			new_flags = old_flags | BIT(PG_workingset);
+			continue;
+		}
+
+		usage = (old_flags & LRU_USAGE_MASK) + BIT(LRU_USAGE_PGOFF);
+
+		new_flags = (old_flags & ~LRU_USAGE_MASK) | min(usage, LRU_USAGE_MASK);
+	} while (new_flags != old_flags &&
+		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
 #else /* CONFIG_LRU_GEN */
 
 static inline bool lru_gen_enabled(void)
@@ -262,6 +298,10 @@ static inline bool lru_gen_deletion(struct page *page, struct lruvec *lruvec)
 	return false;
 }
 
+static inline void page_inc_usage(struct page *page)
+{
+}
+
 #endif /* CONFIG_LRU_GEN */
 
 static __always_inline void add_page_to_lru_list(struct page *page,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..30b1f15f5c6e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,8 +365,8 @@ extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
-						struct vm_area_struct *vma);
+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+				   bool faulting);
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6addc9780319..4e93e5602723 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	if (new_page) {
 		get_page(new_page);
 		page_add_new_anon_rmap(new_page, vma, addr, false);
-		lru_cache_add_inactive_or_unevictable(new_page, vma);
+		lru_cache_add_page_vma(new_page, vma, false);
 	} else
 		/* no new page, just dec_mm_counter for old_page */
 		dec_mm_counter(mm, MM_ANONPAGES);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8ac9093e5a0d..681da4a3cf61 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,7 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		entry = mk_huge_pmd(page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		page_add_new_anon_rmap(page, vma, haddr, true);
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		lru_cache_add_page_vma(page, vma, true);
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c0185fdd815..09e5346c2754 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1198,7 +1198,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address, true);
-	lru_cache_add_inactive_or_unevictable(new_page, vma);
+	lru_cache_add_page_vma(new_page, vma, true);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
diff --git a/mm/memory.c b/mm/memory.c
index 730daa00952b..a76196885f92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -839,7 +839,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	copy_user_highpage(new_page, page, addr, src_vma);
 	__SetPageUptodate(new_page);
 	page_add_new_anon_rmap(new_page, dst_vma, addr, false);
-	lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
+	lru_cache_add_page_vma(new_page, dst_vma, false);
 	rss[mm_counter(new_page)]++;
 
 	/* All done, just insert the new page copy in the child */
@@ -2950,7 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		 */
 		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
 		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
-		lru_cache_add_inactive_or_unevictable(new_page, vma);
+		lru_cache_add_page_vma(new_page, vma, true);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
@@ -3479,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	/* ksm created a completely new copy */
 	if (unlikely(page != swapcache && swapcache)) {
 		page_add_new_anon_rmap(page, vma, vmf->address, false);
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		lru_cache_add_page_vma(page, vma, true);
 	} else {
 		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
 	}
@@ -3625,7 +3625,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, vmf->address, false);
-	lru_cache_add_inactive_or_unevictable(page, vma);
+	lru_cache_add_page_vma(page, vma, true);
 setpte:
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3793,7 +3793,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 	if (write && !(vma->vm_flags & VM_SHARED)) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, addr, false);
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		lru_cache_add_page_vma(page, vma, true);
 	} else {
 		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
 		page_add_file_rmap(page, false);
diff --git a/mm/migrate.c b/mm/migrate.c
index b234c3f3acb7..d3307c9eced4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2967,7 +2967,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	inc_mm_counter(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, addr, false);
 	if (!is_zone_device_page(page))
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		lru_cache_add_page_vma(page, vma, false);
 	get_page(page);
 
 	if (flush) {
diff --git a/mm/swap.c b/mm/swap.c
index dfb48cf9c2c9..96ce95eeb2c9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -433,6 +433,8 @@ void mark_page_accessed(struct page *page)
 		 * this list is never rotated or maintained, so marking an
 		 * evictable page accessed has no effect.
 		 */
+	} else if (lru_gen_enabled()) {
+		page_inc_usage(page);
 	} else if (!PageActive(page)) {
 		/*
 		 * If the page is on the LRU, queue it for activation via
@@ -478,15 +480,14 @@ void lru_cache_add(struct page *page)
 EXPORT_SYMBOL(lru_cache_add);
 
 /**
- * lru_cache_add_inactive_or_unevictable
+ * lru_cache_add_page_vma
  * @page:  the page to be added to LRU
  * @vma:   vma in which page is mapped for determining reclaimability
  *
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * Place @page on an LRU list, depending on its evictability.
  */
-void lru_cache_add_inactive_or_unevictable(struct page *page,
-					 struct vm_area_struct *vma)
+void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+			    bool faulting)
 {
 	bool unevictable;
 
@@ -503,6 +504,11 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
 		__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
 		count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
 	}
+
+	/* tell the multigenerational lru that the page is being faulted in */
+	if (lru_gen_enabled() && !unevictable && faulting)
+		SetPageActive(page);
+
 	lru_cache_add(page);
 }
 
@@ -529,7 +535,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
  */
 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 {
-	bool active = PageActive(page);
+	bool active = PageActive(page) || lru_gen_enabled();
 	int nr_pages = thp_nr_pages(page);
 
 	if (PageUnevictable(page))
@@ -569,7 +575,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 
 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
 {
-	if (PageActive(page) && !PageUnevictable(page)) {
+	if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
 		int nr_pages = thp_nr_pages(page);
 
 		del_page_from_lru_list(page, lruvec);
@@ -684,7 +690,7 @@ void deactivate_file_page(struct page *page)
  */
 void deactivate_page(struct page *page)
 {
-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+	if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
 		struct pagevec *pvec;
 
 		local_lock(&lru_pvecs.lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3598b668f533..549e94318b2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 		page_add_anon_rmap(page, vma, addr, false);
 	} else { /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, addr, false);
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		lru_cache_add_page_vma(page, vma, false);
 	}
 	swap_free(entry);
 out:
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e14b3820c6a8..175d55b4f594 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 	inc_mm_counter(dst_mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
-	lru_cache_add_inactive_or_unevictable(page, dst_vma);
+	lru_cache_add_page_vma(page, dst_vma, true);
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7bbfc0b1ebd..84d25079092e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
-		mem_cgroup_swapout(page, swap);
+
+		/* get a shadow entry before page_memcg() is cleared */
 		if (reclaimed && !mapping_exiting(mapping))
 			shadow = workingset_eviction(page, target_memcg);
+		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page, swap, shadow);
 		xa_unlock_irqrestore(&mapping->i_pages, flags);
 		put_swap_page(page, swap);
@@ -2780,6 +2782,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
 	       get_nr_gens(lruvec, 1) <= MAX_NR_GENS;
 }
 
+/******************************************************************************
+ *                          refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop modeled after the PID controller. Currently supports the
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
+ * added if necessary. The setpoint (SP) is the desired position; the process
+ * variable (PV) is the measured position. The error is the difference between
+ * the SP and the PV. A positive error results in a positive control output
+ * correction, which, in our case, is to allow eviction.
+ *
+ * The P term is the current refault rate refaulted/(evicted+activated), which
+ * has a weight of 1. The I term is the arithmetic mean of the last N refault
+ * rates, weighted by geometric series 1/2, 1/4, ..., 1/(1<<N).
+ *
+ * Our goal is to make sure upper tiers have similar refault rates as the base
+ * tier. That is we try to be fair to all tiers by maintaining similar refault
+ * rates across them.
+ */
+struct controller_pos {
+	unsigned long refaulted;
+	unsigned long total;
+	int gain;
+};
+
+static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec,
+				int type, int tier, int gain)
+{
+	struct lrugen *lrugen = &lruvec->evictable;
+	int hist = hist_from_seq_or_gen(lrugen->min_seq[type]);
+
+	pos->refaulted = lrugen->avg_refaulted[type][tier] +
+			 atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+	pos->total = lrugen->avg_total[type][tier] +
+		     atomic_long_read(&lrugen->evicted[hist][type][tier]);
+	if (tier)
+		pos->total += lrugen->activated[hist][type][tier - 1];
+	pos->gain = gain;
+}
+
+static void reset_controller_pos(struct lruvec *lruvec, int gen, int type)
+{
+	int tier;
+	int hist = hist_from_seq_or_gen(gen);
+	struct lrugen *lrugen = &lruvec->evictable;
+	bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
+
+	if (!carryover && NR_STAT_GENS == 1)
+		return;
+
+	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+		if (carryover) {
+			unsigned long sum;
+
+			sum = lrugen->avg_refaulted[type][tier] +
+			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+			sum = lrugen->avg_total[type][tier] +
+			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
+			if (tier)
+				sum += lrugen->activated[hist][type][tier - 1];
+			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+
+			if (NR_STAT_GENS > 1)
+				continue;
+		}
+
+		atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+		atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+		if (tier)
+			WRITE_ONCE(lrugen->activated[hist][type][tier - 1], 0);
+	}
+}
+
+static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv)
+{
+	/*
+	 * Allow eviction if the PV has a limited number of refaulted pages or a
+	 * lower refault rate than the SP.
+	 */
+	return pv->refaulted < SWAP_CLUSTER_MAX ||
+	       pv->refaulted * max(sp->total, 1UL) * sp->gain <=
+	       sp->refaulted * max(pv->total, 1UL) * pv->gain;
+}
+
 /******************************************************************************
  *                          state change
  ******************************************************************************/
diff --git a/mm/workingset.c b/mm/workingset.c
index edb8aed2587e..3f3f03d51ea7 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -201,6 +201,110 @@ static unsigned long unpack_shadow(void *shadow, int *memcg_id, struct pglist_da
 	return val >> MEM_CGROUP_ID_SHIFT;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+#if LRU_GEN_SHIFT + LRU_USAGE_SHIFT >= EVICTION_SHIFT
+#error "Please try smaller NODES_SHIFT, NR_LRU_GENS and TIERS_PER_GEN configurations"
+#endif
+
+static void page_set_usage(struct page *page, int usage)
+{
+	unsigned long old_flags, new_flags;
+
+	VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH));
+
+	if (!usage)
+		return;
+
+	do {
+		old_flags = READ_ONCE(page->flags);
+		new_flags = (old_flags & ~LRU_USAGE_MASK) | LRU_TIER_FLAGS |
+			    ((usage - 1UL) << LRU_USAGE_PGOFF);
+	} while (new_flags != old_flags &&
+		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+}
+
+/* Return a token to be stored in the shadow entry of a page being evicted. */
+static void *lru_gen_eviction(struct page *page)
+{
+	int hist, tier;
+	unsigned long token;
+	unsigned long min_seq;
+	struct lruvec *lruvec;
+	struct lrugen *lrugen;
+	int type = page_is_file_lru(page);
+	int usage = page_tier_usage(page);
+	struct mem_cgroup *memcg = page_memcg(page);
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	lruvec = mem_cgroup_lruvec(memcg, pgdat);
+	lrugen = &lruvec->evictable;
+	min_seq = READ_ONCE(lrugen->min_seq[type]);
+	token = (min_seq << LRU_USAGE_SHIFT) | usage;
+
+	hist = hist_from_seq_or_gen(min_seq);
+	tier = lru_tier_from_usage(usage);
+	atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]);
+
+	return pack_shadow(mem_cgroup_id(memcg), pgdat, token);
+}
+
+/* Account a refaulted page based on the token stored in its shadow entry. */
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+	int hist, tier, usage;
+	int memcg_id;
+	unsigned long token;
+	unsigned long min_seq;
+	struct lruvec *lruvec;
+	struct lrugen *lrugen;
+	struct pglist_data *pgdat;
+	struct mem_cgroup *memcg;
+	int type = page_is_file_lru(page);
+
+	token = unpack_shadow(shadow, &memcg_id, &pgdat);
+	if (page_pgdat(page) != pgdat)
+		return;
+
+	rcu_read_lock();
+	memcg = page_memcg_rcu(page);
+	if (mem_cgroup_id(memcg) != memcg_id)
+		goto unlock;
+
+	usage = token & (BIT(LRU_USAGE_SHIFT) - 1);
+	token >>= LRU_USAGE_SHIFT;
+
+	lruvec = mem_cgroup_lruvec(memcg, pgdat);
+	lrugen = &lruvec->evictable;
+	min_seq = READ_ONCE(lrugen->min_seq[type]);
+	if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_SHIFT)))
+		goto unlock;
+
+	page_set_usage(page, usage);
+
+	hist = hist_from_seq_or_gen(min_seq);
+	tier = lru_tier_from_usage(usage);
+	atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]);
+	inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type);
+	if (tier)
+		inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type);
+unlock:
+	rcu_read_unlock();
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct page *page)
+{
+	return NULL;
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
 /**
  * workingset_age_nonresident - age non-resident entries as LRU ages
  * @lruvec: the lruvec that was aged
@@ -249,6 +353,9 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
 	VM_BUG_ON_PAGE(page_count(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
+	if (lru_gen_enabled())
+		return lru_gen_eviction(page);
+
 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
 	/* XXX: target_memcg can be NULL, go through lruvec */
 	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
@@ -283,6 +390,11 @@ void workingset_refault(struct page *page, void *shadow)
 	bool workingset;
 	int memcgid;
 
+	if (lru_gen_enabled()) {
+		lru_gen_refault(page, shadow);
+		return;
+	}
+
 	eviction = unpack_shadow(shadow, &memcgid, &pgdat);
 
 	rcu_read_lock();
-- 
2.31.1.751.gd2f1c929bd-goog