diff --git a/sys-kernel/pinephone-sources/Manifest b/sys-kernel/pinephone-sources/Manifest index 12c93e3..de4a879 100644 --- a/sys-kernel/pinephone-sources/Manifest +++ b/sys-kernel/pinephone-sources/Manifest @@ -1,6 +1,6 @@ DIST all-5.13.5.patch 15071574 BLAKE2B f0b44888b216a60bb12a920a170ffb8ee705e357b82b0cacd58551e2d0e257c0f4419c34976263dc062335bb37f4b3a7418f3d9674e601fd8adda88bacad97d6 SHA512 046f42a5c8fe6477cdda82f47a07093ea51cf26b231b1c58230885954b7ecab9faa9eb72ac3c0cb1603dd6ca2b5b0d76421de6d2c3c05a0bee3ca6e080bfa084 -DIST all-5.14.1.patch 15063858 BLAKE2B dead0428f722a0af4488162b071850f3261198022f966183f0ae5aa5cf802388ea7fd7d1a75046c3ef9f3f67d4760a0fd89cd5a33e9a4546b48a974ca60e3a1b SHA512 f084775d5506450abf89e6e86284a6e23b973c65d5992f9702cd74e60b2551091db2eb8fb63a50dec1393bf5ff38afa27c552dd0cfc70e55815dce0b050dd7e8 +DIST all-5.14.2.patch 15063858 BLAKE2B dead0428f722a0af4488162b071850f3261198022f966183f0ae5aa5cf802388ea7fd7d1a75046c3ef9f3f67d4760a0fd89cd5a33e9a4546b48a974ca60e3a1b SHA512 f084775d5506450abf89e6e86284a6e23b973c65d5992f9702cd74e60b2551091db2eb8fb63a50dec1393bf5ff38afa27c552dd0cfc70e55815dce0b050dd7e8 DIST linux-5.13.tar.xz 119297284 BLAKE2B 9c4c12e2394dec064adff51f7ccdf389192eb27ba7906db5eda543afe3d04afca6b9ea0848a057571bf2534eeb98e1e3a67734deff82c0d3731be205ad995668 SHA512 a8edf97e9d38a49f1be2bde1e29ad96274bb2c6f7e8a2bebaa1161dd4df9cabcbaec4ff644c45bee94f86ae47725087d6deed0cd954209cec717621d137db85e DIST linux-5.14.tar.xz 120669872 BLAKE2B 0047f5aaa3940dff97f4055ef544faafbbb5282128e6afe21d2f47d8dc8c395806a17016febfa050117d16f59e74b882cb8b9c5011d68f119c230d0a4d120524 SHA512 8e4f3ec3d36f774280f75dc7b004a43e09417af58f12e9c9f8348976659d4cfda7ad905f306f43fed66a27922e5c45db22e46bbfa7a0b9f365012380de3b6f64 DIST patch-5.13.5.xz 473120 BLAKE2B a0dd9f3f972a16de87f0d2d8daa7f5d35b27314d22597a28f471cdbe6cedfa7d4bf69e41504d6a9b9d4c1f085146604394747771185dd0a09276cfd92820b4a8 SHA512 1e4eb575775ccbc2e88b34b902a75562e49d6dfb4699dadd5b41fff9db8c2bc994d946d1e60f6320f48ef233aa721d3725582d4ec57458f2293da9a85806c7b1 -DIST patch-5.14.1.xz 4708 BLAKE2B 334d56ba26abc0d29432c27a31fefff332d9b134899b6f0d5fdeadaf68c3bf95eaeba37a9500eb3c245ae2eea70786c55056586ce747277e9f2be16697ceab73 SHA512 409108139ba2842d100d69e47d88eee87bea58c210a84e242c82aae48fe4bc57ccd0a507c4a0ecb791d0a2962f46cda3ba900f728a9b6856e986bff018a2d19c +DIST patch-5.14.2.xz 8376 BLAKE2B 0fd1b718801661f5fe70866b0301bdcdaa4c2c0ca75aefcc20d4edb7439326cbc2aa1a0f78dd105bc7939f52107aca032f0e6e43d39c077e4f784220d0a8f752 SHA512 b8f181cf5d19e5206e4e0524a1bef5639346b71d002523bfd65e3a2fc2b914fa979c381c8a613c062503160510f31c003f712773cb6945497aa3eba71595c697 diff --git a/sys-kernel/pinephone-sources/files/2423aac2d6f5db55da99e11fd799ee66fe6f54c6.patch b/sys-kernel/pinephone-sources/files/2423aac2d6f5db55da99e11fd799ee66fe6f54c6.patch new file mode 100644 index 0000000..753600f --- /dev/null +++ b/sys-kernel/pinephone-sources/files/2423aac2d6f5db55da99e11fd799ee66fe6f54c6.patch @@ -0,0 +1,172 @@ +From 2423aac2d6f5db55da99e11fd799ee66fe6f54c6 Mon Sep 17 00:00:00 2001 +From: Samuel Holland +Date: Mon, 9 Aug 2021 19:30:18 -0500 +Subject: [PATCH] Input: kb151 - Add support for the FN layer + +Signed-off-by: Samuel Holland +--- + .../dts/allwinner/sun50i-a64-pinephone.dtsi | 34 +++++++++++++++++-- + drivers/input/keyboard/kb151.c | 33 ++++++++++-------- + 2 files changed, 51 insertions(+), 16 deletions(-) + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +index 0bdc6eceec6099..68f5730cf164c7 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +@@ -557,7 +557,7 @@ + reg = <0x15>; + interrupt-parent = <&r_pio>; + interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ +- keypad,num-rows = <6>; ++ keypad,num-rows = <12>; + keypad,num-columns = <12>; + linux,keymap = ; ++ MATRIX_KEY(5, 5, KEY_RIGHTALT) ++ ++ /* FN layer */ ++ MATRIX_KEY(6, 1, KEY_BACKSLASH) ++ MATRIX_KEY(6, 2, KEY_BACKSLASH) ++ MATRIX_KEY(6, 3, KEY_DOLLAR) ++ MATRIX_KEY(6, 4, KEY_EURO) ++ MATRIX_KEY(6, 5, KEY_GRAVE) ++ MATRIX_KEY(6, 6, KEY_GRAVE) ++ MATRIX_KEY(6, 7, KEY_MINUS) ++ MATRIX_KEY(6, 8, KEY_EQUAL) ++ MATRIX_KEY(6, 9, KEY_MINUS) ++ MATRIX_KEY(6, 10, KEY_EQUAL) ++ MATRIX_KEY(6, 11, KEY_DELETE) ++ ++ MATRIX_KEY(8, 0, KEY_SYSRQ) ++ MATRIX_KEY(8, 10, KEY_INSERT) ++ ++ MATRIX_KEY(9, 0, KEY_LEFTSHIFT) ++ MATRIX_KEY(9, 8, KEY_HOME) ++ MATRIX_KEY(9, 9, KEY_UP) ++ MATRIX_KEY(9, 10, KEY_END) ++ ++ MATRIX_KEY(10, 1, KEY_LEFTCTRL) ++ MATRIX_KEY(10, 6, KEY_LEFT) ++ MATRIX_KEY(10, 8, KEY_RIGHT) ++ MATRIX_KEY(10, 9, KEY_DOWN) ++ ++ MATRIX_KEY(11, 2, KEY_FN) ++ MATRIX_KEY(11, 3, KEY_LEFTALT) ++ MATRIX_KEY(11, 5, KEY_RIGHTALT)>; + wakeup-source; + }; + }; +diff --git a/drivers/input/keyboard/kb151.c b/drivers/input/keyboard/kb151.c +index 595275d4f9d96f..bb6250efe93419 100644 +--- a/drivers/input/keyboard/kb151.c ++++ b/drivers/input/keyboard/kb151.c +@@ -29,6 +29,7 @@ struct kb151 { + u8 row_shift; + u8 rows; + u8 cols; ++ u8 fn_state; + u8 buf_swap; + u8 buf[]; + }; +@@ -55,7 +56,7 @@ static void kb151_update(struct i2c_client *client) + return; + } + +- dev_info(dev, "%02x | %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x", ++ dev_dbg(dev, "%02x | %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x", + new_buf[0], new_buf[1], new_buf[2], new_buf[3], new_buf[4], new_buf[5], + new_buf[6], new_buf[7], new_buf[8], new_buf[9], new_buf[10], new_buf[11], + new_buf[12]); +@@ -65,8 +66,6 @@ static void kb151_update(struct i2c_client *client) + crc, new_buf[0]); + return; + } +- dev_info(dev, "Good scan data (%02x == %02x)\n", +- crc, new_buf[0]); + + for (col = 0; col < kb151->cols; ++col) { + u8 old = *(++old_buf); +@@ -74,14 +73,20 @@ static void kb151_update(struct i2c_client *client) + u8 changed = old ^ new; + + for (row = 0; row < kb151->rows; ++row) { +- int code = MATRIX_SCAN_CODE(row, col, kb151->row_shift); + u8 pressed = new & BIT(row); ++ u8 map_row = row + (kb151->fn_state ? kb151->rows : 0); ++ int code = MATRIX_SCAN_CODE(map_row, col, kb151->row_shift); + + if (!(changed & BIT(row))) + continue; + + dev_dbg(&client->dev, "row %u col %u %sed\n", +- row, col, pressed ? "press" : "releas"); ++ map_row, col, pressed ? "press" : "releas"); ++ if (keymap[code] == KEY_FN) { ++ dev_dbg(&client->dev, "FN is now %s\n", ++ pressed ? "pressed" : "released"); ++ kb151->fn_state = pressed; ++ } else + input_report_key(kb151->input, keymap[code], pressed); + } + } +@@ -151,7 +156,7 @@ static int kb151_probe(struct i2c_client *client) + struct device *dev = &client->dev; + u8 info[KB151_MATRIX_SIZE + 1]; + unsigned int kb_rows, kb_cols; +- unsigned int rows, cols; ++ unsigned int map_rows, map_cols; + struct kb151 *kb151; + int ret; + +@@ -168,20 +173,20 @@ static int kb151_probe(struct i2c_client *client) + info[KB151_FW_REVISION] & 0xf, + info[KB151_FW_FEATURES]); + +- ret = matrix_keypad_parse_properties(dev, &rows, &cols); ++ ret = matrix_keypad_parse_properties(dev, &map_rows, &map_cols); + if (ret) + return ret; + + kb_rows = info[KB151_MATRIX_SIZE] & 0xf; + kb_cols = info[KB151_MATRIX_SIZE] >> 4; +- if (rows > kb_rows || cols != kb_cols) { ++ if (map_rows != 2 * kb_rows || map_cols != kb_cols) { + dev_err(dev, "Keyboard matrix is %ux%u, but key map is %ux%u\n", +- kb_rows, kb_cols, rows, cols); ++ kb_rows, kb_cols, map_rows, map_cols); + return -EINVAL; + } + + /* Allocate two buffers, and include space for the CRC. */ +- kb151 = devm_kzalloc(dev, struct_size(kb151, buf, 2 * (cols + 1)), GFP_KERNEL); ++ kb151 = devm_kzalloc(dev, struct_size(kb151, buf, 2 * (kb_cols + 1)), GFP_KERNEL); + if (!kb151) + return -ENOMEM; + +@@ -189,9 +194,9 @@ static int kb151_probe(struct i2c_client *client) + + crc8_populate_msb(kb151->crc_table, KB151_CRC8_POLYNOMIAL); + +- kb151->row_shift = get_count_order(cols); +- kb151->rows = rows; +- kb151->cols = cols; ++ kb151->row_shift = get_count_order(kb_cols); ++ kb151->rows = kb_rows; ++ kb151->cols = kb_cols; + + kb151->input = devm_input_allocate_device(dev); + if (!kb151->input) +@@ -207,7 +212,7 @@ static int kb151_probe(struct i2c_client *client) + + __set_bit(EV_REP, kb151->input->evbit); + +- ret = matrix_keypad_build_keymap(NULL, NULL, rows, cols, ++ ret = matrix_keypad_build_keymap(NULL, NULL, map_rows, map_cols, + NULL, kb151->input); + if (ret) + return dev_err_probe(dev, ret, "Failed to build keymap\n"); diff --git a/sys-kernel/pinephone-sources/files/5.11.5.patch b/sys-kernel/pinephone-sources/files/5.11.5.patch deleted file mode 100644 index cd533a8..0000000 --- a/sys-kernel/pinephone-sources/files/5.11.5.patch +++ /dev/null @@ -1,1526 +0,0 @@ -diff --git a/Makefile b/Makefile -index cb9a8e8239511..1673c12fb4b35 100644 ---- a/Makefile -+++ b/Makefile -@@ -1,7 +1,7 @@ - # SPDX-License-Identifier: GPL-2.0 - VERSION = 5 - PATCHLEVEL = 11 --SUBLEVEL = 4 -+SUBLEVEL = 5 - EXTRAVERSION = - NAME = 💕 Valentine's Day Edition 💕 - -diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c -index e67b22fc3c60b..c1b299760bf7a 100644 ---- a/arch/ia64/kernel/signal.c -+++ b/arch/ia64/kernel/signal.c -@@ -341,7 +341,8 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) - * need to push through a forced SIGSEGV. - */ - while (1) { -- get_signal(&ksig); -+ if (!get_signal(&ksig)) -+ break; - - /* - * get_signal() may have run a debugger (via notify_parent()) -diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c -index bfda153b1a41d..87682dcb64ec3 100644 ---- a/drivers/base/power/runtime.c -+++ b/drivers/base/power/runtime.c -@@ -325,22 +325,22 @@ static void rpm_put_suppliers(struct device *dev) - static int __rpm_callback(int (*cb)(struct device *), struct device *dev) - __releases(&dev->power.lock) __acquires(&dev->power.lock) - { -- int retval, idx; - bool use_links = dev->power.links_count > 0; -+ bool get = false; -+ int retval, idx; -+ bool put; - - if (dev->power.irq_safe) { - spin_unlock(&dev->power.lock); -+ } else if (!use_links) { -+ spin_unlock_irq(&dev->power.lock); - } else { -+ get = dev->power.runtime_status == RPM_RESUMING; -+ - spin_unlock_irq(&dev->power.lock); - -- /* -- * Resume suppliers if necessary. -- * -- * The device's runtime PM status cannot change until this -- * routine returns, so it is safe to read the status outside of -- * the lock. -- */ -- if (use_links && dev->power.runtime_status == RPM_RESUMING) { -+ /* Resume suppliers if necessary. */ -+ if (get) { - idx = device_links_read_lock(); - - retval = rpm_get_suppliers(dev); -@@ -355,24 +355,36 @@ static int __rpm_callback(int (*cb)(struct device *), struct device *dev) - - if (dev->power.irq_safe) { - spin_lock(&dev->power.lock); -- } else { -- /* -- * If the device is suspending and the callback has returned -- * success, drop the usage counters of the suppliers that have -- * been reference counted on its resume. -- * -- * Do that if resume fails too. -- */ -- if (use_links -- && ((dev->power.runtime_status == RPM_SUSPENDING && !retval) -- || (dev->power.runtime_status == RPM_RESUMING && retval))) { -- idx = device_links_read_lock(); -+ return retval; -+ } - -- fail: -- rpm_put_suppliers(dev); -+ spin_lock_irq(&dev->power.lock); - -- device_links_read_unlock(idx); -- } -+ if (!use_links) -+ return retval; -+ -+ /* -+ * If the device is suspending and the callback has returned success, -+ * drop the usage counters of the suppliers that have been reference -+ * counted on its resume. -+ * -+ * Do that if the resume fails too. -+ */ -+ put = dev->power.runtime_status == RPM_SUSPENDING && !retval; -+ if (put) -+ __update_runtime_status(dev, RPM_SUSPENDED); -+ else -+ put = get && retval; -+ -+ if (put) { -+ spin_unlock_irq(&dev->power.lock); -+ -+ idx = device_links_read_lock(); -+ -+fail: -+ rpm_put_suppliers(dev); -+ -+ device_links_read_unlock(idx); - - spin_lock_irq(&dev->power.lock); - } -diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c -index 63f549889f875..5ac1881396afb 100644 ---- a/drivers/block/rsxx/core.c -+++ b/drivers/block/rsxx/core.c -@@ -165,15 +165,17 @@ static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, - { - struct rsxx_cardinfo *card = file_inode(fp)->i_private; - char *buf; -- ssize_t st; -+ int st; - - buf = kzalloc(cnt, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - st = rsxx_creg_read(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); -- if (!st) -- st = copy_to_user(ubuf, buf, cnt); -+ if (!st) { -+ if (copy_to_user(ubuf, buf, cnt)) -+ st = -EFAULT; -+ } - kfree(buf); - if (st) - return st; -diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c -index 431919d5f48af..a2e0395cbe618 100644 ---- a/drivers/char/tpm/tpm_tis_core.c -+++ b/drivers/char/tpm/tpm_tis_core.c -@@ -707,12 +707,22 @@ static int tpm_tis_gen_interrupt(struct tpm_chip *chip) - const char *desc = "attempting to generate an interrupt"; - u32 cap2; - cap_t cap; -+ int ret; - -+ /* TPM 2.0 */ - if (chip->flags & TPM_CHIP_FLAG_TPM2) - return tpm2_get_tpm_pt(chip, 0x100, &cap2, desc); -- else -- return tpm1_getcap(chip, TPM_CAP_PROP_TIS_TIMEOUT, &cap, desc, -- 0); -+ -+ /* TPM 1.2 */ -+ ret = request_locality(chip, 0); -+ if (ret < 0) -+ return ret; -+ -+ ret = tpm1_getcap(chip, TPM_CAP_PROP_TIS_TIMEOUT, &cap, desc, 0); -+ -+ release_locality(chip, 0); -+ -+ return ret; - } - - /* Register the IRQ and issue a command that will cause an interrupt. If an -@@ -1019,11 +1029,21 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, - init_waitqueue_head(&priv->read_queue); - init_waitqueue_head(&priv->int_queue); - if (irq != -1) { -- /* Before doing irq testing issue a command to the TPM in polling mode -+ /* -+ * Before doing irq testing issue a command to the TPM in polling mode - * to make sure it works. May as well use that command to set the - * proper timeouts for the driver. - */ -- if (tpm_get_timeouts(chip)) { -+ -+ rc = request_locality(chip, 0); -+ if (rc < 0) -+ goto out_err; -+ -+ rc = tpm_get_timeouts(chip); -+ -+ release_locality(chip, 0); -+ -+ if (rc) { - dev_err(dev, "Could not get TPM timeouts and durations\n"); - rc = -ENODEV; - goto out_err; -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c -index 8155c54392c88..36a741d63ddcf 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c -@@ -903,10 +903,11 @@ void amdgpu_acpi_fini(struct amdgpu_device *adev) - */ - bool amdgpu_acpi_is_s0ix_supported(struct amdgpu_device *adev) - { -+#if defined(CONFIG_AMD_PMC) - if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) { - if (adev->flags & AMD_IS_APU) - return true; - } -- -+#endif - return false; - } -diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c -index a6667a2ca0db3..c2190c3e97f31 100644 ---- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c -+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c -@@ -356,7 +356,7 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf, - while (size) { - uint32_t value; - -- value = RREG32_PCIE(*pos >> 2); -+ value = RREG32_PCIE(*pos); - r = put_user(value, (uint32_t *)buf); - if (r) { - pm_runtime_mark_last_busy(adev_to_drm(adev)->dev); -@@ -423,7 +423,7 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user - return r; - } - -- WREG32_PCIE(*pos >> 2, value); -+ WREG32_PCIE(*pos, value); - - result += 4; - buf += 4; -diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c -index 6bee3677394ac..22b96b7d3647f 100644 ---- a/drivers/gpu/drm/amd/amdgpu/nv.c -+++ b/drivers/gpu/drm/amd/amdgpu/nv.c -@@ -498,7 +498,8 @@ static bool nv_is_headless_sku(struct pci_dev *pdev) - { - if ((pdev->device == 0x731E && - (pdev->revision == 0xC6 || pdev->revision == 0xC7)) || -- (pdev->device == 0x7340 && pdev->revision == 0xC9)) -+ (pdev->device == 0x7340 && pdev->revision == 0xC9) || -+ (pdev->device == 0x7360 && pdev->revision == 0xC7)) - return true; - return false; - } -@@ -568,7 +569,8 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) - if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT && - !amdgpu_sriov_vf(adev)) - amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block); -- amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); -+ if (!nv_is_headless_sku(adev->pdev)) -+ amdgpu_device_ip_block_add(adev, &vcn_v2_0_ip_block); - if (!amdgpu_sriov_vf(adev)) - amdgpu_device_ip_block_add(adev, &jpeg_v2_0_ip_block); - break; -diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c -index 5aeb5f5a04478..9be8e1888daf4 100644 ---- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c -+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c -@@ -78,6 +78,9 @@ MODULE_FIRMWARE("amdgpu/dimgrey_cavefish_smc.bin"); - #define PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE_MASK 0xC000 - #define PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE__SHIFT 0xE - -+#define mmTHM_BACO_CNTL_ARCT 0xA7 -+#define mmTHM_BACO_CNTL_ARCT_BASE_IDX 0 -+ - static int link_width[] = {0, 1, 2, 4, 8, 12, 16}; - static int link_speed[] = {25, 50, 80, 160}; - -@@ -1581,9 +1584,15 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state) - break; - default: - if (!ras || !ras->supported) { -- data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL); -- data |= 0x80000000; -- WREG32_SOC15(THM, 0, mmTHM_BACO_CNTL, data); -+ if (adev->asic_type == CHIP_ARCTURUS) { -+ data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL_ARCT); -+ data |= 0x80000000; -+ WREG32_SOC15(THM, 0, mmTHM_BACO_CNTL_ARCT, data); -+ } else { -+ data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL); -+ data |= 0x80000000; -+ WREG32_SOC15(THM, 0, mmTHM_BACO_CNTL, data); -+ } - - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_EnterBaco, 0, NULL); - } else { -diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c -index be996dba040cc..3d194bb608405 100644 ---- a/drivers/infiniband/core/cm.c -+++ b/drivers/infiniband/core/cm.c -@@ -3651,6 +3651,7 @@ static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, - struct ib_cm_sidr_rep_param *param) - { - struct ib_mad_send_buf *msg; -+ unsigned long flags; - int ret; - - lockdep_assert_held(&cm_id_priv->lock); -@@ -3676,12 +3677,12 @@ static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, - return ret; - } - cm_id_priv->id.state = IB_CM_IDLE; -- spin_lock_irq(&cm.lock); -+ spin_lock_irqsave(&cm.lock, flags); - if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { - rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); - RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); - } -- spin_unlock_irq(&cm.lock); -+ spin_unlock_irqrestore(&cm.lock, flags); - return 0; - } - -diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c -index ff8e17d7f7ca8..8161035eb7740 100644 ---- a/drivers/infiniband/hw/mlx5/devx.c -+++ b/drivers/infiniband/hw/mlx5/devx.c -@@ -1970,8 +1970,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( - - num_alloc_xa_entries++; - event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL); -- if (!event_sub) -+ if (!event_sub) { -+ err = -ENOMEM; - goto err; -+ } - - list_add_tail(&event_sub->event_list, &sub_list); - uverbs_uobject_get(&ev_file->uobj); -diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig -index 4521490667925..06b8dc5093f77 100644 ---- a/drivers/infiniband/sw/rxe/Kconfig -+++ b/drivers/infiniband/sw/rxe/Kconfig -@@ -4,6 +4,7 @@ config RDMA_RXE - depends on INET && PCI && INFINIBAND - depends on INFINIBAND_VIRT_DMA - select NET_UDP_TUNNEL -+ select CRYPTO - select CRYPTO_CRC32 - help - This driver implements the InfiniBand RDMA transport over -diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c -index 4078358ed66ea..00fbc591a1425 100644 ---- a/drivers/iommu/dma-iommu.c -+++ b/drivers/iommu/dma-iommu.c -@@ -309,6 +309,11 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad) - domain->ops->flush_iotlb_all(domain); - } - -+static bool dev_is_untrusted(struct device *dev) -+{ -+ return dev_is_pci(dev) && to_pci_dev(dev)->untrusted; -+} -+ - /** - * iommu_dma_init_domain - Initialise a DMA mapping domain - * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() -@@ -363,8 +368,9 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, - - init_iova_domain(iovad, 1UL << order, base_pfn); - -- if (!cookie->fq_domain && !iommu_domain_get_attr(domain, -- DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && attr) { -+ if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) && -+ !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) && -+ attr) { - if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, - iommu_dma_entry_dtor)) - pr_warn("iova flush queue initialization failed\n"); -@@ -521,11 +527,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr, - iova_align(iovad, size), dir, attrs); - } - --static bool dev_is_untrusted(struct device *dev) --{ -- return dev_is_pci(dev) && to_pci_dev(dev)->untrusted; --} -- - static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, - size_t size, int prot, u64 dma_mask) - { -diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h -index 97dfcffbf495a..444c0bec221a4 100644 ---- a/drivers/iommu/intel/pasid.h -+++ b/drivers/iommu/intel/pasid.h -@@ -30,8 +30,8 @@ - #define VCMD_VRSP_IP 0x1 - #define VCMD_VRSP_SC(e) (((e) >> 1) & 0x3) - #define VCMD_VRSP_SC_SUCCESS 0 --#define VCMD_VRSP_SC_NO_PASID_AVAIL 1 --#define VCMD_VRSP_SC_INVALID_PASID 1 -+#define VCMD_VRSP_SC_NO_PASID_AVAIL 2 -+#define VCMD_VRSP_SC_INVALID_PASID 2 - #define VCMD_VRSP_RESULT_PASID(e) (((e) >> 8) & 0xfffff) - #define VCMD_CMD_OPERAND(e) ((e) << 8) - /* -diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c -index 4a3f095a1c267..97eb62f667d22 100644 ---- a/drivers/iommu/tegra-smmu.c -+++ b/drivers/iommu/tegra-smmu.c -@@ -798,10 +798,70 @@ static phys_addr_t tegra_smmu_iova_to_phys(struct iommu_domain *domain, - return SMMU_PFN_PHYS(pfn) + SMMU_OFFSET_IN_PAGE(iova); - } - -+static struct tegra_smmu *tegra_smmu_find(struct device_node *np) -+{ -+ struct platform_device *pdev; -+ struct tegra_mc *mc; -+ -+ pdev = of_find_device_by_node(np); -+ if (!pdev) -+ return NULL; -+ -+ mc = platform_get_drvdata(pdev); -+ if (!mc) -+ return NULL; -+ -+ return mc->smmu; -+} -+ -+static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, -+ struct of_phandle_args *args) -+{ -+ const struct iommu_ops *ops = smmu->iommu.ops; -+ int err; -+ -+ err = iommu_fwspec_init(dev, &dev->of_node->fwnode, ops); -+ if (err < 0) { -+ dev_err(dev, "failed to initialize fwspec: %d\n", err); -+ return err; -+ } -+ -+ err = ops->of_xlate(dev, args); -+ if (err < 0) { -+ dev_err(dev, "failed to parse SW group ID: %d\n", err); -+ iommu_fwspec_free(dev); -+ return err; -+ } -+ -+ return 0; -+} -+ - static struct iommu_device *tegra_smmu_probe_device(struct device *dev) - { -- struct tegra_smmu *smmu = dev_iommu_priv_get(dev); -+ struct device_node *np = dev->of_node; -+ struct tegra_smmu *smmu = NULL; -+ struct of_phandle_args args; -+ unsigned int index = 0; -+ int err; -+ -+ while (of_parse_phandle_with_args(np, "iommus", "#iommu-cells", index, -+ &args) == 0) { -+ smmu = tegra_smmu_find(args.np); -+ if (smmu) { -+ err = tegra_smmu_configure(smmu, dev, &args); -+ of_node_put(args.np); - -+ if (err < 0) -+ return ERR_PTR(err); -+ -+ break; -+ } -+ -+ of_node_put(args.np); -+ index++; -+ } -+ -+ smmu = dev_iommu_priv_get(dev); - if (!smmu) - return ERR_PTR(-ENODEV); - -@@ -1028,6 +1088,16 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev, - if (!smmu) - return ERR_PTR(-ENOMEM); - -+ /* -+ * This is a bit of a hack. Ideally we'd want to simply return this -+ * value. However the IOMMU registration process will attempt to add -+ * all devices to the IOMMU when bus_set_iommu() is called. In order -+ * not to rely on global variables to track the IOMMU instance, we -+ * set it here so that it can be looked up from the .probe_device() -+ * callback via the IOMMU device's .drvdata field. -+ */ -+ mc->smmu = smmu; -+ - size = BITS_TO_LONGS(soc->num_asids) * sizeof(long); - - smmu->asids = devm_kzalloc(dev, size, GFP_KERNEL); -diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c -index fce4cbf9529d6..50f3e673729c3 100644 ---- a/drivers/md/dm-bufio.c -+++ b/drivers/md/dm-bufio.c -@@ -1526,6 +1526,10 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); - sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) - { - sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT; -+ if (s >= c->start) -+ s -= c->start; -+ else -+ s = 0; - if (likely(c->sectors_per_block_bits >= 0)) - s >>= c->sectors_per_block_bits; - else -diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c -index fb41b4f23c489..66f4c6398f670 100644 ---- a/drivers/md/dm-verity-fec.c -+++ b/drivers/md/dm-verity-fec.c -@@ -61,19 +61,18 @@ static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, - static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, - unsigned *offset, struct dm_buffer **buf) - { -- u64 position, block; -+ u64 position, block, rem; - u8 *res; - - position = (index + rsb) * v->fec->roots; -- block = position >> v->data_dev_block_bits; -- *offset = (unsigned)(position - (block << v->data_dev_block_bits)); -+ block = div64_u64_rem(position, v->fec->roots << SECTOR_SHIFT, &rem); -+ *offset = (unsigned)rem; - -- res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf); -+ res = dm_bufio_read(v->fec->bufio, block, buf); - if (IS_ERR(res)) { - DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", - v->data_dev->name, (unsigned long long)rsb, -- (unsigned long long)(v->fec->start + block), -- PTR_ERR(res)); -+ (unsigned long long)block, PTR_ERR(res)); - *buf = NULL; - } - -@@ -155,7 +154,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, - - /* read the next block when we run out of parity bytes */ - offset += v->fec->roots; -- if (offset >= 1 << v->data_dev_block_bits) { -+ if (offset >= v->fec->roots << SECTOR_SHIFT) { - dm_bufio_release(buf); - - par = fec_read_parity(v, rsb, block_offset, &offset, &buf); -@@ -674,7 +673,7 @@ int verity_fec_ctr(struct dm_verity *v) - { - struct dm_verity_fec *f = v->fec; - struct dm_target *ti = v->ti; -- u64 hash_blocks; -+ u64 hash_blocks, fec_blocks; - int ret; - - if (!verity_fec_is_enabled(v)) { -@@ -744,15 +743,17 @@ int verity_fec_ctr(struct dm_verity *v) - } - - f->bufio = dm_bufio_client_create(f->dev->bdev, -- 1 << v->data_dev_block_bits, -+ f->roots << SECTOR_SHIFT, - 1, 0, NULL, NULL); - if (IS_ERR(f->bufio)) { - ti->error = "Cannot initialize FEC bufio client"; - return PTR_ERR(f->bufio); - } - -- if (dm_bufio_get_device_size(f->bufio) < -- ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) { -+ dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT)); -+ -+ fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT); -+ if (dm_bufio_get_device_size(f->bufio) < fec_blocks) { - ti->error = "FEC device is too small"; - return -E2BIG; - } -diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c -index 470ff6b3ebef1..35b015c9ab025 100644 ---- a/drivers/net/ethernet/realtek/r8169_main.c -+++ b/drivers/net/ethernet/realtek/r8169_main.c -@@ -2208,6 +2208,7 @@ static void rtl_pll_power_down(struct rtl8169_private *tp) - - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: -+ case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30: - case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33: - case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39: -@@ -2235,6 +2236,7 @@ static void rtl_pll_power_up(struct rtl8169_private *tp) - { - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: -+ case RTL_GIGA_MAC_VER_29 ... RTL_GIGA_MAC_VER_30: - case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33: - case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39: -diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c -index 3b1c387375a6b..3cf1b953f5236 100644 ---- a/fs/btrfs/block-group.c -+++ b/fs/btrfs/block-group.c -@@ -1150,6 +1150,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - -+ if (cache->swap_extents) { -+ ret = -ETXTBSY; -+ goto out; -+ } -+ - if (cache->ro) { - cache->ro++; - ret = 0; -@@ -2253,7 +2258,7 @@ again: - } - - ret = inc_block_group_ro(cache, 0); -- if (!do_chunk_alloc) -+ if (!do_chunk_alloc || ret == -ETXTBSY) - goto unlock_out; - if (!ret) - goto out; -@@ -2262,6 +2267,8 @@ again: - if (ret < 0) - goto out; - ret = inc_block_group_ro(cache, 0); -+ if (ret == -ETXTBSY) -+ goto unlock_out; - out: - if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); -@@ -3345,6 +3352,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) - ASSERT(list_empty(&block_group->io_list)); - ASSERT(list_empty(&block_group->bg_list)); - ASSERT(refcount_read(&block_group->refs) == 1); -+ ASSERT(block_group->swap_extents == 0); - btrfs_put_block_group(block_group); - - spin_lock(&info->block_group_cache_lock); -@@ -3411,3 +3419,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) - __btrfs_remove_free_space_cache(block_group->free_space_ctl); - } - } -+ -+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) -+{ -+ bool ret = true; -+ -+ spin_lock(&bg->lock); -+ if (bg->ro) -+ ret = false; -+ else -+ bg->swap_extents++; -+ spin_unlock(&bg->lock); -+ -+ return ret; -+} -+ -+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) -+{ -+ spin_lock(&bg->lock); -+ ASSERT(!bg->ro); -+ ASSERT(bg->swap_extents >= amount); -+ bg->swap_extents -= amount; -+ spin_unlock(&bg->lock); -+} -diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h -index 8f74a96074f7b..8a925741dc34a 100644 ---- a/fs/btrfs/block-group.h -+++ b/fs/btrfs/block-group.h -@@ -181,6 +181,12 @@ struct btrfs_block_group { - */ - int needs_free_space; - -+ /* -+ * Number of extents in this block group used for swap files. -+ * All accesses protected by the spinlock 'lock'. -+ */ -+ int swap_extents; -+ - /* Record locked full stripes for RAID5/6 block group */ - struct btrfs_full_stripe_locks_tree full_stripe_locks_root; - }; -@@ -301,4 +307,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); - #endif - -+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg); -+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount); -+ - #endif /* BTRFS_BLOCK_GROUP_H */ -diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h -index 4debdbdde2abb..0c8c55a41d7b2 100644 ---- a/fs/btrfs/ctree.h -+++ b/fs/btrfs/ctree.h -@@ -523,6 +523,11 @@ struct btrfs_swapfile_pin { - * points to a struct btrfs_device. - */ - bool is_block_group; -+ /* -+ * Only used when 'is_block_group' is true and it is the number of -+ * extents used by a swapfile for this block group ('ptr' field). -+ */ -+ int bg_extent_count; - }; - - bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); -diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c -index 70c0340d839cb..f12e6a0aa3c70 100644 ---- a/fs/btrfs/delayed-inode.c -+++ b/fs/btrfs/delayed-inode.c -@@ -649,7 +649,7 @@ static int btrfs_delayed_inode_reserve_metadata( - btrfs_ino(inode), - num_bytes, 1); - } else { -- btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize); -+ btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - return ret; - } -diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c -index 0e41459b8de66..f851a1a63833d 100644 ---- a/fs/btrfs/file.c -+++ b/fs/btrfs/file.c -@@ -3264,8 +3264,11 @@ reserve_space: - goto out; - ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, - alloc_start, bytes_to_reserve); -- if (ret) -+ if (ret) { -+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, -+ lockend, &cached_state); - goto out; -+ } - ret = btrfs_prealloc_file_range(inode, mode, alloc_start, - alloc_end - alloc_start, - i_blocksize(inode), -diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c -index 71d0d14bc18b3..b64b88987367c 100644 ---- a/fs/btrfs/free-space-cache.c -+++ b/fs/btrfs/free-space-cache.c -@@ -2708,8 +2708,10 @@ static void __btrfs_return_cluster_to_free_space( - struct rb_node *node; - - spin_lock(&cluster->lock); -- if (cluster->block_group != block_group) -- goto out; -+ if (cluster->block_group != block_group) { -+ spin_unlock(&cluster->lock); -+ return; -+ } - - cluster->block_group = NULL; - cluster->window_start = 0; -@@ -2747,8 +2749,6 @@ static void __btrfs_return_cluster_to_free_space( - entry->offset, &entry->offset_index, bitmap); - } - cluster->root = RB_ROOT; -- --out: - spin_unlock(&cluster->lock); - btrfs_put_block_group(block_group); - } -@@ -3028,8 +3028,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, - entry->bytes -= bytes; - } - -- if (entry->bytes == 0) -- rb_erase(&entry->offset_index, &cluster->root); - break; - } - out: -@@ -3046,7 +3044,10 @@ out: - ctl->free_space -= bytes; - if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) - ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; -+ -+ spin_lock(&cluster->lock); - if (entry->bytes == 0) { -+ rb_erase(&entry->offset_index, &cluster->root); - ctl->free_extents--; - if (entry->bitmap) { - kmem_cache_free(btrfs_free_space_bitmap_cachep, -@@ -3059,6 +3060,7 @@ out: - kmem_cache_free(btrfs_free_space_cachep, entry); - } - -+ spin_unlock(&cluster->lock); - spin_unlock(&ctl->tree_lock); - - return ret; -diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c -index ad34c5a09befc..40ccb8ddab23a 100644 ---- a/fs/btrfs/inode.c -+++ b/fs/btrfs/inode.c -@@ -9993,6 +9993,7 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, - sp->ptr = ptr; - sp->inode = inode; - sp->is_block_group = is_block_group; -+ sp->bg_extent_count = 1; - - spin_lock(&fs_info->swapfile_pins_lock); - p = &fs_info->swapfile_pins.rb_node; -@@ -10006,6 +10007,8 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, - (sp->ptr == entry->ptr && sp->inode > entry->inode)) { - p = &(*p)->rb_right; - } else { -+ if (is_block_group) -+ entry->bg_extent_count++; - spin_unlock(&fs_info->swapfile_pins_lock); - kfree(sp); - return 1; -@@ -10031,8 +10034,11 @@ static void btrfs_free_swapfile_pins(struct inode *inode) - sp = rb_entry(node, struct btrfs_swapfile_pin, node); - if (sp->inode == inode) { - rb_erase(&sp->node, &fs_info->swapfile_pins); -- if (sp->is_block_group) -+ if (sp->is_block_group) { -+ btrfs_dec_block_group_swap_extents(sp->ptr, -+ sp->bg_extent_count); - btrfs_put_block_group(sp->ptr); -+ } - kfree(sp); - } - node = next; -@@ -10093,7 +10099,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) - { - struct inode *inode = file_inode(file); -- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; -+ struct btrfs_root *root = BTRFS_I(inode)->root; -+ struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; -@@ -10144,13 +10151,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; - } -+ -+ /* -+ * Prevent snapshot creation while we are activating the swap file. -+ * We do not want to race with snapshot creation. If snapshot creation -+ * already started before we bumped nr_swapfiles from 0 to 1 and -+ * completes before the first write into the swap file after it is -+ * activated, than that write would fallback to COW. -+ */ -+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { -+ btrfs_exclop_finish(fs_info); -+ btrfs_warn(fs_info, -+ "cannot activate swapfile because snapshot creation is in progress"); -+ return -EINVAL; -+ } - /* - * Snapshots can create extents which require COW even if NODATACOW is - * set. We use this counter to prevent snapshots. We must increment it - * before walking the extents because we don't want a concurrent - * snapshot to run after we've already checked the extents. - */ -- atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); -+ atomic_inc(&root->nr_swapfiles); - - isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - -@@ -10247,6 +10268,17 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, - goto out; - } - -+ if (!btrfs_inc_block_group_swap_extents(bg)) { -+ btrfs_warn(fs_info, -+ "block group for swapfile at %llu is read-only%s", -+ bg->start, -+ atomic_read(&fs_info->scrubs_running) ? -+ " (scrub running)" : ""); -+ btrfs_put_block_group(bg); -+ ret = -EINVAL; -+ goto out; -+ } -+ - ret = btrfs_add_swapfile_pin(inode, bg, true); - if (ret) { - btrfs_put_block_group(bg); -@@ -10285,6 +10317,8 @@ out: - if (ret) - btrfs_swap_deactivate(file); - -+ btrfs_drew_write_unlock(&root->snapshot_lock); -+ - btrfs_exclop_finish(fs_info); - - if (ret) -diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c -index dde49a791f3e2..0a4ab121c684b 100644 ---- a/fs/btrfs/ioctl.c -+++ b/fs/btrfs/ioctl.c -@@ -1926,7 +1926,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, - if (vol_args->flags & BTRFS_SUBVOL_RDONLY) - readonly = true; - if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { -- if (vol_args->size > PAGE_SIZE) { -+ u64 nums; -+ -+ if (vol_args->size < sizeof(*inherit) || -+ vol_args->size > PAGE_SIZE) { - ret = -EINVAL; - goto free_args; - } -@@ -1935,6 +1938,20 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, - ret = PTR_ERR(inherit); - goto free_args; - } -+ -+ if (inherit->num_qgroups > PAGE_SIZE || -+ inherit->num_ref_copies > PAGE_SIZE || -+ inherit->num_excl_copies > PAGE_SIZE) { -+ ret = -EINVAL; -+ goto free_inherit; -+ } -+ -+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + -+ 2 * inherit->num_excl_copies; -+ if (vol_args->size != struct_size(inherit, qgroups, nums)) { -+ ret = -EINVAL; -+ goto free_inherit; -+ } - } - - ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, -diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c -index 93fbf87bdc8d3..123b79672c63c 100644 ---- a/fs/btrfs/raid56.c -+++ b/fs/btrfs/raid56.c -@@ -2363,16 +2363,21 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - SetPageUptodate(p_page); - - if (has_qstripe) { -+ /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!q_page) { - __free_page(p_page); - goto cleanup; - } - SetPageUptodate(q_page); -+ pointers[rbio->real_stripes - 1] = kmap(q_page); - } - - atomic_set(&rbio->error, 0); - -+ /* Map the parity stripe just once */ -+ pointers[nr_data] = kmap(p_page); -+ - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *p; - void *parity; -@@ -2382,16 +2387,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - pointers[stripe] = kmap(p); - } - -- /* then add the parity stripe */ -- pointers[stripe++] = kmap(p_page); -- - if (has_qstripe) { -- /* -- * raid6, add the qstripe and call the -- * library function to fill in our p/q -- */ -- pointers[stripe++] = kmap(q_page); -- -+ /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, - pointers); - } else { -@@ -2412,12 +2409,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - - for (stripe = 0; stripe < nr_data; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); -- kunmap(p_page); - } - -+ kunmap(p_page); - __free_page(p_page); -- if (q_page) -+ if (q_page) { -+ kunmap(q_page); - __free_page(q_page); -+ } - - writeback: - /* -diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c -index b03e7891394e3..a3bc721bab7c8 100644 ---- a/fs/btrfs/reflink.c -+++ b/fs/btrfs/reflink.c -@@ -550,6 +550,24 @@ process_slot: - */ - btrfs_release_path(path); - -+ /* -+ * When using NO_HOLES and we are cloning a range that covers -+ * only a hole (no extents) into a range beyond the current -+ * i_size, punching a hole in the target range will not create -+ * an extent map defining a hole, because the range starts at or -+ * beyond current i_size. If the file previously had an i_size -+ * greater than the new i_size set by this clone operation, we -+ * need to make sure the next fsync is a full fsync, so that it -+ * detects and logs a hole covering a range from the current -+ * i_size to the new i_size. If the clone range covers extents, -+ * besides a hole, then we know the full sync flag was already -+ * set by previous calls to btrfs_replace_file_extents() that -+ * replaced file extent items. -+ */ -+ if (last_dest_end >= i_size_read(inode)) -+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, -+ &BTRFS_I(inode)->runtime_flags); -+ - ret = btrfs_replace_file_extents(inode, path, last_dest_end, - destoff + len - 1, NULL, &trans); - if (ret) -diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c -index 5f4f88a4d2c8a..c09a494be8c68 100644 ---- a/fs/btrfs/scrub.c -+++ b/fs/btrfs/scrub.c -@@ -3630,6 +3630,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, - * commit_transactions. - */ - ro_set = 0; -+ } else if (ret == -ETXTBSY) { -+ btrfs_warn(fs_info, -+ "skipping scrub of block group %llu due to active swapfile", -+ cache->start); -+ scrub_pause_off(fs_info); -+ ret = 0; -+ goto skip_unfreeze; - } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); -@@ -3719,7 +3726,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, - } else { - spin_unlock(&cache->lock); - } -- -+skip_unfreeze: - btrfs_unfreeze_block_group(cache); - btrfs_put_block_group(cache); - if (ret) -diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c -index 12d7d3be7cd45..8baa806f43d76 100644 ---- a/fs/btrfs/super.c -+++ b/fs/btrfs/super.c -@@ -1919,8 +1919,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) - btrfs_resize_thread_pool(fs_info, - fs_info->thread_pool_size, old_thread_pool_size); - -- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) != -- btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && -+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != -+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { - btrfs_warn(fs_info, - "remount supports changing free space tree only from ro to rw"); -diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c -index 582061c7b5471..f4ade821307d7 100644 ---- a/fs/btrfs/tree-checker.c -+++ b/fs/btrfs/tree-checker.c -@@ -1453,22 +1453,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf, - return -EUCLEAN; - } - for (; ptr < end; ptr += sizeof(*dref)) { -- u64 root_objectid; -- u64 owner; - u64 offset; -- u64 hash; - -+ /* -+ * We cannot check the extent_data_ref hash due to possible -+ * overflow from the leaf due to hash collisions. -+ */ - dref = (struct btrfs_extent_data_ref *)ptr; -- root_objectid = btrfs_extent_data_ref_root(leaf, dref); -- owner = btrfs_extent_data_ref_objectid(leaf, dref); - offset = btrfs_extent_data_ref_offset(leaf, dref); -- hash = hash_extent_data_ref(root_objectid, owner, offset); -- if (unlikely(hash != key->offset)) { -- extent_err(leaf, slot, -- "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", -- hash, key->offset); -- return -EUCLEAN; -- } - if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { - extent_err(leaf, slot, - "invalid extent data backref offset, have %llu expect aligned to %u", -diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c -index af6246f36a9e5..03135dbb318a5 100644 ---- a/fs/btrfs/xattr.c -+++ b/fs/btrfs/xattr.c -@@ -229,11 +229,33 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, - { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; -+ const bool start_trans = (current->journal_info == NULL); - int ret; - -- trans = btrfs_start_transaction(root, 2); -- if (IS_ERR(trans)) -- return PTR_ERR(trans); -+ if (start_trans) { -+ /* -+ * 1 unit for inserting/updating/deleting the xattr -+ * 1 unit for the inode item update -+ */ -+ trans = btrfs_start_transaction(root, 2); -+ if (IS_ERR(trans)) -+ return PTR_ERR(trans); -+ } else { -+ /* -+ * This can happen when smack is enabled and a directory is being -+ * created. It happens through d_instantiate_new(), which calls -+ * smack_d_instantiate(), which in turn calls __vfs_setxattr() to -+ * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the -+ * inode. We have already reserved space for the xattr and inode -+ * update at btrfs_mkdir(), so just use the transaction handle. -+ * We don't join or start a transaction, as that will reset the -+ * block_rsv of the handle and trigger a warning for the start -+ * case. -+ */ -+ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX, -+ XATTR_SECURITY_PREFIX_LEN) == 0); -+ trans = current->journal_info; -+ } - - ret = btrfs_setxattr(trans, inode, name, value, size, flags); - if (ret) -@@ -244,7 +266,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); - out: -- btrfs_end_transaction(trans); -+ if (start_trans) -+ btrfs_end_transaction(trans); - return ret; - } - -diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c -index c388466590191..2f80de4403595 100644 ---- a/fs/btrfs/zoned.c -+++ b/fs/btrfs/zoned.c -@@ -152,7 +152,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) - sector_t sector = 0; - struct blk_zone *zones = NULL; - unsigned int i, nreported = 0, nr_zones; -- unsigned int zone_sectors; -+ sector_t zone_sectors; - int ret; - - if (!bdev_is_zoned(bdev)) -@@ -485,7 +485,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, - u64 *bytenr_ret) - { - struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; -- unsigned int zone_sectors; -+ sector_t zone_sectors; - u32 sb_zone; - int ret; - u64 zone_size; -diff --git a/fs/io_uring.c b/fs/io_uring.c -index 4d0ede0418571..38bfd168ad3b7 100644 ---- a/fs/io_uring.c -+++ b/fs/io_uring.c -@@ -5316,6 +5316,9 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, - pt->error = -EINVAL; - return; - } -+ /* double add on the same waitqueue head, ignore */ -+ if (poll->head == head) -+ return; - poll = kmalloc(sizeof(*poll), GFP_ATOMIC); - if (!poll) { - pt->error = -ENOMEM; -diff --git a/include/crypto/hash.h b/include/crypto/hash.h -index af2ff31ff619f..13f8a6a54ca87 100644 ---- a/include/crypto/hash.h -+++ b/include/crypto/hash.h -@@ -149,7 +149,7 @@ struct ahash_alg { - - struct shash_desc { - struct crypto_shash *tfm; -- void *__ctx[] CRYPTO_MINALIGN_ATTR; -+ void *__ctx[] __aligned(ARCH_SLAB_MINALIGN); - }; - - #define HASH_MAX_DIGESTSIZE 64 -@@ -162,9 +162,9 @@ struct shash_desc { - - #define HASH_MAX_STATESIZE 512 - --#define SHASH_DESC_ON_STACK(shash, ctx) \ -- char __##shash##_desc[sizeof(struct shash_desc) + \ -- HASH_MAX_DESCSIZE] CRYPTO_MINALIGN_ATTR; \ -+#define SHASH_DESC_ON_STACK(shash, ctx) \ -+ char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \ -+ __aligned(__alignof__(struct shash_desc)); \ - struct shash_desc *shash = (struct shash_desc *)__##shash##_desc - - /** -diff --git a/include/linux/crypto.h b/include/linux/crypto.h -index ef90e07c9635c..e3abd1f8646a1 100644 ---- a/include/linux/crypto.h -+++ b/include/linux/crypto.h -@@ -151,9 +151,12 @@ - * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual - * declaration) is used to ensure that the crypto_tfm context structure is - * aligned correctly for the given architecture so that there are no alignment -- * faults for C data types. In particular, this is required on platforms such -- * as arm where pointers are 32-bit aligned but there are data types such as -- * u64 which require 64-bit alignment. -+ * faults for C data types. On architectures that support non-cache coherent -+ * DMA, such as ARM or arm64, it also takes into account the minimal alignment -+ * that is required to ensure that the context struct member does not share any -+ * cachelines with the rest of the struct. This is needed to ensure that cache -+ * maintenance for non-coherent DMA (cache invalidation in particular) does not -+ * affect data that may be accessed by the CPU concurrently. - */ - #define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN - -diff --git a/include/sound/intel-nhlt.h b/include/sound/intel-nhlt.h -index 743c2f4422806..d0574805865f9 100644 ---- a/include/sound/intel-nhlt.h -+++ b/include/sound/intel-nhlt.h -@@ -112,6 +112,11 @@ struct nhlt_vendor_dmic_array_config { - /* TODO add vendor mic config */ - } __packed; - -+enum { -+ NHLT_CONFIG_TYPE_GENERIC = 0, -+ NHLT_CONFIG_TYPE_MIC_ARRAY = 1 -+}; -+ - enum { - NHLT_MIC_ARRAY_2CH_SMALL = 0xa, - NHLT_MIC_ARRAY_2CH_BIG = 0xb, -diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c -index ec08f948dd80e..063f8ea6aad97 100644 ---- a/kernel/trace/ring_buffer.c -+++ b/kernel/trace/ring_buffer.c -@@ -2821,6 +2821,17 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - write_stamp, write_stamp - delta)) - return 0; - -+ /* -+ * It's possible that the event time delta is zero -+ * (has the same time stamp as the previous event) -+ * in which case write_stamp and before_stamp could -+ * be the same. In such a case, force before_stamp -+ * to be different than write_stamp. It doesn't -+ * matter what it is, as long as its different. -+ */ -+ if (!delta) -+ rb_time_set(&cpu_buffer->before_stamp, 0); -+ - /* - * If an event were to come in now, it would see that the - * write_stamp and the before_stamp are different, and assume -diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c -index b9c2ee7ab43fa..cce12e1971d85 100644 ---- a/scripts/recordmcount.c -+++ b/scripts/recordmcount.c -@@ -438,7 +438,7 @@ static int arm_is_fake_mcount(Elf32_Rel const *rp) - - static int arm64_is_fake_mcount(Elf64_Rel const *rp) - { -- return ELF64_R_TYPE(w(rp->r_info)) != R_AARCH64_CALL26; -+ return ELF64_R_TYPE(w8(rp->r_info)) != R_AARCH64_CALL26; - } - - /* 64-bit EM_MIPS has weird ELF64_Rela.r_info. -diff --git a/security/tomoyo/network.c b/security/tomoyo/network.c -index a89ed55d85d41..478f757ff8435 100644 ---- a/security/tomoyo/network.c -+++ b/security/tomoyo/network.c -@@ -613,7 +613,7 @@ static int tomoyo_check_unix_address(struct sockaddr *addr, - static bool tomoyo_kernel_service(void) - { - /* Nothing to do if I am a kernel service. */ -- return uaccess_kernel(); -+ return (current->flags & (PF_KTHREAD | PF_IO_WORKER)) == PF_KTHREAD; - } - - /** -diff --git a/sound/hda/intel-nhlt.c b/sound/hda/intel-nhlt.c -index 059aaf04f536a..d053beccfaec3 100644 ---- a/sound/hda/intel-nhlt.c -+++ b/sound/hda/intel-nhlt.c -@@ -31,18 +31,44 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) - struct nhlt_endpoint *epnt; - struct nhlt_dmic_array_config *cfg; - struct nhlt_vendor_dmic_array_config *cfg_vendor; -+ struct nhlt_fmt *fmt_configs; - unsigned int dmic_geo = 0; -- u8 j; -+ u16 max_ch = 0; -+ u8 i, j; - - if (!nhlt) - return 0; - -- epnt = (struct nhlt_endpoint *)nhlt->desc; -+ for (j = 0, epnt = nhlt->desc; j < nhlt->endpoint_count; j++, -+ epnt = (struct nhlt_endpoint *)((u8 *)epnt + epnt->length)) { - -- for (j = 0; j < nhlt->endpoint_count; j++) { -- if (epnt->linktype == NHLT_LINK_DMIC) { -- cfg = (struct nhlt_dmic_array_config *) -- (epnt->config.caps); -+ if (epnt->linktype != NHLT_LINK_DMIC) -+ continue; -+ -+ cfg = (struct nhlt_dmic_array_config *)(epnt->config.caps); -+ fmt_configs = (struct nhlt_fmt *)(epnt->config.caps + epnt->config.size); -+ -+ /* find max number of channels based on format_configuration */ -+ if (fmt_configs->fmt_count) { -+ dev_dbg(dev, "%s: found %d format definitions\n", -+ __func__, fmt_configs->fmt_count); -+ -+ for (i = 0; i < fmt_configs->fmt_count; i++) { -+ struct wav_fmt_ext *fmt_ext; -+ -+ fmt_ext = &fmt_configs->fmt_config[i].fmt_ext; -+ -+ if (fmt_ext->fmt.channels > max_ch) -+ max_ch = fmt_ext->fmt.channels; -+ } -+ dev_dbg(dev, "%s: max channels found %d\n", __func__, max_ch); -+ } else { -+ dev_dbg(dev, "%s: No format information found\n", __func__); -+ } -+ -+ if (cfg->device_config.config_type != NHLT_CONFIG_TYPE_MIC_ARRAY) { -+ dmic_geo = max_ch; -+ } else { - switch (cfg->array_type) { - case NHLT_MIC_ARRAY_2CH_SMALL: - case NHLT_MIC_ARRAY_2CH_BIG: -@@ -59,13 +85,23 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) - dmic_geo = cfg_vendor->nb_mics; - break; - default: -- dev_warn(dev, "undefined DMIC array_type 0x%0x\n", -- cfg->array_type); -+ dev_warn(dev, "%s: undefined DMIC array_type 0x%0x\n", -+ __func__, cfg->array_type); -+ } -+ -+ if (dmic_geo > 0) { -+ dev_dbg(dev, "%s: Array with %d dmics\n", __func__, dmic_geo); -+ } -+ if (max_ch > dmic_geo) { -+ dev_dbg(dev, "%s: max channels %d exceed dmic number %d\n", -+ __func__, max_ch, dmic_geo); - } - } -- epnt = (struct nhlt_endpoint *)((u8 *)epnt + epnt->length); - } - -+ dev_dbg(dev, "%s: dmic number %d max_ch %d\n", -+ __func__, dmic_geo, max_ch); -+ - return dmic_geo; - } - EXPORT_SYMBOL_GPL(intel_nhlt_get_dmic_geo); -diff --git a/sound/pci/ctxfi/cthw20k2.c b/sound/pci/ctxfi/cthw20k2.c -index fc1bc18caee98..85d1fc76f59e1 100644 ---- a/sound/pci/ctxfi/cthw20k2.c -+++ b/sound/pci/ctxfi/cthw20k2.c -@@ -991,7 +991,7 @@ static int daio_mgr_dao_init(void *blk, unsigned int idx, unsigned int conf) - - if (idx < 4) { - /* S/PDIF output */ -- switch ((conf & 0x7)) { -+ switch ((conf & 0xf)) { - case 1: - set_field(&ctl->txctl[idx], ATXCTL_NUC, 0); - break; -diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c -index 5f4f8c2d760f0..b47504fa8dfd0 100644 ---- a/sound/pci/hda/patch_realtek.c -+++ b/sound/pci/hda/patch_realtek.c -@@ -6408,6 +6408,7 @@ enum { - ALC236_FIXUP_DELL_AIO_HEADSET_MIC, - ALC282_FIXUP_ACER_DISABLE_LINEOUT, - ALC255_FIXUP_ACER_LIMIT_INT_MIC_BOOST, -+ ALC256_FIXUP_ACER_HEADSET_MIC, - }; - - static const struct hda_fixup alc269_fixups[] = { -@@ -7864,6 +7865,16 @@ static const struct hda_fixup alc269_fixups[] = { - .chained = true, - .chain_id = ALC255_FIXUP_ACER_MIC_NO_PRESENCE, - }, -+ [ALC256_FIXUP_ACER_HEADSET_MIC] = { -+ .type = HDA_FIXUP_PINS, -+ .v.pins = (const struct hda_pintbl[]) { -+ { 0x19, 0x02a1113c }, /* use as headset mic, without its own jack detect */ -+ { 0x1a, 0x90a1092f }, /* use as internal mic */ -+ { } -+ }, -+ .chained = true, -+ .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC -+ }, - }; - - static const struct snd_pci_quirk alc269_fixup_tbl[] = { -@@ -7890,9 +7901,11 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { - SND_PCI_QUIRK(0x1025, 0x1246, "Acer Predator Helios 500", ALC299_FIXUP_PREDATOR_SPK), - SND_PCI_QUIRK(0x1025, 0x1247, "Acer vCopperbox", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS), - SND_PCI_QUIRK(0x1025, 0x1248, "Acer Veriton N4660G", ALC269VC_FIXUP_ACER_MIC_NO_PRESENCE), -+ SND_PCI_QUIRK(0x1025, 0x1269, "Acer SWIFT SF314-54", ALC256_FIXUP_ACER_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x128f, "Acer Veriton Z6860G", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x1290, "Acer Veriton Z4860G", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x1291, "Acer Veriton Z4660G", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), -+ SND_PCI_QUIRK(0x1025, 0x129c, "Acer SWIFT SF314-55", ALC256_FIXUP_ACER_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x1308, "Acer Aspire Z24-890", ALC286_FIXUP_ACER_AIO_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x132a, "Acer TravelMate B114-21", ALC233_FIXUP_ACER_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x1330, "Acer TravelMate X514-51T", ALC255_FIXUP_ACER_HEADSET_MIC), -diff --git a/sound/usb/clock.c b/sound/usb/clock.c -index dc68ed65e4787..771b652329571 100644 ---- a/sound/usb/clock.c -+++ b/sound/usb/clock.c -@@ -646,10 +646,10 @@ static int set_sample_rate_v2v3(struct snd_usb_audio *chip, - cur_rate = prev_rate; - - if (cur_rate != rate) { -- usb_audio_warn(chip, -- "%d:%d: freq mismatch (RO clock): req %d, clock runs @%d\n", -- fmt->iface, fmt->altsetting, rate, cur_rate); -- return -ENXIO; -+ usb_audio_dbg(chip, -+ "%d:%d: freq mismatch: req %d, clock runs @%d\n", -+ fmt->iface, fmt->altsetting, rate, cur_rate); -+ /* continue processing */ - } - - validation: -diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c -index 12b15ed59eaa1..d5bdc9c4f452b 100644 ---- a/sound/usb/mixer.c -+++ b/sound/usb/mixer.c -@@ -1301,6 +1301,17 @@ no_res_check: - /* totally crap, return an error */ - return -EINVAL; - } -+ } else { -+ /* if the max volume is too low, it's likely a bogus range; -+ * here we use -96dB as the threshold -+ */ -+ if (cval->dBmax <= -9600) { -+ usb_audio_info(cval->head.mixer->chip, -+ "%d:%d: bogus dB values (%d/%d), disabling dB reporting\n", -+ cval->head.id, mixer_ctrl_intf(cval->head.mixer), -+ cval->dBmin, cval->dBmax); -+ cval->dBmin = cval->dBmax = 0; -+ } - } - - return 0; -diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c -index a7212f16660ec..646deb6244b15 100644 ---- a/sound/usb/mixer_maps.c -+++ b/sound/usb/mixer_maps.c -@@ -536,6 +536,16 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = { - .id = USB_ID(0x05a7, 0x1020), - .map = bose_companion5_map, - }, -+ { -+ /* Corsair Virtuoso SE (wired mode) */ -+ .id = USB_ID(0x1b1c, 0x0a3d), -+ .map = corsair_virtuoso_map, -+ }, -+ { -+ /* Corsair Virtuoso SE (wireless mode) */ -+ .id = USB_ID(0x1b1c, 0x0a3e), -+ .map = corsair_virtuoso_map, -+ }, - { - /* Corsair Virtuoso (wired mode) */ - .id = USB_ID(0x1b1c, 0x0a41), -diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c -index bf5a0f3c1fade..e5311b6bb3f65 100644 ---- a/sound/usb/pcm.c -+++ b/sound/usb/pcm.c -@@ -845,13 +845,19 @@ get_sync_ep_from_substream(struct snd_usb_substream *subs) - - list_for_each_entry(fp, &subs->fmt_list, list) { - ep = snd_usb_get_endpoint(chip, fp->endpoint); -- if (ep && ep->cur_rate) -- return ep; -+ if (ep && ep->cur_audiofmt) { -+ /* if EP is already opened solely for this substream, -+ * we still allow us to change the parameter; otherwise -+ * this substream has to follow the existing parameter -+ */ -+ if (ep->cur_audiofmt != subs->cur_audiofmt || ep->opened > 1) -+ return ep; -+ } - if (!fp->implicit_fb) - continue; - /* for the implicit fb, check the sync ep as well */ - ep = snd_usb_get_endpoint(chip, fp->sync_ep); -- if (ep && ep->cur_rate) -+ if (ep && ep->cur_audiofmt) - return ep; - } - return NULL; -diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c -index 9ba4682ebc482..737b2729c0d37 100644 ---- a/sound/usb/quirks.c -+++ b/sound/usb/quirks.c -@@ -1482,7 +1482,7 @@ static int pioneer_djm_set_format_quirk(struct snd_usb_substream *subs, - usb_set_interface(subs->dev, 0, 1); - // we should derive windex from fmt-sync_ep but it's not set - snd_usb_ctl_msg(subs->stream->chip->dev, -- usb_rcvctrlpipe(subs->stream->chip->dev, 0), -+ usb_sndctrlpipe(subs->stream->chip->dev, 0), - 0x01, 0x22, 0x0100, windex, &sr, 0x0003); - return 0; - } diff --git a/sys-kernel/pinephone-sources/files/Multigenerational-LRU-Framework.patch b/sys-kernel/pinephone-sources/files/Multigenerational-LRU-Framework.patch new file mode 100644 index 0000000..c41fee8 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/Multigenerational-LRU-Framework.patch @@ -0,0 +1,6357 @@ +From patchwork Wed Aug 18 06:30:57 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443379 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham autolearn_force=no + version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id C5C62C432BE + for ; Wed, 18 Aug 2021 06:31:14 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 6EFEF60720 + for ; Wed, 18 Aug 2021 06:31:14 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 6EFEF60720 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 0AD846B0074; Wed, 18 Aug 2021 02:31:14 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id E3F6F6B0075; Wed, 18 Aug 2021 02:31:13 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id C6A8E6B0078; Wed, 18 Aug 2021 02:31:13 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0107.hostedemail.com + [216.40.44.107]) + by kanga.kvack.org (Postfix) with ESMTP id AAB506B0074 + for ; Wed, 18 Aug 2021 02:31:13 -0400 (EDT) +Received: from smtpin28.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay03.hostedemail.com (Postfix) with ESMTP id 5FE438249980 + for ; Wed, 18 Aug 2021 06:31:13 +0000 (UTC) +X-FDA: 78487229226.28.BBB4431 +Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com + [209.85.219.202]) + by imf18.hostedemail.com (Postfix) with ESMTP id 1839D4001EBB + for ; Wed, 18 Aug 2021 06:31:12 +0000 (UTC) +Received: by mail-yb1-f202.google.com with SMTP id + p71-20020a25424a0000b029056092741626so1764846yba.19 + for ; Tue, 17 Aug 2021 23:31:12 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=axDz8FmbUkmwoP5Po1n0WrbURWOp6eK+Sy9KgBfPnX8=; + b=tMJdGo0VdmwhtsmfaZ+8rkd4E9lkxdQ44NTW3d4/TNpCinw/fAFDzywEKib15cClPi + W29Wg6lpdN+bZgNq9Vd3h4q/1uzr+5hqEfenQ6HyCLa6QbL33fZSCMLzbTULgjUGwpI+ + BXmZg8SRzvZbXVtkQBlaLzEKm8WV77UfxsQ8EG6tQhxw6/Eh23ojg4fHfZC7QgWEo1ji + DDtk9ZcuKvr1BNn+esg0iIEsM4XejbvsJ+161wy6H40w1DQU0zhTeR6Qc0qXfc3jhHfB + OSV1hDsLPUfb0gbAc/64uz9OCgvP0IMwvygx6lIZRwZoCsDlkU53OLv5tpYI+ITQhfO/ + zGEA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=axDz8FmbUkmwoP5Po1n0WrbURWOp6eK+Sy9KgBfPnX8=; + b=HEyHpRA7YkfffNvslIvW6HuN/6lnGvyQzgi2vghhPPVeVN1+ZUdzybOhUmp8V1aIsd + R6fuYvHIpvNaRU142y3ulk0Hu9MLPLhdMWDHgdWfAbIZbPHtXgIshRoGehpK84rKwtDo + 5tSnPqV8YuFY1LsWdBgNyDHlrWBWFvpwRw77Zp/UM4h4INljVWXA58iykQGq6VBngklo + rsRicm6hIe9fEMon2Pnmx2zr8mAqhb8NSrlKRey0Bl7gmRdM8+cbn9EYrklokoXvCtKk + lW64Y3rnhPEoczcQygRAiBGYyJ05hDiyz3qZ18SGS7sYMABsjNr0g83Bu9xDoPa91RVC + MSkw== +X-Gm-Message-State: AOAM533RJ4Wj2uAqtXyGyNOhUUTEKfFVtgLHkrIUi6WCw6BsLiZ0OUjk + DRJPW3BODZlhqj+fBg4YowtzkJpwM9h0gx2aA6A/hNtOB2FHNaK+tsi74IrNsA92Ev/+6euQD6R + MDW0nY//owiss6a0Zn+KAhFxRVqnkJyheQOyWmPpErefzhtBP/L5D1Mmw +X-Google-Smtp-Source: + ABdhPJz9k/2W2LmgADoTsZbfpB5d6btee0nxlX6DF/x7Ki85TCn69a1MVMx/WICufFas9KT4dBOAj2dLDVU= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:cf8a:: with SMTP id + f132mr9779192ybg.387.1629268272384; + Tue, 17 Aug 2021 23:31:12 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:30:57 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-2-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 01/11] mm: x86, arm64: add arch_has_hw_pte_young() +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao +Authentication-Results: imf18.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=tMJdGo0V; + spf=pass (imf18.hostedemail.com: domain of + 3MKkcYQYKCAU3z4mftlttlqj.htrqnsz2-rrp0fhp.twl@flex--yuzhao.bounces.google.com + designates 209.85.219.202 as permitted sender) + smtp.mailfrom=3MKkcYQYKCAU3z4mftlttlqj.htrqnsz2-rrp0fhp.twl@flex--yuzhao.bounces.google.com; + dmarc=pass (policy=reject) header.from=google.com +X-Rspamd-Server: rspam06 +X-Rspamd-Queue-Id: 1839D4001EBB +X-Stat-Signature: ierg7zqi1iutxm8fjk9smhyogpegi6rn +X-HE-Tag: 1629268272-765849 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Some architectures set the accessed bit in PTEs automatically, e.g., +x86, and arm64 v8.2 and later. On architectures that do not have this +capability, clearing the accessed bit in a PTE triggers a page fault +following the TLB miss. + +Being aware of this capability can help make better decisions, i.e., +whether to limit the size of each batch of PTEs and the burst of +batches when clearing the accessed bit. + +Signed-off-by: Yu Zhao +--- + arch/arm64/include/asm/cpufeature.h | 19 ++++++------------- + arch/arm64/include/asm/pgtable.h | 10 ++++------ + arch/arm64/kernel/cpufeature.c | 19 +++++++++++++++++++ + arch/arm64/mm/proc.S | 12 ------------ + arch/arm64/tools/cpucaps | 1 + + arch/x86/include/asm/pgtable.h | 6 +++--- + include/linux/pgtable.h | 12 ++++++++++++ + mm/memory.c | 14 +------------- + 8 files changed, 46 insertions(+), 47 deletions(-) + +diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h +index 9bb9d11750d7..2020b9e818c8 100644 +--- a/arch/arm64/include/asm/cpufeature.h ++++ b/arch/arm64/include/asm/cpufeature.h +@@ -776,6 +776,12 @@ static inline bool system_supports_tlb_range(void) + cpus_have_const_cap(ARM64_HAS_TLB_RANGE); + } + ++/* Check whether hardware update of the Access flag is supported. */ ++static inline bool system_has_hw_af(void) ++{ ++ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); ++} ++ + extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); + + static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +@@ -799,19 +805,6 @@ static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) + } + } + +-/* Check whether hardware update of the Access flag is supported */ +-static inline bool cpu_has_hw_af(void) +-{ +- u64 mmfr1; +- +- if (!IS_ENABLED(CONFIG_ARM64_HW_AFDBM)) +- return false; +- +- mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); +- return cpuid_feature_extract_unsigned_field(mmfr1, +- ID_AA64MMFR1_HADBS_SHIFT); +-} +- + static inline bool cpu_has_pan(void) + { + u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index f09bf5c02891..b63a6a7b62ee 100644 +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -993,13 +993,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, + * page after fork() + CoW for pfn mappings. We don't always have a + * hardware-managed access flag on arm64. + */ +-static inline bool arch_faults_on_old_pte(void) ++static inline bool arch_has_hw_pte_young(void) + { +- WARN_ON(preemptible()); +- +- return !cpu_has_hw_af(); ++ return system_has_hw_af(); + } +-#define arch_faults_on_old_pte arch_faults_on_old_pte ++#define arch_has_hw_pte_young arch_has_hw_pte_young + + /* + * Experimentally, it's cheap to set the access flag in hardware and we +@@ -1007,7 +1005,7 @@ static inline bool arch_faults_on_old_pte(void) + */ + static inline bool arch_wants_old_prefaulted_pte(void) + { +- return !arch_faults_on_old_pte(); ++ return arch_has_hw_pte_young(); + } + #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte + +diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c +index 0ead8bfedf20..d05de77626f5 100644 +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -1650,6 +1650,14 @@ static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, + return true; + } + ++static void cpu_enable_hw_af(struct arm64_cpu_capabilities const *cap) ++{ ++ u64 val = read_sysreg(tcr_el1); ++ ++ write_sysreg(val | TCR_HA, tcr_el1); ++ isb(); ++ local_flush_tlb_all(); ++} + #endif + + #ifdef CONFIG_ARM64_AMU_EXTN +@@ -2126,6 +2134,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { + .matches = has_hw_dbm, + .cpu_enable = cpu_enable_hw_dbm, + }, ++ { ++ .desc = "Hardware update of the Access flag", ++ .type = ARM64_CPUCAP_SYSTEM_FEATURE, ++ .capability = ARM64_HW_AF, ++ .sys_reg = SYS_ID_AA64MMFR1_EL1, ++ .sign = FTR_UNSIGNED, ++ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, ++ .min_field_value = 1, ++ .matches = has_cpuid_feature, ++ .cpu_enable = cpu_enable_hw_af, ++ }, + #endif + { + .desc = "CRC32 instructions", +diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S +index 35936c5ae1ce..b066d5712e3d 100644 +--- a/arch/arm64/mm/proc.S ++++ b/arch/arm64/mm/proc.S +@@ -478,18 +478,6 @@ SYM_FUNC_START(__cpu_setup) + * Set the IPS bits in TCR_EL1. + */ + tcr_compute_pa_size tcr, #TCR_IPS_SHIFT, x5, x6 +-#ifdef CONFIG_ARM64_HW_AFDBM +- /* +- * Enable hardware update of the Access Flags bit. +- * Hardware dirty bit management is enabled later, +- * via capabilities. +- */ +- mrs x9, ID_AA64MMFR1_EL1 +- and x9, x9, #0xf +- cbz x9, 1f +- orr tcr, tcr, #TCR_HA // hardware Access flag update +-1: +-#endif /* CONFIG_ARM64_HW_AFDBM */ + msr mair_el1, mair + msr tcr_el1, tcr + /* +diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps +index 49305c2e6dfd..d52f50671e60 100644 +--- a/arch/arm64/tools/cpucaps ++++ b/arch/arm64/tools/cpucaps +@@ -35,6 +35,7 @@ HAS_STAGE2_FWB + HAS_SYSREG_GIC_CPUIF + HAS_TLB_RANGE + HAS_VIRT_HOST_EXTN ++HW_AF + HW_DBM + KVM_PROTECTED_MODE + MISMATCHED_CACHE_TYPE +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 448cd01eb3ec..3908780fc408 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_check(void) + return boot_cpu_has_bug(X86_BUG_L1TF); + } + +-#define arch_faults_on_old_pte arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) ++#define arch_has_hw_pte_young arch_has_hw_pte_young ++static inline bool arch_has_hw_pte_young(void) + { +- return false; ++ return true; + } + + #endif /* __ASSEMBLY__ */ +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index e24d2c992b11..3a8221fa2c76 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -258,6 +258,18 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif + ++#ifndef arch_has_hw_pte_young ++static inline bool arch_has_hw_pte_young(void) ++{ ++ /* ++ * Those arches which have hw access flag feature need to implement ++ * their own helper. By default, "false" means pagefault will be hit ++ * on old pte. ++ */ ++ return false; ++} ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, +diff --git a/mm/memory.c b/mm/memory.c +index 25fc46e87214..2f96179db219 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = + 2; + #endif + +-#ifndef arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) +-{ +- /* +- * Those arches which don't have hw access flag feature need to +- * implement their own helper. By default, "true" means pagefault +- * will be hit on old pte. +- */ +- return true; +-} +-#endif +- + #ifndef arch_wants_old_prefaulted_pte + static inline bool arch_wants_old_prefaulted_pte(void) + { +@@ -2769,7 +2757,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ +- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { ++ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + +From patchwork Wed Aug 18 06:30:58 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443381 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id AF8BCC4338F + for ; Wed, 18 Aug 2021 06:31:16 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 4766360720 + for ; Wed, 18 Aug 2021 06:31:16 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 4766360720 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 95F966B0075; Wed, 18 Aug 2021 02:31:15 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 8E8E16B0078; Wed, 18 Aug 2021 02:31:15 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 789176B007B; Wed, 18 Aug 2021 02:31:15 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0176.hostedemail.com + [216.40.44.176]) + by kanga.kvack.org (Postfix) with ESMTP id 5F1446B0075 + for ; Wed, 18 Aug 2021 02:31:15 -0400 (EDT) +Received: from smtpin38.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay01.hostedemail.com (Postfix) with ESMTP id E31B4184138FF + for ; Wed, 18 Aug 2021 06:31:14 +0000 (UTC) +X-FDA: 78487229268.38.72C5F94 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf12.hostedemail.com (Postfix) with ESMTP id 9C9951004EDF + for ; Wed, 18 Aug 2021 06:31:14 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + w201-20020a25dfd2000000b00594695384d1so1788511ybg.20 + for ; Tue, 17 Aug 2021 23:31:14 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=r28OOl9H0j68087MxXenLvNFF4q/L5LmucJPA1P1rF0=; + b=rVacPqv+ksAvDz+9AzHa3UA8YFJnchxgAhITykWT0kxNmHdqCt/DAMGYBXtFWBzdAw + UzvzekE5eXZhtYXiInhdds4rbazWL2e8SFPVHUTGBWlGoWlFjQjsFG7H8fdm3sVFRrQW + CFVuV2fjqGRF2ixE+rJiDYBWDDGU/m+XKJeE5lrzOOrs34yZw1Ln+xkk4ovZWFoIZoZa + qCrHFEneIQhdNP2RXcj/F4CpHXU0rbR7/8YAGVM5R73tsv7EDGpvPGvc4a+NpC+R39BQ + Ur9JQcBxVNokNwE8GV6vwh3SSZ/rfbxBm3LxCp2vVHVrgTnlp/1EuNSgPFpyUabNGaju + +HbA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=r28OOl9H0j68087MxXenLvNFF4q/L5LmucJPA1P1rF0=; + b=rhGqOrpzKCVP1kqrXzQr7nNlykZjoqhG7Aq1WDZC1vtx5VJ50/IhsIwiM5AFT6svC9 + 4fRnksFFREBF45DFYDPdJro4p2VtkMglxu/jN5Zps+jwHG3n2Yew7LIBG1Tm5cS9f6XG + VfNl2V007FhQOTpn10z3qc9TT36kUxTMxlqmkzD4L9MbliQnXgkXV534Wr6T0QoiLWEa + DjpXuG+KWc/6CUwaamMcAqIV6sSs6ybQ74nQVc72+S2IpAcNNoH/I9o7347+LucMk2kT + twwmAkCcS6MSQuKxsszpUsvXofF+giBWH9PF8uGuzt8kVyVtjq4wSbhvQTaLvVLCcxdk + mqhw== +X-Gm-Message-State: AOAM533tFDVqNULti6Tdyly25YgIWTiusXKk7azjs60a3jRmYck14xKV + azSMcKZnE2+7zk87soK22ZDyrs5BXK3F5NaIEFQWZX8ieeX7nyAj4Hf1cyolRZO8lorC6fQok14 + di+2QdQEk9Va3eNj0TtEC/70S6aJ3NByCxJWMTJW3yBdmYFxAyRSLt2/f +X-Google-Smtp-Source: + ABdhPJxKa+6gSOMXKC9+9yKBARFPL7honqMsWyRtIkHW0A5Nhi7abOtfmKUl4eee1vwoKxR4P+tGRKOMAyI= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:2688:: with SMTP id + m130mr9020507ybm.146.1629268273873; + Tue, 17 Aug 2021 23:31:13 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:30:58 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-3-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 02/11] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +Authentication-Results: imf12.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=rVacPqv+; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf12.hostedemail.com: domain of + 3MakcYQYKCAY405ngumuumrk.iusrot03-ssq1giq.uxm@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3MakcYQYKCAY405ngumuumrk.iusrot03-ssq1giq.uxm@flex--yuzhao.bounces.google.com +X-Stat-Signature: 9q7crf4ydtcm4irdjb66ijaq6wsee4g7 +X-Rspamd-Queue-Id: 9C9951004EDF +X-Rspamd-Server: rspam05 +X-HE-Tag: 1629268274-142487 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Some architectures support the accessed bit on non-leaf PMD entries, +e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using +it as part of linear address translation [1]. As an optimization, page +table walkers who are interested in the accessed bit can skip the PTEs +under a non-leaf PMD entry if the accessed bit is cleared on this +non-leaf PMD entry. + +Although an inline function may be preferable, this capability is +added as a configuration option to look consistent when used with the +existing macros. + +[1]: Intel 64 and IA-32 Architectures Software Developer's Manual + Volume 3 (October 2019), section 4.8 + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + arch/Kconfig | 9 +++++++++ + arch/x86/Kconfig | 1 + + arch/x86/include/asm/pgtable.h | 3 ++- + arch/x86/mm/pgtable.c | 5 ++++- + include/linux/pgtable.h | 4 ++-- + 5 files changed, 18 insertions(+), 4 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 129df498a8e1..5b6b4f95372f 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1282,6 +1282,15 @@ config ARCH_SPLIT_ARG64 + config ARCH_HAS_ELFCORE_COMPAT + bool + ++config ARCH_HAS_NONLEAF_PMD_YOUNG ++ bool ++ depends on PGTABLE_LEVELS > 2 ++ help ++ Architectures that select this are able to set the accessed bit on ++ non-leaf PMD entries in addition to leaf PTE entries where pages are ++ mapped. For them, page table walkers that clear the accessed bit may ++ stop at non-leaf PMD entries if they do not see the accessed bit. ++ + source "kernel/gcov/Kconfig" + + source "scripts/gcc-plugins/Kconfig" +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 88fb922c23a0..36a81d31f711 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -84,6 +84,7 @@ config X86 + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 + select ARCH_HAS_PTE_SPECIAL ++ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 + select ARCH_HAS_COPY_MC if X86_64 + select ARCH_HAS_SET_MEMORY +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 3908780fc408..01a1763123ff 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) + + static inline int pmd_bad(pmd_t pmd) + { +- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != ++ (_KERNPG_TABLE & ~_PAGE_ACCESSED); + } + + static inline unsigned long pages_to_mb(unsigned long npg) +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 3481b35cb4ec..a224193d84bf 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, + return ret; + } + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { +@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, + + return ret; + } ++#endif ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) + { +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 3a8221fa2c76..483d5ff7a33e 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -211,7 +211,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + #endif + + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +@@ -232,7 +232,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + BUILD_BUG(); + return 0; + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ + #endif + + #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH + +From patchwork Wed Aug 18 06:30:59 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443383 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id A70C6C432BE + for ; Wed, 18 Aug 2021 06:31:18 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 53AEF60720 + for ; Wed, 18 Aug 2021 06:31:18 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 53AEF60720 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 010A56B0078; Wed, 18 Aug 2021 02:31:17 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id EDAD36B007B; Wed, 18 Aug 2021 02:31:16 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id D07326B007D; Wed, 18 Aug 2021 02:31:16 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0187.hostedemail.com + [216.40.44.187]) + by kanga.kvack.org (Postfix) with ESMTP id B423C6B0078 + for ; Wed, 18 Aug 2021 02:31:16 -0400 (EDT) +Received: from smtpin25.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay02.hostedemail.com (Postfix) with ESMTP id 55BB620BF4 + for ; Wed, 18 Aug 2021 06:31:16 +0000 (UTC) +X-FDA: 78487229352.25.08BFBEA +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf25.hostedemail.com (Postfix) with ESMTP id 189DBB000298 + for ; Wed, 18 Aug 2021 06:31:15 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + f3-20020a25cf030000b029055a2303fc2dso1806506ybg.11 + for ; Tue, 17 Aug 2021 23:31:15 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=IbNlCcnWCb+KjCwpSHdD59CG4potZ0Ug8pgSOhUgoys=; + b=XDyq5Z6SoXVLKLMxSUBELU3/JYB+XigRWxLmHl+mLVjgRyxzm26UQCeVaYop0jlk8Q + /j3euucaicnXQLnrC9vi8lKXjBup6gx5AJkPyNKArxfWHKlWgPq9vlAt4zuxjNiwgemL + Z2I3+TjwsltWgI++n6c2qaGMKlifmtnYBUI+VvL25BYzfpP2AkXI/s1WUdnXgGEc13Li + Hy37gyZ+oYRHjqbMj3AL08iB3J+yX1CN2BCMOp7sjdwdce3uEAXIkkrp5qc5hemlY8nh + jm2ZLuv0bnfOtB7jLWewZP2vlOyubRZD7CM7jL7+jk6KgOx+CtLcBKkSFg1JcPNbrOOr + j0ug== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=IbNlCcnWCb+KjCwpSHdD59CG4potZ0Ug8pgSOhUgoys=; + b=jZPnPE0jlqRbi22Deu5gHa1uKexbIyxs+qLUoz108hwcNg2p4fkRksxOKcF+t3Hu7J + cZwi4LZpWP3Q1saFiAhDBxy5+39OQXWUZAqq3o9opVIsIkfVxFoIGAuCeW/gjlhNWOW8 + CSk55NNDESHOChrepRAkN1ETnNQnZsuGEh4AMoCzRMKAelXsVVmFV42ID0ie0IZVCw3O + jIJvVyveRLRQ/cooePoIqonuqh8Tski/BHX41c67cC9d6exL+C6wbjo+k3CBOlvCBgT8 + qHpf8oj8cCNtz0Gp8WJW4PLA3+F174ogs2XL9JMH7FpmGXZjNB6zuZOvoeqlp6864KS4 + H5Dw== +X-Gm-Message-State: AOAM533xoeOvffOGFMmpCR9kqnN4I1lzVBqSzp2fV0KytdcnlrcXAP7x + /VtlqfrSGaCkxWS0oLRRnb57MyUd5m8A/5CCQcNmpylSBNPBPWZlXUmZAF4IATlGaDlGtAdA9AE + S5RHs+i9x+BtXZgMSQrsJLDB/ptRZzJOL/7DXB4SIxVaikMUxobKT+QoB +X-Google-Smtp-Source: + ABdhPJx0xmbEVoVaj90boaH3OUfKySCnDfxk00twIs2YN0dnUnaA3Xv1FOpcTPviSrYSArDiy17HMf1TW/4= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:b5ce:: with SMTP id + d14mr9544572ybg.415.1629268275365; + Tue, 17 Aug 2021 23:31:15 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:30:59 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-4-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 03/11] mm/vmscan.c: refactor shrink_node() +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +Authentication-Results: imf25.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=XDyq5Z6S; + spf=pass (imf25.hostedemail.com: domain of + 3M6kcYQYKCAg627piwowwotm.kwutqv25-uus3iks.wzo@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3M6kcYQYKCAg627piwowwotm.kwutqv25-uus3iks.wzo@flex--yuzhao.bounces.google.com; + dmarc=pass (policy=reject) header.from=google.com +X-Rspamd-Server: rspam06 +X-Rspamd-Queue-Id: 189DBB000298 +X-Stat-Signature: sidfqz1997bcoo1mjmyswmcg3wwau9xb +X-HE-Tag: 1629268275-683964 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +This patch refactors shrink_node(). This will make the upcoming +changes to mm/vmscan.c more readable. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + mm/vmscan.c | 186 +++++++++++++++++++++++++++------------------------- + 1 file changed, 98 insertions(+), 88 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 4620df62f0ff..b6d14880bd76 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2437,6 +2437,103 @@ enum scan_balance { + SCAN_FILE, + }; + ++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long file; ++ struct lruvec *target_lruvec; ++ ++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); ++ ++ /* ++ * Determine the scan balance between anon and file LRUs. ++ */ ++ spin_lock_irq(&target_lruvec->lru_lock); ++ sc->anon_cost = target_lruvec->anon_cost; ++ sc->file_cost = target_lruvec->file_cost; ++ spin_unlock_irq(&target_lruvec->lru_lock); ++ ++ /* ++ * Target desirable inactive:active list ratios for the anon ++ * and file LRU lists. ++ */ ++ if (!sc->force_deactivate) { ++ unsigned long refaults; ++ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_ANON); ++ if (refaults != target_lruvec->refaults[0] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) ++ sc->may_deactivate |= DEACTIVATE_ANON; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_ANON; ++ ++ /* ++ * When refaults are being observed, it means a new ++ * workingset is being established. Deactivate to get ++ * rid of any stale active pages quickly. ++ */ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_FILE); ++ if (refaults != target_lruvec->refaults[1] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) ++ sc->may_deactivate |= DEACTIVATE_FILE; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_FILE; ++ } else ++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; ++ ++ /* ++ * If we have plenty of inactive file pages that aren't ++ * thrashing, try to reclaim those first before touching ++ * anonymous pages. ++ */ ++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); ++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) ++ sc->cache_trim_mode = 1; ++ else ++ sc->cache_trim_mode = 0; ++ ++ /* ++ * Prevent the reclaimer from falling into the cache trap: as ++ * cache pages start out inactive, every cache fault will tip ++ * the scan balance towards the file LRU. And as the file LRU ++ * shrinks, so does the window for rotation from references. ++ * This means we have a runaway feedback loop where a tiny ++ * thrashing file LRU becomes infinitely more attractive than ++ * anon pages. Try to detect this based on file LRU size. ++ */ ++ if (!cgroup_reclaim(sc)) { ++ unsigned long total_high_wmark = 0; ++ unsigned long free, anon; ++ int z; ++ ++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); ++ file = node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE); ++ ++ for (z = 0; z < MAX_NR_ZONES; z++) { ++ struct zone *zone = &pgdat->node_zones[z]; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ total_high_wmark += high_wmark_pages(zone); ++ } ++ ++ /* ++ * Consider anon: if that's low too, this isn't a ++ * runaway file reclaim problem, but rather just ++ * extreme pressure. Reclaim as per usual then. ++ */ ++ anon = node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ sc->file_is_tiny = ++ file + free <= total_high_wmark && ++ !(sc->may_deactivate & DEACTIVATE_ANON) && ++ anon >> sc->priority; ++ } ++} ++ + /* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined +@@ -2882,7 +2979,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; +- unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +@@ -2892,93 +2988,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + +- /* +- * Determine the scan balance between anon and file LRUs. +- */ +- spin_lock_irq(&target_lruvec->lru_lock); +- sc->anon_cost = target_lruvec->anon_cost; +- sc->file_cost = target_lruvec->file_cost; +- spin_unlock_irq(&target_lruvec->lru_lock); +- +- /* +- * Target desirable inactive:active list ratios for the anon +- * and file LRU lists. +- */ +- if (!sc->force_deactivate) { +- unsigned long refaults; +- +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_ANON); +- if (refaults != target_lruvec->refaults[0] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) +- sc->may_deactivate |= DEACTIVATE_ANON; +- else +- sc->may_deactivate &= ~DEACTIVATE_ANON; +- +- /* +- * When refaults are being observed, it means a new +- * workingset is being established. Deactivate to get +- * rid of any stale active pages quickly. +- */ +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_FILE); +- if (refaults != target_lruvec->refaults[1] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) +- sc->may_deactivate |= DEACTIVATE_FILE; +- else +- sc->may_deactivate &= ~DEACTIVATE_FILE; +- } else +- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; +- +- /* +- * If we have plenty of inactive file pages that aren't +- * thrashing, try to reclaim those first before touching +- * anonymous pages. +- */ +- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) +- sc->cache_trim_mode = 1; +- else +- sc->cache_trim_mode = 0; +- +- /* +- * Prevent the reclaimer from falling into the cache trap: as +- * cache pages start out inactive, every cache fault will tip +- * the scan balance towards the file LRU. And as the file LRU +- * shrinks, so does the window for rotation from references. +- * This means we have a runaway feedback loop where a tiny +- * thrashing file LRU becomes infinitely more attractive than +- * anon pages. Try to detect this based on file LRU size. +- */ +- if (!cgroup_reclaim(sc)) { +- unsigned long total_high_wmark = 0; +- unsigned long free, anon; +- int z; +- +- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- file = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { +- struct zone *zone = &pgdat->node_zones[z]; +- if (!managed_zone(zone)) +- continue; +- +- total_high_wmark += high_wmark_pages(zone); +- } +- +- /* +- * Consider anon: if that's low too, this isn't a +- * runaway file reclaim problem, but rather just +- * extreme pressure. Reclaim as per usual then. +- */ +- anon = node_page_state(pgdat, NR_INACTIVE_ANON); +- +- sc->file_is_tiny = +- file + free <= total_high_wmark && +- !(sc->may_deactivate & DEACTIVATE_ANON) && +- anon >> sc->priority; +- } ++ prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + + +From patchwork Wed Aug 18 06:31:00 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443385 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 150A0C4320E + for ; Wed, 18 Aug 2021 06:31:21 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 96E6A60720 + for ; Wed, 18 Aug 2021 06:31:20 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 96E6A60720 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 5695E6B007B; Wed, 18 Aug 2021 02:31:18 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 4C95A6B007D; Wed, 18 Aug 2021 02:31:18 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 2CDA18D0001; Wed, 18 Aug 2021 02:31:18 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0109.hostedemail.com + [216.40.44.109]) + by kanga.kvack.org (Postfix) with ESMTP id 0239A6B007B + for ; Wed, 18 Aug 2021 02:31:17 -0400 (EDT) +Received: from smtpin37.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay03.hostedemail.com (Postfix) with ESMTP id 9DBBA8249980 + for ; Wed, 18 Aug 2021 06:31:17 +0000 (UTC) +X-FDA: 78487229394.37.243B0BF +Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com + [209.85.219.202]) + by imf29.hostedemail.com (Postfix) with ESMTP id 43B37900802E + for ; Wed, 18 Aug 2021 06:31:17 +0000 (UTC) +Received: by mail-yb1-f202.google.com with SMTP id + d69-20020a25e6480000b02904f4a117bd74so1786127ybh.17 + for ; Tue, 17 Aug 2021 23:31:17 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=KLByEW8oqSJ6WHKGE1qm3K6ThZaMDwxGAGCFyocHr5I=; + b=oSLNHRddKSi52S3NNpMxk7i0dMFCIRw/Ughyj9grAnePWCkCwCppAIjrxJ0L73mhlS + +wrylPcHwcxJpGFA2mG8qU5IgizoMse5rdRTnytJrgOy4uNilAu6ADLCuhKAvZz1rGjg + /fjzb0aI3dO+8gU3ssEOQHu1PTa6tNwzS3xhC9mbcQmrp8dDhoeq1VJFnoP4mxs17KTU + DRRxvKj6ZVAXrBe/bxyA4S3SMK70FfzB+Vw1R4JvLOXSy0oInNR024Qri2xeQvWQSQRB + QGFSOdNKyZnkhoJMzaS9/GqSLTRYg59ta/5CxuCfSkGoCUTqxE/Di/i4DFc+gOzYOcyZ + 57RQ== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=KLByEW8oqSJ6WHKGE1qm3K6ThZaMDwxGAGCFyocHr5I=; + b=JmOYqasbgRhMEog6OhY2Dz4pR+SIwaD2gVpW5IPjhL3yDp/Xa2ceSMjvQ3YfySP1CV + 9AQWe6GdGfY7VDws0RNcP+GQy2XPTiaPH9B892Zhe+iGq5a/Hmi3GRxzDIpZTChc6ZVO + /4FJlKyRiThwOq+44wH7sDD6x4HKl/WSSd3d/vHGpn4PjGrOPTFOwVeEd+2hR3AgM0VS + fadhxfwCDf69884NHArbvtBQA3wFquVFFFLDrgRiJHeE2tlN1tEDB5fnb3VKLOQ/Q8mg + fRons0RrQXenYmHptuplEyxFc7my9N/jiAxyNr5ZFkGcElMLRQA0Z+rG3Dm8k+EWhulf + IHUw== +X-Gm-Message-State: AOAM532doWUqf8fFpzk4OO5p6LCNKvsqoGzLw08lgrfdcjV+YUx/enTA + 7V59Vo99eeN6rgDcVZEwKYo1TbLQU99OxTB8Mc+VdocDFshhq3djSgk0DXoM+0r77AqZn6/sN/V + uteMn63fPsj6LoXB+jrjJsriywf3LOS+mG97Vt6itjZmUPE3iVxjp5yi9 +X-Google-Smtp-Source: + ABdhPJxWBteTbvBwVadJUVDS2TXnjjvk48ocUKWsqtHIFkmH4QUSDPet5WNamyiqbp6gJL2dSdW6Onc014o= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:4042:: with SMTP id + n63mr8942131yba.254.1629268276531; + Tue, 17 Aug 2021 23:31:16 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:00 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-5-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 04/11] mm: multigenerational lru: groundwork +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +Authentication-Results: imf29.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=oSLNHRdd; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf29.hostedemail.com: domain of + 3NKkcYQYKCAk738qjxpxxpun.lxvurw36-vvt4jlt.x0p@flex--yuzhao.bounces.google.com + designates 209.85.219.202 as permitted sender) + smtp.mailfrom=3NKkcYQYKCAk738qjxpxxpun.lxvurw36-vvt4jlt.x0p@flex--yuzhao.bounces.google.com +X-Stat-Signature: qxf8ht5ebkkgrypatqidrgos3oqwefkq +X-Rspamd-Queue-Id: 43B37900802E +X-Rspamd-Server: rspam05 +X-HE-Tag: 1629268277-97904 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +For each lruvec, evictable pages are divided into multiple +generations. The youngest generation number is stored in +lrugen->max_seq for both anon and file types as they are aged on an +equal footing. The oldest generation numbers are stored in +lrugen->min_seq[2] separately for anon and file types as clean file +pages can be evicted regardless of swap and writeback constraints. +These three variables are monotonically increasing. Generation numbers +are truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit +into page->flags. The sliding window technique is used to prevent +truncated generation numbers from overlapping. Each truncated +generation number is an index to +lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. + +Each generation is then divided into multiple tiers. Tiers represent +levels of usage from file descriptors only. Pages accessed N times via +file descriptors belong to tier order_base_2(N). Each generation +contains at most MAX_NR_TIERS tiers, and they require additional +MAX_NR_TIERS-2 bits in page->flags. In contrast to moving across +generations which requires list operations, moving across tiers only +involves operations on page->flags and therefore has a negligible +cost. A feedback loop modeled after the PID controller monitors +refault rates of all tiers and decides when to protect pages from +which tiers. + +The framework comprises two conceptually independent components: the +aging and the eviction, which can be invoked separately from user +space for the purpose of working set estimation and proactive reclaim. + +The aging produces young generations. Given an lruvec, the aging +traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan +PTEs for accessed pages (a mm_struct list is maintained for each +memcg). Upon finding one, the aging updates its generation number to +max_seq (modulo MAX_NR_GENS). After each round of traversal, the aging +increments max_seq. The aging is due when both min_seq[2] have caught +up with max_seq-1. + +The eviction consumes old generations. Given an lruvec, the eviction +scans pages on lrugen->lists indexed by anon and file min_seq[2] +(modulo MAX_NR_GENS). It first tries to select a type based on the +values of min_seq[2]. If they are equal, it selects the type that has +a lower refault rate. The eviction sorts a page according to its +updated generation number if the aging has found this page accessed. +It also moves a page to the next generation if this page is from an +upper tier that has a higher refault rate than the base tier. The +eviction increments min_seq[2] of a selected type when it finds +lrugen->lists indexed by min_seq[2] of this selected type are empty. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + fs/fuse/dev.c | 3 +- + include/linux/cgroup.h | 15 +- + include/linux/mm.h | 2 + + include/linux/mm_inline.h | 201 ++++++++++++++++++ + include/linux/mmzone.h | 92 +++++++++ + include/linux/page-flags-layout.h | 19 +- + include/linux/page-flags.h | 4 +- + kernel/bounds.c | 3 + + kernel/cgroup/cgroup-internal.h | 1 - + mm/huge_memory.c | 3 +- + mm/mm_init.c | 6 +- + mm/mmzone.c | 2 + + mm/swapfile.c | 2 + + mm/vmscan.c | 329 ++++++++++++++++++++++++++++++ + 14 files changed, 669 insertions(+), 13 deletions(-) + +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index 1c8f79b3dd06..673d987652ee 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *page) + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_USAGE_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 7bf60454a313..1ebc27c8fee7 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -707,6 +718,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 7ca22e6e694a..159b7c94e067 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1092,6 +1092,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_USAGE_PGOFF (LRU_GEN_PGOFF - LRU_USAGE_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 355ea1ee32bd..19e722ec7cf3 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -79,11 +79,206 @@ static __always_inline enum lru_list page_lru(struct page *page) + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return static_branch_likely(&lru_gen_static_key); ++} ++#else ++DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return static_branch_unlikely(&lru_gen_static_key); ++} ++#endif ++ ++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++/* Return a proper index regardless whether we keep a full history of stats. */ ++static inline int lru_hist_from_seq(int seq) ++{ ++ return seq % NR_STAT_GENS; ++} ++ ++/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */ ++static inline int lru_tier_from_usage(int usage) ++{ ++ VM_BUG_ON(usage > BIT(LRU_USAGE_WIDTH)); ++ ++ return order_base_2(usage + 1); ++} ++ ++/* The youngest and the second youngest generations are counted as active. */ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq); ++ ++ VM_BUG_ON(!max_seq); ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++/* Update the sizes of the multigenerational lru lists. */ ++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ enum lru_list lru = type * LRU_FILE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_BUG_ON(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], ++ lrugen->sizes[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], ++ lrugen->sizes[new_gen][type][zone] + delta); ++ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); ++} ++ ++/* Add a page to one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ if (PageUnevictable(page) || !lrugen->enabled[type]) ++ return false; ++ /* ++ * If a page shouldn't be considered for eviction, i.e., a page mapped ++ * upon fault during which the accessed bit is set, add it to the ++ * youngest generation. ++ * ++ * If a page can't be evicted immediately, i.e., an anon page not in ++ * swap cache or a dirty page pending writeback, add it to the second ++ * oldest generation. ++ * ++ * If a page could be evicted immediately, e.g., a clean page, add it to ++ * the oldest generation. ++ */ ++ if (PageActive(page)) ++ gen = lru_gen_from_seq(lrugen->max_seq); ++ else if ((!type && !PageSwapCache(page)) || ++ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) ++ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); ++ else ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); ++ ++ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, -1, gen); ++ if (reclaiming) ++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++/* Delete a page from one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ if (!(new_flags & LRU_GEN_MASK)) ++ return false; ++ ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ ++ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ if ((new_flags & LRU_TIER_FLAGS) != LRU_TIER_FLAGS) ++ new_flags &= ~(LRU_USAGE_MASK | LRU_TIER_FLAGS); ++ /* see the comment on PageReferenced()/PageReclaim() in shrink_page_list() */ ++ if (reclaiming) ++ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ++ else if (lru_gen_is_active(lruvec, gen)) ++ new_flags |= BIT(PG_active); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, gen, -1); ++ list_del(&page->lru); ++ ++ return true; ++} ++ ++#else /* CONFIG_LRU_GEN */ ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec) + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, false)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); + } +@@ -93,6 +288,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, true)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); + } +@@ -100,6 +298,9 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, + static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec) + { ++ if (lru_gen_del_page(page, lruvec, false)) ++ return; ++ + list_del(&page->lru); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index fcb535560028..d6c2c3a4ba43 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -294,6 +294,94 @@ enum lruvec_flags { + */ + }; + ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * For each lruvec, evictable pages are divided into multiple generations. The ++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are ++ * monotonically increasing. The sliding window technique is used to track at ++ * most MAX_NR_GENS and at least MIN_NR_GENS generations. An offset within the ++ * window, AKA gen, indexes an array of per-type and per-zone lists for the ++ * corresponding generation. The counter in page->flags stores gen+1 while a ++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. ++ */ ++#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++ ++/* ++ * Each generation is then divided into multiple tiers. Tiers represent levels ++ * of usage from file descriptors, i.e., mark_page_accessed(). In contrast to ++ * moving across generations which requires the lru lock, moving across tiers ++ * only involves an atomic operation on page->flags and therefore has a ++ * negligible cost. ++ * ++ * The purposes of tiers are to: ++ * 1) estimate whether pages accessed multiple times via file descriptors are ++ * more active than pages accessed only via page tables by separating the two ++ * access types into upper tiers and the base tier and comparing refault rates ++ * across tiers. ++ * 2) improve buffered io performance by deferring the protection of pages ++ * accessed multiple times until the eviction. That is the protection happens ++ * in the reclaim path, not the access path. ++ * ++ * Pages accessed N times via file descriptors belong to tier order_base_2(N). ++ * The base tier may be marked by PageReferenced(). All upper tiers are marked ++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are ++ * used to support more than one upper tier. ++ */ ++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) ++#define LRU_TIER_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ ++/* Whether to keep historical stats for each generation. */ ++#ifdef CONFIG_LRU_GEN_STATS ++#define NR_STAT_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++#else ++#define NR_STAT_GENS 1U ++#endif ++ ++struct lrugen { ++ /* the aging increments the max generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the min generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; ++ /* the multigenerational lru lists */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the sizes of the multigenerational lru lists in pages */ ++ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* to determine which type and its tiers to evict */ ++ atomic_long_t refaulted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t evicted[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the base tier isn't protected, hence the minus one */ ++ unsigned long protected[NR_STAT_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of evicted+protected */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* whether the multigenerational lru is enabled */ ++ bool enabled[ANON_AND_FILE]; ++}; ++ ++void lru_gen_init_lrugen(struct lruvec *lruvec); ++void lru_gen_set_state(bool enable, bool main, bool swap); ++ ++#else /* CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_lrugen(struct lruvec *lruvec) ++{ ++} ++ ++static inline void lru_gen_set_state(bool enable, bool main, bool swap) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -311,6 +399,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* unevictable pages are on LRU_UNEVICTABLE */ ++ struct lrugen evictable; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h +index ef1e3e736e14..ce8d5732a3aa 100644 +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -26,6 +26,14 @@ + + #define ZONES_WIDTH ZONES_SHIFT + ++#ifdef CONFIG_LRU_GEN ++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ ++#define LRU_USAGE_WIDTH (CONFIG_TIERS_PER_GEN - 2) ++#else ++#define LRU_GEN_WIDTH 0 ++#define LRU_USAGE_WIDTH 0 ++#endif ++ + #ifdef CONFIG_SPARSEMEM + #include + #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) +@@ -55,7 +63,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +98,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,8 +109,8 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_USAGE_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 5922031ffab6..0156ac5f08f0 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -848,7 +848,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -859,7 +859,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) ++ ((((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_USAGE_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +diff --git a/kernel/bounds.c b/kernel/bounds.c +index 9795d75b09b2..aba13aa7336c 100644 +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,9 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); ++#endif + /* End of constants */ + + return 0; +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index bfbeabc17a9d..bec59189e206 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -146,7 +146,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index afff3ac87067..d5ccbfb50352 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2390,7 +2390,8 @@ static void __split_huge_page_tail(struct page *head, int tail, + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_USAGE_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 9ddaf0e1b0ab..ef0deadb90a7 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_USAGE_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_USAGE_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +diff --git a/mm/mmzone.c b/mm/mmzone.c +index eb89d6e018e2..2055d66a7f22 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec) + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); ++ ++ lru_gen_init_lrugen(lruvec); + } + + #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +diff --git a/mm/swapfile.c b/mm/swapfile.c +index 1e07d1c776f2..19dacc4ae35e 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_set_state(false, false, true); + + out_dput: + filp_close(victim, NULL); +@@ -3343,6 +3344,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_set_state(true, false, true); + + error = 0; + goto out; +diff --git a/mm/vmscan.c b/mm/vmscan.c +index b6d14880bd76..a02b5ff37e31 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -49,6 +49,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2731,6 +2732,334 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + } + } + ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * After a page is faulted in, the aging must scan it twice before the eviction ++ * can consider it. The first scan clears the accessed bit set during the ++ * initial fault. And the second scan makes sure it hasn't been used since the ++ * first scan. ++ */ ++#define MIN_NR_GENS 2 ++ ++#define MAX_BATCH_SIZE 8192 ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define DEFINE_MAX_SEQ(lruvec) \ ++ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) ++ ++#define DEFINE_MIN_SEQ(lruvec) \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE((lruvec)->evictable.min_seq[0]), \ ++ READ_ONCE((lruvec)->evictable.min_seq[1]), \ ++ } ++ ++#define for_each_type_zone(type, zone) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static int page_lru_gen(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static int page_lru_tier(struct page *page) ++{ ++ int usage; ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ usage = (flags & LRU_TIER_FLAGS) == LRU_TIER_FLAGS ? ++ ((flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF) + 1 : 0; ++ ++ return lru_tier_from_usage(usage); ++} ++ ++static int get_lo_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness) ++{ ++ return max_seq - max(min_seq[!swappiness], min_seq[1]) + 1; ++} ++ ++static int get_hi_wmark(unsigned long max_seq, unsigned long *min_seq, int swappiness) ++{ ++ return max_seq - min(min_seq[!swappiness], min_seq[1]) + 1; ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; ++} ++ ++static int get_swappiness(struct mem_cgroup *memcg) ++{ ++ return mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ? ++ mem_cgroup_swappiness(memcg) : 0; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ return get_nr_gens(lruvec, 0) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS && ++ get_nr_gens(lruvec, 1) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 1) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); ++#else ++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); ++#endif ++ ++static DEFINE_MUTEX(lru_gen_state_mutex); ++static int lru_gen_nr_swapfiles; ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ enum lru_list lru; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for_each_evictable_lru(lru) { ++ type = is_file_lru(lru); ++ ++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ VM_WARN_ON_ONCE(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); ++ } ++ ++ return true; ++} ++ ++static bool fill_lists(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ if (!lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page) != active, page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_lru_gen(page) >= 0, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_lists(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; ++ ++ if (lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ success = lru_gen_del_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++/* ++ * For file page tracking, we enable/disable it according to the main switch. ++ * For anon page tracking, we only enabled it when the main switch is on and ++ * there is at least one swapfile; we disable it when there are no swapfiles ++ * regardless of the value of the main switch. Otherwise, we will eventually ++ * reach the max size of the sliding window and have to call inc_min_seq(). ++ */ ++void lru_gen_set_state(bool enable, bool main, bool swap) ++{ ++ struct mem_cgroup *memcg; ++ ++ mem_hotplug_begin(); ++ mutex_lock(&lru_gen_state_mutex); ++ cgroup_lock(); ++ ++ if (swap) { ++ if (enable) ++ swap = !lru_gen_nr_swapfiles++; ++ else ++ swap = !--lru_gen_nr_swapfiles; ++ } ++ ++ if (main && enable != lru_gen_enabled()) { ++ if (enable) ++ static_branch_enable(&lru_gen_static_key); ++ else ++ static_branch_disable(&lru_gen_static_key); ++ } else if (!swap || !lru_gen_enabled()) ++ goto unlock; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ cgroup_unlock(); ++ mutex_unlock(&lru_gen_state_mutex); ++ mem_hotplug_done(); ++} ++ ++static int __meminit __maybe_unused mem_notifier(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ struct memory_notify *mn = arg; ++ int nid = mn->status_change_nid; ++ ++ if (nid == NUMA_NO_NODE) ++ return NOTIFY_DONE; ++ ++ pgdat = NODE_DATA(nid); ++ ++ if (action != MEM_GOING_ONLINE) ++ return NOTIFY_DONE; ++ ++ mutex_lock(&lru_gen_state_mutex); ++ cgroup_lock(); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ cgroup_unlock(); ++ mutex_unlock(&lru_gen_state_mutex); ++ ++ return NOTIFY_DONE; ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_lrugen(struct lruvec *lruvec) ++{ ++ int i; ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ if (hotplug_memory_notifier(mem_notifier, 0)) ++ pr_err("lru_gen: failed to subscribe hotplug notifications\n"); ++ ++ return 0; ++}; ++/* ++ * We want to run as early as possible because debug code may call mm_alloc() ++ * and mmput(). Our only dependency mm_kobj is initialized one stage earlier. ++ */ ++arch_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; + +From patchwork Wed Aug 18 06:31:01 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443387 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 92D07C4338F + for ; Wed, 18 Aug 2021 06:31:23 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 3BCCD60720 + for ; Wed, 18 Aug 2021 06:31:23 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 3BCCD60720 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 8EA5B6B007D; Wed, 18 Aug 2021 02:31:19 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 873738D0001; Wed, 18 Aug 2021 02:31:19 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 6C5676B0080; Wed, 18 Aug 2021 02:31:19 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0086.hostedemail.com + [216.40.44.86]) + by kanga.kvack.org (Postfix) with ESMTP id 4C3D66B007D + for ; Wed, 18 Aug 2021 02:31:19 -0400 (EDT) +Received: from smtpin38.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay04.hostedemail.com (Postfix) with ESMTP id 01DD522AC6 + for ; Wed, 18 Aug 2021 06:31:19 +0000 (UTC) +X-FDA: 78487229478.38.4FCB738 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf11.hostedemail.com (Postfix) with ESMTP id A3FD6F0058B7 + for ; Wed, 18 Aug 2021 06:31:18 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + a62-20020a254d410000b0290592f360b0ccso1779558ybb.14 + for ; Tue, 17 Aug 2021 23:31:18 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=jfrjqXsmv61MIW7XVF0D5wfNnO+ULVoUNMa8niOlBGQ=; + b=kEmCqE0fGoy+x3LZ6IN9OQroYs/M7PGfxceaaMiw0N9yv+Yhl2PxOGZNo4Z1kuK4RA + oiDWUZPK+/InA0lpXD+L+PPIhmNjqLYh1Xk3UbIroWi5bpN8eAcHNrxeFGhRx2aBycXj + VvsILW8bwMP5asnncGzE95IcZRx0XNv4BsR2/n8Z3vWqguhsgzHtbujMCLFCHdCV1tJN + RWwoUL6R7nIm79J3XUy5ybYI0x4GV2eBXG8ogWZhhaMawVOtogbCVzfl7DYCiYJrpwxx + OWEiJrDJ0yD9o4gcd+H8rAfxuBiT4Nmxp3WRk5uxqqAZrqU7xHfnluKQbBQx7CF0stvo + Gcnw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=jfrjqXsmv61MIW7XVF0D5wfNnO+ULVoUNMa8niOlBGQ=; + b=YuM6WAHkV6i5n+DFfdw4JW1PcazQZmmw09fr99EEwT4uo56N38/qxOYB6EivzV3Yca + LIjwHiJkI409RTARgEXNkmss984Cvajjpgyb+kUFCGxb3sV5VQGsK+p5lUtVI1CeTyEB + /qFHPbpHR0ORA4hEH1W9YLQgJAOVf6Z5hR60gvoKuCT+XxOCmRl9ElgW0KERbAUqqyzr + xyLEJvDURuxcMwIUIMZmKr2y2oghZBJBS5HXujFM5kQC2y34XUOVcm5dmrRpQtUtLW84 + gVbijejCCMfwsz3h7B43zRnMeP6wKP+5ZNXnN3Vr3xvI8vK9qZjXyragXP5+r7jD8M0n + rR4w== +X-Gm-Message-State: AOAM531PtWv8ev2ddQvJWS3updl/wCzKANw4xc9A6vNLCfWKQjxR98zh + Z56/cGM0XJxLif31xAumwtHuRErxlQytFwwu57wOziMFRIFBGCEJnptI6Zo7y+zKmXMXhbDW4uH + 5jYT/bBml4jX2CneGWgZkqMhwI4wcJnUh8V/Z1IoOMF+CzK+G38utAG6m +X-Google-Smtp-Source: + ABdhPJxVcQalP9BDTyAjtd3xQFG6qKv0PFi+YwMJh1S+798Q+5pcLqa0LKr8Xp9guTw6VHKvQ9x4/v7ys9A= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:38d1:: with SMTP id + f200mr9696686yba.183.1629268277914; + Tue, 17 Aug 2021 23:31:17 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:01 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-6-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 05/11] mm: multigenerational lru: protection +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +Authentication-Results: imf11.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=kEmCqE0f; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf11.hostedemail.com: domain of + 3NakcYQYKCAo849rkyqyyqvo.mywvsx47-wwu5kmu.y1q@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3NakcYQYKCAo849rkyqyyqvo.mywvsx47-wwu5kmu.y1q@flex--yuzhao.bounces.google.com +X-Stat-Signature: imnj7fk4dgu1uknzz41wugrph3mjm1xd +X-Rspamd-Queue-Id: A3FD6F0058B7 +X-Rspamd-Server: rspam05 +X-HE-Tag: 1629268278-394211 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +The protection is based on page access types and patterns. There are +two access types: one via page tables and the other via file +descriptors. The protection of the former type is by design stronger +because: + 1) The uncertainty in determining the access patterns of the former + type is higher due to the coalesced nature of the accessed bit. + 2) The cost of evicting the former type is higher due to the TLB + flushes required and the likelihood of involving I/O. + 3) The penalty of under-protecting the former type is higher because + applications usually do not prepare themselves for major faults like + they do for blocked I/O. For example, client applications commonly + dedicate blocked I/O to separate threads to avoid UI janks that + negatively affect user experience. + +There are also two access patterns: one with temporal locality and the +other without. The latter pattern, e.g., random and sequential, needs +to be explicitly excluded to avoid weakening the protection of the +former pattern. Generally the former type follows the former pattern +unless MADV_SEQUENTIAL is specified and the latter type follows the +latter pattern unless outlying refaults have been observed. + +Upon faulting, a page is added to the youngest generation, which +provides the strongest protection as the eviction will not consider +this page before the aging has scanned it at least twice. The first +scan clears the accessed bit set during the initial fault. And the +second scan makes sure this page has not been used since the first +scan. A page from any other generations is brought back to the +youngest generation whenever the aging finds the accessed bit set on +any of the PTEs mapping this page. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. Pages accessed N times via file +descriptors belong to tier order_base_2(N). Each tier keeps track of +how many pages from it have refaulted. Tier 0 is the base tier and +pages from it are evicted unconditionally because there are no better +candidates. Pages from an upper tier are either evicted or moved to +the next generation, depending on whether this upper tier has a higher +refault rate than the base tier. This model has the following +advantages: + 1) It removes the cost in the buffered access path and reduces the + overall cost of protection because pages are conditionally protected + in the reclaim path. + 2) It takes mapped pages into account and avoids overprotecting + pages accessed multiple times via file descriptors. + 3 Additional tiers improve the protection of pages accessed more + than twice. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + include/linux/mm.h | 32 ++++++++++++ + include/linux/sched.h | 3 ++ + mm/memory.c | 7 +++ + mm/swap.c | 51 +++++++++++++++++- + mm/vmscan.c | 91 +++++++++++++++++++++++++++++++- + mm/workingset.c | 119 +++++++++++++++++++++++++++++++++++++++++- + 6 files changed, 298 insertions(+), 5 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 159b7c94e067..7a91518792ba 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1778,6 +1778,25 @@ void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); + void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); ++ ++static inline void task_enter_nonseq_fault(void) ++{ ++ WARN_ON(current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 1; ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++ WARN_ON(!current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 0; ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return current->in_nonseq_fault; ++} + #else + static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, +@@ -1799,6 +1818,19 @@ static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } + static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } ++ ++static inline void task_enter_nonseq_fault(void) ++{ ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return false; ++} + #endif + + static inline void unmap_shared_mapping_range(struct address_space *mapping, +diff --git a/include/linux/sched.h b/include/linux/sched.h +index ec8d07d88641..fd41c9c86cd1 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -843,6 +843,9 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_MMU ++ unsigned in_nonseq_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +diff --git a/mm/memory.c b/mm/memory.c +index 2f96179db219..fa40a5b7a7a7 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4752,6 +4752,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs) + { + vm_fault_t ret; ++ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); + + __set_current_state(TASK_RUNNING); + +@@ -4773,11 +4774,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ if (nonseq_fault) ++ task_enter_nonseq_fault(); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ if (nonseq_fault) ++ task_exit_nonseq_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +diff --git a/mm/swap.c b/mm/swap.c +index 19600430e536..0d3fb2ee3fd6 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -411,6 +411,43 @@ static void __lru_cache_activate_page(struct page *page) + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void page_inc_usage(struct page *page) ++{ ++ unsigned long usage; ++ unsigned long old_flags, new_flags; ++ ++ if (PageUnevictable(page)) ++ return; ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & BIT(PG_referenced))) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ if (!(new_flags & BIT(PG_workingset))) { ++ new_flags |= BIT(PG_workingset); ++ continue; ++ } ++ ++ usage = new_flags & LRU_USAGE_MASK; ++ usage = min(usage + BIT(LRU_USAGE_PGOFF), LRU_USAGE_MASK); ++ ++ new_flags &= ~LRU_USAGE_MASK; ++ new_flags |= usage; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} ++#else ++static void page_inc_usage(struct page *page) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -425,6 +462,11 @@ void mark_page_accessed(struct page *page) + { + page = compound_head(page); + ++ if (lru_gen_enabled()) { ++ page_inc_usage(page); ++ return; ++ } ++ + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { +@@ -468,6 +510,11 @@ void lru_cache_add(struct page *page) + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + ++ /* see the comment in lru_gen_add_page() */ ++ if (lru_gen_enabled() && !PageUnevictable(page) && ++ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) ++ SetPageActive(page); ++ + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -569,7 +616,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -684,7 +731,7 @@ void deactivate_file_page(struct page *page) + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +diff --git a/mm/vmscan.c b/mm/vmscan.c +index a02b5ff37e31..788b4d1ce149 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1094,9 +1094,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; +- mem_cgroup_swapout(page, swap); ++ ++ /* get a shadow entry before page_memcg() is cleared */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); ++ mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page, swap, shadow); + xa_unlock_irqrestore(&mapping->i_pages, flags); + put_swap_page(page, swap); +@@ -2813,6 +2815,93 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) + get_nr_gens(lruvec, 1) <= MAX_NR_GENS; + } + ++/****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop modeled after the PID controller. Currently supports the ++ * proportional (P) and the integral (I) terms; the derivative (D) term can be ++ * added if necessary. The setpoint (SP) is the desired position; the process ++ * variable (PV) is the measured position. The error is the difference between ++ * the SP and the PV. A positive error results in a positive control output ++ * correction, which, in our case, is to allow eviction. ++ * ++ * The P term is the refault rate of the current generation being evicted. The I ++ * term is the exponential moving average of the refault rates of the previous ++ * generations, using the smoothing factor 1/2. ++ * ++ * Our goal is to make sure upper tiers have similar refault rates as the base ++ * tier. That is we try to be fair to all tiers by maintaining similar refault ++ * rates across them. ++ */ ++struct controller_pos { ++ unsigned long refaulted; ++ unsigned long total; ++ int gain; ++}; ++ ++static void read_controller_pos(struct controller_pos *pos, struct lruvec *lruvec, ++ int type, int tier, int gain) ++{ ++ struct lrugen *lrugen = &lruvec->evictable; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_controller_pos(struct lruvec *lruvec, int gen, int type) ++{ ++ int tier; ++ int hist = lru_hist_from_seq(gen); ++ struct lrugen *lrugen = &lruvec->evictable; ++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ if (!carryover && NR_STAT_GENS == 1) ++ return; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ ++ if (NR_STAT_GENS > 1) ++ continue; ++ } ++ ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++} ++ ++static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *pv) ++{ ++ /* ++ * Allow eviction if the PV has a limited number of refaulted pages or a ++ * lower refault rate than the SP. ++ */ ++ return pv->refaulted < SWAP_CLUSTER_MAX || ++ pv->refaulted * max(sp->total, 1UL) * sp->gain <= ++ sp->refaulted * max(pv->total, 1UL) * pv->gain; ++} ++ + /****************************************************************************** + * state change + ******************************************************************************/ +diff --git a/mm/workingset.c b/mm/workingset.c +index 5ba3e42446fa..75dbfba773a6 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,116 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static int page_get_usage(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_USAGE_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ /* see the comment on MAX_NR_TIERS */ ++ return flags & BIT(PG_workingset) ? ++ (flags & LRU_USAGE_MASK) >> LRU_USAGE_PGOFF : 0; ++} ++ ++/* Return a token to be stored in the shadow entry of a page being evicted. */ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist, tier; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ int type = page_is_file_lru(page); ++ int usage = page_get_usage(page); ++ bool workingset = PageWorkingset(page); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_USAGE_WIDTH) | usage; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_usage(usage + workingset); ++ atomic_long_add(thp_nr_pages(page), &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); ++} ++ ++/* Count a refaulted page based on the token stored in its shadow entry. */ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, usage; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = page_is_file_lru(page); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ if (page_pgdat(page) != pgdat) ++ return; ++ ++ rcu_read_lock(); ++ memcg = page_memcg_rcu(page); ++ if (mem_cgroup_id(memcg) != memcg_id) ++ goto unlock; ++ ++ usage = token & (BIT(LRU_USAGE_WIDTH) - 1); ++ if (usage && !workingset) ++ goto unlock; ++ ++ token >>= LRU_USAGE_WIDTH; ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if (token != (min_seq & (EVICTION_MASK >> LRU_USAGE_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_usage(usage + workingset); ++ atomic_long_add(thp_nr_pages(page), &lrugen->refaulted[hist][type][tier]); ++ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type); ++ ++ /* ++ * Tiers don't offer any protection to pages accessed via page tables. ++ * That's what generations do. Tiers can't fully protect pages after ++ * their usage has exceeded the max value. Conservatively count these ++ * two conditions as stalls even though they might not indicate any real ++ * memory pressure. ++ */ ++ if (task_in_nonseq_fault() || usage + workingset == BIT(LRU_USAGE_WIDTH)) { ++ SetPageWorkingset(page); ++ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else /* CONFIG_LRU_GEN */ ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +369,14 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } +@@ -296,7 +405,13 @@ void workingset_refault(struct page *page, void *shadow) + bool workingset; + int memcgid; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* + +From patchwork Wed Aug 18 06:31:02 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443389 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 3DA1FC432BE + for ; Wed, 18 Aug 2021 06:31:26 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id C9E496103A + for ; Wed, 18 Aug 2021 06:31:25 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org C9E496103A +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 454C36B007E; Wed, 18 Aug 2021 02:31:21 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 3DEF56B0080; Wed, 18 Aug 2021 02:31:21 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 27F2E6B0081; Wed, 18 Aug 2021 02:31:21 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0226.hostedemail.com + [216.40.44.226]) + by kanga.kvack.org (Postfix) with ESMTP id 0669D6B007E + for ; Wed, 18 Aug 2021 02:31:21 -0400 (EDT) +Received: from smtpin11.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay01.hostedemail.com (Postfix) with ESMTP id A3C7E1841C1EC + for ; Wed, 18 Aug 2021 06:31:20 +0000 (UTC) +X-FDA: 78487229520.11.54AB6F9 +Received: from mail-il1-f202.google.com (mail-il1-f202.google.com + [209.85.166.202]) + by imf08.hostedemail.com (Postfix) with ESMTP id 5ACDF3004A58 + for ; Wed, 18 Aug 2021 06:31:20 +0000 (UTC) +Received: by mail-il1-f202.google.com with SMTP id + c20-20020a9294140000b02902141528bc7cso698606ili.3 + for ; Tue, 17 Aug 2021 23:31:20 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=aBs1pFrO53Rncja9fuL1uL4T+2CTQbWp/Gm5wFcNO54=; + b=LGz2Nl/Mf512SIdop9E9gI9OF+TdKt1NO0NGoDEE8mnBztbmkJs12hsmhp4vglamuo + kwiKQ2zz1tryY/kWMhRJ2lAkN2VVlWng+y8I9UKlUdW8DveB4he27WkolMLyEiki7AKE + 8kIj9ed9E8Nq/nNhZODdHzd0qRHAyGCLDhIv52y4Aqv92izbx46So60qHY3aHT9gRQPa + PBrNjB56MimjDIVp+AmFqcR1COdjH8QX+Tb8NHCz2zzq23z0QAkIsjS6YbANgjDjdMlJ + O5MnS1rvZWXtyQmrdePmMfGff2dNs2P74rnHoGqeBiwdv/53DWQDwlEqF1rr+oXEnHIL + Er7Q== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=aBs1pFrO53Rncja9fuL1uL4T+2CTQbWp/Gm5wFcNO54=; + b=ccQnJrqjc+Px59ahn30tVqxlwLL+/WUyTuUaS/iX3sNScM/Ko3BLid2e4srXqC0ERq + JvydTCtrqIHV58lNPdUvxwPaPrprpfsi3qAe0N2HHJGL2BIm+c2QOaV8HvFDwKaMVJM4 + UYD+LU2pILKA9dZhGKqVKaDWxKM+7Cvk2m3bQakDddDObqQrZMp+Dvihg7sdXIFkOSNA + x9kB0PYfTlZvX8nGhgQx8Evd2yWq717x+KXUS1q/h4ncNBJV46jNPqEqvpPtMpMzFzNo + 789TaLUSxksUVX4bU/v5seQqYVBrT82gBNmExxUT9jwX9EW+wzbzuM/qTJ+o72lah0M8 + +ARw== +X-Gm-Message-State: AOAM530ZdEdZ5pAeyqqqI9Bqgh0O+0ynkQYXkuziCf/B0ajH8122k5sT + gls/SCTqGymPpAciOm9cOckNCn6AyseMrpxVCnMqOhbOtimXLd9GCDZRlAhVvxfNd3G5rlx4FkU + s1F4KFfqscSOcvcR/Y5hS3eshWPNVSbwWr1xzeYofLjI1n9pm1x0JM4ai +X-Google-Smtp-Source: + ABdhPJxkf+A7lOf48PabGNLMm+/xDsY/XQj18H+ZNhHGpzdfJ357A44QxsN1scjme8OdHZpi43TRW9e6JvA= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a6b:2bd6:: with SMTP id + r205mr6000886ior.122.1629268279406; + Tue, 17 Aug 2021 23:31:19 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:02 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-7-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 06/11] mm: multigenerational lru: mm_struct list +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +X-Rspamd-Queue-Id: 5ACDF3004A58 +Authentication-Results: imf08.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b="LGz2Nl/M"; + spf=pass (imf08.hostedemail.com: domain of + 3N6kcYQYKCAwA6Btm0s00sxq.o0yxuz69-yyw7mow.03s@flex--yuzhao.bounces.google.com + designates 209.85.166.202 as permitted sender) + smtp.mailfrom=3N6kcYQYKCAwA6Btm0s00sxq.o0yxuz69-yyw7mow.03s@flex--yuzhao.bounces.google.com; + dmarc=pass (policy=reject) header.from=google.com +X-Rspamd-Server: rspam01 +X-Stat-Signature: 1z7x61aak98o6q9r3h9m8dhjfzp8w8cq +X-HE-Tag: 1629268280-900776 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +To scan PTEs for accessed pages, a mm_struct list is maintained for +each memcg. When multiple threads traverse the same memcg->mm_list, +each of them gets a unique mm_struct and therefore they can run +walk_page_range() concurrently to reach page tables of all processes +of this memcg. + +And to skip page tables of processes that have been sleeping since the +last walk, the usage of mm_struct is also tracked between context +switches. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + fs/exec.c | 2 + + include/linux/memcontrol.h | 6 + + include/linux/mm_types.h | 107 +++++++++++++ + kernel/exit.c | 1 + + kernel/fork.c | 10 ++ + kernel/kthread.c | 1 + + kernel/sched/core.c | 2 + + mm/memcontrol.c | 28 ++++ + mm/vmscan.c | 313 +++++++++++++++++++++++++++++++++++++ + 9 files changed, 470 insertions(+) + +diff --git a/fs/exec.c b/fs/exec.c +index 38f63451b928..7ead083bcb39 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1005,6 +1005,7 @@ static int exec_mmap(struct mm_struct *mm) + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1015,6 +1016,7 @@ static int exec_mmap(struct mm_struct *mm) + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + activate_mm(active_mm, mm); ++ lru_gen_switch_mm(active_mm, mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index bfe5c486f4ad..5e223cecb5c2 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -230,6 +230,8 @@ struct obj_cgroup { + }; + }; + ++struct lru_gen_mm_list; ++ + /* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per cgroup. We would eventually like to provide +@@ -349,6 +351,10 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ struct lru_gen_mm_list *mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[]; + }; + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 52bbd2b7cb46..d9a2ba150ce8 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -15,6 +15,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -571,6 +573,22 @@ struct mm_struct { + + #ifdef CONFIG_IOMMU_SUPPORT + u32 pasid; ++#endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* the node of a global or per-memcg mm_struct list */ ++ struct list_head list; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of the owner task above */ ++ struct mem_cgroup *memcg; ++#endif ++ /* whether this mm_struct has been used since the last walk */ ++ nodemask_t nodes; ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ /* the number of CPUs using this mm_struct */ ++ atomic_t nr_cpus; ++#endif ++ } lrugen; + #endif + } __randomize_layout; + +@@ -598,6 +616,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++void lru_gen_init_mm(struct mm_struct *mm); ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++int lru_gen_alloc_mm_list(struct mem_cgroup *memcg); ++void lru_gen_free_mm_list(struct mem_cgroup *memcg); ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++/* Track the usage of each mm_struct so that we can skip inactive ones. */ ++static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) ++{ ++ /* exclude init_mm, efi_mm, etc. */ ++ if (!core_kernel_data((unsigned long)old)) { ++ VM_BUG_ON(old == &init_mm); ++ ++ nodes_setall(old->lrugen.nodes); ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ atomic_dec(&old->lrugen.nr_cpus); ++ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old); ++#endif ++ } else ++ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) || ++ READ_ONCE(old->lrugen.list.next), old); ++ ++ if (!core_kernel_data((unsigned long)new)) { ++ VM_BUG_ON(new == &init_mm); ++ ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ atomic_inc(&new->lrugen.nr_cpus); ++ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new); ++#endif ++ } else ++ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) || ++ READ_ONCE(new->lrugen.list.next), new); ++} ++ ++/* Return whether this mm_struct is being used on any CPUs. */ ++static inline bool lru_gen_mm_is_active(struct mm_struct *mm) ++{ ++#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ return !cpumask_empty(mm_cpumask(mm)); ++#else ++ return atomic_read(&mm->lrugen.nr_cpus); ++#endif ++} ++ ++#else /* CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) ++{ ++ return 0; ++} ++ ++static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) ++{ ++} ++ ++static inline bool lru_gen_mm_is_active(struct mm_struct *mm) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/kernel/exit.c b/kernel/exit.c +index 9a89e7f36acb..c24d5ffae792 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm) + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index bc94b2cc5995..e5f5dd5ac584 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm) + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); + #endif ++ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); + } + + #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +@@ -1066,6 +1067,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + goto fail_nocontext; + + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; + + fail_nocontext: +@@ -1108,6 +1110,7 @@ static inline void __mmput(struct mm_struct *mm) + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); + } + +@@ -2530,6 +2533,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) + get_task_struct(p); + } + ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); + + /* forking complete and child started to run, tell ptracer */ +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 5b37a8567168..fd827fdad26b 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -1361,6 +1361,7 @@ void kthread_use_mm(struct mm_struct *mm) + tsk->mm = mm; + membarrier_update_current_mm(mm); + switch_mm_irqs_off(active_mm, mm, tsk); ++ lru_gen_switch_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + #ifdef finish_arch_post_lock_switch +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 20ffcc044134..eea1457704ed 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4665,6 +4665,7 @@ context_switch(struct rq *rq, struct task_struct *prev, + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_switch_mm(prev->active_mm, next->mm); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +@@ -8391,6 +8392,7 @@ void idle_task_exit(void) + + if (mm != &init_mm) { + switch_mm(mm, &init_mm, current); ++ lru_gen_switch_mm(mm, &init_mm); + finish_arch_post_lock_switch(); + } + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 702a81dfe72d..8597992797d0 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5172,6 +5172,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + free_percpu(memcg->vmstats_percpu); ++ lru_gen_free_mm_list(memcg); + kfree(memcg); + } + +@@ -5221,6 +5222,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) + if (alloc_mem_cgroup_per_node_info(memcg, node)) + goto fail; + ++ if (lru_gen_alloc_mm_list(memcg)) ++ goto fail; ++ + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; + +@@ -6182,6 +6186,29 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *task = NULL; ++ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ break; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && task->mm->owner == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6523,6 +6550,7 @@ struct cgroup_subsys memory_cgrp_subsys = { + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 788b4d1ce149..15eadf2a135e 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2902,6 +2902,312 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos * + sp->refaulted * max(pv->total, 1UL) * pv->gain; + } + ++/****************************************************************************** ++ * mm_struct list ++ ******************************************************************************/ ++ ++enum { ++ MM_SCHED_ACTIVE, /* running processes */ ++ MM_SCHED_INACTIVE, /* sleeping processes */ ++ MM_LOCK_CONTENTION, /* lock contentions */ ++ MM_VMA_INTERVAL, /* VMAs within the range of each PUD/PMD/PTE */ ++ MM_LEAF_OTHER_NODE, /* entries not from the node under reclaim */ ++ MM_LEAF_OTHER_MEMCG, /* entries not from the memcg under reclaim */ ++ MM_LEAF_OLD, /* old entries */ ++ MM_LEAF_YOUNG, /* young entries */ ++ MM_LEAF_DIRTY, /* dirty entries */ ++ MM_LEAF_HOLE, /* non-present entries */ ++ MM_NONLEAF_OLD, /* old non-leaf PMD entries */ ++ MM_NONLEAF_YOUNG, /* young non-leaf PMD entries */ ++ NR_MM_STATS ++}; ++ ++/* mnemonic codes for the stats above */ ++#define MM_STAT_CODES "aicvnmoydhlu" ++ ++struct lru_gen_mm_list { ++ /* the head of a global or per-memcg mm_struct list */ ++ struct list_head head; ++ /* protects the list */ ++ spinlock_t lock; ++ struct { ++ /* set to max_seq after each round of walk */ ++ unsigned long cur_seq; ++ /* the next mm_struct on the list to walk */ ++ struct list_head *iter; ++ /* to wait for the last walker to finish */ ++ struct wait_queue_head wait; ++ /* the number of concurrent walkers */ ++ int nr_walkers; ++ /* stats for debugging */ ++ unsigned long stats[NR_STAT_GENS][NR_MM_STATS]; ++ } nodes[0]; ++}; ++ ++static struct lru_gen_mm_list *global_mm_list; ++ ++static struct lru_gen_mm_list *alloc_mm_list(void) ++{ ++ int nid; ++ struct lru_gen_mm_list *mm_list; ++ ++ mm_list = kvzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL); ++ if (!mm_list) ++ return NULL; ++ ++ INIT_LIST_HEAD(&mm_list->head); ++ spin_lock_init(&mm_list->lock); ++ ++ for_each_node(nid) { ++ mm_list->nodes[nid].cur_seq = MIN_NR_GENS; ++ mm_list->nodes[nid].iter = &mm_list->head; ++ init_waitqueue_head(&mm_list->nodes[nid].wait); ++ } ++ ++ return mm_list; ++} ++ ++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) ++{ ++#ifdef CONFIG_MEMCG ++ if (!mem_cgroup_disabled()) ++ return memcg ? memcg->mm_list : root_mem_cgroup->mm_list; ++#endif ++ VM_BUG_ON(memcg); ++ ++ return global_mm_list; ++} ++ ++void lru_gen_init_mm(struct mm_struct *mm) ++{ ++ INIT_LIST_HEAD(&mm->lrugen.list); ++#ifdef CONFIG_MEMCG ++ mm->lrugen.memcg = NULL; ++#endif ++#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ++ atomic_set(&mm->lrugen.nr_cpus, 0); ++#endif ++ nodes_clear(mm->lrugen.nodes); ++} ++ ++void lru_gen_add_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); ++#ifdef CONFIG_MEMCG ++ VM_BUG_ON_MM(mm->lrugen.memcg, mm); ++ WRITE_ONCE(mm->lrugen.memcg, memcg); ++#endif ++ spin_lock(&mm_list->lock); ++ list_add_tail(&mm->lrugen.list, &mm_list->head); ++ spin_unlock(&mm_list->lock); ++} ++ ++void lru_gen_del_mm(struct mm_struct *mm) ++{ ++ int nid; ++#ifdef CONFIG_MEMCG ++ struct lru_gen_mm_list *mm_list = get_mm_list(mm->lrugen.memcg); ++#else ++ struct lru_gen_mm_list *mm_list = get_mm_list(NULL); ++#endif ++ ++ spin_lock(&mm_list->lock); ++ ++ for_each_node(nid) { ++ if (mm_list->nodes[nid].iter != &mm->lrugen.list) ++ continue; ++ ++ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; ++ if (mm_list->nodes[nid].iter == &mm_list->head) ++ WRITE_ONCE(mm_list->nodes[nid].cur_seq, ++ mm_list->nodes[nid].cur_seq + 1); ++ } ++ ++ list_del_init(&mm->lrugen.list); ++ ++ spin_unlock(&mm_list->lock); ++ ++#ifdef CONFIG_MEMCG ++ mem_cgroup_put(mm->lrugen.memcg); ++ WRITE_ONCE(mm->lrugen.memcg, NULL); ++#endif ++} ++ ++#ifdef CONFIG_MEMCG ++int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) ++{ ++ if (mem_cgroup_disabled()) ++ return 0; ++ ++ memcg->mm_list = alloc_mm_list(); ++ ++ return memcg->mm_list ? 0 : -ENOMEM; ++} ++ ++void lru_gen_free_mm_list(struct mem_cgroup *memcg) ++{ ++ kvfree(memcg->mm_list); ++ memcg->mm_list = NULL; ++} ++ ++void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg; ++ ++ lockdep_assert_held(&mm->owner->alloc_lock); ++ ++ if (mem_cgroup_disabled()) ++ return; ++ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(mm->owner); ++ rcu_read_unlock(); ++ if (memcg == mm->lrugen.memcg) ++ return; ++ ++ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); ++ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); ++ ++ lru_gen_del_mm(mm); ++ lru_gen_add_mm(mm); ++} ++ ++static bool mm_is_migrated(struct mm_struct *mm, struct mem_cgroup *memcg) ++{ ++ return READ_ONCE(mm->lrugen.memcg) != memcg; ++} ++#else ++static bool mm_is_migrated(struct mm_struct *mm, struct mem_cgroup *memcg) ++{ ++ return false; ++} ++#endif ++ ++struct mm_walk_args { ++ struct mem_cgroup *memcg; ++ unsigned long max_seq; ++ unsigned long start_pfn; ++ unsigned long end_pfn; ++ unsigned long next_addr; ++ int node_id; ++ int swappiness; ++ int batch_size; ++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ int mm_stats[NR_MM_STATS]; ++ unsigned long bitmap[0]; ++}; ++ ++static void reset_mm_stats(struct lru_gen_mm_list *mm_list, bool last, ++ struct mm_walk_args *args) ++{ ++ int i; ++ int nid = args->node_id; ++ int hist = lru_hist_from_seq(args->max_seq); ++ ++ lockdep_assert_held(&mm_list->lock); ++ ++ for (i = 0; i < NR_MM_STATS; i++) { ++ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], ++ mm_list->nodes[nid].stats[hist][i] + args->mm_stats[i]); ++ args->mm_stats[i] = 0; ++ } ++ ++ if (!last || NR_STAT_GENS == 1) ++ return; ++ ++ hist = lru_hist_from_seq(args->max_seq + 1); ++ for (i = 0; i < NR_MM_STATS; i++) ++ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], 0); ++} ++ ++static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ int type; ++ unsigned long size = 0; ++ ++ if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes)) ++ return true; ++ ++ if (mm_is_oom_victim(mm)) ++ return true; ++ ++ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { ++ size += type ? get_mm_counter(mm, MM_FILEPAGES) : ++ get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ } ++ ++ /* leave the legwork to the rmap if the mappings are too sparse */ ++ if (size < max(SWAP_CLUSTER_MAX, mm_pgtables_bytes(mm) / PAGE_SIZE)) ++ return true; ++ ++ return !mmget_not_zero(mm); ++} ++ ++/* To support multiple walkers that concurrently walk an mm_struct list. */ ++static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter) ++{ ++ bool last = true; ++ struct mm_struct *mm = NULL; ++ int nid = args->node_id; ++ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); ++ ++ if (*iter) ++ mmput_async(*iter); ++ else if (args->max_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq)) ++ return false; ++ ++ spin_lock(&mm_list->lock); ++ ++ VM_BUG_ON(args->max_seq > mm_list->nodes[nid].cur_seq + 1); ++ VM_BUG_ON(*iter && args->max_seq < mm_list->nodes[nid].cur_seq); ++ VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_walkers); ++ ++ if (args->max_seq <= mm_list->nodes[nid].cur_seq) { ++ last = *iter; ++ goto done; ++ } ++ ++ if (mm_list->nodes[nid].iter == &mm_list->head) { ++ VM_BUG_ON(*iter || mm_list->nodes[nid].nr_walkers); ++ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; ++ } ++ ++ while (!mm && mm_list->nodes[nid].iter != &mm_list->head) { ++ mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, lrugen.list); ++ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; ++ if (should_skip_mm(mm, args)) ++ mm = NULL; ++ ++ args->mm_stats[mm ? MM_SCHED_ACTIVE : MM_SCHED_INACTIVE]++; ++ } ++ ++ if (mm_list->nodes[nid].iter == &mm_list->head) ++ WRITE_ONCE(mm_list->nodes[nid].cur_seq, ++ mm_list->nodes[nid].cur_seq + 1); ++done: ++ if (*iter && !mm) ++ mm_list->nodes[nid].nr_walkers--; ++ if (!*iter && mm) ++ mm_list->nodes[nid].nr_walkers++; ++ ++ last = last && !mm_list->nodes[nid].nr_walkers && ++ mm_list->nodes[nid].iter == &mm_list->head; ++ ++ reset_mm_stats(mm_list, last, args); ++ ++ spin_unlock(&mm_list->lock); ++ ++ *iter = mm; ++ if (mm) ++ node_clear(nid, mm->lrugen.nodes); ++ ++ return last; ++} ++ + /****************************************************************************** + * state change + ******************************************************************************/ +@@ -3135,6 +3441,13 @@ static int __init init_lru_gen(void) + { + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); ++ ++ if (mem_cgroup_disabled()) { ++ global_mm_list = alloc_mm_list(); ++ if (!global_mm_list) ++ return -ENOMEM; ++ } + + if (hotplug_memory_notifier(mem_notifier, 0)) + pr_err("lru_gen: failed to subscribe hotplug notifications\n"); + +From patchwork Wed Aug 18 06:31:03 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443391 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 358D8C4338F + for ; Wed, 18 Aug 2021 06:31:29 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id B57026103A + for ; Wed, 18 Aug 2021 06:31:28 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org B57026103A +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 604356B0080; Wed, 18 Aug 2021 02:31:22 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 53D226B0081; Wed, 18 Aug 2021 02:31:22 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 2F5E56B0082; Wed, 18 Aug 2021 02:31:22 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0192.hostedemail.com + [216.40.44.192]) + by kanga.kvack.org (Postfix) with ESMTP id 09DB66B0080 + for ; Wed, 18 Aug 2021 02:31:22 -0400 (EDT) +Received: from smtpin04.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay04.hostedemail.com (Postfix) with ESMTP id B0BD61F04B + for ; Wed, 18 Aug 2021 06:31:21 +0000 (UTC) +X-FDA: 78487229562.04.D5BDBDC +Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com + [209.85.219.202]) + by imf03.hostedemail.com (Postfix) with ESMTP id 4A61C300E545 + for ; Wed, 18 Aug 2021 06:31:21 +0000 (UTC) +Received: by mail-yb1-f202.google.com with SMTP id + j9-20020a2581490000b02905897d81c63fso1816607ybm.8 + for ; Tue, 17 Aug 2021 23:31:21 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=z2kO3gETVLe3f6E2AsDubYJZRvsLJPaS2Ipw7IApwm0=; + b=svUvVh0WR+PW4aMv/cjqN6UwuRRwX5PC1tZd6xXvNOy/dF8Chqt5qRRRjq0SnfJ2AF + +cwdt9BhWmrjGBD/4Fg+vAciPLfsdMV2PLYq+fvQL3t4x04q1W3W8Oa3J5KSYyE3ZAV0 + zR2Je0z2v+xfTvKPdP1h7Arw9sUZ7JdCwjjrDtrdnhxUez63ZVgIYusXupoB1KoCMu2V + CqqVorApKEyBzmf5aj6oqIrJJJDqFjg9AvTIUKhgaVXO6alLlC0TJmIIV+05Jhg1JL0z + AFIzfA1ROm9zCHUY5HHshQPJK4IA3ryYYMnXTfCFDbz350Tih+ZbrT0n96XqVqf7rZrr + rNKA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=z2kO3gETVLe3f6E2AsDubYJZRvsLJPaS2Ipw7IApwm0=; + b=Bm9copLbl8nQeW5UjmgKA65VYui/y0YN+MeQ0siGnhr2HqGIBpOHROqxb+dbATYMTy + SKfEZ/X1SvXo4dV2NdGJXly2uGs5UwdhdjI9lpIZvRjW1dr64ml3gKeCvuQkESwMrmpY + YCrFUpMqh+7dD61RLLjxvvjXveBPyuf3+k5UkbVpzEPF8NlEVpBC3oAnk8ol6aOem8AR + LbS1mdhDAjqg+dCKsi13Qr0pvDvBbZnFrwxj9Ya4t5FelOE8qEE6lSRjtYARlQQNxsQ8 + muu5k8yOk3Ja11wC9vHGLYrBTb+eTmwbxAEqgZqZ7fDmBeGqmSEXfigjjrw1jiOd+Vgb + A/lg== +X-Gm-Message-State: AOAM532agMBYw9DwnasD/ShdtM0nmWSlv15ktwVJVj4dytE/Mr3pdY+N + b8yfDuHXgBaMDddf8V+W77el6NiJZfAx2k/gchr24XfsH/JQfO0gcJmH/S90IBvtPBvv1it9o7d + jt6ekkXqn3AOmUJPQ/w11c7rUYQbpDCjBjNKt9ZNGlTwqegZSHNpMqCHt +X-Google-Smtp-Source: + ABdhPJwWgmQ2RqBHX6PIwvutRAQrPQjtw1s0Rd6t0slktT/1MnRcEEur7wEpXJENz+97f/aiiyb14mlPB5U= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:db82:: with SMTP id + g124mr9256176ybf.46.1629268280599; + Tue, 17 Aug 2021 23:31:20 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:03 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-8-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 07/11] mm: multigenerational lru: aging +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +X-Rspamd-Server: rspam03 +X-Rspamd-Queue-Id: 4A61C300E545 +X-Stat-Signature: i8kr6aabws985fdgph5mc45o6mnd6itz +Authentication-Results: imf03.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=svUvVh0W; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf03.hostedemail.com: domain of + 3OKkcYQYKCA0B7Cun1t11tyr.p1zyv07A-zzx8npx.14t@flex--yuzhao.bounces.google.com + designates 209.85.219.202 as permitted sender) + smtp.mailfrom=3OKkcYQYKCA0B7Cun1t11tyr.p1zyv07A-zzx8npx.14t@flex--yuzhao.bounces.google.com +X-HE-Tag: 1629268281-80077 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +The aging produces young generations. Given an lruvec, the aging +traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan +PTEs for accessed pages. Upon finding one, the aging updates its +generation number to max_seq (modulo MAX_NR_GENS). After each round of +traversal, the aging increments max_seq. The aging is due when both +min_seq[2] have caught up with max_seq-1. + +The aging uses the following optimizations when walking page tables: + 1) It skips page tables of processes that have been sleeping since + the last walk. + 2) It skips non-leaf PMD entries that have the accessed bit cleared + when CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y. + 3) It does not zigzag between a PGD table and the same PMD or PTE + table spanning multiple VMAs. In other words, it finishes all the + VMAs within the range of the same PMD or PTE table before it returns + to this PGD table. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + include/linux/memcontrol.h | 3 + + include/linux/mmzone.h | 11 + + include/linux/oom.h | 16 + + include/linux/swap.h | 1 + + mm/oom_kill.c | 4 +- + mm/rmap.c | 7 + + mm/swap.c | 4 +- + mm/vmscan.c | 903 +++++++++++++++++++++++++++++++++++++ + 8 files changed, 945 insertions(+), 4 deletions(-) + +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 5e223cecb5c2..657d94344dfc 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -1346,10 +1346,13 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) + + static inline void lock_page_memcg(struct page *page) + { ++ /* to match page_memcg_rcu() */ ++ rcu_read_lock(); + } + + static inline void unlock_page_memcg(struct page *page) + { ++ rcu_read_unlock(); + } + + static inline void mem_cgroup_handle_over_high(void) +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index d6c2c3a4ba43..b6005e881862 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -295,6 +295,7 @@ enum lruvec_flags { + }; + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_USAGE_MASK ((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF) +@@ -369,6 +370,7 @@ struct lrugen { + + void lru_gen_init_lrugen(struct lruvec *lruvec); + void lru_gen_set_state(bool enable, bool main, bool swap); ++void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw); + + #else /* CONFIG_LRU_GEN */ + +@@ -380,6 +382,10 @@ static inline void lru_gen_set_state(bool enable, bool main, bool swap) + { + } + ++static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + struct lruvec { +@@ -874,6 +880,8 @@ struct deferred_split { + }; + #endif + ++struct mm_walk_args; ++ + /* + * On NUMA machines, each NUMA node would have a pg_data_t to describe + * it's memory layout. On UMA machines there is a single pglist_data which +@@ -979,6 +987,9 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args *mm_walk_args; ++#endif + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +diff --git a/include/linux/oom.h b/include/linux/oom.h +index 2db9a1432511..c4c8c7e71099 100644 +--- a/include/linux/oom.h ++++ b/include/linux/oom.h +@@ -57,6 +57,22 @@ struct oom_control { + extern struct mutex oom_lock; + extern struct mutex oom_adj_mutex; + ++#ifdef CONFIG_MMU ++extern struct task_struct *oom_reaper_list; ++extern struct wait_queue_head oom_reaper_wait; ++ ++static inline bool oom_reaping_in_progress(void) ++{ ++ /* racy check to see if oom reaping could be in progress */ ++ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); ++} ++#else ++static inline bool oom_reaping_in_progress(void) ++{ ++ return false; ++} ++#endif ++ + static inline void set_current_oom_origin(void) + { + current->signal->oom_flag_origin = true; +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 6f5a43251593..c838e67dfa3a 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -368,6 +368,7 @@ extern void lru_add_drain_all(void); + extern void rotate_reclaimable_page(struct page *page); + extern void deactivate_file_page(struct page *page); + extern void deactivate_page(struct page *page); ++extern void activate_page(struct page *page); + extern void mark_page_lazyfree(struct page *page); + extern void swap_setup(void); + +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index c729a4c4a1ac..eca484ee3a3d 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -507,8 +507,8 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) + * victim (if that is possible) to help the OOM killer to move on. + */ + static struct task_struct *oom_reaper_th; +-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +-static struct task_struct *oom_reaper_list; ++DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); ++struct task_struct *oom_reaper_list; + static DEFINE_SPINLOCK(oom_reaper_lock); + + bool __oom_reap_task_mm(struct mm_struct *mm) +diff --git a/mm/rmap.c b/mm/rmap.c +index b9eb5c12f3fe..f4963d60ff68 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -72,6 +72,7 @@ + #include + #include + #include ++#include + + #include + +@@ -789,6 +790,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, + } + + if (pvmw.pte) { ++ /* the multigenerational lru exploits the spatial locality */ ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & VM_SEQ_READ)) { ++ lru_gen_scan_around(&pvmw); ++ referenced++; ++ } + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +diff --git a/mm/swap.c b/mm/swap.c +index 0d3fb2ee3fd6..0315cfa9fa41 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -347,7 +347,7 @@ static bool need_activate_page_drain(int cpu) + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; + } + +-static void activate_page(struct page *page) ++void activate_page(struct page *page) + { + page = compound_head(page); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { +@@ -367,7 +367,7 @@ static inline void activate_page_drain(int cpu) + { + } + +-static void activate_page(struct page *page) ++void activate_page(struct page *page) + { + struct lruvec *lruvec; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 15eadf2a135e..757ba4f415cc 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -3208,6 +3210,883 @@ static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter) + return last; + } + ++/****************************************************************************** ++ * the aging ++ ******************************************************************************/ ++ ++static int page_update_gen(struct page *page, int gen) ++{ ++ unsigned long old_flags, new_flags; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & LRU_GEN_MASK)) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags &= ~(LRU_GEN_MASK | LRU_USAGE_MASK | LRU_TIER_FLAGS); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int old_gen, new_gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); ++ ++ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* page_update_gen() has updated this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) { ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ return; ++ } ++ ++ new_gen = (old_gen + 1) % MAX_NR_GENS; ++ ++ new_flags &= ~(LRU_GEN_MASK | LRU_USAGE_MASK | LRU_TIER_FLAGS); ++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ /* for rotate_reclaimable_page() */ ++ if (reclaiming) ++ new_flags |= BIT(PG_reclaim); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, old_gen, new_gen); ++ if (reclaiming) ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ else ++ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); ++} ++ ++static void update_batch_size(struct page *page, int old_gen, int new_gen, ++ struct mm_walk_args *args) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON(old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen >= MAX_NR_GENS); ++ ++ args->batch_size++; ++ ++ args->nr_pages[old_gen][type][zone] -= delta; ++ args->nr_pages[new_gen][type][zone] += delta; ++} ++ ++static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ if (!args->batch_size) ++ return; ++ ++ args->batch_size = 0; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ enum lru_list lru = type * LRU_FILE; ++ int total = args->nr_pages[gen][type][zone]; ++ ++ if (!total) ++ continue; ++ ++ args->nr_pages[gen][type][zone] = 0; ++ WRITE_ONCE(lrugen->sizes[gen][type][zone], ++ lrugen->sizes[gen][type][zone] + total); ++ ++ if (lru_gen_is_active(lruvec, gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, total); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) ++{ ++ struct address_space *mapping; ++ struct vm_area_struct *vma = walk->vma; ++ struct mm_walk_args *args = walk->private; ++ ++ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || ++ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) ++ return true; ++ ++ if (vma_is_anonymous(vma)) ++ return !args->swappiness; ++ ++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) ++ return true; ++ ++ mapping = vma->vm_file->f_mapping; ++ if (!mapping->a_ops->writepage) ++ return true; ++ ++ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); ++} ++ ++/* ++ * Some userspace memory allocators create many single-page VMAs. So instead of ++ * returning back to the PGD table for each of such VMAs, we finish at least an ++ * entire PMD table and therefore avoid many zigzags. ++ */ ++static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, ++ unsigned long *start, unsigned long *end) ++{ ++ unsigned long next = round_up(*end, size); ++ struct mm_walk_args *args = walk->private; ++ ++ VM_BUG_ON(mask & size); ++ VM_BUG_ON(*start >= *end); ++ VM_BUG_ON((next & mask) != (*start & mask)); ++ ++ while (walk->vma) { ++ if (next >= walk->vma->vm_end) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ if ((next & mask) != (walk->vma->vm_start & mask)) ++ return false; ++ ++ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ *start = max(next, walk->vma->vm_start); ++ next = (next | ~mask) + 1; ++ /* rounded-up boundaries can wrap to 0 */ ++ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; ++ ++ args->mm_stats[MM_VMA_INTERVAL]++; ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pte_t *pte; ++ spinlock_t *ptl; ++ unsigned long addr; ++ int remote = 0; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pmd_leaf(*pmd)); ++ ++ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); ++ arch_enter_lazy_mmu_mode(); ++restart: ++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ struct page *page; ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) { ++ args->mm_stats[MM_LEAF_HOLE]++; ++ continue; ++ } ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ if (!pte_young(pte[i])) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) { ++ args->mm_stats[MM_LEAF_OTHER_NODE]++; ++ remote++; ++ continue; ++ } ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != args->node_id) { ++ args->mm_stats[MM_LEAF_OTHER_NODE]++; ++ remote++; ++ continue; ++ } ++ ++ if (page_memcg_rcu(page) != args->memcg) { ++ args->mm_stats[MM_LEAF_OTHER_MEMCG]++; ++ continue; ++ } ++ ++ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); ++ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) ++ continue; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) { ++ set_page_dirty(page); ++ args->mm_stats[MM_LEAF_DIRTY]++; ++ } ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ } ++ ++ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) ++ goto restart; ++ ++ arch_leave_lazy_mmu_mode(); ++ pte_unmap_unlock(pte, ptl); ++ ++ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && !remote; ++} ++ ++/* ++ * We scan PMD entries in two passes. The first pass reaches to PTE tables and ++ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD ++ * entries and needs to take the PMD lock. ++ */ ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ start &= PUD_MASK; ++ pmd = pmd_offset(pud, start); ++ ptl = pmd_lock(walk->mm, pmd); ++ arch_enter_lazy_mmu_mode(); ++ ++ for_each_set_bit(i, args->bitmap, PTRS_PER_PMD) { ++ struct page *page; ++ unsigned long pfn = pmd_pfn(pmd[i]); ++ unsigned long addr = start + i * PMD_SIZE; ++ ++ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) { ++ args->mm_stats[MM_LEAF_HOLE]++; ++ continue; ++ } ++ ++ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) ++ continue; ++ ++ if (!pmd_young(pmd[i])) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ if (!pmd_trans_huge(pmd[i])) { ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && ++ pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ args->mm_stats[MM_NONLEAF_YOUNG]++; ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) { ++ args->mm_stats[MM_LEAF_OTHER_NODE]++; ++ continue; ++ } ++ ++ page = pfn_to_page(pfn); ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ if (page_to_nid(page) != args->node_id) { ++ args->mm_stats[MM_LEAF_OTHER_NODE]++; ++ continue; ++ } ++ ++ if (page_memcg_rcu(page) != args->memcg) { ++ args->mm_stats[MM_LEAF_OTHER_MEMCG]++; ++ continue; ++ } ++ ++ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); ++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ continue; ++ ++ if (pmd_dirty(pmd[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) { ++ set_page_dirty(page); ++ args->mm_stats[MM_LEAF_DIRTY]++; ++ } ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++ ++ bitmap_zero(args->bitmap, PTRS_PER_PMD); ++} ++#else ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++} ++#endif ++ ++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ unsigned long next; ++ unsigned long addr; ++ struct vm_area_struct *vma; ++ int leaf = 0; ++ int nonleaf = 0; ++ struct mm_walk_args *args = walk->private; ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ pmd = pmd_offset(pud, start & PUD_MASK); ++restart: ++ vma = walk->vma; ++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { ++ pmd_t val = pmd_read_atomic(pmd + i); ++ ++ /* for pmd_read_atomic() */ ++ barrier(); ++ ++ next = pmd_addr_end(addr, end); ++ ++ if (!pmd_present(val)) { ++ args->mm_stats[MM_LEAF_HOLE]++; ++ continue; ++ } ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ if (pmd_trans_huge(val)) { ++ unsigned long pfn = pmd_pfn(val); ++ ++ if (is_huge_zero_pmd(val)) { ++ args->mm_stats[MM_LEAF_HOLE]++; ++ continue; ++ } ++ ++ if (!pmd_young(val)) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) { ++ args->mm_stats[MM_LEAF_OTHER_NODE]++; ++ continue; ++ } ++ ++ __set_bit(i, args->bitmap); ++ leaf++; ++ continue; ++ } ++#endif ++ ++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG ++ if (!pmd_young(val)) { ++ args->mm_stats[MM_NONLEAF_OLD]++; ++ continue; ++ } ++#endif ++ if (walk_pte_range(&val, addr, next, walk)) { ++ __set_bit(i, args->bitmap); ++ nonleaf++; ++ } ++ } ++ ++ if (leaf) { ++ walk_pmd_range_locked(pud, start, vma, walk); ++ leaf = nonleaf = 0; ++ } ++ ++ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) ++ goto restart; ++ ++ if (nonleaf) ++ walk_pmd_range_locked(pud, start, vma, walk); ++} ++ ++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pud_t *pud; ++ unsigned long addr; ++ unsigned long next; ++ struct mm_walk_args *args = walk->private; ++ ++ VM_BUG_ON(p4d_leaf(*p4d)); ++ ++ pud = pud_offset(p4d, start & P4D_MASK); ++restart: ++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { ++ pud_t val = READ_ONCE(pud[i]); ++ ++ next = pud_addr_end(addr, end); ++ ++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) ++ continue; ++ ++ walk_pmd_range(&val, addr, next, walk); ++ ++ if (args->batch_size >= MAX_BATCH_SIZE) { ++ end = (addr | ~PUD_MASK) + 1; ++ goto done; ++ } ++ } ++ ++ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) ++ goto restart; ++ ++ end = round_up(end, P4D_SIZE); ++done: ++ /* rounded-up boundaries can wrap to 0 */ ++ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; ++ ++ return -EAGAIN; ++} ++ ++static void walk_mm(struct mm_walk_args *args, struct mm_struct *mm) ++{ ++ static const struct mm_walk_ops mm_walk_ops = { ++ .test_walk = should_skip_vma, ++ .p4d_entry = walk_pud_range, ++ }; ++ ++ int err; ++ struct mem_cgroup *memcg = args->memcg; ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(args->node_id)); ++ ++ args->next_addr = FIRST_USER_ADDRESS; ++ ++ do { ++ unsigned long start = args->next_addr; ++ unsigned long end = mm->highest_vm_end; ++ ++ err = -EBUSY; ++ ++ rcu_read_lock(); ++#ifdef CONFIG_MEMCG ++ if (memcg && atomic_read(&memcg->moving_account)) { ++ args->mm_stats[MM_LOCK_CONTENTION]++; ++ goto contended; ++ } ++#endif ++ if (!mmap_read_trylock(mm)) { ++ args->mm_stats[MM_LOCK_CONTENTION]++; ++ goto contended; ++ } ++ ++ err = walk_page_range(mm, start, end, &mm_walk_ops, args); ++ ++ mmap_read_unlock(mm); ++ ++ reset_batch_size(lruvec, args); ++contended: ++ rcu_read_unlock(); ++ ++ cond_resched(); ++ } while (err == -EAGAIN && args->next_addr && ++ !mm_is_oom_victim(mm) && !mm_is_migrated(mm, memcg)); ++} ++ ++static struct mm_walk_args *alloc_mm_walk_args(int nid) ++{ ++ struct pglist_data *pgdat; ++ int size = sizeof(struct mm_walk_args); ++ ++ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) || ++ IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ size += sizeof(unsigned long) * BITS_TO_LONGS(PTRS_PER_PMD); ++ ++ if (!current_is_kswapd()) ++ return kvzalloc_node(size, GFP_KERNEL, nid); ++ ++ VM_BUG_ON(nid == NUMA_NO_NODE); ++ ++ pgdat = NODE_DATA(nid); ++ if (!pgdat->mm_walk_args) ++ pgdat->mm_walk_args = kvzalloc_node(size, GFP_KERNEL, nid); ++ ++ return pgdat->mm_walk_args; ++} ++ ++static void free_mm_walk_args(struct mm_walk_args *args) ++{ ++ if (!current_is_kswapd()) ++ kvfree(args); ++} ++ ++static bool inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ int gen, zone; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) ++ return true; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ page_inc_gen(page, lruvec, false); ++ ++ if (!--remaining) ++ return false; ++ } ++ ++ VM_BUG_ON(lrugen->sizes[gen][type][zone]); ++ } ++ ++ reset_controller_pos(lruvec, gen, type); ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ return true; ++} ++ ++static bool try_to_inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ int gen, zone; ++ bool success = false; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ while (get_nr_gens(lruvec, type) > MIN_NR_GENS) { ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ return success; ++ } ++ ++ reset_controller_pos(lruvec, gen, type); ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ success = true; ++ } ++ ++ return success; ++} ++ ++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (max_seq != lrugen->max_seq) ++ goto unlock; ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ if (try_to_inc_min_seq(lruvec, type)) ++ continue; ++ ++ while (!inc_min_seq(lruvec, type)) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq - 1); ++ for_each_type_zone(type, zone) { ++ enum lru_list lru = type * LRU_FILE; ++ long total = lrugen->sizes[gen][type][zone]; ++ ++ if (!total) ++ continue; ++ ++ WARN_ON_ONCE(total != (int)total); ++ ++ update_lru_size(lruvec, lru, zone, total); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -total); ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq + 1); ++ for_each_type_zone(type, zone) { ++ VM_BUG_ON(lrugen->sizes[gen][type][zone]); ++ VM_BUG_ON(!list_empty(&lrugen->lists[gen][type][zone])); ++ } ++ ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_controller_pos(lruvec, gen, type); ++ ++ WRITE_ONCE(lrugen->timestamps[gen], jiffies); ++ /* make sure all preceding modifications appear first */ ++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); ++unlock: ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++/* Main function used by the foreground, the background and the user-triggered aging. */ ++static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, ++ struct scan_control *sc, int swappiness) ++{ ++ bool last; ++ struct mm_walk_args *args; ++ struct mm_struct *mm = NULL; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ int nid = pgdat->node_id; ++ ++ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); ++ ++ /* ++ * If we are not from run_aging() and clearing the accessed bit may ++ * trigger page faults, then don't proceed to clearing all accessed ++ * PTEs. Instead, fallback to lru_gen_scan_around(), which only clears a ++ * handful of accessed PTEs. This is less efficient but causes fewer ++ * page faults on CPUs that don't have the capability. ++ */ ++ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young()) { ++ inc_max_seq(lruvec, max_seq); ++ return true; ++ } ++ ++ args = alloc_mm_walk_args(nid); ++ if (!args) ++ return false; ++ ++ args->memcg = memcg; ++ args->max_seq = max_seq; ++ args->start_pfn = pgdat->node_start_pfn; ++ args->end_pfn = pgdat_end_pfn(pgdat); ++ args->node_id = nid; ++ args->swappiness = swappiness; ++ ++ do { ++ last = get_next_mm(args, &mm); ++ if (mm) ++ walk_mm(args, mm); ++ ++ cond_resched(); ++ } while (mm); ++ ++ free_mm_walk_args(args); ++ ++ if (!last) { ++ /* don't wait unless we may have trouble reclaiming */ ++ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) ++ wait_event_killable(mm_list->nodes[nid].wait, ++ max_seq < READ_ONCE(lrugen->max_seq)); ++ ++ return max_seq < READ_ONCE(lrugen->max_seq); ++ } ++ ++ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); ++ ++ inc_max_seq(lruvec, max_seq); ++ /* either we see any waiters or they will see updated max_seq */ ++ if (wq_has_sleeper(&mm_list->nodes[nid].wait)) ++ wake_up_all(&mm_list->nodes[nid].wait); ++ ++ wakeup_flusher_threads(WB_REASON_VMSCAN); ++ ++ return true; ++} ++ ++/* Protect the working set accessed within the last N milliseconds. */ ++static unsigned long lru_gen_min_ttl; ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg; ++ ++ VM_BUG_ON(!current_is_kswapd()); ++ ++ if (sc->file_is_tiny && mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ .order = sc->order, ++ }; ++ ++ /* to avoid overkilling */ ++ if (!oom_reaping_in_progress()) ++ out_of_memory(&oc); ++ ++ mutex_unlock(&oom_lock); ++ } ++ ++ if (READ_ONCE(lru_gen_min_ttl)) ++ sc->file_is_tiny = 1; ++ ++ if (!mem_cgroup_disabled() && !sc->force_deactivate) { ++ sc->force_deactivate = 1; ++ return; ++ } ++ ++ sc->force_deactivate = 0; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ int swappiness = get_swappiness(memcg); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (get_lo_wmark(max_seq, min_seq, swappiness) == MIN_NR_GENS) ++ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++} ++ ++#define NR_TO_SCAN (SWAP_CLUSTER_MAX * 2) ++#define SIZE_TO_SCAN (NR_TO_SCAN * PAGE_SIZE) ++ ++/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ ++void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ struct page *page; ++ int old_gen, new_gen; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ struct mem_cgroup *memcg = page_memcg(pvmw->page); ++ struct pglist_data *pgdat = page_pgdat(pvmw->page); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ unsigned long bitmap[BITS_TO_LONGS(NR_TO_SCAN)] = {}; ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end); ++ ++ if (end - start > SIZE_TO_SCAN) { ++ if (pvmw->address - start < SIZE_TO_SCAN / 2) ++ end = start + SIZE_TO_SCAN; ++ else if (end - pvmw->address < SIZE_TO_SCAN / 2) ++ start = end - SIZE_TO_SCAN; ++ else { ++ start = pvmw->address - SIZE_TO_SCAN / 2; ++ end = pvmw->address + SIZE_TO_SCAN / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ new_gen = lru_gen_from_seq(READ_ONCE(lruvec->evictable.max_seq)); ++ ++ rcu_read_lock(); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != pgdat->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ continue; ++ ++ old_gen = page_lru_gen(page); ++ if (old_gen < 0) ++ SetPageReferenced(page); ++ else if (old_gen != new_gen) ++ __set_bit(i, bitmap); ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ rcu_read_unlock(); ++ ++ if (bitmap_weight(bitmap, NR_TO_SCAN) < PAGEVEC_SIZE) { ++ for_each_set_bit(i, bitmap, NR_TO_SCAN) ++ activate_page(pte_page(pte[i])); ++ return; ++ } ++ ++ lock_page_memcg(pvmw->page); ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ new_gen = lru_gen_from_seq(lruvec->evictable.max_seq); ++ ++ for_each_set_bit(i, bitmap, NR_TO_SCAN) { ++ page = compound_head(pte_page(pte[i])); ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ lru_gen_update_size(page, lruvec, old_gen, new_gen); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ unlock_page_memcg(pvmw->page); ++} ++ + /****************************************************************************** + * state change + ******************************************************************************/ +@@ -3392,9 +4271,18 @@ static int __meminit __maybe_unused mem_notifier(struct notifier_block *self, + + pgdat = NODE_DATA(nid); + ++ if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) { ++ free_mm_walk_args(pgdat->mm_walk_args); ++ pgdat->mm_walk_args = NULL; ++ return NOTIFY_DONE; ++ } ++ + if (action != MEM_GOING_ONLINE) + return NOTIFY_DONE; + ++ if (!WARN_ON_ONCE(pgdat->mm_walk_args)) ++ pgdat->mm_walk_args = alloc_mm_walk_args(NUMA_NO_NODE); ++ + mutex_lock(&lru_gen_state_mutex); + cgroup_lock(); + +@@ -3443,6 +4331,10 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + ++ VM_BUG_ON(PMD_SIZE / PAGE_SIZE != PTRS_PER_PTE); ++ VM_BUG_ON(PUD_SIZE / PMD_SIZE != PTRS_PER_PMD); ++ VM_BUG_ON(P4D_SIZE / PUD_SIZE != PTRS_PER_PUD); ++ + if (mem_cgroup_disabled()) { + global_mm_list = alloc_mm_list(); + if (!global_mm_list) +@@ -3460,6 +4352,12 @@ static int __init init_lru_gen(void) + */ + arch_initcall(init_lru_gen); + ++#else /* CONFIG_LRU_GEN */ ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4313,6 +5211,11 @@ static void age_active_anon(struct pglist_data *pgdat, + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } ++ + if (!total_swap_pages) + return; + + +From patchwork Wed Aug 18 06:31:04 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443393 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id D3048C4320A + for ; Wed, 18 Aug 2021 06:31:31 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 777DD6109F + for ; Wed, 18 Aug 2021 06:31:31 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 777DD6109F +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id DA3CF6B0081; Wed, 18 Aug 2021 02:31:23 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id D2DD26B0082; Wed, 18 Aug 2021 02:31:23 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id AE3B36B0083; Wed, 18 Aug 2021 02:31:23 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0107.hostedemail.com + [216.40.44.107]) + by kanga.kvack.org (Postfix) with ESMTP id 899016B0081 + for ; Wed, 18 Aug 2021 02:31:23 -0400 (EDT) +Received: from smtpin02.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay02.hostedemail.com (Postfix) with ESMTP id 4707A22873 + for ; Wed, 18 Aug 2021 06:31:23 +0000 (UTC) +X-FDA: 78487229646.02.8986346 +Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com + [209.85.219.202]) + by imf25.hostedemail.com (Postfix) with ESMTP id DC39CB0025A3 + for ; Wed, 18 Aug 2021 06:31:22 +0000 (UTC) +Received: by mail-yb1-f202.google.com with SMTP id + f8-20020a2585480000b02905937897e3daso1836069ybn.2 + for ; Tue, 17 Aug 2021 23:31:22 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=VNwpW6WIdnuG3AYWvj+4AM1ncawHlKx5Ng/ARryIXuE=; + b=i8Y2GYtpM7PPZh/DNiRTva/K7VOlAWL7Ipv4DJKZysp0znS369tGr1/sEesSHeya08 + K5N3BQXLn6KTOTQ1GM9AGZL8I2YcJ+spHm/Fd0/7XtOEqFBYWXAl5le7jGBKaofufkI5 + OzSvHbtp/R/KoG1aqvuWt2aHIBin+CFZizS36WYIrEQOxgiYdmGy4f94pW60l+gC0sl5 + ux75gso95f348dzFl7neo53KnmIr5C/uyENxzS7cQAKozMuIrA4mXDfWfEtQrANan8mN + fpWxhXrxC4FQZQowBtIcWfx764xAqyz3q/FQ3bbmvl0367t7gwyEWzPrYWyg6b1SZUaw + lzjw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=VNwpW6WIdnuG3AYWvj+4AM1ncawHlKx5Ng/ARryIXuE=; + b=QRZyLmsNYwF0G4p+6W6dnEdWUCtb/k6TLn+fza2RmDFlDQzYj/z/K75dRWnijFtZ/g + sDr6aLPKaeXDBOih0+1sUAOms00rTydOphPq3pIB9oCfriLXcj0JyUlV+QZPxeBf2Q8G + ltOZ4u9lMZ0TnxLA1v9fV5AqtENiWeCyX9O/j8IgByjvrAkCkNYOjtsjV4R0oYlx+Nat + lUV5Y2lfqNgeXuEM05gOmjf8DHI/pykyrIG2KdsZ99OPhyoMd31LbFPJsbkSRX+8Lu8B + reGJaQ//gT2H4sck9L3uos/N26KynTflPu9F8V0lYn7pONGjTveCL7JBU53tKptPxpCA + KjVQ== +X-Gm-Message-State: AOAM531aO0v6XHuA5x1ffnqeCTP67WiycMa2UISqnIxuG6dnkBncuG/W + kUh2p4uOGPMG12dciqFDbTYgIggeppILvAq9xdo2Em9VECUTKcVrKNgpTc7AH4SsdnrRRH+Z7Se + p2pMKoLF+bJbcm94amVWkSlZX0F/LAeR5n5I+hUEwQF8QlmJn9wgz90Aa +X-Google-Smtp-Source: + ABdhPJz7Rwe9d+2oKKXDKvePgtoi9ZiFTYPt74TFugQ+Cu9epD8Yd7UFPRBaiMGC6Io319u3NJRTQDCBQrA= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a5b:4cf:: with SMTP id + u15mr9608628ybp.118.1629268282152; + Tue, 17 Aug 2021 23:31:22 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:04 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-9-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 08/11] mm: multigenerational lru: eviction +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +X-Rspamd-Queue-Id: DC39CB0025A3 +Authentication-Results: imf25.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=i8Y2GYtp; + spf=pass (imf25.hostedemail.com: domain of + 3OqkcYQYKCA8D9Ewp3v33v0t.r310x29C-11zAprz.36v@flex--yuzhao.bounces.google.com + designates 209.85.219.202 as permitted sender) + smtp.mailfrom=3OqkcYQYKCA8D9Ewp3v33v0t.r310x29C-11zAprz.36v@flex--yuzhao.bounces.google.com; + dmarc=pass (policy=reject) header.from=google.com +X-Rspamd-Server: rspam01 +X-Stat-Signature: unw5omaufip3b6z17qs8rzm3rhywckaz +X-HE-Tag: 1629268282-161513 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +The eviction consumes old generations. Given an lruvec, the eviction +scans pages on lrugen->lists indexed by anon and file min_seq[2] +(modulo MAX_NR_GENS). It first tries to select a type based on the +values of min_seq[2]. If they are equal, it selects the type that has +a lower refault rate. The eviction sorts a page according to its +updated generation number if the aging has found this page accessed. +It also moves a page to the next generation if this page is from an +upper tier that has a higher refault rate than the base tier. The +eviction increments min_seq[2] of a selected type when it finds +lrugen->lists indexed by min_seq[2] of this selected type are empty. + +With the aging and the eviction in place, implementing page reclaim +becomes quite straightforward: + 1) To reduce the latency, direct reclaim skips the aging unless both + min_seq[2] are equal to max_seq-1. Then it invokes the eviction. + 2) To avoid the aging in the direct reclaim path, kswapd invokes the + aging if either of min_seq[2] is equal to max_seq-1. Then it invokes + the eviction. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + mm/vmscan.c | 440 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 440 insertions(+) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 757ba4f415cc..2f1fffbd2d61 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1311,6 +1311,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* lru_gen_scan_around() has updated this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2447,6 +2452,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -4087,6 +4095,426 @@ void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw) + unlock_page_memcg(pvmw->page); + } + ++/****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_to_isolate) ++{ ++ bool success; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int tier = page_lru_tier(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON_PAGE(gen < 0, page); ++ VM_BUG_ON_PAGE(tier_to_isolate < 0, page); ++ ++ /* a lazy-free page that has been written into? */ ++ if (type && PageDirty(page) && PageAnon(page)) { ++ success = lru_gen_del_page(page, lruvec, false); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageSwapBacked(page); ++ add_page_to_lru_list_tail(page, lruvec); ++ return true; ++ } ++ ++ /* page_update_gen() has updated this page? */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ /* protect this page if its tier has a higher refault rate */ ++ if (tier_to_isolate < tier) { ++ int hist = lru_hist_from_seq(gen); ++ ++ page_inc_gen(page, lruvec, false); ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + thp_nr_pages(page)); ++ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type); ++ return true; ++ } ++ ++ /* mark this page for reclaim if it's pending writeback */ ++ if (PageWriteback(page) || (type && PageDirty(page))) { ++ page_inc_gen(page, lruvec, true); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ ++ if (!sc->may_unmap && page_mapped(page)) ++ return false; ++ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) ++ return false; ++ ++ if (!get_page_unless_zero(page)) ++ return false; ++ ++ if (!TestClearPageLRU(page)) { ++ put_page(page); ++ return false; ++ } ++ ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ ++ return true; ++} ++ ++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, long *nr_to_scan, ++ int type, int tier, struct list_head *list) ++{ ++ bool success; ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_BUG_ON(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return -ENOENT; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ scanned += delta; ++ ++ if (sort_page(page, lruvec, tier)) ++ sorted += delta; ++ else if (isolate_page(page, lruvec, sc)) { ++ list_add(&page->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&page->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining) ++ break; ++ ++ if (max(isolated, skipped) >= SWAP_CLUSTER_MAX) ++ break; ++ } ++ ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ ++ if (!remaining || isolated >= SWAP_CLUSTER_MAX) ++ break; ++ } ++ ++ success = try_to_inc_min_seq(lruvec, type); ++ ++ *nr_to_scan -= scanned; ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ if (isolated) ++ return isolated; ++ /* ++ * We may have trouble finding eligible pages due to reclaim_idx, ++ * may_unmap and may_writepage. The following check makes sure we won't ++ * be stuck if we aren't making enough progress. ++ */ ++ return !remaining || success || *nr_to_scan <= 0 ? 0 : -ENOENT; ++} ++ ++static int get_tier_to_isolate(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct controller_pos sp, pv; ++ ++ /* ++ * Ideally we don't want to evict upper tiers that have higher refault ++ * rates. However, we need to leave a margin for the fluctuations in ++ * refault rates. So we use a larger gain factor to make sure upper ++ * tiers are indeed more active. We choose 2 because the lowest upper ++ * tier would have twice of the refault rate of the base tier, according ++ * to their numbers of accesses. ++ */ ++ read_controller_pos(&sp, lruvec, type, 0, 1); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_controller_pos(&pv, lruvec, type, tier, 2); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_to_isolate) ++{ ++ int type, tier; ++ struct controller_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare the refault rates between the base tiers of anon and file to ++ * determine which type to evict. Also need to compare the refault rates ++ * of the upper tiers of the selected type with that of the base tier of ++ * the other type to determine which tier of the selected type to evict. ++ */ ++ read_controller_pos(&sp, lruvec, 0, 0, gain[0]); ++ read_controller_pos(&pv, lruvec, 1, 0, gain[1]); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_controller_pos(&sp, lruvec, !type, 0, gain[!type]); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_controller_pos(&pv, lruvec, type, tier, gain[type]); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_to_isolate = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ long *nr_to_scan, int *type_to_scan, struct list_head *list) ++{ ++ int i; ++ int type; ++ int isolated; ++ int tier = -1; ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (get_hi_wmark(max_seq, min_seq, swappiness) == MIN_NR_GENS) ++ return 0; ++ /* ++ * Try to select a type based on generations and swappiness, and if that ++ * fails, fall back to get_type_to_scan(). When anon and file are both ++ * available from the same generation, swappiness 200 is interpreted as ++ * anon first and swappiness 1 is interpreted as file first. ++ */ ++ if (!swappiness) ++ type = 1; ++ else if (min_seq[0] > min_seq[1]) ++ type = 1; ++ else if (min_seq[0] < min_seq[1]) ++ type = 0; ++ else if (swappiness == 1) ++ type = 1; ++ else if (swappiness == 200) ++ type = 0; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ if (tier == -1) ++ tier = get_tier_to_isolate(lruvec, type); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ isolated = scan_pages(lruvec, sc, nr_to_scan, type, tier, list); ++ if (isolated >= 0) ++ break; ++ ++ type = !type; ++ tier = get_tier_to_isolate(lruvec, type); ++ } ++ ++ if (isolated < 0) ++ isolated = *nr_to_scan = 0; ++ ++ *type_to_scan = type; ++ ++ return isolated; ++} ++ ++/* Main function used by the foreground, the background and the user-triggered eviction. */ ++static bool evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ long *nr_to_scan) ++{ ++ int type; ++ int isolated; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct page *page; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ isolated = isolate_pages(lruvec, sc, swappiness, nr_to_scan, &type, &list); ++ VM_BUG_ON(list_empty(&list) == !!isolated); ++ ++ if (isolated) ++ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, isolated); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (!isolated) ++ goto done; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ /* ++ * We need to prevent rejected pages from being added back to the same ++ * lists they were isolated from. Otherwise we may risk looping on them ++ * forever. ++ */ ++ list_for_each_entry(page, &list, lru) { ++ if (!page_evictable(page)) ++ continue; ++ ++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) ++ SetPageActive(page); ++ ++ ClearPageReferenced(page); ++ ClearPageWorkingset(page); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, -isolated); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++done: ++ return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim; ++} ++ ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int gen, type, zone; ++ int nr_gens; ++ long nr_to_scan = 0; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ for (type = !swappiness; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone <= sc->reclaim_idx; zone++) ++ nr_to_scan += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ } ++ } ++ ++ if (nr_to_scan <= 0) ++ return 0; ++ ++ nr_gens = get_hi_wmark(max_seq, min_seq, swappiness); ++ ++ if (current_is_kswapd()) { ++ gen = lru_gen_from_seq(max_seq - nr_gens + 1); ++ if (time_is_before_eq_jiffies(READ_ONCE(lrugen->timestamps[gen]) + ++ READ_ONCE(lru_gen_min_ttl))) ++ sc->file_is_tiny = 0; ++ ++ /* leave the work to lru_gen_age_node() */ ++ if (nr_gens == MIN_NR_GENS) ++ return 0; ++ ++ if (nr_to_scan >= sc->nr_to_reclaim) ++ sc->force_deactivate = 0; ++ } ++ ++ nr_to_scan = max(nr_to_scan >> sc->priority, (long)!mem_cgroup_online(memcg)); ++ if (!nr_to_scan || nr_gens > MIN_NR_GENS) ++ return nr_to_scan; ++ ++ /* move onto other memcgs if we haven't tried them all yet */ ++ if (!mem_cgroup_disabled() && !sc->force_deactivate) { ++ sc->skipped_deactivate = 1; ++ return 0; ++ } ++ ++ return try_to_inc_max_seq(lruvec, max_seq, sc, swappiness) ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ long scanned = 0; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ lru_add_drain(); ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ long nr_to_scan; ++ int swappiness = sc->may_swap ? get_swappiness(memcg) : 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness) - scanned; ++ if (nr_to_scan <= 0) ++ break; ++ ++ scanned += nr_to_scan; ++ ++ if (!evict_pages(lruvec, sc, swappiness, &nr_to_scan)) ++ break; ++ ++ scanned -= nr_to_scan; ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++} ++ + /****************************************************************************** + * state change + ******************************************************************************/ +@@ -4358,6 +4786,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + } + ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4371,6 +4803,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -4837,6 +5274,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; + +From patchwork Wed Aug 18 06:31:05 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443395 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id B02A6C4338F + for ; Wed, 18 Aug 2021 06:31:34 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 514516103A + for ; Wed, 18 Aug 2021 06:31:34 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 514516103A +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 1BC966B0082; Wed, 18 Aug 2021 02:31:25 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 148886B0083; Wed, 18 Aug 2021 02:31:25 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id EDB7E8D0001; Wed, 18 Aug 2021 02:31:24 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0122.hostedemail.com + [216.40.44.122]) + by kanga.kvack.org (Postfix) with ESMTP id CF7766B0082 + for ; Wed, 18 Aug 2021 02:31:24 -0400 (EDT) +Received: from smtpin27.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay01.hostedemail.com (Postfix) with ESMTP id 855BB1842C4BE + for ; Wed, 18 Aug 2021 06:31:24 +0000 (UTC) +X-FDA: 78487229688.27.63DECA5 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf03.hostedemail.com (Postfix) with ESMTP id 4315030039A3 + for ; Wed, 18 Aug 2021 06:31:24 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + d69-20020a25e6480000b02904f4a117bd74so1786362ybh.17 + for ; Tue, 17 Aug 2021 23:31:24 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=Sa/+kI2QQ5yzW+73rZnyPuODQVSPnlfsCdgMQjH8k3w=; + b=p/kWK0lL+i7CF7DanrRtUfMJFmmz8fQT4ClShk8ocdN5YWEXJPS5ufC7DcnocXjud4 + mfpRwtLsUt85zDj/Vel9Sy3Jjj3hox0+z3qxcKB3JAPlTaBCLqAbVsujMvLixzDj8xpA + 7YbURlMIa/XunHEgChappMMLMe1W0v1tpJNhoqYo6dtXDFTS7fgcidPpm1UdfhuQt+Cm + GUlkTg8AKW4VeZqM8zlDD46cHZQg9YvXeKCIo/LYj1mk7JTr8d8wRdeibTuEBDC5GgLg + sFPtmD9SiU8NbM6KRor52pD3S3prQeHrpScJtPebeGIPNqCVE2YafoCfIG0lG5wFHL7H + fyuw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=Sa/+kI2QQ5yzW+73rZnyPuODQVSPnlfsCdgMQjH8k3w=; + b=Gg7WAcEuFJjIqPfsy7dFA0EDBhB5GOqUbL1z47Xgiv6tAvHtQYeEpHw/2v7Qk6EDhM + 1iywY6ESesPeqxD6G2GGyUfugp4PX7XyqXOEY5B4MfArZOzGvOU6ekMimKd5D3TGuzTN + ePo2+D6fFIfIAfgKxKnJZKujnhNJ0yD2Jhe+WGXh2SHNS56BnVHx2yxD5IdhV2k0Nspv + BGrwr2gyRBJ8gx06nEkvT8WeVSYjyUIJDBrJ9jRkVDd0SYA84cpn3f+gv7REeB/VLDLj + LVNyAUEq2MiS4JEz08SATFl5J0NPssI6PJfyri9CexnXs+ocn+mpSpMPxSartPFAXaJ2 + /g3g== +X-Gm-Message-State: AOAM530qMYJEMg8VzkiRKR3/s4rbAsfzh6ygUrzbXFUvWV7MtfR834TG + TxDICFg/pD3oJcIYpJwVj/+Pe4uO+2a1Z6zz2rMrP82K1q7L/8xe47r0rR9M2Z6pJGogycICk2k + kp6VBJ+DYiHYUCOZswEW6sPpaLibf7LdCP/fggG/QnDkqn4FBNJPTJoAe +X-Google-Smtp-Source: + ABdhPJz8Y4PWzvx36wd+Xufm5SH2b/iFjdnqIog9P1g4iCjFBZjzAPAITo1V9qboEamgEU0upnvDj06FKHU= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:bc82:: with SMTP id + e2mr8957312ybk.307.1629268283542; + Tue, 17 Aug 2021 23:31:23 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:05 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-10-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 09/11] mm: multigenerational lru: user interface +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +X-Rspamd-Queue-Id: 4315030039A3 +Authentication-Results: imf03.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b="p/kWK0lL"; + spf=pass (imf03.hostedemail.com: domain of + 3O6kcYQYKCBAEAFxq4w44w1u.s421y3AD-220Bqs0.47w@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3O6kcYQYKCBAEAFxq4w44w1u.s421y3AD-220Bqs0.47w@flex--yuzhao.bounces.google.com; + dmarc=pass (policy=reject) header.from=google.com +X-Rspamd-Server: rspam01 +X-Stat-Signature: 4pwa8jkozqtc6scuyzxpkd8sxheuqpu6 +X-HE-Tag: 1629268284-50209 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add /sys/kernel/mm/lru_gen/enabled to enable and disable the +multigenerational lru at runtime. + +Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a +given number of milliseconds. The OOM killer is invoked if this +working set cannot be kept in memory. + +Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and +invoke the aging and the eviction. This file has the following output: + memcg memcg_id memcg_path + node node_id + min_gen birth_time anon_size file_size + ... + max_gen birth_time anon_size file_size + +min_gen is the oldest generation number and max_gen is the youngest +generation number. birth_time is in milliseconds. anon_size and +file_size are in pages. + +This file takes the following input: + + memcg_id node_id max_gen [swappiness] + - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] + +The first command line invokes the aging, which scans PTEs for +accessed pages and then creates the next generation max_gen+1. A swap +file and a non-zero swappiness, which overrides vm.swappiness, are +required to scan PTEs mapping anon pages. The second command line +invokes the eviction, which evicts generations less than or equal to +min_gen. min_gen should be less than max_gen-1 as max_gen and +max_gen-1 are not fully aged and therefore cannot be evicted. +nr_to_reclaim can be used to limit the number of pages to evict. +Multiple command lines are supported, as is concatenation with +delimiters "," and ";". + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + include/linux/nodemask.h | 1 + + mm/vmscan.c | 412 +++++++++++++++++++++++++++++++++++++++ + 2 files changed, 413 insertions(+) + +diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h +index 567c3ddba2c4..90840c459abc 100644 +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state) + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 2f1fffbd2d61..c6d539a73d00 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -52,6 +52,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -4732,6 +4734,410 @@ static int __meminit __maybe_unused mem_notifier(struct notifier_block *self, + return NOTIFY_DONE; + } + ++/****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++} ++ ++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ unsigned int msecs; ++ ++ if (kstrtouint(buf, 10, &msecs)) ++ return -EINVAL; ++ ++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( ++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl ++); ++ ++static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); ++} ++ ++static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ int enable; ++ ++ if (kstrtoint(buf, 10, &enable)) ++ return -EINVAL; ++ ++ lru_gen_set_state(enable, true, false); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enable, store_enable ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_min_ttl_attr.attr, ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ ++/****************************************************************************** ++ * debugfs interface ++ ******************************************************************************/ ++ ++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct mem_cgroup *memcg; ++ loff_t nr_to_skip = *pos; ++ ++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); ++ if (!m->private) ++ return ERR_PTR(-ENOMEM); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ if (!nr_to_skip--) ++ return mem_cgroup_lruvec(memcg, NODE_DATA(nid)); ++ } ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ return NULL; ++} ++ ++static void lru_gen_seq_stop(struct seq_file *m, void *v) ++{ ++ if (!IS_ERR_OR_NULL(v)) ++ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); ++ ++ kvfree(m->private); ++ m->private = NULL; ++} ++ ++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ int nid = lruvec_pgdat(v)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(v); ++ ++ ++*pos; ++ ++ nid = next_memory_node(nid); ++ if (nid == MAX_NUMNODES) { ++ memcg = mem_cgroup_iter(NULL, memcg, NULL); ++ if (!memcg) ++ return NULL; ++ ++ nid = first_memory_node; ++ } ++ ++ return mem_cgroup_lruvec(memcg, NODE_DATA(nid)); ++} ++ ++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, ++ unsigned long max_seq, unsigned long *min_seq, ++ unsigned long seq) ++{ ++ int i; ++ int type, tier; ++ int hist = lru_hist_from_seq(seq); ++ struct lrugen *lrugen = &lruvec->evictable; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ seq_printf(m, " %10d", tier); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ unsigned long n[3] = {}; ++ ++ if (seq == max_seq) { ++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); ++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); ++ ++ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); ++ } else if (seq == min_seq[type] || NR_STAT_GENS > 1) { ++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); ++ ++ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); ++ } else ++ seq_puts(m, " 0 0 0 "); ++ } ++ seq_putc(m, '\n'); ++ } ++ ++ seq_puts(m, " "); ++ for (i = 0; i < NR_MM_STATS; i++) { ++ if (i == 6) ++ seq_puts(m, "\n "); ++ ++ if (seq == max_seq && NR_STAT_GENS == 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(mm_list->nodes[nid].stats[hist][i]), ++ toupper(MM_STAT_CODES[i])); ++ else if (seq != max_seq && NR_STAT_GENS > 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(mm_list->nodes[nid].stats[hist][i]), ++ MM_STAT_CODES[i]); ++ else ++ seq_puts(m, " 0 "); ++ } ++ seq_putc(m, '\n'); ++} ++ ++static int lru_gen_seq_show(struct seq_file *m, void *v) ++{ ++ unsigned long seq; ++ bool full = !debugfs_real_fops(m->file)->write; ++ struct lruvec *lruvec = v; ++ struct lrugen *lrugen = &lruvec->evictable; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (nid == first_memory_node) { ++ const char *path = memcg ? m->private : ""; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); ++#endif ++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); ++ } ++ ++ seq_printf(m, " node %5d\n", nid); ++ ++ if (!full) ++ seq = min(min_seq[0], min_seq[1]); ++ else if (max_seq >= MAX_NR_GENS) ++ seq = max_seq - MAX_NR_GENS + 1; ++ else ++ seq = 0; ++ ++ for (; seq <= max_seq; seq++) { ++ int gen, type, zone; ++ unsigned int msecs; ++ ++ gen = lru_gen_from_seq(seq); ++ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); ++ ++ seq_printf(m, " %10lu %10u", seq, msecs); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ long size = 0; ++ ++ if (seq < min_seq[type]) { ++ seq_puts(m, " -0 "); ++ continue; ++ } ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ seq_printf(m, " %10lu ", max(size, 0L)); ++ } ++ ++ seq_putc(m, '\n'); ++ ++ if (full) ++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); ++ } ++ ++ return 0; ++} ++ ++static const struct seq_operations lru_gen_seq_ops = { ++ .start = lru_gen_seq_start, ++ .stop = lru_gen_seq_stop, ++ .next = lru_gen_seq_next, ++ .show = lru_gen_seq_show, ++}; ++ ++static int run_aging(struct lruvec *lruvec, unsigned long seq, int swappiness) ++{ ++ struct scan_control sc = {}; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq == max_seq) ++ try_to_inc_max_seq(lruvec, max_seq, &sc, swappiness); ++ ++ return seq > max_seq ? -EINVAL : 0; ++} ++ ++static int run_eviction(struct lruvec *lruvec, unsigned long seq, int swappiness, ++ unsigned long nr_to_reclaim) ++{ ++ unsigned int flags; ++ struct blk_plug plug; ++ int err = -EINTR; ++ long nr_to_scan = LONG_MAX; ++ struct scan_control sc = { ++ .nr_to_reclaim = nr_to_reclaim, ++ .may_writepage = 1, ++ .may_unmap = 1, ++ .may_swap = 1, ++ .reclaim_idx = MAX_NR_ZONES - 1, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq >= max_seq - 1) ++ return -EINVAL; ++ ++ flags = memalloc_noreclaim_save(); ++ ++ blk_start_plug(&plug); ++ ++ while (!signal_pending(current)) { ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < min(min_seq[!swappiness], min_seq[swappiness < 200]) || ++ !evict_pages(lruvec, &sc, swappiness, &nr_to_scan)) { ++ err = 0; ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ memalloc_noreclaim_restore(flags); ++ ++ return err; ++} ++ ++static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, ++ int swappiness, unsigned long nr_to_reclaim) ++{ ++ struct lruvec *lruvec; ++ int err = -EINVAL; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (!mem_cgroup_disabled()) { ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_id(memcg_id); ++#ifdef CONFIG_MEMCG ++ if (memcg && !css_tryget(&memcg->css)) ++ memcg = NULL; ++#endif ++ rcu_read_unlock(); ++ ++ if (!memcg) ++ goto done; ++ } ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto done; ++ ++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) ++ goto done; ++ ++ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); ++ ++ if (swappiness == -1) ++ swappiness = get_swappiness(memcg); ++ else if (swappiness > 200U) ++ goto done; ++ ++ switch (cmd) { ++ case '+': ++ err = run_aging(lruvec, seq, swappiness); ++ break; ++ case '-': ++ err = run_eviction(lruvec, seq, swappiness, nr_to_reclaim); ++ break; ++ } ++done: ++ mem_cgroup_put(memcg); ++ ++ return err; ++} ++ ++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, ++ size_t len, loff_t *pos) ++{ ++ void *buf; ++ char *cur, *next; ++ int err = 0; ++ ++ buf = kvmalloc(len + 1, GFP_USER); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, src, len)) { ++ kvfree(buf); ++ return -EFAULT; ++ } ++ ++ next = buf; ++ next[len] = '\0'; ++ ++ while ((cur = strsep(&next, ",;\n"))) { ++ int n; ++ int end; ++ char cmd; ++ unsigned int memcg_id; ++ unsigned int nid; ++ unsigned long seq; ++ unsigned int swappiness = -1; ++ unsigned long nr_to_reclaim = -1; ++ ++ cur = skip_spaces(cur); ++ if (!*cur) ++ continue; ++ ++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, ++ &seq, &end, &swappiness, &end, &nr_to_reclaim, &end); ++ if (n < 4 || cur[end]) { ++ err = -EINVAL; ++ break; ++ } ++ ++ err = run_cmd(cmd, memcg_id, nid, seq, swappiness, nr_to_reclaim); ++ if (err) ++ break; ++ } ++ ++ kvfree(buf); ++ ++ return err ? : len; ++} ++ ++static int lru_gen_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &lru_gen_seq_ops); ++} ++ ++static const struct file_operations lru_gen_rw_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .write = lru_gen_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations lru_gen_ro_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -4772,6 +5178,12 @@ static int __init init_lru_gen(void) + if (hotplug_memory_notifier(mem_notifier, 0)) + pr_err("lru_gen: failed to subscribe hotplug notifications\n"); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ ++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); ++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); ++ + return 0; + }; + /* + +From patchwork Wed Aug 18 06:31:06 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443397 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 3FCB3C4320E + for ; Wed, 18 Aug 2021 06:31:37 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id EA34560720 + for ; Wed, 18 Aug 2021 06:31:36 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org EA34560720 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 9FABE6B0083; Wed, 18 Aug 2021 02:31:26 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 9A9E36B0085; Wed, 18 Aug 2021 02:31:26 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 824228D0001; Wed, 18 Aug 2021 02:31:26 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0096.hostedemail.com + [216.40.44.96]) + by kanga.kvack.org (Postfix) with ESMTP id 61AC46B0083 + for ; Wed, 18 Aug 2021 02:31:26 -0400 (EDT) +Received: from smtpin18.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay03.hostedemail.com (Postfix) with ESMTP id 0C6338249980 + for ; Wed, 18 Aug 2021 06:31:26 +0000 (UTC) +X-FDA: 78487229772.18.1012126 +Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com + [209.85.219.202]) + by imf11.hostedemail.com (Postfix) with ESMTP id C16AFF0058AD + for ; Wed, 18 Aug 2021 06:31:25 +0000 (UTC) +Received: by mail-yb1-f202.google.com with SMTP id + j9-20020a2581490000b02905897d81c63fso1816744ybm.8 + for ; Tue, 17 Aug 2021 23:31:25 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=ObPkY95BgccMVOXBvY8W42OB1HqUP7YDieqWubLPiME=; + b=Jtstdp0V+Xh88MSFciGZx/6Crwh6uax0pLuw3AXs+D9oj1KbSd4HzqBIam2qrHZk5g + s6jTCXgbdB9PNkW+XseCkH6f/SPnpj+XnPohW32F/qWDIdj7SKz1f0lnCEZ8LmwdB716 + Rvc1f56BY2VSkf0TJZ2KH2yJfDtj+1gU2XMmm4u5Bt6jYwhCgsspJjhwVZKFfJUTrCz/ + XZVzMwPQYhql8Y8pTREUZGN5i0txpFG5vvusnI8qF9fCezySVG3pknbNJwrstN6Q+nbI + 0vRgf5epo1QPD4Pm/Aj6L2XrjBdYdKaX+yhcs/uRqo5Y444dTY1fVQPIgzKndV2ESXHr + EBPA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=ObPkY95BgccMVOXBvY8W42OB1HqUP7YDieqWubLPiME=; + b=Cfb3bIYH/X5ffmWyrrvJ2r6l88JBTlwNYhVYwZ1kPPhLPvLtOHCo5g1OyHk4RuJDN6 + hihtvhEx/uo6wAcvAdslydHefgcyhQwm3+XmVsTDvb++XMubb0RLNzFmLGFA7GntEgBh + zas0bdN4o3+Xp8Ki1j2+plHhpjLcGFLEdKL6NQkj4bBL3vIAHKOiFFBuDpbcfjMja6ei + p4+GFLpRAa35tNqgvbMu4TGFF9zSAexU/cjDG2RsDXxqNh7/OwPa0yFv7DQvNxrDhcvj + gmYB0AQTUKQLcnCqyKndANtpbLv8CW52/RPOHz6dE4jRbP8+fpdiHtpB56QZK/o2r/w5 + 8fbA== +X-Gm-Message-State: AOAM530uSIuKV+NjYSJ8uzL68ywRxE5GTZdZPQXM2wJuA/CHQhqQz2Kx + 6MVKq2J6YSR/BSYgSG8cZiBDUmMp/AnKKypt4zWQhD+HM9KlaiEkqlXgimpcBbrRhjd5kaRQT3T + 32mvdtUDTygsupZBOsJzuL08fYBnijlBa6pvVwjEKfj2QTa0k6HX1Qqty +X-Google-Smtp-Source: + ABdhPJxfsOO100ggYeO0sY++gqlc3MsdnvEUTquZH6o+BnI8n97BCwdldFsnuDGXBQoFBIYNUCH/1sFG8Qo= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:42d8:: with SMTP id + p207mr9419451yba.270.1629268285077; + Tue, 17 Aug 2021 23:31:25 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:06 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-11-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 10/11] mm: multigenerational lru: Kconfig +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +Authentication-Results: imf11.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=Jtstdp0V; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf11.hostedemail.com: domain of + 3PakcYQYKCBIGCHzs6y66y3w.u64305CF-442Dsu2.69y@flex--yuzhao.bounces.google.com + designates 209.85.219.202 as permitted sender) + smtp.mailfrom=3PakcYQYKCBIGCHzs6y66y3w.u64305CF-442Dsu2.69y@flex--yuzhao.bounces.google.com +X-Stat-Signature: wbnonwezckp6pkry7sy33qudztsfo9q4 +X-Rspamd-Queue-Id: C16AFF0058AD +X-Rspamd-Server: rspam05 +X-HE-Tag: 1629268285-115850 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add configuration options for the multigenerational lru. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 59 insertions(+) + +diff --git a/mm/Kconfig b/mm/Kconfig +index 40a9bfcd5062..4cd257cfdf84 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -889,4 +889,63 @@ config IO_MAPPING + config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + ++# the multigenerational lru { ++config LRU_GEN ++ bool "Multigenerational LRU" ++ depends on MMU ++ # the following options may leave not enough spare bits in page->flags ++ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) ++ help ++ A high performance LRU implementation to heavily overcommit workloads ++ that are not IO bound. See Documentation/vm/multigen_lru.rst for ++ details. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces a small per-process and per-memcg and per-node memory ++ overhead. ++ ++config LRU_GEN_ENABLED ++ bool "Turn on by default" ++ depends on LRU_GEN ++ help ++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option ++ changes it to 1. ++ ++ Warning: the default value is the fast path. See ++ Documentation/static-keys.txt for details. ++ ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ This option keeps full stats for each generation, which can be read ++ from /sys/kernel/debug/lru_gen_full. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces an additional small per-process and per-memcg and ++ per-node memory overhead. ++ ++config NR_LRU_GENS ++ int "Max number of generations" ++ depends on LRU_GEN ++ range 4 31 ++ default 7 ++ help ++ This will use order_base_2(N+1) spare bits from page flags. ++ ++ Warning: do not use numbers larger than necessary because each ++ generation introduces a small per-node and per-memcg memory overhead. ++ ++config TIERS_PER_GEN ++ int "Number of tiers per generation" ++ depends on LRU_GEN ++ range 2 5 ++ default 4 ++ help ++ This will use N-2 spare bits from page flags. ++ ++ Larger values generally offer better protection to active pages under ++ heavy buffered I/O workloads. ++# } ++ + endmenu + +From patchwork Wed Aug 18 06:31:07 2021 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12443399 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +X-Spam-Level: +X-Spam-Status: No, score=-26.3 required=3.0 tests=BAYES_00,DKIMWL_WL_MED, + DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS, + INCLUDES_CR_TRAILER,INCLUDES_PATCH,MAILING_LIST_MULTI,SPF_HELO_NONE,SPF_PASS, + URIBL_BLOCKED,USER_AGENT_GIT,USER_IN_DEF_DKIM_WL autolearn=ham + autolearn_force=no version=3.4.0 +Received: from mail.kernel.org (mail.kernel.org [198.145.29.99]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 85412C4338F + for ; Wed, 18 Aug 2021 06:31:39 +0000 (UTC) +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by mail.kernel.org (Postfix) with ESMTP id 37A5B60F11 + for ; Wed, 18 Aug 2021 06:31:39 +0000 (UTC) +DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 37A5B60F11 +Authentication-Results: mail.kernel.org; + dmarc=fail (p=reject dis=none) header.from=google.com +Authentication-Results: mail.kernel.org; spf=pass smtp.mailfrom=kvack.org +Received: by kanga.kvack.org (Postfix) + id 25A286B0085; Wed, 18 Aug 2021 02:31:28 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 1E4B48D0001; Wed, 18 Aug 2021 02:31:28 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id F29206B0088; Wed, 18 Aug 2021 02:31:27 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from forelay.hostedemail.com (smtprelay0215.hostedemail.com + [216.40.44.215]) + by kanga.kvack.org (Postfix) with ESMTP id D31CC6B0085 + for ; Wed, 18 Aug 2021 02:31:27 -0400 (EDT) +Received: from smtpin11.hostedemail.com (10.5.19.251.rfc1918.com + [10.5.19.251]) + by forelay02.hostedemail.com (Postfix) with ESMTP id 65DCA22892 + for ; Wed, 18 Aug 2021 06:31:27 +0000 (UTC) +X-FDA: 78487229814.11.5C300F7 +Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com + [209.85.219.202]) + by imf05.hostedemail.com (Postfix) with ESMTP id 22BC45048BB2 + for ; Wed, 18 Aug 2021 06:31:27 +0000 (UTC) +Received: by mail-yb1-f202.google.com with SMTP id + n20-20020a2540140000b0290593b8e64cd5so1837294yba.3 + for ; Tue, 17 Aug 2021 23:31:26 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20161025; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc; + bh=EuYCgt8+lkHBd5kOQmKG+gfowm77wjwHGDYaQfqgXkM=; + b=jf1KXu7fmEUdMsKv8qqPS1uXlv+56ikFPzF13yc3+KTvUt6asr/dfm42U7m8z2oaxW + A1lZnhfuZsNz6idvEFIx7MwSYBYmByXzQK4ED92Tl/aOYre4fO0pStwZP5hfQyZoLhpA + k9RuTVA9AcmArHPO54uF/Ki4EABqeALUSuj9BtbZVk/q1wrblw+DakX+xFVyH0ZPllpN + QRr9BLXmVbCFxjf9LCMyU2/3AuDTtwNHaELFoGy21fgiwitZOhlfWs7IBfT24zSAzv5z + gcDc1UyinDFGNwilfZtrE/BXLAPyfDw73xKZU00G6uLKrvxEGHYLjgWMad1/KSypqjGn + s8Lg== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20161025; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc; + bh=EuYCgt8+lkHBd5kOQmKG+gfowm77wjwHGDYaQfqgXkM=; + b=bpiAyfCtnbVkWWkC4NcCa/hvNnmnf6xRF1m0fFRGnwecJGS0w262LZqzVbxoeQBvSm + MWqCNMWz/r+IAMkMEnQriPzrzmN5bnLpGek7cCxpZ5uS+GUOPfa8fvupyhsNoqY4vaUE + CcWqxTi3vSEKb+HmcenIywGxM+NysnOi20GQPMwwzeOzpzL344XNS8cGV+twx2ABb929 + KedIbHHb4lNSlhnFA1CeIfWDBCTFTKFMl+gOM5r6Nr7wnUZSyDlJdG7QjNAWvGL+IV6M + 09U73b0C1/UxR3+TG0Aq2tol5eGmRa6VWIKC5/1loFYTZVMCsBmBO4sR/Yhp2G+Aa7wp + Rvyg== +X-Gm-Message-State: AOAM533+43noaAkuihpXxsnRGizyoA+MqzrgGJmzbTF/yoZ5rDVRSEmI + 1XV/a6O4TzpPo4fCtQk7U2V6pdMwTmWsijUigwaqACx8vsoKbQSYJqYccH9apUFjPaz4mGEGngr + 0ADDt2XZ8kuTST9dnWeM/OyHpR43CeWBegLqpSeX4M+VQWUTTmbgwdNMZ +X-Google-Smtp-Source: + ABdhPJyHuC5mbVB/nEzN5pzEvd35CiT+uZVTZdGa9/B0v3WwM0r7rwsMuOQ4SlTfuOuU4UXhfMr9DYHhRjw= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:41f0:f89:87cd:8bd0]) + (user=yuzhao job=sendgmr) by 2002:a25:f310:: with SMTP id + c16mr8599656ybs.464.1629268286452; + Tue, 17 Aug 2021 23:31:26 -0700 (PDT) +Date: Wed, 18 Aug 2021 00:31:07 -0600 +In-Reply-To: <20210818063107.2696454-1-yuzhao@google.com> +Message-Id: <20210818063107.2696454-12-yuzhao@google.com> +Mime-Version: 1.0 +References: <20210818063107.2696454-1-yuzhao@google.com> +X-Mailer: git-send-email 2.33.0.rc1.237.g0d66db33f3-goog +Subject: [PATCH v4 11/11] mm: multigenerational lru: documentation +From: Yu Zhao +To: linux-mm@kvack.org +Cc: linux-kernel@vger.kernel.org, Hillf Danton , + page-reclaim@google.com, + Yu Zhao , Konstantin Kharlamov +Authentication-Results: imf05.hostedemail.com; + dkim=pass header.d=google.com header.s=20161025 header.b=jf1KXu7f; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf05.hostedemail.com: domain of + 3PqkcYQYKCBMHDI0t7z77z4x.v75416DG-553Etv3.7Az@flex--yuzhao.bounces.google.com + designates 209.85.219.202 as permitted sender) + smtp.mailfrom=3PqkcYQYKCBMHDI0t7z77z4x.v75416DG-553Etv3.7Az@flex--yuzhao.bounces.google.com +X-Stat-Signature: c4nq3g7ct4oiq961sdqddouod8zp6x84 +X-Rspamd-Queue-Id: 22BC45048BB2 +X-Rspamd-Server: rspam05 +X-HE-Tag: 1629268287-160661 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add Documentation/vm/multigen_lru.rst. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +--- + Documentation/vm/index.rst | 1 + + Documentation/vm/multigen_lru.rst | 134 ++++++++++++++++++++++++++++++ + 2 files changed, 135 insertions(+) + create mode 100644 Documentation/vm/multigen_lru.rst + +diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst +index eff5fbd492d0..c353b3f55924 100644 +--- a/Documentation/vm/index.rst ++++ b/Documentation/vm/index.rst +@@ -17,6 +17,7 @@ various features of the Linux memory management + + swap_numa + zswap ++ multigen_lru + + Kernel developers MM documentation + ================================== +diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst +new file mode 100644 +index 000000000000..adedff5319d9 +--- /dev/null ++++ b/Documentation/vm/multigen_lru.rst +@@ -0,0 +1,134 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++===================== ++Multigenerational LRU ++===================== ++ ++Quick Start ++=========== ++Build Configurations ++-------------------- ++:Required: Set ``CONFIG_LRU_GEN=y``. ++ ++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by ++ default. ++ ++Runtime Configurations ++---------------------- ++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the ++ feature was not turned on by default. ++ ++:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to ++ protect the working set of ``N`` milliseconds. The OOM killer is ++ invoked if this working set cannot be kept in memory. ++ ++:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature ++ is turned on. This file has the following output: ++ ++:: ++ ++ memcg memcg_id memcg_path ++ node node_id ++ min_gen birth_time anon_size file_size ++ ... ++ max_gen birth_time anon_size file_size ++ ++``min_gen`` is the oldest generation number and ``max_gen`` is the ++youngest generation number. ``birth_time`` is in milliseconds. ++``anon_size`` and ``file_size`` are in pages. ++ ++Phones/Laptops/Workstations ++--------------------------- ++No additional configurations required. ++ ++Servers/Data Centers ++-------------------- ++:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a ++ larger number. ++ ++:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger ++ number. ++ ++:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. ++ ++:Working set estimation: Write ``+ memcg_id node_id max_gen ++ [swappiness]`` to ``/sys/kernel/debug/lru_gen`` to invoke the aging, ++ which scans PTEs for accessed pages and then creates the next ++ generation ``max_gen+1``. A swap file and a non-zero ``swappiness``, ++ which overrides ``vm.swappiness``, are required to scan PTEs mapping ++ anon pages. ++ ++:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] ++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the ++ eviction, which evicts generations less than or equal to ``min_gen``. ++ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and ++ ``max_gen-1`` are not fully aged and therefore cannot be evicted. ++ ``nr_to_reclaim`` can be used to limit the number of pages to evict. ++ Multiple command lines are supported, so does concatenation with ++ delimiters ``,`` and ``;``. ++ ++Framework ++========= ++For each ``lruvec``, evictable pages are divided into multiple ++generations. The youngest generation number is stored in ++``lrugen->max_seq`` for both anon and file types as they are aged on ++an equal footing. The oldest generation numbers are stored in ++``lrugen->min_seq[2]`` separately for anon and file types as clean ++file pages can be evicted regardless of swap and writeback ++constraints. These three variables are monotonically increasing. ++Generation numbers are truncated into ++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into ++``page->flags``. The sliding window technique is used to prevent ++truncated generation numbers from overlapping. Each truncated ++generation number is an index to an array of per-type and per-zone ++lists ``lrugen->lists``. ++ ++Each generation is then divided into multiple tiers. Tiers represent ++levels of usage from file descriptors only. Pages accessed ``N`` times ++via file descriptors belong to tier ``order_base_2(N)``. Each ++generation contains at most ``CONFIG_TIERS_PER_GEN`` tiers, and they ++require additional ``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. ++In contrast to moving across generations which requires list ++operations, moving across tiers only involves operations on ++``page->flags`` and therefore has a negligible cost. A feedback loop ++modeled after the PID controller monitors refault rates of all tiers ++and decides when to protect pages from which tiers. ++ ++The framework comprises two conceptually independent components: the ++aging and the eviction, which can be invoked separately from user ++space for the purpose of working set estimation and proactive reclaim. ++ ++Aging ++----- ++The aging produces young generations. Given an ``lruvec``, the aging ++traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` ++to scan PTEs for accessed pages (a ``mm_struct`` list is maintained ++for each ``memcg``). Upon finding one, the aging updates its ++generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). ++After each round of traversal, the aging increments ``max_seq``. The ++aging is due when both ``min_seq[2]`` have caught up with ++``max_seq-1``. ++ ++Eviction ++-------- ++The eviction consumes old generations. Given an ``lruvec``, the ++eviction scans pages on the per-zone lists indexed by anon and file ++``min_seq[2]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to ++select a type based on the values of ``min_seq[2]``. If they are ++equal, it selects the type that has a lower refault rate. The eviction ++sorts a page according to its updated generation number if the aging ++has found this page accessed. It also moves a page to the next ++generation if this page is from an upper tier that has a higher ++refault rate than the base tier. The eviction increments ++``min_seq[2]`` of a selected type when it finds all the per-zone lists ++indexed by ``min_seq[2]`` of this selected type are empty. ++ ++To-do List ++========== ++KVM Optimization ++---------------- ++Support shadow page table walk. ++ ++NUMA Optimization ++----------------- ++Optimize page table walk for NUMA. diff --git a/sys-kernel/pinephone-sources/files/d1d849cae12db71aa81ceedaedc1b17a34790367.patch b/sys-kernel/pinephone-sources/files/d1d849cae12db71aa81ceedaedc1b17a34790367.patch new file mode 100644 index 0000000..cfb8de7 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/d1d849cae12db71aa81ceedaedc1b17a34790367.patch @@ -0,0 +1,377 @@ +From d1d849cae12db71aa81ceedaedc1b17a34790367 Mon Sep 17 00:00:00 2001 +From: Samuel Holland +Date: Sat, 19 Jun 2021 18:36:05 -0500 +Subject: [PATCH] Input: kb151 - Add a driver for the KB151 keyboard + +This keyboard is found in the official Pine64 PinePhone keyboard case. +It is connected over I2C and runs a libre firmware. + +Signed-off-by: Samuel Holland +--- + .../dts/allwinner/sun50i-a64-pinephone.dtsi | 64 +++++ + drivers/input/keyboard/Kconfig | 10 + + drivers/input/keyboard/Makefile | 1 + + drivers/input/keyboard/kb151.c | 246 ++++++++++++++++++ + 4 files changed, 321 insertions(+) + create mode 100644 drivers/input/keyboard/kb151.c + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +index 4ede9fe66020c9..0bdc6eceec6099 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +@@ -551,6 +551,70 @@ + /* Connected to pogo pins (external spring based pinheader for user addons) */ + &i2c2 { + status = "okay"; ++ ++ keyboard@15 { ++ compatible = "pine64,kb151"; ++ reg = <0x15>; ++ interrupt-parent = <&r_pio>; ++ interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ ++ keypad,num-rows = <6>; ++ keypad,num-columns = <12>; ++ linux,keymap = ; ++ wakeup-source; ++ }; + }; + + &i2s2 { +diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig +index 40a070a2e7f5b7..0259e9133f4692 100644 +--- a/drivers/input/keyboard/Kconfig ++++ b/drivers/input/keyboard/Kconfig +@@ -353,6 +353,16 @@ config KEYBOARD_HP7XX + To compile this driver as a module, choose M here: the + module will be called jornada720_kbd. + ++config KEYBOARD_KB151 ++ tristate "Pine64 KB151 Keyboard" ++ depends on I2C ++ select CRC8 ++ select INPUT_MATRIXKMAP ++ help ++ Say Y here to enable support for the KB151 keyboard used in the ++ Pine64 PinePhone keyboard case. This driver supports the FLOSS ++ firmware available at https://megous.com/git/pinephone-keyboard/ ++ + config KEYBOARD_LM8323 + tristate "LM8323 keypad chip" + depends on I2C +diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile +index 1d689fdd5c00f9..87fda7b961913a 100644 +--- a/drivers/input/keyboard/Makefile ++++ b/drivers/input/keyboard/Makefile +@@ -33,6 +33,7 @@ obj-$(CONFIG_KEYBOARD_IMX) += imx_keypad.o + obj-$(CONFIG_KEYBOARD_IMX_SC_KEY) += imx_sc_key.o + obj-$(CONFIG_KEYBOARD_HP6XX) += jornada680_kbd.o + obj-$(CONFIG_KEYBOARD_HP7XX) += jornada720_kbd.o ++obj-$(CONFIG_KEYBOARD_KB151) += kb151.o + obj-$(CONFIG_KEYBOARD_LKKBD) += lkkbd.o + obj-$(CONFIG_KEYBOARD_LM8323) += lm8323.o + obj-$(CONFIG_KEYBOARD_LM8333) += lm8333.o +diff --git a/drivers/input/keyboard/kb151.c b/drivers/input/keyboard/kb151.c +new file mode 100644 +index 00000000000000..595275d4f9d96f +--- /dev/null ++++ b/drivers/input/keyboard/kb151.c +@@ -0,0 +1,246 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++// ++// Copyright (C) 2021 Samuel Holland ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define KB151_CRC8_POLYNOMIAL 0x07 ++ ++#define KB151_DEVICE_ID_HI 0x00 ++#define KB151_DEVICE_ID_HI_VALUE 0x4b ++#define KB151_DEVICE_ID_LO 0x01 ++#define KB151_DEVICE_ID_LO_VALUE 0x42 ++#define KB151_FW_REVISION 0x02 ++#define KB151_FW_FEATURES 0x03 ++#define KB151_MATRIX_SIZE 0x06 ++#define KB151_SCAN_CRC 0x07 ++#define KB151_SCAN_DATA 0x08 ++#define KB151_SYS_CONFIG 0x20 ++#define KB151_SYS_CONFIG_DISABLE_SCAN BIT(0) ++ ++struct kb151 { ++ struct input_dev *input; ++ u8 crc_table[CRC8_TABLE_SIZE]; ++ u8 row_shift; ++ u8 rows; ++ u8 cols; ++ u8 buf_swap; ++ u8 buf[]; ++}; ++ ++static void kb151_update(struct i2c_client *client) ++{ ++ struct kb151 *kb151 = i2c_get_clientdata(client); ++ unsigned short *keymap = kb151->input->keycode; ++ struct device *dev = &client->dev; ++ size_t buf_len = kb151->cols + 1; ++ u8 *old_buf = kb151->buf; ++ u8 *new_buf = kb151->buf; ++ int col, crc, ret, row; ++ ++ if (kb151->buf_swap) ++ old_buf += buf_len; ++ else ++ new_buf += buf_len; ++ ++ ret = i2c_smbus_read_i2c_block_data(client, KB151_SCAN_CRC, ++ buf_len, new_buf); ++ if (ret != buf_len) { ++ dev_err(dev, "Failed to read scan data: %d\n", ret); ++ return; ++ } ++ ++ dev_info(dev, "%02x | %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x", ++ new_buf[0], new_buf[1], new_buf[2], new_buf[3], new_buf[4], new_buf[5], ++ new_buf[6], new_buf[7], new_buf[8], new_buf[9], new_buf[10], new_buf[11], ++ new_buf[12]); ++ crc = crc8(kb151->crc_table, new_buf + 1, kb151->cols, CRC8_INIT_VALUE); ++ if (crc != new_buf[0]) { ++ dev_err(dev, "Bad scan data (%02x != %02x)\n", ++ crc, new_buf[0]); ++ return; ++ } ++ dev_info(dev, "Good scan data (%02x == %02x)\n", ++ crc, new_buf[0]); ++ ++ for (col = 0; col < kb151->cols; ++col) { ++ u8 old = *(++old_buf); ++ u8 new = *(++new_buf); ++ u8 changed = old ^ new; ++ ++ for (row = 0; row < kb151->rows; ++row) { ++ int code = MATRIX_SCAN_CODE(row, col, kb151->row_shift); ++ u8 pressed = new & BIT(row); ++ ++ if (!(changed & BIT(row))) ++ continue; ++ ++ dev_dbg(&client->dev, "row %u col %u %sed\n", ++ row, col, pressed ? "press" : "releas"); ++ input_report_key(kb151->input, keymap[code], pressed); ++ } ++ } ++ input_sync(kb151->input); ++ ++ kb151->buf_swap = !kb151->buf_swap; ++} ++ ++static int kb151_open(struct input_dev *input) ++{ ++ struct i2c_client *client = input_get_drvdata(input); ++ struct device *dev = &client->dev; ++ int ret, val; ++ ++ ret = i2c_smbus_read_byte_data(client, KB151_SYS_CONFIG); ++ if (ret < 0) { ++ dev_err(dev, "Failed to read config: %d\n", ret); ++ return ret; ++ } ++ ++ val = ret & ~KB151_SYS_CONFIG_DISABLE_SCAN; ++ ret = i2c_smbus_write_byte_data(client, KB151_SYS_CONFIG, val); ++ if (ret) { ++ dev_err(dev, "Failed to write config: %d\n", ret); ++ return ret; ++ } ++ ++ kb151_update(client); ++ ++ enable_irq(client->irq); ++ ++ return 0; ++} ++ ++static void kb151_close(struct input_dev *input) ++{ ++ struct i2c_client *client = input_get_drvdata(input); ++ struct device *dev = &client->dev; ++ int ret, val; ++ ++ disable_irq(client->irq); ++ ++ ret = i2c_smbus_read_byte_data(client, KB151_SYS_CONFIG); ++ if (ret < 0) { ++ dev_err(dev, "Failed to read config: %d\n", ret); ++ return; ++ } ++ ++ val = ret | KB151_SYS_CONFIG_DISABLE_SCAN; ++ ret = i2c_smbus_write_byte_data(client, KB151_SYS_CONFIG, val); ++ if (ret) { ++ dev_err(dev, "Failed to write config: %d\n", ret); ++ } ++} ++ ++static irqreturn_t kb151_irq_thread(int irq, void *data) ++{ ++ struct i2c_client *client = data; ++ ++ kb151_update(client); ++ ++ return IRQ_HANDLED; ++} ++ ++static int kb151_probe(struct i2c_client *client) ++{ ++ struct device *dev = &client->dev; ++ u8 info[KB151_MATRIX_SIZE + 1]; ++ unsigned int kb_rows, kb_cols; ++ unsigned int rows, cols; ++ struct kb151 *kb151; ++ int ret; ++ ++ ret = i2c_smbus_read_i2c_block_data(client, 0, sizeof(info), info); ++ if (ret != sizeof(info)) ++ return ret; ++ ++ if (info[KB151_DEVICE_ID_HI] != KB151_DEVICE_ID_HI_VALUE || ++ info[KB151_DEVICE_ID_LO] != KB151_DEVICE_ID_LO_VALUE) ++ return -ENODEV; ++ ++ dev_info(dev, "Found KB151 with firmware %d.%d (features=%#x)\n", ++ info[KB151_FW_REVISION] >> 4, ++ info[KB151_FW_REVISION] & 0xf, ++ info[KB151_FW_FEATURES]); ++ ++ ret = matrix_keypad_parse_properties(dev, &rows, &cols); ++ if (ret) ++ return ret; ++ ++ kb_rows = info[KB151_MATRIX_SIZE] & 0xf; ++ kb_cols = info[KB151_MATRIX_SIZE] >> 4; ++ if (rows > kb_rows || cols != kb_cols) { ++ dev_err(dev, "Keyboard matrix is %ux%u, but key map is %ux%u\n", ++ kb_rows, kb_cols, rows, cols); ++ return -EINVAL; ++ } ++ ++ /* Allocate two buffers, and include space for the CRC. */ ++ kb151 = devm_kzalloc(dev, struct_size(kb151, buf, 2 * (cols + 1)), GFP_KERNEL); ++ if (!kb151) ++ return -ENOMEM; ++ ++ i2c_set_clientdata(client, kb151); ++ ++ crc8_populate_msb(kb151->crc_table, KB151_CRC8_POLYNOMIAL); ++ ++ kb151->row_shift = get_count_order(cols); ++ kb151->rows = rows; ++ kb151->cols = cols; ++ ++ kb151->input = devm_input_allocate_device(dev); ++ if (!kb151->input) ++ return -ENOMEM; ++ ++ input_set_drvdata(kb151->input, client); ++ ++ kb151->input->name = client->name; ++ kb151->input->phys = "kb151/input0"; ++ kb151->input->id.bustype = BUS_I2C; ++ kb151->input->open = kb151_open; ++ kb151->input->close = kb151_close; ++ ++ __set_bit(EV_REP, kb151->input->evbit); ++ ++ ret = matrix_keypad_build_keymap(NULL, NULL, rows, cols, ++ NULL, kb151->input); ++ if (ret) ++ return dev_err_probe(dev, ret, "Failed to build keymap\n"); ++ ++ ret = devm_request_threaded_irq(dev, client->irq, ++ NULL, kb151_irq_thread, ++ IRQF_ONESHOT | IRQF_NO_AUTOEN, ++ client->name, client); ++ if (ret) ++ return dev_err_probe(dev, ret, "Failed to request IRQ\n"); ++ ++ ret = input_register_device(kb151->input); ++ if (ret) ++ return dev_err_probe(dev, ret, "Failed to register input\n"); ++ ++ return 0; ++} ++ ++static const struct of_device_id kb151_of_match[] = { ++ { .compatible = "pine64,kb151" }, ++ { } ++}; ++MODULE_DEVICE_TABLE(of, kb151_of_match); ++ ++static struct i2c_driver kb151_driver = { ++ .probe_new = kb151_probe, ++ .driver = { ++ .name = "kb151", ++ .of_match_table = kb151_of_match, ++ }, ++}; ++module_i2c_driver(kb151_driver); ++ ++MODULE_AUTHOR("Samuel Holland "); ++MODULE_DESCRIPTION("Pine64 KB151 keyboard driver"); ++MODULE_LICENSE("GPL"); diff --git a/sys-kernel/pinephone-sources/pinephone-sources-5.14.1.ebuild b/sys-kernel/pinephone-sources/pinephone-sources-5.14.2.ebuild similarity index 89% rename from sys-kernel/pinephone-sources/pinephone-sources-5.14.1.ebuild rename to sys-kernel/pinephone-sources/pinephone-sources-5.14.2.ebuild index 8f221cc..7d53352 100644 --- a/sys-kernel/pinephone-sources/pinephone-sources-5.14.1.ebuild +++ b/sys-kernel/pinephone-sources/pinephone-sources-5.14.2.ebuild @@ -9,7 +9,7 @@ ETYPE="sources" inherit kernel-2 detect_version -KEYWORDS="~alpha ~amd64 ~arm ~arm64 ~hppa ~ia64 ~mips ~ppc ~ppc64 ~s390 ~sparc ~x86" +KEYWORDS="~arm64" # Copyright 1999-2021 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 @@ -33,7 +33,14 @@ PATCHES=( ${FILESDIR}/0011-dts-pinetab-hardcode-mmc-numbers.patch ${FILESDIR}/0107-quirk-kernel-org-bug-210681-firmware_rome_error.patch ${FILESDIR}/0177-leds-gpio-make-max_brightness-configurable.patch - ${FILESDIR}/panic-led-5.12.patch + ${FILESDIR}/panic-led-5.12.patch + + # keyboard + ${FILESDIR}/d1d849cae12db71aa81ceedaedc1b17a34790367.patch + ${FILESDIR}/2423aac2d6f5db55da99e11fd799ee66fe6f54c6.patch + + # LRU + ${FILESDIR}/Multigenerational-LRU-Framework.patch ) src_prepare() { @@ -60,3 +67,4 @@ pkg_postinst() { pkg_postrm() { kernel-2_pkg_postrm } +