diff --git a/sys-kernel/pinephone-sources/Manifest b/sys-kernel/pinephone-sources/Manifest index 2adad09..3a33b67 100644 --- a/sys-kernel/pinephone-sources/Manifest +++ b/sys-kernel/pinephone-sources/Manifest @@ -1,4 +1 @@ -DIST all-5.18.2.patch 16908878 BLAKE2B 6eae9430fff2702af7fde8cc16bffcccb100b02ed155c88e886b557d5dd26f3b091f0255ef96accbc512bb70875c4b1bd9af1b089a9439b59618dbb839014371 SHA512 2f4fca905f8bc721bc5d5fc027bbd01b8af559d806df3180141b7705538d8fdd4d65aab1fa6c5bca58e8b2423904098ef73afa43b5b700c24074347c15691b03 -DIST genpatches-5.18-4.base.tar.xz 51808 BLAKE2B f9a33dabb82687789f6f778b1eea29e29d1ff8601954f6125530cd98a88b9469c3255c0858c767806580b018be9a4bde23805240d266f0b8cb52a3031bbc836c SHA512 7ddcff3c89328b0fa1c25fffd595f13b2ab9569d9387a503937c7f58f44e0ab4fdda53b260a476316c85e9459a439be03cf875683b594a6a3b10c4320d2a274e -DIST genpatches-5.18-4.extras.tar.xz 3924 BLAKE2B 9d555b98ccc35b3d42caf7fabd46756b18ca121a80b041a1a44799f283cd6062e0023618cf94867baebb7ab9c71b8a812f75beded436eaaab79ac52674e8242e SHA512 d9b75f29090f45b801fb94f7bca4d5ef9458d243ea5719059a427f45ab045be1696856dae9c47b43f08c70024de5e29e08bcf78179f684cbec797063be97987a -DIST linux-5.18.tar.xz 129790264 BLAKE2B e2745a69eb70169e90505a9318a3993046eab3020496eecde7d8352ecda0eb71a25b21becf7ce93fc593507dce7d1cd61b94ddcdf82b3094d79c0d3d48508eeb SHA512 dbbc9d1395898a498fa4947fceda1781344fa5d360240f753810daa4fa88e519833e2186c4e582a8f1836e6413e9e85f6563c7770523b704e8702d67622f98b5 +DIST orange-pi-5.19-20220802-0940.tar.gz 214990340 BLAKE2B 9bbadd06a8d160d716838d709f7ca6adb6143cb2205337940fb2d4607f0b806400cc77fb4abd36856844536b0a4ced92737658fc7af60d10f141a21116d66eed SHA512 04d46f6065a138d3b206937fada3990f823a1937c14812bada6512d04ebf1c7634cdea0a57611066bd2b4951a38c8e354b187bffe2ca738f2fe2a3f50d922dc2 diff --git a/sys-kernel/pinephone-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch b/sys-kernel/pinephone-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch new file mode 100644 index 0000000..b90eced --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0101-arm64-dts-pinephone-drop-modem-power-node.patch @@ -0,0 +1,175 @@ +From 602d05e416ae0d0fba3022fa2c3d195164b406c6 Mon Sep 17 00:00:00 2001 +From: Clayton Craft +Date: Wed, 16 Dec 2020 20:16:14 -0800 +Subject: [PATCH] dts: pinephone: drop modem-power node + +--- + .../allwinner/sun50i-a64-pinephone-1.0.dts | 26 +++--------------- + .../allwinner/sun50i-a64-pinephone-1.1.dts | 27 +++---------------- + .../allwinner/sun50i-a64-pinephone-1.2.dts | 27 +++---------------- + .../dts/allwinner/sun50i-a64-pinephone.dtsi | 12 +++++++++ + 4 files changed, 24 insertions(+), 68 deletions(-) + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts +index a21c6d78a..7f0cfdafe 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.0.dts +@@ -86,28 +86,6 @@ ®_drivevbus { + status = "okay"; + }; + +-&uart3 { +- modem { +- compatible = "quectel,eg25"; +- char-device-name = "modem-power"; +- +- power-supply = <®_vbat_bb>; /* PL7 */ +- +- enable-gpios = <&pio 7 8 GPIO_ACTIVE_LOW>; /* PH8 */ +- reset-gpios = <&pio 2 4 GPIO_ACTIVE_HIGH>; /* PC4 */ +- pwrkey-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */ +- +- sleep-gpios = <&pio 7 7 GPIO_ACTIVE_HIGH>; /* PH7 */ +- wakeup-gpios = <&pio 1 2 GPIO_ACTIVE_HIGH>; /* PB2-RI */ +- +- cts-gpios = <&pio 3 5 GPIO_ACTIVE_HIGH>; /* PD5-CTS */ +- dtr-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6-DTR */ +- rts-gpios = <&pio 3 4 GPIO_ACTIVE_HIGH>; /* PD4-RTS */ +- +- quectel,qdai = "1,1,0,1,0,0,1,1"; +- }; +-}; +- + &usbphy { + usb-role-switch; + +@@ -118,6 +96,10 @@ usb0_drd_sw: endpoint { + }; + }; + ++&ring_indicator { ++ gpios = <&pio 1 2 GPIO_ACTIVE_LOW>; /* PB2 */ ++}; ++ + &sgm3140 { + enable-gpios = <&pio 2 3 GPIO_ACTIVE_HIGH>; /* PC3 */ + flash-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* PD24 */ +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts +index 61ff56b17..5e85ddc12 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.1.dts +@@ -109,34 +109,15 @@ ®_drivevbus { + status = "okay"; + }; + ++&ring_indicator { ++ gpios = <&pio 1 2 GPIO_ACTIVE_LOW>; /* PB2 */ ++}; ++ + &sgm3140 { + enable-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* PD24 */ + flash-gpios = <&pio 2 3 GPIO_ACTIVE_HIGH>; /* PC3 */ + }; + +-&uart3 { +- modem { +- compatible = "quectel,eg25"; +- char-device-name = "modem-power"; +- +- power-supply = <®_vbat_bb>; /* PL7 */ +- +- enable-gpios = <&pio 7 8 GPIO_ACTIVE_LOW>; /* PH8 */ +- reset-gpios = <&pio 2 4 GPIO_ACTIVE_HIGH>; /* PC4 */ +- pwrkey-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */ +- //status-pwrkey-multiplexed; /* status acts as pwrkey */ +- +- sleep-gpios = <&pio 7 7 GPIO_ACTIVE_HIGH>; /* PH7 */ +- wakeup-gpios = <&pio 1 2 GPIO_ACTIVE_HIGH>; /* PB2-RI */ +- +- dtr-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6-DTR */ +- cts-gpios = <&pio 3 5 GPIO_ACTIVE_HIGH>; /* PD5-CTS */ +- rts-gpios = <&pio 3 4 GPIO_ACTIVE_HIGH>; /* PD4-RTS */ +- +- quectel,qdai = "1,1,0,1,0,0,1,1"; +- }; +-}; +- + &usbphy { + usb-role-switch; + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts +index fe7d567a8..f4b9b0991 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone-1.2.dts +@@ -101,34 +101,15 @@ ®_anx1v0 { + enable-active-high; + }; + ++&ring_indicator { ++ gpios = <&r_pio 0 6 GPIO_ACTIVE_LOW>; /* PL6 */ ++}; ++ + &sgm3140 { + enable-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* PD24 */ + flash-gpios = <&pio 2 3 GPIO_ACTIVE_HIGH>; /* PC3 */ + }; + +-&uart3 { +- modem { +- compatible = "quectel,eg25"; +- char-device-name = "modem-power"; +- +- power-supply = <®_vbat_bb>; /* PL7 */ +- +- enable-gpios = <&pio 7 8 GPIO_ACTIVE_LOW>; /* PH8 */ +- reset-gpios = <&pio 2 4 GPIO_ACTIVE_HIGH>; /* PC4 */ +- status-gpios = <&pio 7 9 GPIO_ACTIVE_HIGH>; /* PH9 */ +- pwrkey-gpios = <&pio 1 3 GPIO_ACTIVE_HIGH>; /* PB3 */ +- +- host-ready-gpios = <&pio 7 7 GPIO_ACTIVE_HIGH>; /* PH7 */ +- wakeup-gpios = <&r_pio 0 6 GPIO_ACTIVE_HIGH>; /* PL6-RI */ +- +- dtr-gpios = <&pio 1 2 GPIO_ACTIVE_HIGH>; /* PB2-DTR */ +- cts-gpios = <&pio 3 5 GPIO_ACTIVE_HIGH>; /* PD5-CTS */ +- rts-gpios = <&pio 3 4 GPIO_ACTIVE_HIGH>; /* PD4-RTS */ +- +- quectel,qdai = "1,1,0,1,0,0,1,1"; +- }; +-}; +- + &usbphy { + usb-role-switch; + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +index 346113382..7b48126d1 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +@@ -192,6 +192,17 @@ ec25_codec: ec25-codec { + sound-name-prefix = "Modem"; + }; + ++ gpio-keys { ++ compatible = "gpio-keys"; ++ ++ ring_indicator: ring-indicator { ++ label = "Ring Indicator"; ++ linux,can-disable; ++ linux,code = ; ++ wakeup-source; ++ }; ++ }; ++ + i2c_csi: i2c-csi { + compatible = "i2c-gpio"; + sda-gpios = <&pio 4 13 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; /* PE13 */ +@@ -264,6 +275,7 @@ reg_usb_5v: usb-5v { + reg_vbat_bb: vbat-bb { + compatible = "regulator-fixed"; + regulator-name = "vbat-bb"; ++ regulator-always-on; + regulator-min-microvolt = <3500000>; + regulator-max-microvolt = <3500000>; + gpio = <&r_pio 0 7 GPIO_ACTIVE_HIGH>; /* PL7 */ +-- +2.31.1 + diff --git a/sys-kernel/pinephone-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch b/sys-kernel/pinephone-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch new file mode 100644 index 0000000..24be3b4 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0102-arm64-dts-pinephone-pro-remove-modem-node.patch @@ -0,0 +1,86 @@ +From 60d8aedea7c8c390ee744730ab3e565ea84496fb Mon Sep 17 00:00:00 2001 +From: Danct12 +Date: Fri, 10 Dec 2021 23:01:34 +0700 +Subject: [PATCH] arm64: dts: rk3399-pinephone-pro: Remove modem node + +Since we don't use modem-power driver, this can be removed +for eg25-manager. +--- + .../dts/rockchip/rk3399-pinephone-pro.dts | 40 +------------------ + 1 file changed, 2 insertions(+), 38 deletions(-) + +diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts +index 61c990764..13141c643 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts +@@ -326,6 +326,7 @@ vcc_4g_5v: vcc-4g-5v { + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + vin-supply = <&vcc5v0_sys>; ++ regulator-always-on; + }; + + vcc_4g: vcc-4g { +@@ -338,6 +339,7 @@ vcc_4g: vcc-4g { + regulator-min-microvolt = <3800000>; + regulator-max-microvolt = <3800000>; + vin-supply = <&vcc_sysin>; ++ regulator-always-on; + }; + + vcc1v8_codec: vcc1v8-codec-regulator { +@@ -1058,31 +1060,6 @@ mipi_in_panel: endpoint { + + &uart3 { + status = "okay"; +- +- modem { +- compatible = "quectel,eg25"; +- char-device-name = "modem-power"; +- +- pinctrl-names = "default"; +- pinctrl-0 = <&modem_control_pins>; +- +- power-supply = <&vcc_4g>; +- vbus-supply = <&vcc_4g_5v>; +- +- enable-gpios = <&gpio0 RK_PB0 GPIO_ACTIVE_HIGH>; // W_DISABLE# +- reset-gpios = <&gpio3 RK_PB0 GPIO_ACTIVE_HIGH>; +- status-gpios = <&gpio3 RK_PA6 GPIO_ACTIVE_HIGH>; +- pwrkey-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; +- +- host-ready-gpios = <&gpio0 RK_PB4 GPIO_ACTIVE_HIGH>; // apready +- wakeup-gpios = <&gpio0 RK_PA1 GPIO_ACTIVE_HIGH>; // ri +- +- dtr-gpios = <&gpio0 RK_PA3 GPIO_ACTIVE_HIGH>; +- cts-gpios = <&gpio3 RK_PC0 GPIO_ACTIVE_HIGH>; +- rts-gpios = <&gpio3 RK_PC1 GPIO_ACTIVE_HIGH>; +- +- quectel,qdai = "3,0,0,4,0,0,1,1"; +- }; + }; + + &pmu_io_domains { +@@ -1153,19 +1130,6 @@ vcc_4g_5v_en: vcc-4g-5v-en-pin { + vcc_4g_en: vcc-4g-en-pin { + rockchip,pins = <4 RK_PC7 RK_FUNC_GPIO &pcfg_pull_none>; + }; +- +- modem_control_pins: modem-control-pins { +- rockchip,pins = +- <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_none>, +- <3 RK_PB0 RK_FUNC_GPIO &pcfg_pull_none>, +- <3 RK_PA6 RK_FUNC_GPIO &pcfg_pull_none>, +- <0 RK_PB5 RK_FUNC_GPIO &pcfg_pull_none>, +- <0 RK_PB4 RK_FUNC_GPIO &pcfg_pull_none>, +- <0 RK_PA1 RK_FUNC_GPIO &pcfg_pull_none>, +- <0 RK_PA3 RK_FUNC_GPIO &pcfg_pull_none>, +- <3 RK_PC0 RK_FUNC_GPIO &pcfg_pull_none>, +- <3 RK_PC1 RK_FUNC_GPIO &pcfg_pull_none>; +- }; + }; + + pmic { +-- +2.34.1 + diff --git a/sys-kernel/pinephone-sources/files/0103-ccu-sun50i-a64-reparent-clocks-to-lower-speed-oscillator.patch b/sys-kernel/pinephone-sources/files/0103-ccu-sun50i-a64-reparent-clocks-to-lower-speed-oscillator.patch new file mode 100644 index 0000000..2a16042 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0103-ccu-sun50i-a64-reparent-clocks-to-lower-speed-oscillator.patch @@ -0,0 +1,50 @@ +diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-a64.c b/drivers/clk/sunxi-ng/ccu-sun50i-a64.c +index 149cfde817cb..0399d8714fd0 100644 +--- a/drivers/clk/sunxi-ng/ccu-sun50i-a64.c ++++ b/drivers/clk/sunxi-ng/ccu-sun50i-a64.c +@@ -984,6 +984,8 @@ static int sun50i_a64_ccu_probe(struct p + if (IS_ERR(reg)) + return PTR_ERR(reg); + ++ platform_set_drvdata(pdev, reg); ++ + /* Force the pll-audio variable divider to 3 */ + val = readl(reg + SUN50I_A64_PLL_AUDIO_REG); + val &= ~GENMASK(19, 16); +@@ -1031,12 +1031,36 @@ static const struct of_device_id sun50i_ + { } + }; + ++#define USBPHY_CFG_REG 0x0cc ++ ++static int sun50i_a64_ccu_suspend(struct device *dev) ++{ ++ void __iomem *reg = dev_get_drvdata(dev); ++ ++ writel(readl(reg + USBPHY_CFG_REG) | 0xa00000, reg + USBPHY_CFG_REG); ++ ++ return 0; ++} ++ ++static int sun50i_a64_ccu_resume(struct device *dev) ++{ ++ void __iomem *reg = dev_get_drvdata(dev); ++ ++ writel(readl(reg + USBPHY_CFG_REG) & ~0xa00000, reg + USBPHY_CFG_REG); ++ ++ return 0; ++} ++ ++static SIMPLE_DEV_PM_OPS(sun50i_a64_ccu_pm_ops, ++ sun50i_a64_ccu_suspend, sun50i_a64_ccu_resume); ++ + static struct platform_driver sun50i_a64_ccu_driver = { + .probe = sun50i_a64_ccu_probe, + .driver = { + .name = "sun50i-a64-ccu", + .suppress_bind_attrs = true, + .of_match_table = sun50i_a64_ccu_ids, ++ .pm = &sun50i_a64_ccu_pm_ops, + }, + }; + builtin_platform_driver(sun50i_a64_ccu_driver); diff --git a/sys-kernel/pinephone-sources/files/0104-quirk-kernel-org-bug-210681-firmware_rome_error.patch b/sys-kernel/pinephone-sources/files/0104-quirk-kernel-org-bug-210681-firmware_rome_error.patch new file mode 100644 index 0000000..7f46da7 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0104-quirk-kernel-org-bug-210681-firmware_rome_error.patch @@ -0,0 +1,12 @@ +diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c +index 03b83aa91277..dfc6c7d1b0e7 100644 +--- a/drivers/bluetooth/btusb.c ++++ b/drivers/bluetooth/btusb.c +@@ -4070,6 +4070,7 @@ static int btusb_setup_qca(struct hci_dev *hdev) + } + if (!info) { + bt_dev_err(hdev, "don't support firmware rome 0x%x", ver_rom); ++ if (ver_rom & ~0xffffU) return 0; + return -ENODEV; + } + diff --git a/sys-kernel/pinephone-sources/files/0105-leds-gpio-make-max_brightness-configurable.patch b/sys-kernel/pinephone-sources/files/0105-leds-gpio-make-max_brightness-configurable.patch new file mode 100644 index 0000000..e844fce --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0105-leds-gpio-make-max_brightness-configurable.patch @@ -0,0 +1,49 @@ +From cb408fb65a08bd45543724c1e9b8f38ae1bebc4a Mon Sep 17 00:00:00 2001 +From: Arnaud Ferraris +Date: Tue, 4 Aug 2020 15:12:59 +0200 +Subject: [PATCH 177/183] leds-gpio: make max_brightness configurable + +--- + drivers/leds/leds-gpio.c | 4 ++++ + include/linux/leds.h | 3 ++- + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c +index 93f5b1b60fde..f8483fab1164 100644 +--- a/drivers/leds/leds-gpio.c ++++ b/drivers/leds/leds-gpio.c +@@ -108,6 +108,8 @@ static int create_gpio_led(const struct gpio_led *template, + if (ret < 0) + return ret; + ++ led_dat->cdev.max_brightness = template->max_brightness; ++ + if (template->name) { + led_dat->cdev.name = template->name; + ret = devm_led_classdev_register(parent, &led_dat->cdev); +@@ -177,6 +179,8 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev) + if (fwnode_property_present(child, "panic-indicator")) + led.panic_indicator = 1; + ++ fwnode_property_read_u32(child, "max-brightness", &led.max_brightness); ++ + ret = create_gpio_led(&led, led_dat, dev, child, NULL); + if (ret < 0) { + fwnode_handle_put(child); +diff --git a/include/linux/leds.h b/include/linux/leds.h +index 6a8d6409c993..99a80092114d 100644 +--- a/include/linux/leds.h ++++ b/include/linux/leds.h +@@ -513,7 +513,8 @@ typedef int (*gpio_blink_set_t)(struct gpio_desc *desc, int state, + struct gpio_led { + const char *name; + const char *default_trigger; +- unsigned gpio; ++ unsigned gpio; ++ unsigned max_brightness; + unsigned active_low : 1; + unsigned retain_state_suspended : 1; + unsigned panic_indicator : 1; +-- +2.30.0 + diff --git a/sys-kernel/pinephone-sources/files/0106-panic-led.patch b/sys-kernel/pinephone-sources/files/0106-panic-led.patch new file mode 100644 index 0000000..2bb4843 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/0106-panic-led.patch @@ -0,0 +1,12 @@ +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +index 1c555456b..05fab5d79 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +@@ -78,6 +78,7 @@ green { + }; + + led-2 { ++ linux,default-trigger = "panic"; + function = LED_FUNCTION_INDICATOR; + color = ; + gpios = <&pio 3 19 GPIO_ACTIVE_HIGH>; /* PD19 */ diff --git a/sys-kernel/pinephone-sources/files/1500_XATTR_USER_PREFIX.patch b/sys-kernel/pinephone-sources/files/1500_XATTR_USER_PREFIX.patch new file mode 100644 index 0000000..245dcc2 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/1500_XATTR_USER_PREFIX.patch @@ -0,0 +1,67 @@ +From: Anthony G. Basile + +This patch adds support for a restricted user-controlled namespace on +tmpfs filesystem used to house PaX flags. The namespace must be of the +form user.pax.* and its value cannot exceed a size of 8 bytes. + +This is needed even on all Gentoo systems so that XATTR_PAX flags +are preserved for users who might build packages using portage on +a tmpfs system with a non-hardened kernel and then switch to a +hardened kernel with XATTR_PAX enabled. + +The namespace is added to any user with Extended Attribute support +enabled for tmpfs. Users who do not enable xattrs will not have +the XATTR_PAX flags preserved. + +diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h +index 1590c49..5eab462 100644 +--- a/include/uapi/linux/xattr.h ++++ b/include/uapi/linux/xattr.h +@@ -73,5 +73,9 @@ + #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" + #define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT + ++/* User namespace */ ++#define XATTR_PAX_PREFIX XATTR_USER_PREFIX "pax." ++#define XATTR_PAX_FLAGS_SUFFIX "flags" ++#define XATTR_NAME_PAX_FLAGS XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX + + #endif /* _UAPI_LINUX_XATTR_H */ +--- a/mm/shmem.c 2020-05-04 15:30:27.042035334 -0400 ++++ b/mm/shmem.c 2020-05-04 15:34:57.013881725 -0400 +@@ -3238,6 +3238,14 @@ static int shmem_xattr_handler_set(const + struct shmem_inode_info *info = SHMEM_I(inode); + + name = xattr_full_name(handler, name); ++ ++ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { ++ if (strcmp(name, XATTR_NAME_PAX_FLAGS)) ++ return -EOPNOTSUPP; ++ if (size > 8) ++ return -EINVAL; ++ } ++ + return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + } + +@@ -3253,6 +3261,12 @@ static const struct xattr_handler shmem_ + .set = shmem_xattr_handler_set, + }; + ++static const struct xattr_handler shmem_user_xattr_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = shmem_xattr_handler_get, ++ .set = shmem_xattr_handler_set, ++}; ++ + static const struct xattr_handler *shmem_xattr_handlers[] = { + #ifdef CONFIG_TMPFS_POSIX_ACL + &posix_acl_access_xattr_handler, +@@ -3260,6 +3274,7 @@ static const struct xattr_handler *shmem + #endif + &shmem_security_xattr_handler, + &shmem_trusted_xattr_handler, ++ &shmem_user_xattr_handler, + NULL + }; + diff --git a/sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch b/sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch new file mode 100644 index 0000000..e8c3015 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/1510_fs-enable-link-security-restrictions-by-default.patch @@ -0,0 +1,17 @@ +--- a/fs/namei.c 2022-01-23 13:02:27.876558299 -0500 ++++ b/fs/namei.c 2022-03-06 12:47:39.375719693 -0500 +@@ -1020,10 +1020,10 @@ static inline void put_link(struct namei + path_put(&last->link); + } + +-static int sysctl_protected_symlinks __read_mostly; +-static int sysctl_protected_hardlinks __read_mostly; +-static int sysctl_protected_fifos __read_mostly; +-static int sysctl_protected_regular __read_mostly; ++static int sysctl_protected_symlinks __read_mostly = 1; ++static int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 1; ++int sysctl_protected_regular __read_mostly = 1; + + #ifdef CONFIG_SYSCTL + static struct ctl_table namei_sysctls[] = { diff --git a/sys-kernel/pinephone-sources/files/1700_sparc-address-warray-bound-warnings.patch b/sys-kernel/pinephone-sources/files/1700_sparc-address-warray-bound-warnings.patch new file mode 100644 index 0000000..f939355 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/1700_sparc-address-warray-bound-warnings.patch @@ -0,0 +1,17 @@ +--- a/arch/sparc/mm/init_64.c 2022-05-24 16:48:40.749677491 -0400 ++++ b/arch/sparc/mm/init_64.c 2022-05-24 16:55:15.511356945 -0400 +@@ -3052,11 +3052,11 @@ static inline resource_size_t compute_ke + static void __init kernel_lds_init(void) + { + code_resource.start = compute_kern_paddr(_text); +- code_resource.end = compute_kern_paddr(_etext - 1); ++ code_resource.end = compute_kern_paddr(_etext) - 1; + data_resource.start = compute_kern_paddr(_etext); +- data_resource.end = compute_kern_paddr(_edata - 1); ++ data_resource.end = compute_kern_paddr(_edata) - 1; + bss_resource.start = compute_kern_paddr(__bss_start); +- bss_resource.end = compute_kern_paddr(_end - 1); ++ bss_resource.end = compute_kern_paddr(_end) - 1; + } + + static int __init report_memory(void) diff --git a/sys-kernel/pinephone-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch b/sys-kernel/pinephone-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch new file mode 100644 index 0000000..394ad48 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch @@ -0,0 +1,37 @@ +The encryption is only mandatory to be enforced when both sides are using +Secure Simple Pairing and this means the key size check makes only sense +in that case. + +On legacy Bluetooth 2.0 and earlier devices like mice the encryption was +optional and thus causing an issue if the key size check is not bound to +using Secure Simple Pairing. + +Fixes: d5bb334a8e17 ("Bluetooth: Align minimum encryption key size for LE and BR/EDR connections") +Signed-off-by: Marcel Holtmann +Cc: stable@vger.kernel.org +--- + net/bluetooth/hci_conn.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c +index 3cf0764d5793..7516cdde3373 100644 +--- a/net/bluetooth/hci_conn.c ++++ b/net/bluetooth/hci_conn.c +@@ -1272,8 +1272,13 @@ int hci_conn_check_link_mode(struct hci_conn *conn) + return 0; + } + +- if (hci_conn_ssp_enabled(conn) && +- !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) ++ /* If Secure Simple Pairing is not enabled, then legacy connection ++ * setup is used and no encryption or key sizes can be enforced. ++ */ ++ if (!hci_conn_ssp_enabled(conn)) ++ return 1; ++ ++ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) + return 0; + + /* The minimum encryption key size needs to be enforced by the +-- +2.20.1 diff --git a/sys-kernel/pinephone-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch b/sys-kernel/pinephone-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch new file mode 100644 index 0000000..4335685 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch @@ -0,0 +1,30 @@ +From dc328d75a6f37f4ff11a81ae16b1ec88c3197640 Mon Sep 17 00:00:00 2001 +From: Mike Pagano +Date: Mon, 23 Mar 2020 08:20:06 -0400 +Subject: [PATCH 1/1] This driver requires REGMAP_I2C to build. Select it by + default in Kconfig. Reported at gentoo bugzilla: + https://bugs.gentoo.org/710790 +Cc: mpagano@gentoo.org + +Reported-by: Phil Stracchino + +Signed-off-by: Mike Pagano +--- + drivers/hwmon/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig +index 47ac20aee06f..530b4f29ba85 100644 +--- a/drivers/hwmon/Kconfig ++++ b/drivers/hwmon/Kconfig +@@ -1769,6 +1769,7 @@ config SENSORS_TMP421 + config SENSORS_TMP513 + tristate "Texas Instruments TMP513 and compatibles" + depends on I2C ++ select REGMAP_I2C + help + If you say yes here you get support for Texas Instruments TMP512, + and TMP513 temperature and power supply sensor chips. +-- +2.24.1 + diff --git a/sys-kernel/pinephone-sources/files/2920_sign-file-patch-for-libressl.patch b/sys-kernel/pinephone-sources/files/2920_sign-file-patch-for-libressl.patch new file mode 100644 index 0000000..e6ec017 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/2920_sign-file-patch-for-libressl.patch @@ -0,0 +1,16 @@ +--- a/scripts/sign-file.c 2020-05-20 18:47:21.282820662 -0400 ++++ b/scripts/sign-file.c 2020-05-20 18:48:37.991081899 -0400 +@@ -41,9 +41,10 @@ + * signing with anything other than SHA1 - so we're stuck with that if such is + * the case. + */ +-#if defined(LIBRESSL_VERSION_NUMBER) || \ +- OPENSSL_VERSION_NUMBER < 0x10000000L || \ +- defined(OPENSSL_NO_CMS) ++#if defined(OPENSSL_NO_CMS) || \ ++ ( defined(LIBRESSL_VERSION_NUMBER) \ ++ && (LIBRESSL_VERSION_NUMBER < 0x3010000fL) ) || \ ++ OPENSSL_VERSION_NUMBER < 0x10000000L + #define USE_PKCS7 + #endif + #ifndef USE_PKCS7 diff --git a/sys-kernel/pinephone-sources/files/3000_Support-printing-firmware-info.patch b/sys-kernel/pinephone-sources/files/3000_Support-printing-firmware-info.patch new file mode 100644 index 0000000..a630cfb --- /dev/null +++ b/sys-kernel/pinephone-sources/files/3000_Support-printing-firmware-info.patch @@ -0,0 +1,14 @@ +--- a/drivers/base/firmware_loader/main.c 2021-08-24 15:42:07.025482085 -0400 ++++ b/drivers/base/firmware_loader/main.c 2021-08-24 15:44:40.782975313 -0400 +@@ -809,6 +809,11 @@ _request_firmware(const struct firmware + + ret = _request_firmware_prepare(&fw, name, device, buf, size, + offset, opt_flags); ++ ++#ifdef CONFIG_GENTOO_PRINT_FIRMWARE_INFO ++ printk(KERN_NOTICE "Loading firmware: %s\n", name); ++#endif ++ + if (ret <= 0) /* error or already assigned */ + goto out; + diff --git a/sys-kernel/pinephone-sources/files/4567_distro-Gentoo-Kconfig.patch b/sys-kernel/pinephone-sources/files/4567_distro-Gentoo-Kconfig.patch new file mode 100644 index 0000000..0a38098 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/4567_distro-Gentoo-Kconfig.patch @@ -0,0 +1,341 @@ +--- a/Kconfig 2022-05-11 13:20:07.110347567 -0400 ++++ b/Kconfig 2022-05-11 13:21:12.127174393 -0400 +@@ -30,3 +30,5 @@ source "lib/Kconfig" + source "lib/Kconfig.debug" + + source "Documentation/Kconfig" ++ ++source "distro/Kconfig" +--- /dev/null 2022-05-10 13:47:17.750578524 -0400 ++++ b/distro/Kconfig 2022-05-11 13:21:20.540529032 -0400 +@@ -0,0 +1,290 @@ ++menu "Gentoo Linux" ++ ++config GENTOO_LINUX ++ bool "Gentoo Linux support" ++ ++ default y ++ ++ select CPU_FREQ_DEFAULT_GOV_SCHEDUTIL ++ ++ help ++ In order to boot Gentoo Linux a minimal set of config settings needs to ++ be enabled in the kernel; to avoid the users from having to enable them ++ manually as part of a Gentoo Linux installation or a new clean config, ++ we enable these config settings by default for convenience. ++ ++ See the settings that become available for more details and fine-tuning. ++ ++config GENTOO_LINUX_UDEV ++ bool "Linux dynamic and persistent device naming (userspace devfs) support" ++ ++ depends on GENTOO_LINUX ++ default y if GENTOO_LINUX ++ ++ select DEVTMPFS ++ select TMPFS ++ select UNIX ++ ++ select MMU ++ select SHMEM ++ ++ help ++ In order to boot Gentoo Linux a minimal set of config settings needs to ++ be enabled in the kernel; to avoid the users from having to enable them ++ manually as part of a Gentoo Linux installation or a new clean config, ++ we enable these config settings by default for convenience. ++ ++ Currently this only selects TMPFS, DEVTMPFS and their dependencies. ++ TMPFS is enabled to maintain a tmpfs file system at /dev/shm, /run and ++ /sys/fs/cgroup; DEVTMPFS to maintain a devtmpfs file system at /dev. ++ ++ Some of these are critical files that need to be available early in the ++ boot process; if not available, it causes sysfs and udev to malfunction. ++ ++ To ensure Gentoo Linux boots, it is best to leave this setting enabled; ++ if you run a custom setup, you could consider whether to disable this. ++ ++config GENTOO_LINUX_PORTAGE ++ bool "Select options required by Portage features" ++ ++ depends on GENTOO_LINUX ++ default y if GENTOO_LINUX ++ ++ select CGROUPS ++ select NAMESPACES ++ select IPC_NS ++ select NET_NS ++ select PID_NS ++ select SYSVIPC ++ select USER_NS ++ select UTS_NS ++ ++ help ++ This enables options required by various Portage FEATURES. ++ Currently this selects: ++ ++ CGROUPS (required for FEATURES=cgroup) ++ IPC_NS (required for FEATURES=ipc-sandbox) ++ NET_NS (required for FEATURES=network-sandbox) ++ PID_NS (required for FEATURES=pid-sandbox) ++ SYSVIPC (required by IPC_NS) ++ ++ ++ It is highly recommended that you leave this enabled as these FEATURES ++ are, or will soon be, enabled by default. ++ ++menu "Support for init systems, system and service managers" ++ visible if GENTOO_LINUX ++ ++config GENTOO_LINUX_INIT_SCRIPT ++ bool "OpenRC, runit and other script based systems and managers" ++ ++ default y if GENTOO_LINUX ++ ++ depends on GENTOO_LINUX ++ ++ select BINFMT_SCRIPT ++ select CGROUPS ++ select EPOLL ++ select FILE_LOCKING ++ select INOTIFY_USER ++ select SIGNALFD ++ select TIMERFD ++ ++ help ++ The init system is the first thing that loads after the kernel booted. ++ ++ These config settings allow you to select which init systems to support; ++ instead of having to select all the individual settings all over the ++ place, these settings allows you to select all the settings at once. ++ ++ This particular setting enables all the known requirements for OpenRC, ++ runit and similar script based systems and managers. ++ ++ If you are unsure about this, it is best to leave this setting enabled. ++ ++config GENTOO_LINUX_INIT_SYSTEMD ++ bool "systemd" ++ ++ default n ++ ++ depends on GENTOO_LINUX && GENTOO_LINUX_UDEV ++ ++ select AUTOFS_FS ++ select BLK_DEV_BSG ++ select BPF_SYSCALL ++ select CGROUP_BPF ++ select CGROUPS ++ select CRYPTO_HMAC ++ select CRYPTO_SHA256 ++ select CRYPTO_USER_API_HASH ++ select DEVPTS_MULTIPLE_INSTANCES ++ select DMIID if X86_32 || X86_64 || X86 ++ select EPOLL ++ select FANOTIFY ++ select FHANDLE ++ select FILE_LOCKING ++ select INOTIFY_USER ++ select IPV6 ++ select KCMP ++ select NET ++ select NET_NS ++ select PROC_FS ++ select SECCOMP if HAVE_ARCH_SECCOMP ++ select SECCOMP_FILTER if HAVE_ARCH_SECCOMP_FILTER ++ select SIGNALFD ++ select SYSFS ++ select TIMERFD ++ select TMPFS_POSIX_ACL ++ select TMPFS_XATTR ++ ++ select ANON_INODES ++ select BLOCK ++ select EVENTFD ++ select FSNOTIFY ++ select INET ++ select NLATTR ++ ++ help ++ The init system is the first thing that loads after the kernel booted. ++ ++ These config settings allow you to select which init systems to support; ++ instead of having to select all the individual settings all over the ++ place, these settings allows you to select all the settings at once. ++ ++ This particular setting enables all the known requirements for systemd; ++ it also enables suggested optional settings, as the package suggests to. ++ ++endmenu ++ ++menuconfig GENTOO_KERNEL_SELF_PROTECTION ++ bool "Kernel Self Protection Project" ++ depends on GENTOO_LINUX ++ help ++ Recommended Kernel settings based on the suggestions from the Kernel Self Protection Project ++ See: https://kernsec.org/wiki/index.php/Kernel_Self_Protection_Project/Recommended_Settings ++ Note, there may be additional settings for which the CONFIG_ setting is invisible in menuconfig due ++ to unmet dependencies. Search for GENTOO_KERNEL_SELF_PROTECTION_COMMON and search for ++ GENTOO_KERNEL_SELF_PROTECTION_{X86_64, ARM64, X86_32, ARM} for dependency information on your ++ specific architecture. ++ Note 2: Please see the URL above for numeric settings, e.g. CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 ++ for X86_64 ++ ++if GENTOO_KERNEL_SELF_PROTECTION ++config GENTOO_KERNEL_SELF_PROTECTION_COMMON ++ bool "Enable Kernel Self Protection Project Recommendations" ++ ++ depends on GENTOO_LINUX && !ACPI_CUSTOM_METHOD && !COMPAT_BRK && !PROC_KCORE && !COMPAT_VDSO && !KEXEC && !HIBERNATION && !LEGACY_PTYS && !X86_X32 && !MODIFY_LDT_SYSCALL && GCC_PLUGINS && !IOMMU_DEFAULT_DMA_LAZY && !IOMMU_DEFAULT_PASSTHROUGH && IOMMU_DEFAULT_DMA_STRICT ++ ++ select BUG ++ select STRICT_KERNEL_RWX ++ select DEBUG_WX ++ select STACKPROTECTOR ++ select STACKPROTECTOR_STRONG ++ select STRICT_DEVMEM if DEVMEM=y ++ select IO_STRICT_DEVMEM if DEVMEM=y ++ select SYN_COOKIES ++ select DEBUG_CREDENTIALS ++ select DEBUG_NOTIFIERS ++ select DEBUG_LIST ++ select DEBUG_SG ++ select HARDENED_USERCOPY if HAVE_HARDENED_USERCOPY_ALLOCATOR=y ++ select KFENCE if HAVE_ARCH_KFENCE && (!SLAB || SLUB) ++ select RANDOMIZE_KSTACK_OFFSET_DEFAULT if HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET && (INIT_STACK_NONE || !CC_IS_CLANG || CLANG_VERSION>=140000) ++ select SCHED_CORE if SCHED_SMT ++ select BUG_ON_DATA_CORRUPTION ++ select SCHED_STACK_END_CHECK ++ select SECCOMP if HAVE_ARCH_SECCOMP ++ select SECCOMP_FILTER if HAVE_ARCH_SECCOMP_FILTER ++ select SECURITY_YAMA ++ select SLAB_FREELIST_RANDOM ++ select SLAB_FREELIST_HARDENED ++ select SHUFFLE_PAGE_ALLOCATOR ++ select SLUB_DEBUG ++ select PAGE_POISONING ++ select PAGE_POISONING_NO_SANITY ++ select PAGE_POISONING_ZERO ++ select INIT_ON_ALLOC_DEFAULT_ON ++ select INIT_ON_FREE_DEFAULT_ON ++ select REFCOUNT_FULL ++ select FORTIFY_SOURCE ++ select SECURITY_DMESG_RESTRICT ++ select PANIC_ON_OOPS ++ select GCC_PLUGIN_LATENT_ENTROPY ++ select GCC_PLUGIN_STRUCTLEAK ++ select GCC_PLUGIN_STRUCTLEAK_BYREF_ALL ++ select GCC_PLUGIN_RANDSTRUCT ++ select GCC_PLUGIN_RANDSTRUCT_PERFORMANCE ++ select ZERO_CALL_USED_REGS if CC_HAS_ZERO_CALL_USED_REGS ++ ++ help ++ Search for GENTOO_KERNEL_SELF_PROTECTION_{X86_64, ARM64, X86_32, ARM} for dependency ++ information on your specific architecture. Note 2: Please see the URL above for ++ numeric settings, e.g. CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 for X86_64 ++ ++config GENTOO_KERNEL_SELF_PROTECTION_X86_64 ++ bool "X86_64 KSPP Settings" if GENTOO_KERNEL_SELF_PROTECTION_COMMON ++ ++ depends on !X86_MSR && X86_64 && GENTOO_KERNEL_SELF_PROTECTION ++ default n ++ ++ select RANDOMIZE_BASE ++ select RANDOMIZE_MEMORY ++ select RELOCATABLE ++ select LEGACY_VSYSCALL_NONE ++ select PAGE_TABLE_ISOLATION ++ select GCC_PLUGIN_STACKLEAK ++ select VMAP_STACK ++ ++ ++config GENTOO_KERNEL_SELF_PROTECTION_ARM64 ++ bool "ARM64 KSPP Settings" ++ ++ depends on ARM64 ++ default n ++ ++ select RANDOMIZE_BASE ++ select RELOCATABLE ++ select ARM64_SW_TTBR0_PAN ++ select CONFIG_UNMAP_KERNEL_AT_EL0 ++ select GCC_PLUGIN_STACKLEAK ++ select VMAP_STACK ++ ++config GENTOO_KERNEL_SELF_PROTECTION_X86_32 ++ bool "X86_32 KSPP Settings" ++ ++ depends on !X86_MSR && !MODIFY_LDT_SYSCALL && !M486 && X86_32 ++ default n ++ ++ select HIGHMEM64G ++ select X86_PAE ++ select RANDOMIZE_BASE ++ select RELOCATABLE ++ select PAGE_TABLE_ISOLATION ++ ++config GENTOO_KERNEL_SELF_PROTECTION_ARM ++ bool "ARM KSPP Settings" ++ ++ depends on !OABI_COMPAT && ARM ++ default n ++ ++ select VMSPLIT_3G ++ select STRICT_MEMORY_RWX ++ select CPU_SW_DOMAIN_PAN ++ ++endif ++ ++config GENTOO_PRINT_FIRMWARE_INFO ++ bool "Print firmware information that the kernel attempts to load" ++ ++ depends on GENTOO_LINUX ++ default y ++ ++ help ++ Enable this option to print information about firmware that the kernel ++ is attempting to load. This information can be accessible via the ++ dmesg command-line utility ++ ++ See the settings that become available for more details and fine-tuning. ++ ++endmenu +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 9e921fc72..f29bc13fa 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -26,6 +26,7 @@ config SECURITY_SELINUX_BOOTPARAM + config SECURITY_SELINUX_DISABLE + bool "NSA SELinux runtime disable" + depends on SECURITY_SELINUX ++ depends on !GENTOO_KERNEL_SELF_PROTECTION + select SECURITY_WRITABLE_HOOKS + default n + help +-- +2.31.1 + +From bd3ff0b16792c18c0614c2b95e148943209f460a Mon Sep 17 00:00:00 2001 +From: Georgy Yakovlev +Date: Tue, 8 Jun 2021 13:59:57 -0700 +Subject: [PATCH 2/2] set DEFAULT_MMAP_MIN_ADDR by default + +--- + mm/Kconfig | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/mm/Kconfig b/mm/Kconfig +index 24c045b24..e13fc740c 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -321,6 +321,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU ++ default 65536 if ( X86_64 || X86_32 || PPC64 || IA64 ) && GENTOO_KERNEL_SELF_PROTECTION ++ default 32768 if ( ARM64 || ARM ) && GENTOO_KERNEL_SELF_PROTECTION + default 4096 + help + This is the portion of low virtual memory which should be protected +-- +2.31.1 +``` diff --git a/sys-kernel/pinephone-sources/files/5010_enable-cpu-optimizations-universal.patch b/sys-kernel/pinephone-sources/files/5010_enable-cpu-optimizations-universal.patch new file mode 100644 index 0000000..b9c03cb --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5010_enable-cpu-optimizations-universal.patch @@ -0,0 +1,675 @@ +From b5892719c43f739343c628e3d357471a3bdaa368 Mon Sep 17 00:00:00 2001 +From: graysky +Date: Tue, 15 Mar 2022 05:58:43 -0400 +Subject: [PATCH] more uarches for kernel 5.17+ +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +FEATURES +This patch adds additional CPU options to the Linux kernel accessible under: + Processor type and features ---> + Processor family ---> + +With the release of gcc 11.1 and clang 12.0, several generic 64-bit levels are +offered which are good for supported Intel or AMD CPUs: +• x86-64-v2 +• x86-64-v3 +• x86-64-v4 + +Users of glibc 2.33 and above can see which level is supported by current +hardware by running: + /lib/ld-linux-x86-64.so.2 --help | grep supported + +Alternatively, compare the flags from /proc/cpuinfo to this list.[1] + +CPU-specific microarchitectures include: +• AMD Improved K8-family +• AMD K10-family +• AMD Family 10h (Barcelona) +• AMD Family 14h (Bobcat) +• AMD Family 16h (Jaguar) +• AMD Family 15h (Bulldozer) +• AMD Family 15h (Piledriver) +• AMD Family 15h (Steamroller) +• AMD Family 15h (Excavator) +• AMD Family 17h (Zen) +• AMD Family 17h (Zen 2) +• AMD Family 19h (Zen 3)† +• Intel Silvermont low-power processors +• Intel Goldmont low-power processors (Apollo Lake and Denverton) +• Intel Goldmont Plus low-power processors (Gemini Lake) +• Intel 1st Gen Core i3/i5/i7 (Nehalem) +• Intel 1.5 Gen Core i3/i5/i7 (Westmere) +• Intel 2nd Gen Core i3/i5/i7 (Sandybridge) +• Intel 3rd Gen Core i3/i5/i7 (Ivybridge) +• Intel 4th Gen Core i3/i5/i7 (Haswell) +• Intel 5th Gen Core i3/i5/i7 (Broadwell) +• Intel 6th Gen Core i3/i5/i7 (Skylake) +• Intel 6th Gen Core i7/i9 (Skylake X) +• Intel 8th Gen Core i3/i5/i7 (Cannon Lake) +• Intel 10th Gen Core i7/i9 (Ice Lake) +• Intel Xeon (Cascade Lake) +• Intel Xeon (Cooper Lake)* +• Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)* +• Intel 3rd Gen 10nm++ Xeon (Sapphire Rapids)‡ +• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)‡ +• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)‡ + +Notes: If not otherwise noted, gcc >=9.1 is required for support. + *Requires gcc >=10.1 or clang >=10.0 + †Required gcc >=10.3 or clang >=12.0 + ‡Required gcc >=11.1 or clang >=12.0 + +It also offers to compile passing the 'native' option which, "selects the CPU +to generate code for at compilation time by determining the processor type of +the compiling machine. Using -march=native enables all instruction subsets +supported by the local machine and will produce code optimized for the local +machine under the constraints of the selected instruction set."[2] + +Users of Intel CPUs should select the 'Intel-Native' option and users of AMD +CPUs should select the 'AMD-Native' option. + +MINOR NOTES RELATING TO INTEL ATOM PROCESSORS +This patch also changes -march=atom to -march=bonnell in accordance with the +gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I +believe it should use the newer -march=bonnell flag for atom processors.[3] + +It is not recommended to compile on Atom-CPUs with the 'native' option.[4] The +recommendation is to use the 'atom' option instead. + +BENEFITS +Small but real speed increases are measurable using a make endpoint comparing +a generic kernel to one built with one of the respective microarchs. + +See the following experimental evidence supporting this statement: +https://github.com/graysky2/kernel_gcc_patch + +REQUIREMENTS +linux version 5.17+ +gcc version >=9.0 or clang version >=9.0 + +ACKNOWLEDGMENTS +This patch builds on the seminal work by Jeroen.[5] + +REFERENCES +1. https://gitlab.com/x86-psABIs/x86-64-ABI/-/commit/77566eb03bc6a326811cb7e9 +2. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options +3. https://bugzilla.kernel.org/show_bug.cgi?id=77461 +4. https://github.com/graysky2/kernel_gcc_patch/issues/15 +5. http://www.linuxforge.net/docs/linux/linux-gcc.php + +Signed-off-by: graysky +--- + arch/x86/Kconfig.cpu | 332 ++++++++++++++++++++++++++++++-- + arch/x86/Makefile | 40 +++- + arch/x86/include/asm/vermagic.h | 66 +++++++ + 3 files changed, 424 insertions(+), 14 deletions(-) + +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index 542377cd419d..22b919cdb6d1 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -157,7 +157,7 @@ config MPENTIUM4 + + + config MK6 +- bool "K6/K6-II/K6-III" ++ bool "AMD K6/K6-II/K6-III" + depends on X86_32 + help + Select this for an AMD K6-family processor. Enables use of +@@ -165,7 +165,7 @@ config MK6 + flags to GCC. + + config MK7 +- bool "Athlon/Duron/K7" ++ bool "AMD Athlon/Duron/K7" + depends on X86_32 + help + Select this for an AMD Athlon K7-family processor. Enables use of +@@ -173,12 +173,98 @@ config MK7 + flags to GCC. + + config MK8 +- bool "Opteron/Athlon64/Hammer/K8" ++ bool "AMD Opteron/Athlon64/Hammer/K8" + help + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + ++config MK8SSE3 ++ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" ++ help ++ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MK10 ++ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" ++ help ++ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, ++ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. ++ Enables use of some extended instructions, and passes appropriate ++ optimization flags to GCC. ++ ++config MBARCELONA ++ bool "AMD Barcelona" ++ help ++ Select this for AMD Family 10h Barcelona processors. ++ ++ Enables -march=barcelona ++ ++config MBOBCAT ++ bool "AMD Bobcat" ++ help ++ Select this for AMD Family 14h Bobcat processors. ++ ++ Enables -march=btver1 ++ ++config MJAGUAR ++ bool "AMD Jaguar" ++ help ++ Select this for AMD Family 16h Jaguar processors. ++ ++ Enables -march=btver2 ++ ++config MBULLDOZER ++ bool "AMD Bulldozer" ++ help ++ Select this for AMD Family 15h Bulldozer processors. ++ ++ Enables -march=bdver1 ++ ++config MPILEDRIVER ++ bool "AMD Piledriver" ++ help ++ Select this for AMD Family 15h Piledriver processors. ++ ++ Enables -march=bdver2 ++ ++config MSTEAMROLLER ++ bool "AMD Steamroller" ++ help ++ Select this for AMD Family 15h Steamroller processors. ++ ++ Enables -march=bdver3 ++ ++config MEXCAVATOR ++ bool "AMD Excavator" ++ help ++ Select this for AMD Family 15h Excavator processors. ++ ++ Enables -march=bdver4 ++ ++config MZEN ++ bool "AMD Zen" ++ help ++ Select this for AMD Family 17h Zen processors. ++ ++ Enables -march=znver1 ++ ++config MZEN2 ++ bool "AMD Zen 2" ++ help ++ Select this for AMD Family 17h Zen 2 processors. ++ ++ Enables -march=znver2 ++ ++config MZEN3 ++ bool "AMD Zen 3" ++ depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ help ++ Select this for AMD Family 19h Zen 3 processors. ++ ++ Enables -march=znver3 ++ + config MCRUSOE + bool "Crusoe" + depends on X86_32 +@@ -270,7 +356,7 @@ config MPSC + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. + + config MCORE2 +- bool "Core 2/newer Xeon" ++ bool "Intel Core 2" + help + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and +@@ -278,6 +364,8 @@ config MCORE2 + family in /proc/cpuinfo. Newer ones have 6 and older ones 15 + (not a typo) + ++ Enables -march=core2 ++ + config MATOM + bool "Intel Atom" + help +@@ -287,6 +375,182 @@ config MATOM + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + ++config MNEHALEM ++ bool "Intel Nehalem" ++ select X86_P6_NOP ++ help ++ ++ Select this for 1st Gen Core processors in the Nehalem family. ++ ++ Enables -march=nehalem ++ ++config MWESTMERE ++ bool "Intel Westmere" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Westmere formerly Nehalem-C family. ++ ++ Enables -march=westmere ++ ++config MSILVERMONT ++ bool "Intel Silvermont" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Silvermont platform. ++ ++ Enables -march=silvermont ++ ++config MGOLDMONT ++ bool "Intel Goldmont" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Goldmont platform including Apollo Lake and Denverton. ++ ++ Enables -march=goldmont ++ ++config MGOLDMONTPLUS ++ bool "Intel Goldmont Plus" ++ select X86_P6_NOP ++ help ++ ++ Select this for the Intel Goldmont Plus platform including Gemini Lake. ++ ++ Enables -march=goldmont-plus ++ ++config MSANDYBRIDGE ++ bool "Intel Sandy Bridge" ++ select X86_P6_NOP ++ help ++ ++ Select this for 2nd Gen Core processors in the Sandy Bridge family. ++ ++ Enables -march=sandybridge ++ ++config MIVYBRIDGE ++ bool "Intel Ivy Bridge" ++ select X86_P6_NOP ++ help ++ ++ Select this for 3rd Gen Core processors in the Ivy Bridge family. ++ ++ Enables -march=ivybridge ++ ++config MHASWELL ++ bool "Intel Haswell" ++ select X86_P6_NOP ++ help ++ ++ Select this for 4th Gen Core processors in the Haswell family. ++ ++ Enables -march=haswell ++ ++config MBROADWELL ++ bool "Intel Broadwell" ++ select X86_P6_NOP ++ help ++ ++ Select this for 5th Gen Core processors in the Broadwell family. ++ ++ Enables -march=broadwell ++ ++config MSKYLAKE ++ bool "Intel Skylake" ++ select X86_P6_NOP ++ help ++ ++ Select this for 6th Gen Core processors in the Skylake family. ++ ++ Enables -march=skylake ++ ++config MSKYLAKEX ++ bool "Intel Skylake X" ++ select X86_P6_NOP ++ help ++ ++ Select this for 6th Gen Core processors in the Skylake X family. ++ ++ Enables -march=skylake-avx512 ++ ++config MCANNONLAKE ++ bool "Intel Cannon Lake" ++ select X86_P6_NOP ++ help ++ ++ Select this for 8th Gen Core processors ++ ++ Enables -march=cannonlake ++ ++config MICELAKE ++ bool "Intel Ice Lake" ++ select X86_P6_NOP ++ help ++ ++ Select this for 10th Gen Core processors in the Ice Lake family. ++ ++ Enables -march=icelake-client ++ ++config MCASCADELAKE ++ bool "Intel Cascade Lake" ++ select X86_P6_NOP ++ help ++ ++ Select this for Xeon processors in the Cascade Lake family. ++ ++ Enables -march=cascadelake ++ ++config MCOOPERLAKE ++ bool "Intel Cooper Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) ++ select X86_P6_NOP ++ help ++ ++ Select this for Xeon processors in the Cooper Lake family. ++ ++ Enables -march=cooperlake ++ ++config MTIGERLAKE ++ bool "Intel Tiger Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) ++ select X86_P6_NOP ++ help ++ ++ Select this for third-generation 10 nm process processors in the Tiger Lake family. ++ ++ Enables -march=tigerlake ++ ++config MSAPPHIRERAPIDS ++ bool "Intel Sapphire Rapids" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ select X86_P6_NOP ++ help ++ ++ Select this for third-generation 10 nm process processors in the Sapphire Rapids family. ++ ++ Enables -march=sapphirerapids ++ ++config MROCKETLAKE ++ bool "Intel Rocket Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ select X86_P6_NOP ++ help ++ ++ Select this for eleventh-generation processors in the Rocket Lake family. ++ ++ Enables -march=rocketlake ++ ++config MALDERLAKE ++ bool "Intel Alder Lake" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ select X86_P6_NOP ++ help ++ ++ Select this for twelfth-generation processors in the Alder Lake family. ++ ++ Enables -march=alderlake ++ + config GENERIC_CPU + bool "Generic-x86-64" + depends on X86_64 +@@ -294,6 +558,50 @@ config GENERIC_CPU + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + ++config GENERIC_CPU2 ++ bool "Generic-x86-64-v2" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 ++ help ++ Generic x86-64 CPU. ++ Run equally well on all x86-64 CPUs with min support of x86-64-v2. ++ ++config GENERIC_CPU3 ++ bool "Generic-x86-64-v3" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 ++ help ++ Generic x86-64-v3 CPU with v3 instructions. ++ Run equally well on all x86-64 CPUs with min support of x86-64-v3. ++ ++config GENERIC_CPU4 ++ bool "Generic-x86-64-v4" ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 ++ help ++ Generic x86-64 CPU with v4 instructions. ++ Run equally well on all x86-64 CPUs with min support of x86-64-v4. ++ ++config MNATIVE_INTEL ++ bool "Intel-Native optimizations autodetected by the compiler" ++ help ++ ++ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. Do NOT use this ++ for AMD CPUs. Intel Only! ++ ++ Enables -march=native ++ ++config MNATIVE_AMD ++ bool "AMD-Native optimizations autodetected by the compiler" ++ help ++ ++ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects ++ the optimum settings to use based on your processor. Do NOT use this ++ for Intel CPUs. AMD Only! ++ ++ Enables -march=native ++ + endchoice + + config X86_GENERIC +@@ -318,7 +626,7 @@ config X86_INTERNODE_CACHE_SHIFT + config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || MPSC +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU ++ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 || GENERIC_CPU3 || GENERIC_CPU4 + default "4" if MELAN || M486SX || M486 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + +@@ -336,11 +644,11 @@ config X86_ALIGNMENT_16 + + config X86_INTEL_USERCOPY + def_bool y +- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL + + config X86_USE_PPRO_CHECKSUM + def_bool y +- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD + + # + # P6_NOPs are a relatively minor optimization that require a family >= +@@ -356,26 +664,26 @@ config X86_USE_PPRO_CHECKSUM + config X86_P6_NOP + def_bool y + depends on X86_64 +- depends on (MCORE2 || MPENTIUM4 || MPSC) ++ depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL) + + config X86_TSC + def_bool y +- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 ++ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD) || X86_64 + + config X86_CMPXCHG64 + def_bool y +- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 ++ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD + + # this should be set for all -march=.. options where the compiler + # generates cmov. + config X86_CMOV + def_bool y +- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) ++ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD) + + config X86_MINIMUM_CPU_FAMILY + int + default "64" if X86_64 +- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) ++ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MNATIVE_INTEL || MNATIVE_AMD) + default "5" if X86_32 && X86_CMPXCHG64 + default "4" + +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index e84cdd409b64..7d3bbf060079 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -131,8 +131,44 @@ else + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MK8) += -march=k8 + cflags-$(CONFIG_MPSC) += -march=nocona +- cflags-$(CONFIG_MCORE2) += -march=core2 +- cflags-$(CONFIG_MATOM) += -march=atom ++ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 ++ cflags-$(CONFIG_MK10) += -march=amdfam10 ++ cflags-$(CONFIG_MBARCELONA) += -march=barcelona ++ cflags-$(CONFIG_MBOBCAT) += -march=btver1 ++ cflags-$(CONFIG_MJAGUAR) += -march=btver2 ++ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 ++ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm ++ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm ++ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm ++ cflags-$(CONFIG_MZEN) += -march=znver1 ++ cflags-$(CONFIG_MZEN2) += -march=znver2 ++ cflags-$(CONFIG_MZEN3) += -march=znver3 ++ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native ++ cflags-$(CONFIG_MNATIVE_AMD) += -march=native ++ cflags-$(CONFIG_MATOM) += -march=bonnell ++ cflags-$(CONFIG_MCORE2) += -march=core2 ++ cflags-$(CONFIG_MNEHALEM) += -march=nehalem ++ cflags-$(CONFIG_MWESTMERE) += -march=westmere ++ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont ++ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont ++ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus ++ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge ++ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge ++ cflags-$(CONFIG_MHASWELL) += -march=haswell ++ cflags-$(CONFIG_MBROADWELL) += -march=broadwell ++ cflags-$(CONFIG_MSKYLAKE) += -march=skylake ++ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 ++ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake ++ cflags-$(CONFIG_MICELAKE) += -march=icelake-client ++ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake ++ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake ++ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake ++ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids ++ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake ++ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake ++ cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 ++ cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 ++ cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 + cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic + KBUILD_CFLAGS += $(cflags-y) + +diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h +index 75884d2cdec3..4e6a08d4c7e5 100644 +--- a/arch/x86/include/asm/vermagic.h ++++ b/arch/x86/include/asm/vermagic.h +@@ -17,6 +17,48 @@ + #define MODULE_PROC_FAMILY "586MMX " + #elif defined CONFIG_MCORE2 + #define MODULE_PROC_FAMILY "CORE2 " ++#elif defined CONFIG_MNATIVE_INTEL ++#define MODULE_PROC_FAMILY "NATIVE_INTEL " ++#elif defined CONFIG_MNATIVE_AMD ++#define MODULE_PROC_FAMILY "NATIVE_AMD " ++#elif defined CONFIG_MNEHALEM ++#define MODULE_PROC_FAMILY "NEHALEM " ++#elif defined CONFIG_MWESTMERE ++#define MODULE_PROC_FAMILY "WESTMERE " ++#elif defined CONFIG_MSILVERMONT ++#define MODULE_PROC_FAMILY "SILVERMONT " ++#elif defined CONFIG_MGOLDMONT ++#define MODULE_PROC_FAMILY "GOLDMONT " ++#elif defined CONFIG_MGOLDMONTPLUS ++#define MODULE_PROC_FAMILY "GOLDMONTPLUS " ++#elif defined CONFIG_MSANDYBRIDGE ++#define MODULE_PROC_FAMILY "SANDYBRIDGE " ++#elif defined CONFIG_MIVYBRIDGE ++#define MODULE_PROC_FAMILY "IVYBRIDGE " ++#elif defined CONFIG_MHASWELL ++#define MODULE_PROC_FAMILY "HASWELL " ++#elif defined CONFIG_MBROADWELL ++#define MODULE_PROC_FAMILY "BROADWELL " ++#elif defined CONFIG_MSKYLAKE ++#define MODULE_PROC_FAMILY "SKYLAKE " ++#elif defined CONFIG_MSKYLAKEX ++#define MODULE_PROC_FAMILY "SKYLAKEX " ++#elif defined CONFIG_MCANNONLAKE ++#define MODULE_PROC_FAMILY "CANNONLAKE " ++#elif defined CONFIG_MICELAKE ++#define MODULE_PROC_FAMILY "ICELAKE " ++#elif defined CONFIG_MCASCADELAKE ++#define MODULE_PROC_FAMILY "CASCADELAKE " ++#elif defined CONFIG_MCOOPERLAKE ++#define MODULE_PROC_FAMILY "COOPERLAKE " ++#elif defined CONFIG_MTIGERLAKE ++#define MODULE_PROC_FAMILY "TIGERLAKE " ++#elif defined CONFIG_MSAPPHIRERAPIDS ++#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS " ++#elif defined CONFIG_ROCKETLAKE ++#define MODULE_PROC_FAMILY "ROCKETLAKE " ++#elif defined CONFIG_MALDERLAKE ++#define MODULE_PROC_FAMILY "ALDERLAKE " + #elif defined CONFIG_MATOM + #define MODULE_PROC_FAMILY "ATOM " + #elif defined CONFIG_M686 +@@ -35,6 +77,30 @@ + #define MODULE_PROC_FAMILY "K7 " + #elif defined CONFIG_MK8 + #define MODULE_PROC_FAMILY "K8 " ++#elif defined CONFIG_MK8SSE3 ++#define MODULE_PROC_FAMILY "K8SSE3 " ++#elif defined CONFIG_MK10 ++#define MODULE_PROC_FAMILY "K10 " ++#elif defined CONFIG_MBARCELONA ++#define MODULE_PROC_FAMILY "BARCELONA " ++#elif defined CONFIG_MBOBCAT ++#define MODULE_PROC_FAMILY "BOBCAT " ++#elif defined CONFIG_MBULLDOZER ++#define MODULE_PROC_FAMILY "BULLDOZER " ++#elif defined CONFIG_MPILEDRIVER ++#define MODULE_PROC_FAMILY "PILEDRIVER " ++#elif defined CONFIG_MSTEAMROLLER ++#define MODULE_PROC_FAMILY "STEAMROLLER " ++#elif defined CONFIG_MJAGUAR ++#define MODULE_PROC_FAMILY "JAGUAR " ++#elif defined CONFIG_MEXCAVATOR ++#define MODULE_PROC_FAMILY "EXCAVATOR " ++#elif defined CONFIG_MZEN ++#define MODULE_PROC_FAMILY "ZEN " ++#elif defined CONFIG_MZEN2 ++#define MODULE_PROC_FAMILY "ZEN2 " ++#elif defined CONFIG_MZEN3 ++#define MODULE_PROC_FAMILY "ZEN3 " + #elif defined CONFIG_MELAN + #define MODULE_PROC_FAMILY "ELAN " + #elif defined CONFIG_MCRUSOE +-- +2.35.1 + diff --git a/sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch b/sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch new file mode 100644 index 0000000..610cfe8 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch @@ -0,0 +1,9956 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index cc3ea8febc62..ab4c5a35b999 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -5299,6 +5299,12 @@ + sa1100ir [NET] + See drivers/net/irda/sa1100_ir.c. + ++ sched_timeslice= ++ [KNL] Time slice in ms for Project C BMQ/PDS scheduler. ++ Format: integer 2, 4 ++ Default: 4 ++ See Documentation/scheduler/sched-BMQ.txt ++ + sched_verbose [KNL] Enables verbose scheduler debug messages. + + schedstats= [KNL,X86] Enable or disable scheduled statistics. +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index ddccd1077462..e24781970a3d 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1524,3 +1524,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ/PDS CPU scheduler only. This determines what type of yield calls ++to sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 8dfa36a99c74..46397c606e01 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/sched.h b/include/linux/sched.h +index c46f3a63b758..7c65e6317d97 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -751,8 +751,14 @@ struct task_struct { + unsigned int ptrace; + + #ifdef CONFIG_SMP +- int on_cpu; + struct __call_single_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) ++ int on_cpu; ++#endif ++ ++#ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -766,6 +772,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_ALT */ + #endif + int on_rq; + +@@ -774,6 +781,20 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_ALT ++ u64 last_ran; ++ s64 time_slice; ++ int sq_idx; ++ struct list_head sq_node; ++#ifdef CONFIG_SCHED_BMQ ++ int boost_prio; ++#endif /* CONFIG_SCHED_BMQ */ ++#ifdef CONFIG_SCHED_PDS ++ u64 deadline; ++#endif /* CONFIG_SCHED_PDS */ ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_ALT */ + struct sched_entity se; + struct sched_rt_entity rt; + struct sched_dl_entity dl; +@@ -784,6 +805,7 @@ struct task_struct { + unsigned long core_cookie; + unsigned int core_occupation; + #endif ++#endif /* !CONFIG_SCHED_ALT */ + + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; +@@ -1517,6 +1539,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_ALT ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_ALT */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 7c83d4d5a971..fa30f98cb2be 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,24 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_ALT ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHED_BMQ ++#define __tsk_deadline(p) (0UL) ++#endif ++ ++#ifdef CONFIG_SCHED_PDS ++#define __tsk_deadline(p) ((((u64) ((p)->prio))<<56) | (p)->deadline) ++#endif ++ ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -21,6 +40,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_ALT */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index ab83d85e1183..6af9ae681116 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -18,6 +18,32 @@ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++#ifdef CONFIG_SCHED_ALT ++ ++/* Undefine MAX_PRIO and DEFAULT_PRIO */ ++#undef MAX_PRIO ++#undef DEFAULT_PRIO ++ ++/* +/- priority levels from the base priority */ ++#ifdef CONFIG_SCHED_BMQ ++#define MAX_PRIORITY_ADJ (7) ++ ++#define MIN_NORMAL_PRIO (MAX_RT_PRIO) ++#define MAX_PRIO (MIN_NORMAL_PRIO + NICE_WIDTH) ++#define DEFAULT_PRIO (MIN_NORMAL_PRIO + NICE_WIDTH / 2) ++#endif ++ ++#ifdef CONFIG_SCHED_PDS ++#define MAX_PRIORITY_ADJ (0) ++ ++#define MIN_NORMAL_PRIO (128) ++#define NORMAL_PRIO_NUM (64) ++#define MAX_PRIO (MIN_NORMAL_PRIO + NORMAL_PRIO_NUM) ++#define DEFAULT_PRIO (MAX_PRIO - NICE_WIDTH / 2) ++#endif ++ ++#endif /* CONFIG_SCHED_ALT */ ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..0a7565d0d3cf 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_ALT + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 56cffe42abbc..e020fc572b22 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -233,7 +233,8 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) + + #endif /* !CONFIG_SMP */ + +-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && \ ++ !defined(CONFIG_SCHED_ALT) + extern void rebuild_sched_domains_energy(void); + #else + static inline void rebuild_sched_domains_energy(void) +diff --git a/init/Kconfig b/init/Kconfig +index c7900e8975f1..d2b593e3807d 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -812,6 +812,7 @@ menu "Scheduler features" + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_ALT + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -858,6 +859,35 @@ config UCLAMP_BUCKETS_COUNT + + If in doubt, use the default value. + ++menuconfig SCHED_ALT ++ bool "Alternative CPU Schedulers" ++ default y ++ help ++ This feature enable alternative CPU scheduler" ++ ++if SCHED_ALT ++ ++choice ++ prompt "Alternative CPU Scheduler" ++ default SCHED_BMQ ++ ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++config SCHED_PDS ++ bool "PDS CPU scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler. ++ ++endchoice ++ ++endif ++ + endmenu + + # +@@ -911,6 +941,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT ++ depends on !SCHED_ALT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -1003,6 +1034,7 @@ config FAIR_GROUP_SCHED + depends on CGROUP_SCHED + default CGROUP_SCHED + ++if !SCHED_ALT + config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on FAIR_GROUP_SCHED +@@ -1025,6 +1057,7 @@ config RT_GROUP_SCHED + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.rst for more information. + ++endif #!SCHED_ALT + endif #CGROUP_SCHED + + config UCLAMP_TASK_GROUP +@@ -1268,6 +1301,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_ALT + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 73cc8f03511a..2d0bad762895 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -75,9 +75,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_ALT ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, +@@ -88,6 +94,17 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_ALT ++ .sq_node = LIST_HEAD_INIT(init_task.sq_node), ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .sq_idx = 15, ++#endif ++#ifdef CONFIG_SCHED_PDS ++ .deadline = 0, ++#endif ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -95,6 +112,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index c2f1fd95a821..41654679b1b2 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -117,7 +117,7 @@ config PREEMPT_DYNAMIC + + config SCHED_CORE + bool "Core Scheduling for SMT" +- depends on SCHED_SMT ++ depends on SCHED_SMT && !SCHED_ALT + help + This option permits Core Scheduling, a means of coordinated task + selection across SMT siblings. When enabled -- see +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 71a418858a5e..7e3016873db1 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -704,7 +704,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1100,7 +1100,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 164ed9ef77a3..c974a84b056f 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -150,7 +150,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 64c938ce36fe..a353f7ef5392 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -124,7 +124,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -145,7 +145,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 7779ee8abc2a..5b9893cdfb1b 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -300,21 +300,25 @@ static __always_inline void + waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) + { + waiter->prio = __waiter_prio(task); +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + } + + /* + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = __tsk_deadline(p) } + + static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) + { ++#ifdef CONFIG_SCHED_PDS ++ return (left->deadline < right->deadline); ++#else + if (left->prio < right->prio) + return 1; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -323,16 +327,22 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return dl_time_before(left->deadline, right->deadline); ++#endif + + return 0; ++#endif + } + + static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) + { ++#ifdef CONFIG_SCHED_PDS ++ return (left->deadline == right->deadline); ++#else + if (left->prio != right->prio) + return 0; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -341,8 +351,10 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; ++#endif + + return 1; ++#endif + } + + static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 976092b7bd45..31d587c16ec1 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -28,7 +28,12 @@ endif + # These compilation units have roughly the same size and complexity - so their + # build parallelizes well and finishes roughly at once: + # ++ifdef CONFIG_SCHED_ALT ++obj-y += alt_core.o ++obj-$(CONFIG_SCHED_DEBUG) += alt_debug.o ++else + obj-y += core.o + obj-y += fair.o ++endif + obj-y += build_policy.o + obj-y += build_utility.o +diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c +new file mode 100644 +index 000000000000..d0ab41c4d9ad +--- /dev/null ++++ b/kernel/sched/alt_core.c +@@ -0,0 +1,7807 @@ ++/* ++ * kernel/sched/alt_core.c ++ * ++ * Core alternative kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#define CREATE_TRACE_POINTS ++#include ++#undef CREATE_TRACE_POINTS ++ ++#include "sched.h" ++ ++#include "pelt.h" ++ ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++/* ++ * Export tracepoints that act as a bare tracehook (ie: have no trace event ++ * associated with them) to allow external modules to probe them. ++ */ ++EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); ++ ++#ifdef CONFIG_SCHED_DEBUG ++#define sched_feat(x) (1) ++/* ++ * Print a warning if need_resched is set for the given duration (if ++ * LATENCY_WARN is enabled). ++ * ++ * If sysctl_resched_latency_warn_once is set, only one warning will be shown ++ * per boot. ++ */ ++__read_mostly int sysctl_resched_latency_warn_ms = 100; ++__read_mostly int sysctl_resched_latency_warn_once = 1; ++#else ++#define sched_feat(x) (0) ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#define ALT_SCHED_VERSION "v5.19-r0" ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ ++u64 sched_timeslice_ns __read_mostly = (4 << 20); ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx); ++ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq.h" ++#endif ++#ifdef CONFIG_SCHED_PDS ++#include "pds.h" ++#endif ++ ++static int __init sched_timeslice(char *str) ++{ ++ int timeslice_ms; ++ ++ get_option(&str, ×lice_ms); ++ if (2 != timeslice_ms) ++ timeslice_ms = 4; ++ sched_timeslice_ns = timeslice_ms << 20; ++ sched_timeslice_imp(timeslice_ms); ++ ++ return 0; ++} ++early_param("sched_timeslice", sched_timeslice); ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 << 10) ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_topo_end_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++#endif ++static cpumask_t sched_rq_watermark[SCHED_QUEUE_BITS] ____cacheline_aligned_in_smp; ++ ++/* sched_queue related functions */ ++static inline void sched_queue_init(struct sched_queue *q) ++{ ++ int i; ++ ++ bitmap_zero(q->bitmap, SCHED_QUEUE_BITS); ++ for(i = 0; i < SCHED_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++/* ++ * Init idle task and put into queue structure of rq ++ * IMPORTANT: may be called multiple times for a single cpu ++ */ ++static inline void sched_queue_init_idle(struct sched_queue *q, ++ struct task_struct *idle) ++{ ++ idle->sq_idx = IDLE_TASK_SCHED_PRIO; ++ INIT_LIST_HEAD(&q->heads[idle->sq_idx]); ++ list_add(&idle->sq_node, &q->heads[idle->sq_idx]); ++} ++ ++/* water mark related functions */ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = last_wm; i > watermark; i--) ++ cpumask_clear_cpu(cpu, sched_rq_watermark + SCHED_QUEUE_BITS - i); ++#ifdef CONFIG_SCHED_SMT ++ if (static_branch_likely(&sched_smt_present) && ++ IDLE_TASK_SCHED_PRIO == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = watermark; i > last_wm; i--) ++ cpumask_set_cpu(cpu, sched_rq_watermark + SCHED_QUEUE_BITS - i); ++#ifdef CONFIG_SCHED_SMT ++ if (static_branch_likely(&sched_smt_present) && ++ IDLE_TASK_SCHED_PRIO == watermark) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), sched_rq_watermark); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++ } ++#endif ++} ++ ++/* ++ * This routine assume that the idle task always in queue ++ */ ++static inline struct task_struct *sched_rq_first_task(struct rq *rq) ++{ ++ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_QUEUE_BITS); ++ const struct list_head *head = &rq->queue.heads[sched_prio2idx(idx, rq)]; ++ ++ return list_first_entry(head, struct task_struct, sq_node); ++} ++ ++static inline struct task_struct * ++sched_rq_next_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->sq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->sq_node, head)) { ++ idx = find_next_bit(rq->queue.bitmap, SCHED_QUEUE_BITS, ++ sched_idx2prio(idx, rq) + 1); ++ head = &rq->queue.heads[sched_prio2idx(idx, rq)]; ++ ++ return list_first_entry(head, struct task_struct, sq_node); ++ } ++ ++ return list_next_entry(p, sq_node); ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = sched_rq_first_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = sched_rq_next_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Serialization rules: ++ * ++ * Lock order: ++ * ++ * p->pi_lock ++ * rq->lock ++ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) ++ * ++ * rq1->lock ++ * rq2->lock where: rq1 < rq2 ++ * ++ * Regular state: ++ * ++ * Normal scheduling state is serialized by rq->lock. __schedule() takes the ++ * local CPU's rq->lock, it optionally removes the task from the runqueue and ++ * always looks at the local rq data structures to find the most eligible task ++ * to run next. ++ * ++ * Task enqueue is also under rq->lock, possibly taken from another CPU. ++ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to ++ * the local CPU to avoid bouncing the runqueue state around [ see ++ * ttwu_queue_wakelist() ] ++ * ++ * Task wakeup, specifically wakeups that involve migration, are horribly ++ * complicated to avoid having to take two rq->locks. ++ * ++ * Special state: ++ * ++ * System-calls and anything external will use task_rq_lock() which acquires ++ * both p->pi_lock and rq->lock. As a consequence the state they change is ++ * stable while holding either lock: ++ * ++ * - sched_setaffinity()/ ++ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed ++ * - set_user_nice(): p->se.load, p->*prio ++ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, ++ * p->se.load, p->rt_priority, ++ * p->dl.dl_{runtime, deadline, period, flags, bw, density} ++ * - sched_setnuma(): p->numa_preferred_nid ++ * - sched_move_task()/ ++ * cpu_cgroup_fork(): p->sched_task_group ++ * - uclamp_update_active() p->uclamp* ++ * ++ * p->state <- TASK_*: ++ * ++ * is changed locklessly using set_current_state(), __set_current_state() or ++ * set_special_state(), see their respective comments, or by ++ * try_to_wake_up(). This latter uses p->pi_lock to serialize against ++ * concurrent self. ++ * ++ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: ++ * ++ * is set by activate_task() and cleared by deactivate_task(), under ++ * rq->lock. Non-zero indicates the task is runnable, the special ++ * ON_RQ_MIGRATING state is used for migration without holding both ++ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). ++ * ++ * p->on_cpu <- { 0, 1 }: ++ * ++ * is set by prepare_task() and cleared by finish_task() such that it will be ++ * set before p is scheduled-in and cleared after p is scheduled-out, both ++ * under rq->lock. Non-zero indicates the task is running on its CPU. ++ * ++ * [ The astute reader will observe that it is possible for two tasks on one ++ * CPU to have ->on_cpu = 1 at the same time. ] ++ * ++ * task_cpu(p): is changed by set_task_cpu(), the rules are: ++ * ++ * - Don't call set_task_cpu() on a blocked task: ++ * ++ * We don't care what CPU we're not running on, this simplifies hotplug, ++ * the CPU assignment of blocked tasks isn't required to be valid. ++ * ++ * - for try_to_wake_up(), called under p->pi_lock: ++ * ++ * This allows try_to_wake_up() to only take one rq->lock, see its comment. ++ * ++ * - for migration called under rq->lock: ++ * [ see task_on_rq_migrating() in task_rq_lock() ] ++ * ++ * o move_queued_task() ++ * o detach_task() ++ * ++ * - for migration called under double_rq_lock(): ++ * ++ * o __migrate_swap_task() ++ * o push_rt_task() / pull_rt_task() ++ * o push_dl_task() / pull_dl_task() ++ * o dl_task_offline_migration() ++ * ++ */ ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++static inline void ++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); ++} ++ ++void raw_spin_rq_lock_nested(struct rq *rq, int subclass) ++{ ++ raw_spinlock_t *lock; ++ ++ /* Matches synchronize_rcu() in __sched_core_enable() */ ++ preempt_disable(); ++ ++ for (;;) { ++ lock = __rq_lockp(rq); ++ raw_spin_lock_nested(lock, subclass); ++ if (likely(lock == __rq_lockp(rq))) { ++ /* preempt_count *MUST* be > 1 */ ++ preempt_enable_no_resched(); ++ return; ++ } ++ raw_spin_unlock(lock); ++ } ++} ++ ++void raw_spin_rq_unlock(struct rq *rq) ++{ ++ raw_spin_unlock(rq_lockp(rq)); ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_time_edge(rq); ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * RQ Load update routine ++ */ ++#define RQ_LOAD_HISTORY_BITS (sizeof(s32) * 8ULL) ++#define RQ_UTIL_SHIFT (8) ++#define RQ_LOAD_HISTORY_TO_UTIL(l) (((l) >> (RQ_LOAD_HISTORY_BITS - 1 - RQ_UTIL_SHIFT)) & 0xff) ++ ++#define LOAD_BLOCK(t) ((t) >> 17) ++#define LOAD_HALF_BLOCK(t) ((t) >> 16) ++#define BLOCK_MASK(t) ((t) & ((0x01 << 18) - 1)) ++#define LOAD_BLOCK_BIT(b) (1UL << (RQ_LOAD_HISTORY_BITS - 1 - (b))) ++#define CURRENT_LOAD_BIT LOAD_BLOCK_BIT(0) ++ ++static inline void rq_load_update(struct rq *rq) ++{ ++ u64 time = rq->clock; ++ u64 delta = min(LOAD_BLOCK(time) - LOAD_BLOCK(rq->load_stamp), ++ RQ_LOAD_HISTORY_BITS - 1); ++ u64 prev = !!(rq->load_history & CURRENT_LOAD_BIT); ++ u64 curr = !!rq->nr_running; ++ ++ if (delta) { ++ rq->load_history = rq->load_history >> delta; ++ ++ if (delta < RQ_UTIL_SHIFT) { ++ rq->load_block += (~BLOCK_MASK(rq->load_stamp)) * prev; ++ if (!!LOAD_HALF_BLOCK(rq->load_block) ^ curr) ++ rq->load_history ^= LOAD_BLOCK_BIT(delta); ++ } ++ ++ rq->load_block = BLOCK_MASK(time) * prev; ++ } else { ++ rq->load_block += (time - rq->load_stamp) * prev; ++ } ++ if (prev ^ curr) ++ rq->load_history ^= CURRENT_LOAD_BIT; ++ rq->load_stamp = time; ++} ++ ++unsigned long rq_load_util(struct rq *rq, unsigned long max) ++{ ++ return RQ_LOAD_HISTORY_TO_UTIL(rq->load_history) * (max >> RQ_UTIL_SHIFT); ++} ++ ++#ifdef CONFIG_SMP ++unsigned long sched_cpu_util(int cpu, unsigned long max) ++{ ++ return rq_load_util(cpu_rq(cpu), max); ++} ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_FREQ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++#ifdef CONFIG_SMP ++ rq_load_update(rq); ++#endif ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++#ifdef CONFIG_SMP ++ rq_load_update(rq); ++#endif ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++bool sched_task_on_rq(struct task_struct *p) ++{ ++ return task_on_rq_queued(p); ++} ++ ++unsigned long get_wchan(struct task_struct *p) ++{ ++ unsigned long ip = 0; ++ unsigned int state; ++ ++ if (!p || p == current) ++ return 0; ++ ++ /* Only get wchan if task is blocked and we can keep it that way. */ ++ raw_spin_lock_irq(&p->pi_lock); ++ state = READ_ONCE(p->__state); ++ smp_rmb(); /* see try_to_wake_up() */ ++ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) ++ ip = __get_wchan(p); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ return ip; ++} ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++#define __SCHED_DEQUEUE_TASK(p, rq, flags) \ ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ ++ sched_info_dequeue(rq, p); \ ++ \ ++ list_del(&p->sq_node); \ ++ if (list_empty(&rq->queue.heads[p->sq_idx])) \ ++ clear_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); ++ ++#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ ++ sched_info_enqueue(rq, p); \ ++ psi_enqueue(p, flags); \ ++ \ ++ p->sq_idx = task_sched_prio_idx(p, rq); \ ++ list_add_tail(&p->sq_node, &rq->queue.heads[p->sq_idx]); \ ++ set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); ++ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_DEQUEUE_TASK(p, rq, flags); ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_ENQUEUE_TASK(p, rq, flags); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq, int idx) ++{ ++ lockdep_assert_held(&rq->lock); ++ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ list_del(&p->sq_node); ++ list_add_tail(&p->sq_node, &rq->queue.heads[idx]); ++ if (idx != p->sq_idx) { ++ if (list_empty(&rq->queue.heads[p->sq_idx])) ++ clear_bit(sched_idx2prio(p->sq_idx, rq), ++ rq->queue.bitmap); ++ p->sq_idx = idx; ++ set_bit(sched_idx2prio(p->sq_idx, rq), rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * it's already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) {} ++ ++void select_nohz_load_balancer(int stop_tick) {} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ const struct cpumask *hk_mask; ++ ++ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); ++ ++ for (mask = per_cpu(sched_cpu_topo_masks, cpu) + 1; ++ mask < per_cpu(sched_cpu_topo_end_mask, cpu); mask++) ++ for_each_cpu_and(i, mask, hk_mask) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++static inline void wake_up_idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(rq->idle)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++ ++static void nohz_csd_func(void *info) ++{ ++ struct rq *rq = info; ++ int cpu = cpu_of(rq); ++ unsigned int flags; ++ ++ /* ++ * Release the rq::nohz_csd. ++ */ ++ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); ++ WARN_ON(!(flags & NOHZ_KICK_MASK)); ++ ++ rq->idle_balance = idle_cpu(cpu); ++ if (rq->idle_balance && !need_resched()) { ++ rq->nohz_idle_balance = flags; ++ raise_softirq_irqoff(SCHED_SOFTIRQ); ++ } ++} ++ ++#endif /* CONFIG_NO_HZ_COMMON */ ++#endif /* CONFIG_SMP */ ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (sched_rq_first_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * Alt schedule FW doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time = rq->hrtick_time; ++ ++ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ ++ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ if (rq == this_rq()) ++ __hrtick_restart(rq); ++ else ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int __normal_prio(int policy, int rt_prio, int static_prio) ++{ ++ return rt_policy(policy) ? (MAX_RT_PRIO - 1 - rt_prio) : ++ static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the expected normal priority: i.e. priority ++ * without taking RT-inheritance into account. Might be ++ * boosted by interactivity modifiers. Changes upon fork, ++ * setprio syscalls, and whenever the interactivity ++ * estimator recalculates. ++ */ ++static inline int normal_prio(struct task_struct *p) ++{ ++ return __normal_prio(p->policy, p->rt_priority, p->static_prio); ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT * p->in_iowait); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++} ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->migration_disabled; ++#else ++ return false; ++#endif ++} ++ ++#define SCA_CHECK 0x01 ++#define SCA_USER 0x08 ++ ++#ifdef CONFIG_SMP ++ ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ unsigned int state = READ_ONCE(p->__state); ++ ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++ ++ WARN_ON_ONCE(is_migration_disabled(p)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++#define MDF_FORCE_ENABLED 0x80 ++ ++static void ++__do_set_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ /* ++ * This here violates the locking rules for affinity, since we're only ++ * supposed to change these variables while holding both rq->lock and ++ * p->pi_lock. ++ * ++ * HOWEVER, it magically works, because ttwu() is the only code that ++ * accesses these variables under p->pi_lock and only does so after ++ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() ++ * before finish_task(). ++ * ++ * XXX do further audits, this smells like something putrid. ++ */ ++ SCHED_WARN_ON(!p->on_cpu); ++ p->cpus_ptr = new_mask; ++} ++ ++void migrate_disable(void) ++{ ++ struct task_struct *p = current; ++ int cpu; ++ ++ if (p->migration_disabled) { ++ p->migration_disabled++; ++ return; ++ } ++ ++ preempt_disable(); ++ cpu = smp_processor_id(); ++ if (cpumask_test_cpu(cpu, &p->cpus_mask)) { ++ cpu_rq(cpu)->nr_pinned++; ++ p->migration_disabled = 1; ++ p->migration_flags &= ~MDF_FORCE_ENABLED; ++ ++ /* ++ * Violates locking rules! see comment in __do_set_cpus_ptr(). ++ */ ++ if (p->cpus_ptr == &p->cpus_mask) ++ __do_set_cpus_ptr(p, cpumask_of(cpu)); ++ } ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(migrate_disable); ++ ++void migrate_enable(void) ++{ ++ struct task_struct *p = current; ++ ++ if (0 == p->migration_disabled) ++ return; ++ ++ if (p->migration_disabled > 1) { ++ p->migration_disabled--; ++ return; ++ } ++ ++ if (WARN_ON_ONCE(!p->migration_disabled)) ++ return; ++ ++ /* ++ * Ensure stop_task runs either before or after this, and that ++ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). ++ */ ++ preempt_disable(); ++ /* ++ * Assumption: current should be running on allowed cpu ++ */ ++ WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &p->cpus_mask)); ++ if (p->cpus_ptr != &p->cpus_mask) ++ __do_set_cpus_ptr(p, &p->cpus_mask); ++ /* ++ * Mustn't clear migration_disabled() until cpus_ptr points back at the ++ * regular cpus_mask, otherwise things that race (eg. ++ * select_fallback_rq) get confused. ++ */ ++ barrier(); ++ p->migration_disabled = 0; ++ this_rq()->nr_pinned--; ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(migrate_enable); ++ ++static inline bool rq_has_pinned_tasks(struct rq *rq) ++{ ++ return rq->nr_pinned; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ /* When not in the task's cpumask, no point in looking further. */ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ /* migrate_disabled() must be allowed to finish. */ ++ if (is_migration_disabled(p)) ++ return cpu_online(cpu); ++ ++ /* Non kernel threads are not allowed during either online or offline. */ ++ if (!(p->flags & PF_KTHREAD)) ++ return cpu_active(cpu) && task_cpu_possible(cpu, p); ++ ++ /* KTHREAD_IS_PER_CPU is always allowed. */ ++ if (kthread_is_per_cpu(p)) ++ return cpu_online(cpu); ++ ++ /* Regular kernel threads don't get to stay during offline. */ ++ if (cpu_dying(cpu)) ++ return false; ++ ++ /* But are allowed during online. */ ++ return cpu_online(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ update_sched_rq_watermark(rq); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ sched_task_sanity_check(p, rq); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ unsigned long flags; ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_save(flags); ++ /* ++ * We need to explicitly wake pending tasks before running ++ * __migrate_task() such that we will not miss enforcing cpus_ptr ++ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. ++ */ ++ flush_smp_call_function_queue(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++static void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ lockdep_assert_held(&p->pi_lock); ++ set_cpus_allowed_common(p, new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++} ++ ++int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, ++ int node) ++{ ++ if (!src->user_cpus_ptr) ++ return 0; ++ ++ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); ++ if (!dst->user_cpus_ptr) ++ return -ENOMEM; ++ ++ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); ++ return 0; ++} ++ ++static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) ++{ ++ struct cpumask *user_mask = NULL; ++ ++ swap(p->user_cpus_ptr, user_mask); ++ ++ return user_mask; ++} ++ ++void release_user_cpus_ptr(struct task_struct *p) ++{ ++ kfree(clear_user_cpus_ptr(p)); ++} ++ ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || READ_ONCE(p->__state) == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (is_cpu_allowed(p, dest_cpu)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (cpuset_cpus_allowed_fallback(p)) { ++ state = possible; ++ break; ++ } ++ fallthrough; ++ case possible: ++ /* ++ * XXX When called from select_task_rq() we only ++ * hold p->pi_lock and again violate locking order. ++ * ++ * More yuck to audit. ++ */ ++ do_set_cpus_allowed(p, task_cpu_possible_mask(p)); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_active_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, sched_rq_watermark) || ++ cpumask_and(&tmp, &chk_mask, ++ sched_rq_watermark + SCHED_QUEUE_BITS - 1 - task_sched_prio(p))) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ static struct lock_class_key stop_pi_lock; ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ ++ /* ++ * The PI code calls rt_mutex_setprio() with ->pi_lock held to ++ * adjust the effective priority of a task. As a result, ++ * rt_mutex_setprio() can trigger (RT) balancing operations, ++ * which can then trigger wakeups of the stop thread to push ++ * around the current task. ++ * ++ * The stop task itself will never be part of the PI-chain, it ++ * never blocks, therefore that ->pi_lock recursion is safe. ++ * Tell lockdep about this by placing the stop->pi_lock in its ++ * own class. ++ */ ++ lockdep_set_class(&stop->pi_lock, &stop_pi_lock); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++static int affine_move_task(struct rq *rq, struct task_struct *p, int dest_cpu, ++ raw_spinlock_t *lock, unsigned long irq_flags) ++{ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { ++ if (p->migration_disabled) { ++ if (likely(p->cpus_ptr != &p->cpus_mask)) ++ __do_set_cpus_ptr(p, &p->cpus_mask); ++ p->migration_disabled = 0; ++ p->migration_flags |= MDF_FORCE_ENABLED; ++ /* When p is migrate_disabled, rq->lock should be held */ ++ rq->nr_pinned--; ++ } ++ ++ if (task_running(p) || READ_ONCE(p->__state) == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ } ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); ++ return 0; ++} ++ ++static int __set_cpus_allowed_ptr_locked(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 flags, ++ struct rq *rq, ++ raw_spinlock_t *lock, ++ unsigned long irq_flags) ++{ ++ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool kthread = p->flags & PF_KTHREAD; ++ struct cpumask *user_mask = NULL; ++ int dest_cpu; ++ int ret = 0; ++ ++ if (kthread || is_migration_disabled(p)) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs, ++ * however, during cpu-hot-unplug, even these might get pushed ++ * away if not KTHREAD_IS_PER_CPU. ++ * ++ * Specifically, migration_disabled() tasks must not fail the ++ * cpumask_any_and_distribute() pick below, esp. so on ++ * SCA_MIGRATE_ENABLE, otherwise we'll not call ++ * set_cpus_allowed_common() and actually reset p->cpus_ptr. ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (flags & SCA_USER) ++ user_mask = clear_user_cpus_ptr(p); ++ ++ ret = affine_move_task(rq, p, dest_cpu, lock, irq_flags); ++ ++ kfree(user_mask); ++ ++ return ret; ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); ++ ++ return ret; ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, u32 flags) ++{ ++ unsigned long irq_flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, irq_flags); ++ rq = __task_access_lock(p, &lock); ++ ++ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, lock, irq_flags); ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, 0); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++/* ++ * Change a given task's CPU affinity to the intersection of its current ++ * affinity mask and @subset_mask, writing the resulting mask to @new_mask ++ * and pointing @p->user_cpus_ptr to a copy of the old mask. ++ * If the resulting mask is empty, leave the affinity unchanged and return ++ * -EINVAL. ++ */ ++static int restrict_cpus_allowed_ptr(struct task_struct *p, ++ struct cpumask *new_mask, ++ const struct cpumask *subset_mask) ++{ ++ struct cpumask *user_mask = NULL; ++ unsigned long irq_flags; ++ raw_spinlock_t *lock; ++ struct rq *rq; ++ int err; ++ ++ if (!p->user_cpus_ptr) { ++ user_mask = kmalloc(cpumask_size(), GFP_KERNEL); ++ if (!user_mask) ++ return -ENOMEM; ++ } ++ ++ raw_spin_lock_irqsave(&p->pi_lock, irq_flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { ++ err = -EINVAL; ++ goto err_unlock; ++ } ++ ++ /* ++ * We're about to butcher the task affinity, so keep track of what ++ * the user asked for in case we're able to restore it later on. ++ */ ++ if (user_mask) { ++ cpumask_copy(user_mask, p->cpus_ptr); ++ p->user_cpus_ptr = user_mask; ++ } ++ ++ /*return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);*/ ++ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, lock, irq_flags); ++ ++err_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); ++ kfree(user_mask); ++ return err; ++} ++ ++/* ++ * Restrict the CPU affinity of task @p so that it is a subset of ++ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the ++ * old affinity mask. If the resulting mask is empty, we warn and walk ++ * up the cpuset hierarchy until we find a suitable mask. ++ */ ++void force_compatible_cpus_allowed_ptr(struct task_struct *p) ++{ ++ cpumask_var_t new_mask; ++ const struct cpumask *override_mask = task_cpu_possible_mask(p); ++ ++ alloc_cpumask_var(&new_mask, GFP_KERNEL); ++ ++ /* ++ * __migrate_task() can fail silently in the face of concurrent ++ * offlining of the chosen destination CPU, so take the hotplug ++ * lock to ensure that the migration succeeds. ++ */ ++ cpus_read_lock(); ++ if (!cpumask_available(new_mask)) ++ goto out_set_mask; ++ ++ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) ++ goto out_free_mask; ++ ++ /* ++ * We failed to find a valid subset of the affinity mask for the ++ * task, so override it based on its cpuset hierarchy. ++ */ ++ cpuset_cpus_allowed(p, new_mask); ++ override_mask = new_mask; ++ ++out_set_mask: ++ if (printk_ratelimit()) { ++ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", ++ task_pid_nr(p), p->comm, ++ cpumask_pr_args(override_mask)); ++ } ++ ++ WARN_ON(set_cpus_allowed_ptr(p, override_mask)); ++out_free_mask: ++ cpus_read_unlock(); ++ free_cpumask_var(new_mask); ++} ++ ++static int ++__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); ++ ++/* ++ * Restore the affinity of a task @p which was previously restricted by a ++ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) ++ * @p->user_cpus_ptr. ++ * ++ * It is the caller's responsibility to serialise this with any calls to ++ * force_compatible_cpus_allowed_ptr(@p). ++ */ ++void relax_compatible_cpus_allowed_ptr(struct task_struct *p) ++{ ++ struct cpumask *user_mask = p->user_cpus_ptr; ++ unsigned long flags; ++ ++ /* ++ * Try to restore the old affinity mask. If this fails, then ++ * we free the mask explicitly to avoid it being inherited across ++ * a subsequent fork(). ++ */ ++ if (!user_mask || !__sched_setaffinity(p, user_mask)) ++ return; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ user_mask = clear_user_cpus_ptr(p); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ kfree(user_mask); ++} ++ ++#else /* CONFIG_SMP */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, u32 flags) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++ ++static inline bool rq_has_pinned_tasks(struct rq *rq) ++{ ++ return false; ++} ++ ++#endif /* !CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ __schedstat_inc(p->stats.nr_wakeups_local); ++ } else { ++ /** Alt schedule FW ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++ __schedstat_inc(p->stats.nr_wakeups); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ check_preempt_curr(rq); ++ WRITE_ONCE(p->__state, TASK_RUNNING); ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ if ( ++#ifdef CONFIG_SMP ++ !(wake_flags & WF_MIGRATED) && ++#endif ++ p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++/* ++ * Consider @p being inside a wait loop: ++ * ++ * for (;;) { ++ * set_current_state(TASK_UNINTERRUPTIBLE); ++ * ++ * if (CONDITION) ++ * break; ++ * ++ * schedule(); ++ * } ++ * __set_current_state(TASK_RUNNING); ++ * ++ * between set_current_state() and schedule(). In this case @p is still ++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in ++ * an atomic manner. ++ * ++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq ++ * then schedule() must still happen and p->state can be changed to ++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we ++ * need to do a full wakeup with enqueue. ++ * ++ * Returns: %true when the wakeup is done, ++ * %false otherwise. ++ */ ++static int ttwu_runnable(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ /* check_preempt_curr() may use rq clock */ ++ update_rq_clock(rq); ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ update_rq_clock(rq); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { ++ if (WARN_ON_ONCE(p->on_cpu)) ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) ++ set_task_cpu(p, cpu_of(rq)); ++ ++ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if ++ * necessary. The wakee CPU on receipt of the IPI will queue the task ++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost ++ * of the wakeup instead of the waker. ++ */ ++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); ++ ++ WRITE_ONCE(rq->ttwu_pending, 1); ++ __smp_call_single_queue(cpu, &p->wake_entry.llist); ++} ++ ++static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++{ ++ /* ++ * Do not complicate things with the async wake_list while the CPU is ++ * in hotplug state. ++ */ ++ if (!cpu_active(cpu)) ++ return false; ++ ++ /* ++ * If the CPU does not share cache, then queue the task on the ++ * remote rqs wakelist to avoid accessing remote data. ++ */ ++ if (!cpus_share_cache(smp_processor_id(), cpu)) ++ return true; ++ ++ /* ++ * If the task is descheduling and the only running task on the ++ * CPU then use the wakelist to offload the task activation to ++ * the soon-to-be-idle CPU as the current CPU is likely busy. ++ * nr_running is checked to avoid unnecessary task stacking. ++ */ ++ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) ++ return true; ++ ++ return false; ++} ++ ++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { ++ if (WARN_ON_ONCE(cpu == smp_processor_id())) ++ return false; ++ ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ __ttwu_queue_wakelist(p, cpu, wake_flags); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ resched_curr(rq); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ if (this_cpu == that_cpu) ++ return true; ++ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#else /* !CONFIG_SMP */ ++ ++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (ttwu_queue_wakelist(p, cpu, wake_flags)) ++ return; ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Invoked from try_to_wake_up() to check whether the task can be woken up. ++ * ++ * The caller holds p::pi_lock if p != current or has preemption ++ * disabled when p == current. ++ * ++ * The rules of PREEMPT_RT saved_state: ++ * ++ * The related locking code always holds p::pi_lock when updating ++ * p::saved_state, which means the code is fully serialized in both cases. ++ * ++ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other ++ * bits set. This allows to distinguish all wakeup scenarios. ++ */ ++static __always_inline ++bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) ++{ ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { ++ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && ++ state != TASK_RTLOCK_WAIT); ++ } ++ ++ if (READ_ONCE(p->__state) & state) { ++ *success = 1; ++ return true; ++ } ++ ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * Saved state preserves the task state across blocking on ++ * an RT lock. If the state matches, set p::saved_state to ++ * TASK_RUNNING, but do not wake the task because it waits ++ * for a lock wakeup. Also indicate success because from ++ * the regular waker's point of view this has succeeded. ++ * ++ * After acquiring the lock the task will restore p::__state ++ * from p::saved_state which ensures that the regular ++ * wakeup is not lost. The restore will also set ++ * p::saved_state to TASK_RUNNING so any further tests will ++ * not result in false positives vs. @success ++ */ ++ if (p->saved_state & state) { ++ p->saved_state = TASK_RUNNING; ++ *success = 1; ++ } ++#endif ++ return false; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) -- finish_task() ++ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Conceptually does: ++ * ++ * If (@state & @p->state) @p->state = TASK_RUNNING. ++ * ++ * If the task was not queued/runnable, also place it back on a runqueue. ++ * ++ * This function is atomic against schedule() which would dequeue the task. ++ * ++ * It issues a full memory barrier before accessing @p->state, see the comment ++ * with set_current_state(). ++ * ++ * Uses p->pi_lock to serialize against concurrent wake-ups. ++ * ++ * Relies on p->pi_lock stabilizing: ++ * - p->sched_class ++ * - p->cpus_ptr ++ * - p->sched_task_group ++ * in order to do migration, see its use of select_task_rq()/set_task_cpu(). ++ * ++ * Tries really hard to only take one task_rq(p)->lock for performance. ++ * Takes rq->lock in: ++ * - ttwu_runnable() -- old rq, unavoidable, see comment there; ++ * - ttwu_queue() -- new rq, for enqueue of the task; ++ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. ++ * ++ * As a consequence we race really badly with just about everything. See the ++ * many memory barriers and their comments for details. ++ * ++ * Return: %true if @p->state changes (an actual wakeup was done), ++ * %false otherwise. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_runnable()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!ttwu_state_match(p, state, &success)) ++ goto out; ++ ++ trace_sched_waking(p); ++ WRITE_ONCE(p->__state, TASK_RUNNING); ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with smp_store_mb() ++ * in set_current_state() that the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!ttwu_state_match(p, state, &success)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). ++ */ ++ smp_rmb(); ++ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure ++ * schedule()'s deactivate_task() has 'happened' and p will no longer ++ * care about it's own p->state. See the comment in __schedule(). ++ */ ++ smp_acquire__after_ctrl_dep(); ++ ++ /* ++ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq ++ * == 0), which means we need to do an enqueue, change p->state to ++ * TASK_WAKING such that we can unlock p->pi_lock before doing the ++ * enqueue, such as ttwu_queue_wakelist(). ++ */ ++ WRITE_ONCE(p->__state, TASK_WAKING); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, considering queueing p on the remote CPUs wake_list ++ * which potentially sends an IPI instead of spinning on p->on_cpu to ++ * let the waker make forward progress. This is safe because IRQs are ++ * disabled and the IPI will deliver after on_cpu is cleared. ++ * ++ * Ensure we load task_cpu(p) after p->on_cpu: ++ * ++ * set_task_cpu(p, cpu); ++ * STORE p->cpu = @cpu ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock ++ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) ++ * STORE p->on_cpu = 1 LOAD p->cpu ++ * ++ * to ensure we observe the correct CPU on which the task is currently ++ * scheduling. ++ */ ++ if (smp_load_acquire(&p->on_cpu) && ++ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ++ goto unlock; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until it's done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ sched_task_ttwu(p); ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else ++ cpu = task_cpu(p); ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, task_cpu(p), wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * task_call_func - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked, can be @current. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * Fix the task in it's current state by avoiding wakeups and or rq operations ++ * and call @func(@arg) on it. This function can use ->on_rq and task_curr() ++ * to work out what the state is, if required. Given that @func can be invoked ++ * with a runqueue lock held, it had better be quite lightweight. ++ * ++ * Returns: ++ * Whatever @func returns ++ */ ++int task_call_func(struct task_struct *p, task_call_f func, void *arg) ++{ ++ struct rq *rq = NULL; ++ unsigned int state; ++ struct rq_flags rf; ++ int ret; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, rf.flags); ++ ++ state = READ_ONCE(p->__state); ++ ++ /* ++ * Ensure we load p->on_rq after p->__state, otherwise it would be ++ * possible to, falsely, observe p->on_rq == 0. ++ * ++ * See try_to_wake_up() for a longer comment. ++ */ ++ smp_rmb(); ++ ++ /* ++ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when ++ * the task is blocked. Make sure to check @state since ttwu() can drop ++ * locks at the end, see ttwu_queue_wakelist(). ++ */ ++ if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq) ++ rq = __task_rq_lock(p, &rf); ++ ++ /* ++ * At this point the task is pinned; either: ++ * - blocked and we're holding off wakeups (pi->lock) ++ * - woken, and we're holding off enqueue (rq->lock) ++ * - queued, and we're holding off schedule (rq->lock) ++ * - running, and we're holding off de-schedule (rq->lock) ++ * ++ * The called function (@func) can use: task_curr(), p->on_rq and ++ * p->__state to differentiate between these states. ++ */ ++ ret = func(p, arg); ++ ++ if (rq) ++ __task_rq_unlock(rq, &rf); ++ ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ * ++ * __sched_fork() is basic setup used by init_idle() too: ++ */ ++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ p->on_rq = 0; ++ p->on_cpu = 0; ++ p->utime = 0; ++ p->stime = 0; ++ p->sched_time = 0; ++ ++#ifdef CONFIG_SCHEDSTATS ++ /* Even if schedstat is disabled, there should not be garbage */ ++ memset(&p->stats, 0, sizeof(p->stats)); ++#endif ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++#ifdef CONFIG_SMP ++ p->wake_entry.u_flags = CSD_TYPE_TTWU; ++#endif ++} ++ ++/* ++ * fork()/clone()-time setup: ++ */ ++int sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ __sched_fork(clone_flags, p); ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->__state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = p->static_prio; ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ /* ++ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly ++ * required yet, but lockdep gets upset if rules are violated. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ resched_curr(rq); ++ } ++ sched_task_fork(p, rq); ++ raw_spin_unlock(&rq->lock); ++ ++ rseq_migrate(p); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++void sched_post_fork(struct task_struct *p) ++{ ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ if (!strcmp(str, "enable")) { ++ set_schedstats(true); ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ set_schedstats(false); ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++#ifdef CONFIG_PROC_SYSCTL ++static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++ ++static struct ctl_table sched_core_sysctls[] = { ++ { ++ .procname = "sched_schedstats", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sysctl_schedstats, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ {} ++}; ++static int __init sched_core_sysctl_init(void) ++{ ++ register_sysctl_init("kernel", sched_core_sysctls); ++ return 0; ++} ++late_initcall(sched_core_sysctl_init); ++#endif /* CONFIG_PROC_SYSCTL */ ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ WRITE_ONCE(p->__state, TASK_RUNNING); ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ rseq_migrate(p); ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ * ++ * See the ttwu() WF_ON_CPU case and its ordering comment. ++ */ ++ WRITE_ONCE(next->on_cpu, 1); ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * This must be the very last reference to @prev from this CPU. After ++ * p->on_cpu is cleared, the task can be moved to a different CPU. We ++ * must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++static void do_balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++ void (*func)(struct rq *rq); ++ struct callback_head *next; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ while (head) { ++ func = (void (*)(struct rq *))head->func; ++ next = head->next; ++ head->next = NULL; ++ head = next; ++ ++ func(rq); ++ } ++} ++ ++static void balance_push(struct rq *rq); ++ ++/* ++ * balance_push_callback is a right abuse of the callback interface and plays ++ * by significantly different rules. ++ * ++ * Where the normal balance_callback's purpose is to be ran in the same context ++ * that queued it (only later, when it's safe to drop rq->lock again), ++ * balance_push_callback is specifically targeted at __schedule(). ++ * ++ * This abuse is tolerated because it places all the unlikely/odd cases behind ++ * a single test, namely: rq->balance_callback == NULL. ++ */ ++struct callback_head balance_push_callback = { ++ .next = NULL, ++ .func = (void (*)(struct callback_head *))balance_push, ++}; ++ ++static inline struct callback_head * ++__splice_balance_callbacks(struct rq *rq, bool split) ++{ ++ struct callback_head *head = rq->balance_callback; ++ ++ if (likely(!head)) ++ return NULL; ++ ++ lockdep_assert_rq_held(rq); ++ /* ++ * Must not take balance_push_callback off the list when ++ * splice_balance_callbacks() and balance_callbacks() are not ++ * in the same rq->lock section. ++ * ++ * In that case it would be possible for __schedule() to interleave ++ * and observe the list empty. ++ */ ++ if (split && head == &balance_push_callback) ++ head = NULL; ++ else ++ rq->balance_callback = NULL; ++ ++ return head; ++} ++ ++static inline struct callback_head *splice_balance_callbacks(struct rq *rq) ++{ ++ return __splice_balance_callbacks(rq, true); ++} ++ ++static void __balance_callbacks(struct rq *rq) ++{ ++ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); ++} ++ ++static inline void balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++ unsigned long flags; ++ ++ if (unlikely(head)) { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ do_balance_callbacks(rq, head); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++} ++ ++#else ++ ++static inline void __balance_callbacks(struct rq *rq) ++{ ++} ++ ++static inline struct callback_head *splice_balance_callbacks(struct rq *rq) ++{ ++ return NULL; ++} ++ ++static inline void balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++} ++ ++#endif ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ __balance_callbacks(rq); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/* ++ * NOP if the arch has not defined these: ++ */ ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++ ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++static inline void kmap_local_sched_out(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_out(); ++#endif ++} ++ ++static inline void kmap_local_sched_in(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_in(); ++#endif ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ kmap_local_sched_out(); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ unsigned int prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = READ_ONCE(prev->__state); ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ tick_nohz_task_switch(); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ /* ++ * kmap_local_sched_out() is invoked with rq::lock held and ++ * interrupts disabled. There is no requirement for that, but the ++ * sched out code does not have an interrupt enabled section. ++ * Restoring the maps on sched in does not require interrupts being ++ * disabled either. ++ */ ++ kmap_local_sched_in(); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop_sched(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned int nr_running(void) ++{ ++ unsigned int i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned int nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how it's mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned int nr_iowait(void) ++{ ++ unsigned int i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * sched_exec - execve() is a valuable balancing opportunity, because at ++ * this point the task has the smallest effective memory and cache ++ * footprint. ++ */ ++void sched_exec(void) ++{ ++} ++ ++#endif ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ cgroup_account_cputime(p, ns); ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++static u64 cpu_resched_latency(struct rq *rq) ++{ ++ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); ++ u64 resched_latency, now = rq_clock(rq); ++ static bool warned_once; ++ ++ if (sysctl_resched_latency_warn_once && warned_once) ++ return 0; ++ ++ if (!need_resched() || !latency_warn_ms) ++ return 0; ++ ++ if (system_state == SYSTEM_BOOTING) ++ return 0; ++ ++ if (!rq->last_seen_need_resched_ns) { ++ rq->last_seen_need_resched_ns = now; ++ rq->ticks_without_resched = 0; ++ return 0; ++ } ++ ++ rq->ticks_without_resched++; ++ resched_latency = now - rq->last_seen_need_resched_ns; ++ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) ++ return 0; ++ ++ warned_once = true; ++ ++ return resched_latency; ++} ++ ++static int __init setup_resched_latency_warn_ms(char *str) ++{ ++ long val; ++ ++ if ((kstrtol(str, 0, &val))) { ++ pr_warn("Unable to set resched_latency_warn_ms\n"); ++ return 1; ++ } ++ ++ sysctl_resched_latency_warn_ms = val; ++ return 1; ++} ++__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); ++#else ++static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ u64 resched_latency; ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ if (sched_feat(LATENCY_WARN)) ++ resched_latency = cpu_resched_latency(rq); ++ calc_global_load_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ if (sched_feat(LATENCY_WARN) && resched_latency) ++ resched_latency_warn(cpu, resched_latency); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int sg_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask) && ++ !is_migration_disabled(p)) { ++ int cpu = cpu_of(rq); ++ int dcpu = __best_mask_cpu(&tmp, per_cpu(sched_cpu_llc_mask, cpu)); ++ rq = move_queued_task(rq, p, dcpu); ++ } ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq= cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ !is_migration_disabled(curr) && (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, sg_balance_cpu_stop, curr, ++ &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu = cpu_of(rq); ++ ++ /* exit when cpu is offline */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* ++ * Only cpu in slibing idle group will do the checking and then ++ * find potential cpus which can migrate the current running task ++ */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && ++ cpumask_andnot(&chk, cpu_online_mask, sched_rq_watermark) && ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_mask)) { ++ int i; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ if (cpumask_subset(cpu_smt_mask(i), &chk) && ++ sg_balance_trigger(i)) ++ return; ++ } ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ scheduler_task_tick(rq); ++ ++ calc_load_nohz_remote(rq); ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_TYPE_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++ ++ if (task_scs_end_corrupted(prev)) ++ panic("corrupted shadow stack detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ SCHED_WARN_ON(ct_state() == CONTEXT_USER); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * Compile time debug macro ++ * #define ALT_SCHED_DEBUG ++ */ ++ ++#ifdef ALT_SCHED_DEBUG ++void alt_sched_debug(void) ++{ ++ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", ++ sched_rq_pending_mask.bits[0], ++ sched_rq_watermark[0].bits[0], ++ sched_sg_idle_mask.bits[0]); ++} ++#else ++inline void alt_sched_debug(void) {} ++#endif ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32U) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = sched_rq_next_task(skip, rq)) != rq->idle) { ++ skip = sched_rq_next_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ __SCHED_DEQUEUE_TASK(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ sched_task_sanity_check(p, dest_rq); ++ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *topo_mask, *end_mask; ++ ++ if (unlikely(!rq->online)) ++ return 0; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ topo_mask = per_cpu(sched_cpu_topo_masks, cpu) + 1; ++ end_mask = per_cpu(sched_cpu_topo_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, topo_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { ++ src_rq->nr_running -= nr_migrated; ++ if (src_rq->nr_running < 2) ++ cpumask_clear_cpu(i, &sched_rq_pending_mask); ++ ++ rq->nr_running += nr_migrated; ++ if (rq->nr_running > 1) ++ cpumask_set_cpu(cpu, &sched_rq_pending_mask); ++ ++ cpufreq_update_util(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ return 1; ++ } ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ } ++ } while (++topo_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (unlikely(rq->idle == p)) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) ++ time_slice_expired(p, rq); ++} ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ rq->skip = NULL; ++ schedstat_inc(rq->sched_goidle); ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = rq_runnable_task(rq); ++#endif ++ } ++ rq->skip = NULL; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ return next; ++ } ++ ++ next = sched_rq_first_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ schedstat_inc(rq->sched_goidle); ++ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = sched_rq_first_task(rq); ++#endif ++ } ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, ++ * next);*/ ++ return next; ++} ++ ++/* ++ * Constants for the sched_mode argument of __schedule(). ++ * ++ * The mode argument allows RT enabled kernels to differentiate a ++ * preemption from blocking on an 'sleeping' spin/rwlock. Note that ++ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to ++ * optimize the AND operation out and just check for zero. ++ */ ++#define SM_NONE 0x0 ++#define SM_PREEMPT 0x1 ++#define SM_RTLOCK_WAIT 0x2 ++ ++#ifndef CONFIG_PREEMPT_RT ++# define SM_MASK_PREEMPT (~0U) ++#else ++# define SM_MASK_PREEMPT SM_PREEMPT ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(unsigned int sched_mode) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ unsigned long prev_state; ++ struct rq *rq; ++ int cpu; ++ int deactivated = 0; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, !!sched_mode); ++ ++ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(!!sched_mode); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(): ++ * ++ * __set_current_state(@state) signal_wake_up() ++ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) ++ * wake_up_state(p, state) ++ * LOCK rq->lock LOCK p->pi_state ++ * smp_mb__after_spinlock() smp_mb__after_spinlock() ++ * if (signal_pending_state()) if (p->state & @state) ++ * ++ * Also, the membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ /* ++ * We must load prev->state once (task_struct::state is volatile), such ++ * that we form a control dependency vs deactivate_task() below. ++ */ ++ prev_state = READ_ONCE(prev->__state); ++ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { ++ if (signal_pending_state(prev_state, prev)) { ++ WRITE_ONCE(prev->__state, TASK_RUNNING); ++ } else { ++ prev->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev->flags & PF_FROZEN); ++ ++ if (prev->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ /* ++ * __schedule() ttwu() ++ * prev_state = prev->state; if (p->on_rq && ...) ++ * if (prev_state) goto out; ++ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); ++ * p->state = TASK_WAKING ++ * ++ * Where __schedule() and ttwu() have matching control dependencies. ++ * ++ * After this, schedule() must not care about p->state any more. ++ */ ++ sched_task_deactivate(prev, rq); ++ deactivate_task(prev, rq); ++ deactivated = 1; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++#ifdef CONFIG_SCHED_DEBUG ++ rq->last_seen_need_resched_ns = 0; ++#endif ++ ++ if (likely(prev != next)) { ++ if (deactivated) ++ update_sched_rq_watermark(rq); ++ next->last_ran = rq->clock_task; ++ rq->last_ts_switch = rq->clock; ++ ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++ } else { ++ __balance_callbacks(rq); ++ raw_spin_unlock_irq(&rq->lock); ++ } ++ ++#ifdef CONFIG_SCHED_SMT ++ sg_balance(rq); ++#endif ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ ++ __schedule(SM_NONE); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ unsigned int task_flags; ++ ++ if (task_is_running(tsk)) ++ return; ++ ++ task_flags = tsk->flags; ++ /* ++ * If a worker goes to sleep, notify and ask workqueue whether it ++ * wants to wake up a task to maintain concurrency. ++ */ ++ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (task_flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ blk_flush_plug(tsk->plug, true); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(SM_NONE); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->__state); ++ do { ++ __schedule(SM_NONE); ++ } while (need_resched()); ++} ++ ++#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++void __sched notrace schedule_rtlock(void) ++{ ++ do { ++ preempt_disable(); ++ __schedule(SM_RTLOCK_WAIT); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++} ++NOKPROBE_SYMBOL(schedule_rtlock); ++#endif ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(SM_PREEMPT); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++#ifdef CONFIG_PREEMPT_DYNAMIC ++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) ++#ifndef preempt_schedule_dynamic_enabled ++#define preempt_schedule_dynamic_enabled preempt_schedule ++#define preempt_schedule_dynamic_disabled NULL ++#endif ++DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); ++EXPORT_STATIC_CALL_TRAMP(preempt_schedule); ++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) ++static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); ++void __sched notrace dynamic_preempt_schedule(void) ++{ ++ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) ++ return; ++ preempt_schedule(); ++} ++NOKPROBE_SYMBOL(dynamic_preempt_schedule); ++EXPORT_SYMBOL(dynamic_preempt_schedule); ++#endif ++#endif ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(SM_PREEMPT); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#ifdef CONFIG_PREEMPT_DYNAMIC ++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) ++#ifndef preempt_schedule_notrace_dynamic_enabled ++#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace ++#define preempt_schedule_notrace_dynamic_disabled NULL ++#endif ++DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); ++EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); ++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) ++static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); ++void __sched notrace dynamic_preempt_schedule_notrace(void) ++{ ++ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) ++ return; ++ preempt_schedule_notrace(); ++} ++NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); ++EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); ++#endif ++#endif ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(SM_PREEMPT); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct task_struct *p, struct rq *rq) ++{ ++ int idx; ++ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && (idx = task_sched_prio_idx(p, rq)) != p->sq_idx) { ++ requeue_task(p, rq, idx); ++ check_preempt_curr(rq); ++ } ++} ++ ++static void __setscheduler_prio(struct task_struct *p, int prio) ++{ ++ p->prio = prio; ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guarantees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ ++ __setscheduler_prio(p, prio); ++ ++ check_task_changed(p, rq); ++out_unlock: ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ ++ __balance_callbacks(rq); ++ __task_access_unlock(p, lock); ++ ++ preempt_enable(); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it won't have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ ++ check_task_changed(p, rq); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * ++ * sched policy return value kernel prio user prio/nice ++ * ++ * (BMQ)normal, batch, idle[0 ... 53] [100 ... 139] 0/[-20 ... 19]/[-7 ... 7] ++ * (PDS)normal, batch, idle[0 ... 39] 100 0/[-20 ... 19] ++ * fifo, rr [-1 ... -100] [99 ... 0] [0 ... 99] ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO) ? p->prio - MAX_RT_PRIO : ++ task_sched_prio_normal(p, task_rq(p)); ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (rq->ttwu_pending) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int oldpolicy = -1, policy = attr->sched_policy; ++ int retval, newprio; ++ struct callback_head *head; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ newprio = __normal_prio(policy, attr->sched_priority, NICE_TO_PRIO(attr->sched_nice)); ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ newprio = rt_effective_prio(p, newprio); ++ } ++ ++ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { ++ __setscheduler_params(p, attr); ++ __setscheduler_prio(p, newprio); ++ } ++ ++ check_task_changed(p, rq); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ head = splice_balance_callbacks(rq); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ /* Run balance callbacks after we've adjusted the PI chain: */ ++ balance_callbacks(rq, head); ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Use sched_set_fifo(), read its comment. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr_nocheck); ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++ ++/* ++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally ++ * incapable of resource management, which is the one thing an OS really should ++ * be doing. ++ * ++ * This is of course the reason it is limited to privileged users only. ++ * ++ * Worse still; it is fundamentally impossible to compose static priority ++ * workloads. You cannot take two correctly working static prio workloads ++ * and smash them together and still expect them to work. ++ * ++ * For this reason 'all' FIFO tasks the kernel creates are basically at: ++ * ++ * MAX_RT_PRIO / 2 ++ * ++ * The administrator _MUST_ configure the system, the kernel simply doesn't ++ * know enough information to make a sensible choice. ++ */ ++void sched_set_fifo(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo); ++ ++/* ++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. ++ */ ++void sched_set_fifo_low(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = 1 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo_low); ++ ++void sched_set_normal(struct task_struct *p, int nice) ++{ ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ .sched_nice = nice, ++ }; ++ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_normal); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ kattr.sched_flags &= SCHED_FLAG_ALL; ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++static int ++__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) ++{ ++ int retval; ++ cpumask_var_t cpus_allowed, new_mask; ++ ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); ++ if (retval) ++ goto out_free_new_mask; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ ++out_free_new_mask: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ retval = -EPERM; ++ goto out_put_task; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_put_task; ++ ++ retval = __sched_setaffinity(p, in_mask); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) ++ do_sched_yield_type_1(current, rq); ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ preempt_disable(); ++ raw_spin_unlock_irq(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. If there are no ++ * other threads running on this CPU then this function will return. ++ * ++ * Return: 0. ++ */ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) ++int __sched __cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ /* ++ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick ++ * whether the current CPU is in an RCU read-side critical section, ++ * so the tick can report quiescent states even for CPUs looping ++ * in kernel context. In contrast, in non-preemptible kernels, ++ * RCU readers leave no in-memory hints, which means that CPU-bound ++ * processes executing in kernel context might never report an ++ * RCU quiescent state. Therefore, the following code causes ++ * cond_resched() to report a quiescent state, but only when RCU ++ * is in urgent need of one. ++ */ ++#ifndef CONFIG_PREEMPT_RCU ++ rcu_all_qs(); ++#endif ++ return 0; ++} ++EXPORT_SYMBOL(__cond_resched); ++#endif ++ ++#ifdef CONFIG_PREEMPT_DYNAMIC ++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) ++#define cond_resched_dynamic_enabled __cond_resched ++#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) ++DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); ++EXPORT_STATIC_CALL_TRAMP(cond_resched); ++ ++#define might_resched_dynamic_enabled __cond_resched ++#define might_resched_dynamic_disabled ((void *)&__static_call_return0) ++DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); ++EXPORT_STATIC_CALL_TRAMP(might_resched); ++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) ++static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); ++int __sched dynamic_cond_resched(void) ++{ ++ if (!static_branch_unlikely(&sk_dynamic_cond_resched)) ++ return 0; ++ return __cond_resched(); ++} ++EXPORT_SYMBOL(dynamic_cond_resched); ++ ++static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); ++int __sched dynamic_might_resched(void) ++{ ++ if (!static_branch_unlikely(&sk_dynamic_might_resched)) ++ return 0; ++ return __cond_resched(); ++} ++EXPORT_SYMBOL(dynamic_might_resched); ++#endif ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (!_cond_resched()) ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++int __cond_resched_rwlock_read(rwlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held_read(lock); ++ ++ if (rwlock_needbreak(lock) || resched) { ++ read_unlock(lock); ++ if (!_cond_resched()) ++ cpu_relax(); ++ ret = 1; ++ read_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_rwlock_read); ++ ++int __cond_resched_rwlock_write(rwlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held_write(lock); ++ ++ if (rwlock_needbreak(lock) || resched) { ++ write_unlock(lock); ++ if (!_cond_resched()) ++ cpu_relax(); ++ ret = 1; ++ write_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_rwlock_write); ++ ++#ifdef CONFIG_PREEMPT_DYNAMIC ++ ++#ifdef CONFIG_GENERIC_ENTRY ++#include ++#endif ++ ++/* ++ * SC:cond_resched ++ * SC:might_resched ++ * SC:preempt_schedule ++ * SC:preempt_schedule_notrace ++ * SC:irqentry_exit_cond_resched ++ * ++ * ++ * NONE: ++ * cond_resched <- __cond_resched ++ * might_resched <- RET0 ++ * preempt_schedule <- NOP ++ * preempt_schedule_notrace <- NOP ++ * irqentry_exit_cond_resched <- NOP ++ * ++ * VOLUNTARY: ++ * cond_resched <- __cond_resched ++ * might_resched <- __cond_resched ++ * preempt_schedule <- NOP ++ * preempt_schedule_notrace <- NOP ++ * irqentry_exit_cond_resched <- NOP ++ * ++ * FULL: ++ * cond_resched <- RET0 ++ * might_resched <- RET0 ++ * preempt_schedule <- preempt_schedule ++ * preempt_schedule_notrace <- preempt_schedule_notrace ++ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched ++ */ ++ ++enum { ++ preempt_dynamic_undefined = -1, ++ preempt_dynamic_none, ++ preempt_dynamic_voluntary, ++ preempt_dynamic_full, ++}; ++ ++int preempt_dynamic_mode = preempt_dynamic_undefined; ++ ++int sched_dynamic_mode(const char *str) ++{ ++ if (!strcmp(str, "none")) ++ return preempt_dynamic_none; ++ ++ if (!strcmp(str, "voluntary")) ++ return preempt_dynamic_voluntary; ++ ++ if (!strcmp(str, "full")) ++ return preempt_dynamic_full; ++ ++ return -EINVAL; ++} ++ ++#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) ++#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) ++#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) ++#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) ++#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) ++#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) ++#else ++#error "Unsupported PREEMPT_DYNAMIC mechanism" ++#endif ++ ++void sched_dynamic_update(int mode) ++{ ++ /* ++ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in ++ * the ZERO state, which is invalid. ++ */ ++ preempt_dynamic_enable(cond_resched); ++ preempt_dynamic_enable(might_resched); ++ preempt_dynamic_enable(preempt_schedule); ++ preempt_dynamic_enable(preempt_schedule_notrace); ++ preempt_dynamic_enable(irqentry_exit_cond_resched); ++ ++ switch (mode) { ++ case preempt_dynamic_none: ++ preempt_dynamic_enable(cond_resched); ++ preempt_dynamic_disable(might_resched); ++ preempt_dynamic_disable(preempt_schedule); ++ preempt_dynamic_disable(preempt_schedule_notrace); ++ preempt_dynamic_disable(irqentry_exit_cond_resched); ++ pr_info("Dynamic Preempt: none\n"); ++ break; ++ ++ case preempt_dynamic_voluntary: ++ preempt_dynamic_enable(cond_resched); ++ preempt_dynamic_enable(might_resched); ++ preempt_dynamic_disable(preempt_schedule); ++ preempt_dynamic_disable(preempt_schedule_notrace); ++ preempt_dynamic_disable(irqentry_exit_cond_resched); ++ pr_info("Dynamic Preempt: voluntary\n"); ++ break; ++ ++ case preempt_dynamic_full: ++ preempt_dynamic_disable(cond_resched); ++ preempt_dynamic_disable(might_resched); ++ preempt_dynamic_enable(preempt_schedule); ++ preempt_dynamic_enable(preempt_schedule_notrace); ++ preempt_dynamic_enable(irqentry_exit_cond_resched); ++ pr_info("Dynamic Preempt: full\n"); ++ break; ++ } ++ ++ preempt_dynamic_mode = mode; ++} ++ ++static int __init setup_preempt_mode(char *str) ++{ ++ int mode = sched_dynamic_mode(str); ++ if (mode < 0) { ++ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); ++ return 0; ++ } ++ ++ sched_dynamic_update(mode); ++ return 1; ++} ++__setup("preempt=", setup_preempt_mode); ++ ++static void __init preempt_dynamic_init(void) ++{ ++ if (preempt_dynamic_mode == preempt_dynamic_undefined) { ++ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { ++ sched_dynamic_update(preempt_dynamic_none); ++ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { ++ sched_dynamic_update(preempt_dynamic_voluntary); ++ } else { ++ /* Default static call setting, nothing to do */ ++ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); ++ preempt_dynamic_mode = preempt_dynamic_full; ++ pr_info("Dynamic Preempt: full\n"); ++ } ++ } ++} ++ ++#define PREEMPT_MODEL_ACCESSOR(mode) \ ++ bool preempt_model_##mode(void) \ ++ { \ ++ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ ++ return preempt_dynamic_mode == preempt_dynamic_##mode; \ ++ } \ ++ EXPORT_SYMBOL_GPL(preempt_model_##mode) ++ ++PREEMPT_MODEL_ACCESSOR(none); ++PREEMPT_MODEL_ACCESSOR(voluntary); ++PREEMPT_MODEL_ACCESSOR(full); ++ ++#else /* !CONFIG_PREEMPT_DYNAMIC */ ++ ++static inline void preempt_dynamic_init(void) { } ++ ++#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, it's already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In Alt schedule FW, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_flush_plug(current->plug, true); ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_RT_PRIO - 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ alt_sched_debug(); ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(sched_timeslice_ns); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); ++ ++ if (task_is_running(p)) ++ pr_cont(" running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", ++ free, task_pid_nr(p), ppid, ++ read_task_thread_flags(p)); ++ ++ print_worker_info(KERN_INFO, p); ++ print_stop_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ unsigned int state = READ_ONCE(p->__state); ++ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned int state_filter) ++{ ++ struct task_struct *g, *p; ++ ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: Alt schedule FW should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: CPU the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void __init init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ __sched_fork(0, idle); ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->__state = TASK_RUNNING; ++ /* ++ * PF_KTHREAD should already be set at this point; regardless, make it ++ * look like a proper per-CPU kthread. ++ */ ++ idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; ++ kthread_set_per_cpu(idle, cpu); ++ ++ sched_queue_init_idle(&rq->queue, idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++ ++static int __balance_push_cpu_stop(void *arg) ++{ ++ struct task_struct *p = arg; ++ struct rq *rq = this_rq(); ++ struct rq_flags rf; ++ int cpu; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ rq_lock(rq, &rf); ++ ++ update_rq_clock(rq); ++ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) { ++ cpu = select_fallback_rq(rq->cpu, p); ++ rq = __migrate_task(rq, p, cpu); ++ } ++ ++ rq_unlock(rq, &rf); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ put_task_struct(p); ++ ++ return 0; ++} ++ ++static DEFINE_PER_CPU(struct cpu_stop_work, push_work); ++ ++/* ++ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only ++ * effective when the hotplug motion is down. ++ */ ++static void balance_push(struct rq *rq) ++{ ++ struct task_struct *push_task = rq->curr; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ /* ++ * Ensure the thing is persistent until balance_push_set(.on = false); ++ */ ++ rq->balance_callback = &balance_push_callback; ++ ++ /* ++ * Only active while going offline and when invoked on the outgoing ++ * CPU. ++ */ ++ if (!cpu_dying(rq->cpu) || rq != this_rq()) ++ return; ++ ++ /* ++ * Both the cpu-hotplug and stop task are in this case and are ++ * required to complete the hotplug process. ++ */ ++ if (kthread_is_per_cpu(push_task) || ++ is_migration_disabled(push_task)) { ++ ++ /* ++ * If this is the idle task on the outgoing CPU try to wake ++ * up the hotplug control thread which might wait for the ++ * last task to vanish. The rcuwait_active() check is ++ * accurate here because the waiter is pinned on this CPU ++ * and can't obviously be running in parallel. ++ * ++ * On RT kernels this also has to check whether there are ++ * pinned and scheduled out tasks on the runqueue. They ++ * need to leave the migrate disabled section first. ++ */ ++ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && ++ rcuwait_active(&rq->hotplug_wait)) { ++ raw_spin_unlock(&rq->lock); ++ rcuwait_wake_up(&rq->hotplug_wait); ++ raw_spin_lock(&rq->lock); ++ } ++ return; ++ } ++ ++ get_task_struct(push_task); ++ /* ++ * Temporarily drop rq->lock such that we can wake-up the stop task. ++ * Both preemption and IRQs are still disabled. ++ */ ++ raw_spin_unlock(&rq->lock); ++ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, ++ this_cpu_ptr(&push_work)); ++ /* ++ * At this point need_resched() is true and we'll take the loop in ++ * schedule(). The next pick is obviously going to be the stop task ++ * which kthread_is_per_cpu() and will push this task away. ++ */ ++ raw_spin_lock(&rq->lock); ++} ++ ++static void balance_push_set(int cpu, bool on) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (on) { ++ WARN_ON_ONCE(rq->balance_callback); ++ rq->balance_callback = &balance_push_callback; ++ } else if (rq->balance_callback == &balance_push_callback) { ++ rq->balance_callback = NULL; ++ } ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++/* ++ * Invoked from a CPUs hotplug control thread after the CPU has been marked ++ * inactive. All tasks which are not per CPU kernel threads are either ++ * pushed off this CPU now via balance_push() or placed on a different CPU ++ * during wakeup. Wait until the CPU is quiescent. ++ */ ++static void balance_hotplug_wait(void) ++{ ++ struct rq *rq = this_rq(); ++ ++ rcuwait_wait_event(&rq->hotplug_wait, ++ rq->nr_running == 1 && !rq_has_pinned_tasks(rq), ++ TASK_UNINTERRUPTIBLE); ++} ++ ++#else ++ ++static void balance_push(struct rq *rq) ++{ ++} ++ ++static void balance_push_set(int cpu, bool on) ++{ ++} ++ ++static inline void balance_hotplug_wait(void) ++{ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* ++ * Clear the balance_push callback and prepare to schedule ++ * regular tasks. ++ */ ++ balance_push_set(cpu, false); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ ++ /* ++ * From this point forward, this CPU will refuse to run any task that ++ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively ++ * push those tasks away until this gets cleared, see ++ * sched_cpu_dying(). ++ */ ++ balance_push_set(cpu, true); ++ ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Specifically, we rely on ttwu to no longer target this CPU, see ++ * ttwu_queue_cond() and is_cpu_allowed(). ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ update_rq_clock(rq); ++ set_rq_offline(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ balance_push_set(cpu, false); ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ ++/* ++ * Invoked immediately before the stopper thread is invoked to bring the ++ * CPU down completely. At this point all per CPU kthreads except the ++ * hotplug thread (current) and the stopper thread (inactive) have been ++ * either parked or have been unbound from the outgoing CPU. Ensure that ++ * any of those which might be on the way out are gone. ++ * ++ * If after this point a bound task is being woken on this CPU then the ++ * responsible hotplug callback has failed to do it's job. ++ * sched_cpu_dying() will catch it with the appropriate fireworks. ++ */ ++int sched_cpu_wait_empty(unsigned int cpu) ++{ ++ balance_hotplug_wait(); ++ return 0; ++} ++ ++/* ++ * Since this CPU is going 'away' for a while, fold any nr_active delta we ++ * might have. Called from the CPU stopper task after ensuring that the ++ * stopper is the last running task on the CPU, so nr_active count is ++ * stable. We need to take the teardown thread which is calling this into ++ * account, so we hand in adjust = 1 to the load calculation. ++ * ++ * Also see the comment "Global load-average calculations". ++ */ ++static void calc_load_migrate(struct rq *rq) ++{ ++ long delta = calc_load_fold_active(rq, 1); ++ ++ if (delta) ++ atomic_long_add(delta, &calc_load_tasks); ++} ++ ++static void dump_rq_tasks(struct rq *rq, const char *loglvl) ++{ ++ struct task_struct *g, *p; ++ int cpu = cpu_of(rq); ++ ++ lockdep_assert_held(&rq->lock); ++ ++ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); ++ for_each_process_thread(g, p) { ++ if (task_cpu(p) != cpu) ++ continue; ++ ++ if (!task_on_rq_queued(p)) ++ continue; ++ ++ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); ++ } ++} ++ ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_tick_stop(cpu); ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { ++ WARN(true, "Dying CPU not properly vacated!"); ++ dump_rq_tasks(rq, KERN_WARNING); ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ calc_load_migrate(rq); ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ /* init topo masks */ ++ tmp = per_cpu(sched_cpu_topo_masks, cpu); ++ ++ cpumask_copy(tmp, cpumask_of(cpu)); ++ tmp++; ++ cpumask_copy(tmp, cpu_possible_mask); ++ per_cpu(sched_cpu_llc_mask, cpu) = tmp; ++ per_cpu(sched_cpu_topo_end_mask, cpu) = ++tmp; ++ /*per_cpu(sd_llc_id, cpu) = cpu;*/ ++ } ++} ++ ++#define TOPOLOGY_CPUMASK(name, mask, last)\ ++ if (cpumask_and(topo, topo, mask)) { \ ++ cpumask_copy(topo, mask); \ ++ printk(KERN_INFO "sched: cpu#%02d topo: 0x%08lx - "#name, \ ++ cpu, (topo++)->bits[0]); \ ++ } \ ++ if (!last) \ ++ cpumask_complement(topo, mask) ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *topo; ++ ++ for_each_online_cpu(cpu) { ++ /* take chance to reset time slice for idle tasks */ ++ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; ++ ++ topo = per_cpu(sched_cpu_topo_masks, cpu) + 1; ++ ++ cpumask_complement(topo, cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); ++#endif ++ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); ++ per_cpu(sched_cpu_llc_mask, cpu) = topo; ++ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); ++ ++ per_cpu(sched_cpu_topo_end_mask, cpu) = topo; ++ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", ++ cpu, per_cpu(sd_llc_id, cpu), ++ (int) (per_cpu(sched_cpu_llc_mask, cpu) - ++ per_cpu(sched_cpu_topo_masks, cpu))); ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) ++ BUG(); ++ current->flags &= ~PF_NO_SETAFFINITY; ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ unsigned long shares; ++#endif ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ printk(KERN_INFO ALT_SCHED_VERSION_MSG); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < SCHED_QUEUE_BITS; i++) ++ cpumask_copy(sched_rq_watermark + i, cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ sched_queue_init(&rq->queue); ++ rq->watermark = IDLE_TASK_SCHED_PRIO; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++ ++#ifdef CONFIG_NO_HZ_COMMON ++ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); ++#endif ++ rq->balance_callback = &balance_push_callback; ++#ifdef CONFIG_HOTPLUG_CPU ++ rcuwait_init(&rq->hotplug_wait); ++#endif ++#endif /* CONFIG_SMP */ ++ rq->nr_switches = 0; ++ ++ hrtick_rq_init(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * The idle task doesn't need the kthread struct to function, but it ++ * is dressed up as a per-CPU kthread and thus needs to play the part ++ * if we want to avoid special-casing it in code that deals with per-CPU ++ * kthreads. ++ */ ++ WARN_ON(!set_kthread_struct(current)); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ balance_push_set(smp_processor_id(), false); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ psi_init(); ++ ++ preempt_dynamic_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ ++void __might_sleep(const char *file, int line) ++{ ++ unsigned int state = get_current_state(); ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%x set at [<%p>] %pS\n", state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ __might_resched(file, line, 0); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) ++{ ++ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) ++ return; ++ ++ if (preempt_count() == preempt_offset) ++ return; ++ ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, ip); ++} ++ ++static inline bool resched_offsets_ok(unsigned int offsets) ++{ ++ unsigned int nested = preempt_count(); ++ ++ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; ++ ++ return nested == offsets; ++} ++ ++void __might_resched(const char *file, int line, unsigned int offsets) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((resched_offsets_ok(offsets) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ pr_err("BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ pr_err("preempt_count: %x, expected: %x\n", preempt_count(), ++ offsets & MIGHT_RESCHED_PREEMPT_MASK); ++ ++ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { ++ pr_err("RCU nest depth: %d, expected: %u\n", ++ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); ++ } ++ ++ if (task_stack_end_corrupted(current)) ++ pr_emerg("Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ ++ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, ++ preempt_disable_ip); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(__might_resched); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++#ifdef CONFIG_SMP ++void __cant_migrate(const char *file, int line) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (is_migration_disabled(current)) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > 0) ++ return; ++ ++ if (current->migration_flags & MDF_FORCE_ENABLED) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); ++ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), is_migration_disabled(current), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_migrate); ++#endif ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ schedstat_set(p->stats.wait_start, 0); ++ schedstat_set(p->stats.sleep_start, 0); ++ schedstat_set(p->stats.block_start, 0); ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++static void sched_unregister_group(struct task_group *tg) ++{ ++ /* ++ * We have to wait for yet another RCU grace period to expire, as ++ * print_cfs_stats() might run concurrently. ++ */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_unregister_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs: */ ++ sched_unregister_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete: */ ++ call_rcu(&tg->rcu, sched_unregister_group_rcu); ++} ++ ++void sched_release_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_release_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_unregister_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++static DEFINE_MUTEX(shares_mutex); ++ ++int sched_group_set_shares(struct task_group *tg, unsigned long shares) ++{ ++ /* ++ * We can't change the weight of the root cgroup. ++ */ ++ if (&root_task_group == tg) ++ return -EINVAL; ++ ++ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); ++ ++ mutex_lock(&shares_mutex); ++ if (tg->shares == shares) ++ goto done; ++ ++ tg->shares = shares; ++done: ++ mutex_unlock(&shares_mutex); ++ return 0; ++} ++ ++static int cpu_shares_write_u64(struct cgroup_subsys_state *css, ++ struct cftype *cftype, u64 shareval) ++{ ++ if (shareval > scale_load_down(ULONG_MAX)) ++ shareval = MAX_SHARES; ++ return sched_group_set_shares(css_tg(css), scale_load(shareval)); ++} ++ ++static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ return (u64) scale_load_down(tg->shares); ++} ++#endif ++ ++static struct cftype cpu_legacy_files[] = { ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ { ++ .name = "shares", ++ .read_u64 = cpu_shares_read_u64, ++ .write_u64 = cpu_shares_write_u64, ++ }, ++#endif ++ { } /* Terminate */ ++}; ++ ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c +new file mode 100644 +index 000000000000..1212a031700e +--- /dev/null ++++ b/kernel/sched/alt_debug.c +@@ -0,0 +1,31 @@ ++/* ++ * kernel/sched/alt_debug.c ++ * ++ * Print the alt scheduler debugging details ++ * ++ * Author: Alfred Chen ++ * Date : 2020 ++ */ ++#include "sched.h" ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} +diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h +new file mode 100644 +index 000000000000..a181bf9ce57d +--- /dev/null ++++ b/kernel/sched/alt_sched.h +@@ -0,0 +1,645 @@ ++#ifndef ALT_SCHED_H ++#define ALT_SCHED_H ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "../workqueue_internal.h" ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_BMQ ++/* bits: ++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ ++#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) ++#endif ++ ++#ifdef CONFIG_SCHED_PDS ++/* bits: RT(0-99), reserved(100-127), NORMAL_PRIO_NUM, cpu idle task */ ++#define SCHED_BITS (MIN_NORMAL_PRIO + NORMAL_PRIO_NUM + 1) ++#endif /* CONFIG_SCHED_PDS */ ++ ++#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++extern void resched_latency_warn(int cpu, u64 latency); ++#else ++# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) ++static inline void resched_latency_warn(int cpu, u64 latency) {} ++#endif ++ ++/* ++ * Increase resolution of nice-level calculations for 64-bit architectures. ++ * The extra resolution improves shares distribution and load balancing of ++ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup ++ * hierarchies, especially on larger systems. This is not a user-visible change ++ * and does not change the user-interface for setting shares/weights. ++ * ++ * We increase resolution only if we have enough bits to allow this increased ++ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit ++ * are pretty high and the returns do not justify the increased costs. ++ * ++ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to ++ * increase coverage and consistency always enable it on 64-bit platforms. ++ */ ++#ifdef CONFIG_64BIT ++# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) ++# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) ++# define scale_load_down(w) \ ++({ \ ++ unsigned long __w = (w); \ ++ if (__w) \ ++ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ ++ __w; \ ++}) ++#else ++# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) ++# define scale_load(w) (w) ++# define scale_load_down(w) (w) ++#endif ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED ++#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD ++ ++/* ++ * A weight of 0 or 1 can cause arithmetics problems. ++ * A weight of a cfs_rq is the sum of weights of which entities ++ * are queued on this cfs_rq, so a weight of a entity should not be ++ * too large, so as the shares value of a task group. ++ * (The default weight is 1024 - so there's no practical ++ * limitation from this.) ++ */ ++#define MIN_SHARES (1UL << 1) ++#define MAX_SHARES (1UL << 18) ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++#define WF_ON_CPU 0x08 /* Wakee is on_rq */ ++ ++#define SCHED_QUEUE_BITS (SCHED_BITS - 1) ++ ++struct sched_queue { ++ DECLARE_BITMAP(bitmap, SCHED_QUEUE_BITS); ++ struct list_head heads[SCHED_BITS]; ++}; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++ struct sched_queue queue; ++#ifdef CONFIG_SCHED_PDS ++ u64 time_edge; ++#endif ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_SCHED_DEBUG ++ u64 last_seen_need_resched_ns; ++ int ticks_without_resched; ++#endif ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ unsigned int ttwu_pending; ++ unsigned char nohz_idle_balance; ++ unsigned char idle_balance; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++ struct callback_head *balance_callback; ++#ifdef CONFIG_HOTPLUG_CPU ++ struct rcuwait hotplug_wait; ++#endif ++ unsigned int nr_pinned; ++ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* For genenal cpu load util */ ++ s32 load_history; ++ u64 load_block; ++ u64 load_stamp; ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned int nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++ ktime_t hrtick_time; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++ ++#ifdef CONFIG_NO_HZ_COMMON ++#ifdef CONFIG_SMP ++ call_single_data_t nohz_csd; ++#endif ++ atomic_t nohz_flags; ++#endif /* CONFIG_NO_HZ_COMMON */ ++}; ++ ++extern unsigned long rq_load_util(struct rq *rq, unsigned long max); ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern bool sched_smp_initialized; ++ ++enum { ++ ITSELF_LEVEL_SPACE_HOLDER, ++#ifdef CONFIG_SCHED_SMT ++ SMT_LEVEL_SPACE_HOLDER, ++#endif ++ COREGROUP_LEVEL_SPACE_HOLDER, ++ CORE_LEVEL_SPACE_HOLDER, ++ OTHER_LEVEL_SPACE_HOLDER, ++ NR_CPU_AFFINITY_LEVELS ++}; ++ ++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_LEVELS], sched_cpu_topo_masks); ++DECLARE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++ ++static inline int ++__best_mask_cpu(const cpumask_t *cpumask, const cpumask_t *mask) ++{ ++ int cpu; ++ ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *mask) ++{ ++ return __best_mask_cpu(mask, per_cpu(sched_cpu_topo_masks, cpu)); ++} ++ ++extern void flush_smp_call_function_queue(void); ++ ++#else /* !CONFIG_SMP */ ++static inline void flush_smp_call_function_queue(void) { } ++#endif ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_lock(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(&rq->lock); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline void ++rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline raw_spinlock_t *__rq_lockp(struct rq *rq) ++{ ++ return &rq->lock; ++} ++ ++static inline raw_spinlock_t *rq_lockp(struct rq *rq) ++{ ++ return __rq_lockp(rq); ++} ++ ++static inline void lockdep_assert_rq_held(struct rq *rq) ++{ ++ lockdep_assert_held(__rq_lockp(rq)); ++} ++ ++extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); ++extern void raw_spin_rq_unlock(struct rq *rq); ++ ++static inline void raw_spin_rq_lock(struct rq *rq) ++{ ++ raw_spin_rq_lock_nested(rq, 0); ++} ++ ++static inline void raw_spin_rq_lock_irq(struct rq *rq) ++{ ++ local_irq_disable(); ++ raw_spin_rq_lock(rq); ++} ++ ++static inline void raw_spin_rq_unlock_irq(struct rq *rq) ++{ ++ raw_spin_rq_unlock(rq); ++ local_irq_enable(); ++} ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern int task_running_nice(struct task_struct *p); ++ ++extern struct static_key_false sched_schedstats; ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_NO_HZ_COMMON ++#define NOHZ_BALANCE_KICK_BIT 0 ++#define NOHZ_STATS_KICK_BIT 1 ++ ++#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) ++#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) ++ ++#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) ++ ++#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) ++ ++/* TODO: needed? ++extern void nohz_balance_exit_idle(struct rq *rq); ++#else ++static inline void nohz_balance_exit_idle(struct rq *rq) { } ++*/ ++#endif ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern void swake_up_all_locked(struct swait_queue_head *q); ++extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#ifdef CONFIG_PREEMPT_DYNAMIC ++extern int preempt_dynamic_mode; ++extern int sched_dynamic_mode(const char *str); ++extern void sched_dynamic_update(int mode); ++#endif ++ ++static inline void nohz_run_idle_balance(int cpu) { } ++ ++static inline ++unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, ++ struct task_struct *p) ++{ ++ return util; ++} ++ ++static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } ++ ++#endif /* ALT_SCHED_H */ +diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h +new file mode 100644 +index 000000000000..66b77291b9d0 +--- /dev/null ++++ b/kernel/sched/bmq.h +@@ -0,0 +1,110 @@ ++#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" ++ ++/* ++ * BMQ only routines ++ */ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (sched_timeslice_ns >>\ ++ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++/* ++ * Common interfaces ++ */ ++static inline void sched_timeslice_imp(const int timeslice_ms) {} ++ ++static inline int ++task_sched_prio_normal(const struct task_struct *p, const struct rq *rq) ++{ ++ return p->prio + p->boost_prio - MAX_RT_PRIO; ++} ++ ++static inline int task_sched_prio(const struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; ++} ++ ++static inline int ++task_sched_prio_idx(const struct task_struct *p, const struct rq *rq) ++{ ++ return task_sched_prio(p); ++} ++ ++static inline int sched_prio2idx(int prio, struct rq *rq) ++{ ++ return prio; ++} ++ ++static inline int sched_idx2prio(int idx, struct rq *rq) ++{ ++ return idx; ++} ++ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = sched_timeslice_ns; ++ ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq, task_sched_prio_idx(p, rq)); ++ } ++} ++ ++static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) {} ++ ++inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++static void sched_task_fork(struct task_struct *p, struct rq *rq) ++{ ++ p->boost_prio = MAX_PRIORITY_ADJ; ++} ++ ++static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) ++{ ++ p->boost_prio = MAX_PRIORITY_ADJ; ++} ++ ++#ifdef CONFIG_SMP ++static inline void sched_task_ttwu(struct task_struct *p) ++{ ++ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) ++ boost_task(p); ++} ++#endif ++ ++static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) ++{ ++ if (rq_switch_time(rq) < boost_threshold(p)) ++ boost_task(p); ++} ++ ++static inline void update_rq_time_edge(struct rq *rq) {} +diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c +index d9dc9ab3773f..71a25540d65e 100644 +--- a/kernel/sched/build_policy.c ++++ b/kernel/sched/build_policy.c +@@ -42,13 +42,19 @@ + + #include "idle.c" + ++#ifndef CONFIG_SCHED_ALT + #include "rt.c" ++#endif + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + # include "cpudeadline.c" ++#endif + # include "pelt.c" + #endif + + #include "cputime.c" +-#include "deadline.c" + ++#ifndef CONFIG_SCHED_ALT ++#include "deadline.c" ++#endif +diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c +index 99bdd96f454f..23f80a86d2d7 100644 +--- a/kernel/sched/build_utility.c ++++ b/kernel/sched/build_utility.c +@@ -85,7 +85,9 @@ + + #ifdef CONFIG_SMP + # include "cpupri.c" ++#ifndef CONFIG_SCHED_ALT + # include "stop_task.c" ++#endif + # include "topology.c" + #endif + +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 3dbf351d12d5..b2590f961139 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -160,9 +160,14 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) + unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); + + sg_cpu->max = max; ++#ifndef CONFIG_SCHED_ALT + sg_cpu->bw_dl = cpu_bw_dl(rq); + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max, + FREQUENCY_UTIL, NULL); ++#else ++ sg_cpu->bw_dl = 0; ++ sg_cpu->util = rq_load_util(rq, max); ++#endif /* CONFIG_SCHED_ALT */ + } + + /** +@@ -306,8 +311,10 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) + { ++#ifndef CONFIG_SCHED_ALT + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) + sg_cpu->sg_policy->limits_changed = true; ++#endif + } + + static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, +@@ -607,6 +614,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -839,7 +847,9 @@ cpufreq_governor_init(schedutil_gov); + #ifdef CONFIG_ENERGY_MODEL + static void rebuild_sd_workfn(struct work_struct *work) + { ++#ifndef CONFIG_SCHED_ALT + rebuild_sched_domains_energy(); ++#endif /* CONFIG_SCHED_ALT */ + } + static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 78a233d43757..b3bbc87d4352 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + task_group_account_field(p, CPUTIME_NICE, cputime); + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -611,7 +611,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + if (task_cputime(p, &cputime.utime, &cputime.stime)) +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index bb3d63bdf4ae..4e1680785704 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -7,6 +7,7 @@ + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar + */ + ++#ifndef CONFIG_SCHED_ALT + /* + * This allows printing both to /proc/sched_debug and + * to the console +@@ -215,6 +216,7 @@ static const struct file_operations sched_scaling_fops = { + }; + + #endif /* SMP */ ++#endif /* !CONFIG_SCHED_ALT */ + + #ifdef CONFIG_PREEMPT_DYNAMIC + +@@ -278,6 +280,7 @@ static const struct file_operations sched_dynamic_fops = { + + #endif /* CONFIG_PREEMPT_DYNAMIC */ + ++#ifndef CONFIG_SCHED_ALT + __read_mostly bool sched_debug_verbose; + + static const struct seq_operations sched_debug_sops; +@@ -293,6 +296,7 @@ static const struct file_operations sched_debug_fops = { + .llseek = seq_lseek, + .release = seq_release, + }; ++#endif /* !CONFIG_SCHED_ALT */ + + static struct dentry *debugfs_sched; + +@@ -302,12 +306,15 @@ static __init int sched_init_debug(void) + + debugfs_sched = debugfs_create_dir("sched", NULL); + ++#ifndef CONFIG_SCHED_ALT + debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); + debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose); ++#endif /* !CONFIG_SCHED_ALT */ + #ifdef CONFIG_PREEMPT_DYNAMIC + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + ++#ifndef CONFIG_SCHED_ALT + debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); + debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); + debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); +@@ -336,11 +343,13 @@ static __init int sched_init_debug(void) + #endif + + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); ++#endif /* !CONFIG_SCHED_ALT */ + + return 0; + } + late_initcall(sched_init_debug); + ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_SMP + + static cpumask_var_t sd_sysctl_cpus; +@@ -1067,6 +1076,7 @@ void proc_sched_set_task(struct task_struct *p) + memset(&p->stats, 0, sizeof(p->stats)); + #endif + } ++#endif /* !CONFIG_SCHED_ALT */ + + void resched_latency_warn(int cpu, u64 latency) + { +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 328cccbee444..aef991facc79 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -400,6 +400,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * idle-task scheduling class. + */ +@@ -521,3 +522,4 @@ DEFINE_SCHED_CLASS(idle) = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h +new file mode 100644 +index 000000000000..56a649d02e49 +--- /dev/null ++++ b/kernel/sched/pds.h +@@ -0,0 +1,127 @@ ++#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" ++ ++static int sched_timeslice_shift = 22; ++ ++#define NORMAL_PRIO_MOD(x) ((x) & (NORMAL_PRIO_NUM - 1)) ++ ++/* ++ * Common interfaces ++ */ ++static inline void sched_timeslice_imp(const int timeslice_ms) ++{ ++ if (2 == timeslice_ms) ++ sched_timeslice_shift = 21; ++} ++ ++static inline int ++task_sched_prio_normal(const struct task_struct *p, const struct rq *rq) ++{ ++ s64 delta = p->deadline - rq->time_edge + NORMAL_PRIO_NUM - NICE_WIDTH; ++ ++ if (WARN_ONCE(delta > NORMAL_PRIO_NUM - 1, ++ "pds: task_sched_prio_normal() delta %lld\n", delta)) ++ return NORMAL_PRIO_NUM - 1; ++ ++ return (delta < 0) ? 0 : delta; ++} ++ ++static inline int task_sched_prio(const struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO) ? p->prio : ++ MIN_NORMAL_PRIO + task_sched_prio_normal(p, task_rq(p)); ++} ++ ++static inline int ++task_sched_prio_idx(const struct task_struct *p, const struct rq *rq) ++{ ++ return (p->prio < MAX_RT_PRIO) ? p->prio : MIN_NORMAL_PRIO + ++ NORMAL_PRIO_MOD(task_sched_prio_normal(p, rq) + rq->time_edge); ++} ++ ++static inline int sched_prio2idx(int prio, struct rq *rq) ++{ ++ return (IDLE_TASK_SCHED_PRIO == prio || prio < MAX_RT_PRIO) ? prio : ++ MIN_NORMAL_PRIO + NORMAL_PRIO_MOD((prio - MIN_NORMAL_PRIO) + ++ rq->time_edge); ++} ++ ++static inline int sched_idx2prio(int idx, struct rq *rq) ++{ ++ return (idx < MAX_RT_PRIO) ? idx : MIN_NORMAL_PRIO + ++ NORMAL_PRIO_MOD((idx - MIN_NORMAL_PRIO) + NORMAL_PRIO_NUM - ++ NORMAL_PRIO_MOD(rq->time_edge)); ++} ++ ++static inline void sched_renew_deadline(struct task_struct *p, const struct rq *rq) ++{ ++ if (p->prio >= MAX_RT_PRIO) ++ p->deadline = (rq->clock >> sched_timeslice_shift) + ++ p->static_prio - (MAX_PRIO - NICE_WIDTH); ++} ++ ++int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio > DEFAULT_PRIO); ++} ++ ++static inline void update_rq_time_edge(struct rq *rq) ++{ ++ struct list_head head; ++ u64 old = rq->time_edge; ++ u64 now = rq->clock >> sched_timeslice_shift; ++ u64 prio, delta; ++ ++ if (now == old) ++ return; ++ ++ delta = min_t(u64, NORMAL_PRIO_NUM, now - old); ++ INIT_LIST_HEAD(&head); ++ ++ for_each_set_bit(prio, &rq->queue.bitmap[2], delta) ++ list_splice_tail_init(rq->queue.heads + MIN_NORMAL_PRIO + ++ NORMAL_PRIO_MOD(prio + old), &head); ++ ++ rq->queue.bitmap[2] = (NORMAL_PRIO_NUM == delta) ? 0UL : ++ rq->queue.bitmap[2] >> delta; ++ rq->time_edge = now; ++ if (!list_empty(&head)) { ++ u64 idx = MIN_NORMAL_PRIO + NORMAL_PRIO_MOD(now); ++ struct task_struct *p; ++ ++ list_for_each_entry(p, &head, sq_node) ++ p->sq_idx = idx; ++ ++ list_splice(&head, rq->queue.heads + idx); ++ rq->queue.bitmap[2] |= 1UL; ++ } ++} ++ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = sched_timeslice_ns; ++ sched_renew_deadline(p, rq); ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq, task_sched_prio_idx(p, rq)); ++} ++ ++static inline void sched_task_sanity_check(struct task_struct *p, struct rq *rq) ++{ ++ u64 max_dl = rq->time_edge + NICE_WIDTH - 1; ++ if (unlikely(p->deadline > max_dl)) ++ p->deadline = max_dl; ++} ++ ++static void sched_task_fork(struct task_struct *p, struct rq *rq) ++{ ++ sched_renew_deadline(p, rq); ++} ++ ++static inline void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) ++{ ++ time_slice_expired(p, rq); ++} ++ ++#ifdef CONFIG_SMP ++static inline void sched_task_ttwu(struct task_struct *p) {} ++#endif ++static inline void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index 0f310768260c..bd38bf738fe9 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -266,6 +266,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * sched_entity: + * +@@ -383,8 +384,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + +-#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + /* + * thermal: + * +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index 4ff2ed4f8fa1..226eeed61318 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,13 +1,15 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_ALT + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + +-#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + + static inline u64 thermal_load_avg(struct rq *rq) +@@ -44,6 +46,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) + return PELT_MIN_DIVIDER + avg->period_contrib; + } + ++#ifndef CONFIG_SCHED_ALT + static inline void cfs_se_util_change(struct sched_avg *avg) + { + unsigned int enqueued; +@@ -155,9 +158,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_ALT */ + + #else + ++#ifndef CONFIG_SCHED_ALT + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -175,6 +180,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 47b89a0fc6e5..de2641a32c22 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -5,6 +5,10 @@ + #ifndef _KERNEL_SCHED_SCHED_H + #define _KERNEL_SCHED_SCHED_H + ++#ifdef CONFIG_SCHED_ALT ++#include "alt_sched.h" ++#else ++ + #include + #include + #include +@@ -3116,4 +3120,9 @@ extern int sched_dynamic_mode(const char *str); + extern void sched_dynamic_update(int mode); + #endif + ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_ALT */ + #endif /* _KERNEL_SCHED_SCHED_H */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 857f837f52cb..5486c63e4790 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -125,8 +125,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -143,6 +145,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -171,6 +174,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h +index baa839c1ba96..15238be0581b 100644 +--- a/kernel/sched/stats.h ++++ b/kernel/sched/stats.h +@@ -89,6 +89,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt + + #endif /* CONFIG_SCHEDSTATS */ + ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity_stats { + struct sched_entity se; +@@ -105,6 +106,7 @@ __schedstats_from_se(struct sched_entity *se) + #endif + return &task_of(se)->stats; + } ++#endif /* CONFIG_SCHED_ALT */ + + #ifdef CONFIG_PSI + /* +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 05b6c2ad90b9..480ef393b3c9 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -3,6 +3,7 @@ + * Scheduler topology setup/handling methods + */ + ++#ifndef CONFIG_SCHED_ALT + DEFINE_MUTEX(sched_domains_mutex); + + /* Protected by sched_domains_mutex: */ +@@ -1413,8 +1414,10 @@ static void asym_cpu_capacity_scan(void) + */ + + static int default_relax_domain_level = -1; ++#endif /* CONFIG_SCHED_ALT */ + int sched_domain_level_max; + ++#ifndef CONFIG_SCHED_ALT + static int __init setup_relax_domain_level(char *str) + { + if (kstrtoint(str, 0, &default_relax_domain_level)) +@@ -1647,6 +1650,7 @@ sd_init(struct sched_domain_topology_level *tl, + + return sd; + } ++#endif /* CONFIG_SCHED_ALT */ + + /* + * Topology list, bottom-up. +@@ -1683,6 +1687,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) + sched_domain_topology_saved = NULL; + } + ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_NUMA + + static const struct cpumask *sd_numa_mask(int cpu) +@@ -2638,3 +2643,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + mutex_unlock(&sched_domains_mutex); + } ++#else /* CONFIG_SCHED_ALT */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{} ++ ++#ifdef CONFIG_NUMA ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++#endif +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 35d034219513..23719c728677 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -86,6 +86,10 @@ + + /* Constants used for minimum and maximum */ + ++#ifdef CONFIG_SCHED_ALT ++extern int sched_yield_type; ++#endif ++ + #ifdef CONFIG_PERF_EVENTS + static const int six_hundred_forty_kb = 640 * 1024; + #endif +@@ -1590,6 +1594,7 @@ int proc_do_static_key(struct ctl_table *table, int write, + } + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing", +@@ -1601,6 +1606,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_FOUR, + }, + #endif /* CONFIG_NUMA_BALANCING */ ++#endif /* !CONFIG_SCHED_ALT */ + { + .procname = "panic", + .data = &panic_timeout, +@@ -1902,6 +1908,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_ALT ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_TWO, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 0ea8702eb516..a27a0f3a654d 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2088,8 +2088,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + int ret = 0; + u64 slack; + ++#ifndef CONFIG_SCHED_ALT + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) ++#endif + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index cb925e8ef9a8..67d823510f5c 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -223,7 +223,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -866,6 +866,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_ALT + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -873,6 +874,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -900,8 +902,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -915,7 +919,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1151,8 +1155,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index a2d301f58ced..2ccdede8585c 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1143,10 +1143,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_ALT ++ /* No deadline on BMQ/PDS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/sys-kernel/pinephone-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch b/sys-kernel/pinephone-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch new file mode 100644 index 0000000..6b2049d --- /dev/null +++ b/sys-kernel/pinephone-sources/files/5021_BMQ-and-PDS-gentoo-defaults.patch @@ -0,0 +1,13 @@ +--- a/init/Kconfig 2022-07-07 13:22:00.698439887 -0400 ++++ b/init/Kconfig 2022-07-07 13:23:45.152333576 -0400 +@@ -874,8 +874,9 @@ config UCLAMP_BUCKETS_COUNT + If in doubt, use the default value. + + menuconfig SCHED_ALT ++ depends on X86_64 + bool "Alternative CPU Schedulers" +- default y ++ default n + help + This feature enable alternative CPU scheduler" + diff --git a/sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch b/sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch new file mode 100644 index 0000000..cf365f8 --- /dev/null +++ b/sys-kernel/pinephone-sources/files/Multi-Gen-LRU-Framework.patch @@ -0,0 +1,8901 @@ +From patchwork Wed Jul 6 22:00:10 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908719 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 94C1DC43334 + for ; Wed, 6 Jul 2022 22:07:01 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 3052B6B0072; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 2B5426B0073; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 1A27C6B0074; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com + [216.40.44.11]) + by kanga.kvack.org (Postfix) with ESMTP id 0BC686B0072 + for ; Wed, 6 Jul 2022 18:07:01 -0400 (EDT) +Received: from smtpin02.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay10.hostedemail.com (Postfix) with ESMTP id CA2056A6 + for ; Wed, 6 Jul 2022 22:07:00 +0000 (UTC) +X-FDA: 79658061000.02.A339B96 +Received: from mail-io1-f74.google.com (mail-io1-f74.google.com + [209.85.166.74]) + by imf05.hostedemail.com (Postfix) with ESMTP id 4E8D2100017 + for ; Wed, 6 Jul 2022 22:07:00 +0000 (UTC) +Received: by mail-io1-f74.google.com with SMTP id + bw12-20020a056602398c00b00675895c2e24so8731074iob.19 + for ; Wed, 06 Jul 2022 15:07:00 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=qRI8PXbLcM+5CLpEdu5Szvo90bsJIGjJE2jS009gkGk=; + b=ZmyxY1Zw8XSvfRWkKAW+f4mUNqqtO18FFYBy2MotiZryXwyz9ItbUh9iu4txbliGWV + 2zSpKFQCiNnOAlQ6EcsvQBLjKhLO02wKW9+/0P3DsfIXA4cNhb908dXECrznSmVA8Pnr + F13ODZZAGss1dN9dP7/zz2TweJvGgqjzlw8hpy3C9EXhkGdCEVfFUX5sYsFwHF6ph62j + YFYkt0yEeDGZ6BSKwot0UC5ZcUyd9AqPFg+XD4PWIlU21bbWaLA6eIQAr/1vyvoOUESY + RP+ZlS9AQ2JVmz3TDo8SyWa829c8OgLjNn28DmB38A4um5Ju0lB8q6j6sdVFGsj5iEvp + AFww== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=qRI8PXbLcM+5CLpEdu5Szvo90bsJIGjJE2jS009gkGk=; + b=AOUJsEidUKQJJt8SToVFGAPr3C12fTJoJElAAFE7KxYD410S32sDmau0kREt/LiYkv + dUGJshmCRHkkY36SzjWitE/xBZ1tllr3SMm51k1bORtnqWEcusXlr0UtsGdinQK9ILBh + K59jgkQzsGwJHEpe+Ll9kADhZ7o8oLcSMLrrgD3Fnx7oEtfuAHMKEcBlBkSgMPppXUTd + ulnN01wxGMVZTYZ/yA1sJwjTalfTQgS1jWfhZp1m3A1fJTF4eeWoX2ceAvv5rGOLAiJM + SW/ePavunPzzOpIPmO+dbtG0EWRHGTxBR65eiy6ov/69KgUE7bGwub0yYiOXQKWohY5Z + W3FQ== +X-Gm-Message-State: AJIora8WGdQPm0mObzFDWcXDHpPyK5u2HGFLGwaXGZ5Jsb3uO01h6wcU + F87blpjQgVwnSA0QtoTcFD+utCSKlvM= +X-Google-Smtp-Source: + AGRyM1sDSDGcRnDMwsrjVWQBGg0nLWGv8i6pEMJ10oDW9TLUnp/+bD9AQNZ2+emMVfEkLEmXVjNXufDKAmU= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a05:6638:14cd:b0:33c:c00e:898d with SMTP id + l13-20020a05663814cd00b0033cc00e898dmr26357876jak.143.1657144852078; Wed, 06 + Jul 2022 15:00:52 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:10 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-2-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 01/14] mm: x86, arm64: add arch_has_hw_pte_young() +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Barry Song , Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657145220; a=rsa-sha256; + cv=none; + b=VumvfKCjx5tf93BL/O1DNNiONuUUMaZYR4iOhULdFR4P8YOdhpBtrpKwBsHGR4wUqyMcvI + ToLran37owHd2V3ShTKRPwSH8VjFvggnLlLoA19COIyGitTG9II71uvoVW/BX9CNy0fyvU + cjZkFbkAV2gw14xwh4oA0dBJXiv4wcs= +ARC-Authentication-Results: i=1; + imf05.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=ZmyxY1Zw; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf05.hostedemail.com: domain of + 3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com + designates 209.85.166.74 as permitted sender) + smtp.mailfrom=3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657145220; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=qRI8PXbLcM+5CLpEdu5Szvo90bsJIGjJE2jS009gkGk=; + b=qXdH1Ee5JE3ufkBF1syfLTJ4Hf4+XbhNy8Ep7CdbOWtn0impShoppleSgAJd0DjZcGtBPd + BrCXlkc1QnMUpwyPi5WEIjJZZLPAAkBIhwltXoG15zc7F1kIblfi2GpbrcQSpycZKhMp2a + awra7JeixwgTaauxTH0OVnzltL0UkbU= +X-Stat-Signature: y7hoskbhfp1nq6ugnwo8zwjg458t1yhh +X-Rspam-User: +X-Rspamd-Server: rspam12 +X-Rspamd-Queue-Id: 4E8D2100017 +Authentication-Results: imf05.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=ZmyxY1Zw; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf05.hostedemail.com: domain of + 3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com + designates 209.85.166.74 as permitted sender) + smtp.mailfrom=3FAbGYgYKCFULHM4xB3BB381.zB985AHK-997Ixz7.BE3@flex--yuzhao.bounces.google.com +X-HE-Tag: 1657145220-618745 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Some architectures automatically set the accessed bit in PTEs, e.g., +x86 and arm64 v8.2. On architectures that do not have this capability, +clearing the accessed bit in a PTE usually triggers a page fault +following the TLB miss of this PTE (to emulate the accessed bit). + +Being aware of this capability can help make better decisions, e.g., +whether to spread the work out over a period of time to reduce bursty +page faults when trying to clear the accessed bit in many PTEs. + +Note that theoretically this capability can be unreliable, e.g., +hotplugged CPUs might be different from builtin ones. Therefore it +should not be used in architecture-independent code that involves +correctness, e.g., to determine whether TLB flushes are required (in +combination with the accessed bit). + +Signed-off-by: Yu Zhao +Reviewed-by: Barry Song +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Acked-by: Will Deacon +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + arch/arm64/include/asm/pgtable.h | 15 ++------------- + arch/x86/include/asm/pgtable.h | 6 +++--- + include/linux/pgtable.h | 13 +++++++++++++ + mm/memory.c | 14 +------------- + 4 files changed, 19 insertions(+), 29 deletions(-) + +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index 0b6632f18364..c46399c0500c 100644 +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -1066,24 +1066,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, + * page after fork() + CoW for pfn mappings. We don't always have a + * hardware-managed access flag on arm64. + */ +-static inline bool arch_faults_on_old_pte(void) +-{ +- /* The register read below requires a stable CPU to make any sense */ +- cant_migrate(); +- +- return !cpu_has_hw_af(); +-} +-#define arch_faults_on_old_pte arch_faults_on_old_pte ++#define arch_has_hw_pte_young cpu_has_hw_af + + /* + * Experimentally, it's cheap to set the access flag in hardware and we + * benefit from prefaulting mappings as 'old' to start with. + */ +-static inline bool arch_wants_old_prefaulted_pte(void) +-{ +- return !arch_faults_on_old_pte(); +-} +-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte ++#define arch_wants_old_prefaulted_pte cpu_has_hw_af + + static inline bool pud_sect_supported(void) + { +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 44e2d6f1dbaa..dc5f7d8ef68a 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_check(void) + return boot_cpu_has_bug(X86_BUG_L1TF); + } + +-#define arch_faults_on_old_pte arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) ++#define arch_has_hw_pte_young arch_has_hw_pte_young ++static inline bool arch_has_hw_pte_young(void) + { +- return false; ++ return true; + } + + #ifdef CONFIG_PAGE_TABLE_CHECK +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 3cdc16cfd867..8eee31bc9bde 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif + ++#ifndef arch_has_hw_pte_young ++/* ++ * Return whether the accessed bit is supported on the local CPU. ++ * ++ * This stub assumes accessing through an old PTE triggers a page fault. ++ * Architectures that automatically set the access bit should overwrite it. ++ */ ++static inline bool arch_has_hw_pte_young(void) ++{ ++ return false; ++} ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, +diff --git a/mm/memory.c b/mm/memory.c +index 7a089145cad4..49500390b91b 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly = + 2; + #endif + +-#ifndef arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) +-{ +- /* +- * Those arches which don't have hw access flag feature need to +- * implement their own helper. By default, "true" means pagefault +- * will be hit on old pte. +- */ +- return true; +-} +-#endif +- + #ifndef arch_wants_old_prefaulted_pte + static inline bool arch_wants_old_prefaulted_pte(void) + { +@@ -2862,7 +2850,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ +- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { ++ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + +From patchwork Wed Jul 6 22:00:11 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908700 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id CAC87C433EF + for ; Wed, 6 Jul 2022 22:00:56 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 154F66B0073; Wed, 6 Jul 2022 18:00:56 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 0B6EE6B0074; Wed, 6 Jul 2022 18:00:56 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id E729E6B0075; Wed, 6 Jul 2022 18:00:55 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0016.hostedemail.com + [216.40.44.16]) + by kanga.kvack.org (Postfix) with ESMTP id D9E9C6B0073 + for ; Wed, 6 Jul 2022 18:00:55 -0400 (EDT) +Received: from smtpin22.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay06.hostedemail.com (Postfix) with ESMTP id 9F16534610 + for ; Wed, 6 Jul 2022 22:00:55 +0000 (UTC) +X-FDA: 79658045670.22.8A546BF +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf12.hostedemail.com (Postfix) with ESMTP id 0D92C40033 + for ; Wed, 6 Jul 2022 22:00:53 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + m68-20020a253f47000000b006683bd91962so12445167yba.0 + for ; Wed, 06 Jul 2022 15:00:53 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=Yske1o/9q5kvxCT6Do7fK+m0Z7RMKAIlwEA5dqQMV6o=; + b=ScyLwm63xSUVYY78eVpIKf7E4l6uHPJ8SKqWyYLqNgfcQS9rJpZhYXa+GvIYC8VFxz + 2VFStSncvwevlF5a8SeHX4Xsz1oxV5uuYYiB5ijS1hgFnqmnWUZ92SAkit2dsdOrKkVm + doRskpr19skWYdTit7iDaFWDHSkEjmp1FnyOwnhb4K1iob0FZUGliEmOjr11tQKlaxMl + A7gk8PUbqgtBAB5FxJW674j5ErsQXUNEF0mV9mDiI18iHiW2zTe0Jvp4coFt/YGkO03P + +mGZgU80OTVBNdIcmd9CUSdknj31pHlFfc27NA1Hoqf7YpOu3eL0SW+Jp946t/R7w6FH + wLdA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=Yske1o/9q5kvxCT6Do7fK+m0Z7RMKAIlwEA5dqQMV6o=; + b=XVonL9oPc/dGS0Lj1bwrmAxlSwptN2oDguArTP7th8VxXdXwHpn2oGKmURoazynnWW + GaxAg33Dr4knllzhF6wCdcowLA++/AgQbdQfMwZEbDkgdPMiKz+9twLafdDp2twVELPc + mZFyE0neVCe8OAOes5N5stgxrIPJyGN+cmejA7EFYbUXD5yKaVHVWEbZ1DKvs+vkVfYH + 4I7Mc++TN9sTNUODcCZv7eNmy5ddfKdhs8ZEqmBzkQQl+6Nyi2IUxEa3YeftDVQx6pqJ + 7oPE/pTcwcjKcRm4Bn+MZj1FE7of9UClcR0Wd4ZoxSRmvPtCnOiV9G4yZyDcGkpaBUWh + +quA== +X-Gm-Message-State: AJIora81vFHYbn7du42CA/tgPDfWrrUd4KS2ldCuwHG08ccHdOGjEtup + i9hDsoVZxoG9FtGAgLFxbBinusC9kmE= +X-Google-Smtp-Source: + AGRyM1uPiyB7rEJDdAsYAVesh6XcxF7m4/NOwgKHx35NtLh0WZv9A8PqKLV1Gu8X5xOooB/DS/0V1C1QSZU= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:390:0:b0:66e:b9c7:b46c with SMTP id + 138-20020a250390000000b0066eb9c7b46cmr172371ybd.505.1657144853349; Wed, 06 + Jul 2022 15:00:53 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:11 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-3-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Barry Song , Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144854; a=rsa-sha256; + cv=none; + b=ba2dVYmnfxk8QTCdMBgkSodQNf9QrUzHc+vrIrNl2fGKaUM0VC5LUOkOS0Uam92Z/fgIw+ + J3iBf4wOdpf9YxVZLpvnO/CvPz7LzU7dbaCIsHjkTYZyjSGj5b5H8veJBlUQe2PyEhqktl + KdZlmcrPxuSkAHBseFs2D8j/Mhzx2nw= +ARC-Authentication-Results: i=1; + imf12.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=ScyLwm63; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf12.hostedemail.com: domain of + 3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144854; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=Yske1o/9q5kvxCT6Do7fK+m0Z7RMKAIlwEA5dqQMV6o=; + b=TMdhE0VPqYoVfu/UiVKCUJJu+4spbPZ1vrUaXbX8Pa9OP/6dtZAs1KcJMK2kLgdUvhO8E9 + UX8x+Y/myW5EAlPfC2BrKGzFE9TjBUVGeJYESpLIZg2lf658PqCu5GAkoM5vFZRBG80nvF + ObTkOxqzk+IkvR3PYDfWaVzYP1bgmkU= +Authentication-Results: imf12.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=ScyLwm63; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf12.hostedemail.com: domain of + 3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3FQbGYgYKCFYMIN5yC4CC492.0CA96BIL-AA8Jy08.CF4@flex--yuzhao.bounces.google.com +X-Stat-Signature: u9s859meeaiiqe5mxswoozqtkc4fepwx +X-Rspamd-Queue-Id: 0D92C40033 +X-Rspamd-Server: rspam05 +X-Rspam-User: +X-HE-Tag: 1657144853-88353 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Some architectures support the accessed bit in non-leaf PMD entries, +e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it +as part of linear address translation [1]. Page table walkers that +clear the accessed bit may use this capability to reduce their search +space. + +Note that: +1. Although an inline function is preferable, this capability is added + as a configuration option for consistency with the existing macros. +2. Due to the little interest in other varieties, this capability was + only tested on Intel and AMD CPUs. + +Thanks to the following developers for their efforts [2][3]. + Randy Dunlap + Stephen Rothwell + +[1]: Intel 64 and IA-32 Architectures Software Developer's Manual + Volume 3 (June 2021), section 4.8 +[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/ +[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/ + +Signed-off-by: Yu Zhao +Reviewed-by: Barry Song +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + arch/Kconfig | 8 ++++++++ + arch/x86/Kconfig | 1 + + arch/x86/include/asm/pgtable.h | 3 ++- + arch/x86/mm/pgtable.c | 5 ++++- + include/linux/pgtable.h | 4 ++-- + 5 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index fcf9a41a4ef5..eaeec187bd6a 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1403,6 +1403,14 @@ config DYNAMIC_SIGFRAME + config HAVE_ARCH_NODE_DEV_GROUP + bool + ++config ARCH_HAS_NONLEAF_PMD_YOUNG ++ bool ++ help ++ Architectures that select this option are capable of setting the ++ accessed bit in non-leaf PMD entries when using them as part of linear ++ address translations. Page table walkers that clear the accessed bit ++ may use this capability to reduce their search space. ++ + source "kernel/gcov/Kconfig" + + source "scripts/gcc-plugins/Kconfig" +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index be0b95e51df6..5715111abe13 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -85,6 +85,7 @@ config X86 + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 + select ARCH_HAS_PTE_SPECIAL ++ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 + select ARCH_HAS_COPY_MC if X86_64 + select ARCH_HAS_SET_MEMORY +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index dc5f7d8ef68a..5059799bebe3 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) + + static inline int pmd_bad(pmd_t pmd) + { +- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != ++ (_KERNPG_TABLE & ~_PAGE_ACCESSED); + } + + static inline unsigned long pages_to_mb(unsigned long npg) +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index a932d7712d85..8525f2876fb4 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, + return ret; + } + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { +@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, + + return ret; + } ++#endif ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) + { +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 8eee31bc9bde..9c57c5cc49c2 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + #endif + + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + BUILD_BUG(); + return 0; + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ + #endif + + #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH + +From patchwork Wed Jul 6 22:00:12 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908701 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 05277CCA481 + for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 7E4186B0074; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 76ECC6B0075; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 524486B0078; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com + [216.40.44.11]) + by kanga.kvack.org (Postfix) with ESMTP id 3E4B36B0074 + for ; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay06.hostedemail.com (Postfix) with ESMTP id 0F88934416 + for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) +X-FDA: 79658045754.31.374F01B +Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com + [209.85.128.201]) + by imf23.hostedemail.com (Postfix) with ESMTP id 7B5CF140071 + for ; Wed, 6 Jul 2022 22:00:55 +0000 (UTC) +Received: by mail-yw1-f201.google.com with SMTP id + 00721157ae682-31c9a49a1a8so63943167b3.9 + for ; Wed, 06 Jul 2022 15:00:55 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=xRTO2a+j5NrVJKtKeScRWRKKBSTrjcMS5t6hiKMzj7E=; + b=jaYbReZJ8uDDLbii1xwhzvdsu6n9p9fFeOoX3rMWV4HRFwikqu+fxkANqP9J1hGdR2 + NeJtlffRYWnnwdndS5aG1Db183fv4nEfSDNZk5Aw1GhS0DDV+irZrJ4sR+RBQ0mlRL0F + PCWg0VVitxpZ5yzJzYAkEO4uHOjww0Tjni9prrUmk4iDUdAeuQHZsQYSGRbR+cGm4i8w + k7/vbxWbkPS/YQ/tq51SCEZjr+bTsFRcUYhsaDMMVhgqvpvMmhh84viZjp9G9W/MZCVp + lhJy7B/1ym1XZ9aYTn0gi9sgQDfh0ksvuw/1a5ib9CO1DG9/pvF0LoK/EKm8nNJ/pZyy + kAfA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=xRTO2a+j5NrVJKtKeScRWRKKBSTrjcMS5t6hiKMzj7E=; + b=idSd06MTounv54fIacoxrI0eQJJSRslStkggX8APKQWu6w4H6jnTCQKfjZteTt+gtC + gAOrWhpfQgtNIatPBQliE6gNhZivKVIon5hkSCuin7ffPXxx98Zr4xXmYyyQFey9eZEP + bYnELAeg+MpzknUvWf0bHlIZA6PwqWxz1eWAYDCV4dPEApBGDNhC+aYkMFf9EqdJdmDs + 9FEeSbXKN25LUaLiOpKh0js2kdqX4Aukk0uqlH6XX+ZNfNNe0hkK/+OhTievM+jCt5TK + p/+QW8aqDk5Qbxin3B3Tq7oWWNyUaM/L/r7o7nuPHq1VWWbGPqj0YyKzEZLz9+OsyhJD + mEbg== +X-Gm-Message-State: AJIora8tXy61OEQgPe+vKTfstDk3naLrxXDSbXHWqk7IopwvdFEH4Cef + nkdLlvLaW7zOebfRuKKHBEK+WcRl9M8= +X-Google-Smtp-Source: + AGRyM1slRLC7YoaOAJX582u+iAc9V/TxbhI1Hoxnov8FBUQD9MbHWxCLjXxj9TQ7JPp2TjaJTr9hNbuL1Ok= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a05:6902:1206:b0:66e:6e93:366c with SMTP id + s6-20020a056902120600b0066e6e93366cmr11968955ybu.59.1657144854840; Wed, 06 + Jul 2022 15:00:54 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:12 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-4-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 03/14] mm/vmscan.c: refactor shrink_node() +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Barry Song , Miaohe Lin , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144855; a=rsa-sha256; + cv=none; + b=C5Z9j3O02mimmj7Fw9J8fsWNjRE+Y/gPxdz9C+kmFqM/2BgzBkrX0GAW0hgI7dHEM8924m + HqfJJSqt0XV/+xhpN2q4jK0T+02nB4EEXRK//o2vKS+/FvUwh/ucVjQrLyDiacFK43TXI6 + NfK2zkhL3Ol0W61EEn2HSK05MjQkolc= +ARC-Authentication-Results: i=1; + imf23.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=jaYbReZJ; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf23.hostedemail.com: domain of + 3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com + designates 209.85.128.201 as permitted sender) + smtp.mailfrom=3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144855; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=xRTO2a+j5NrVJKtKeScRWRKKBSTrjcMS5t6hiKMzj7E=; + b=8oACpQ7ksfTXalrWOXw+va/sQyeVexNkPm99hYfcs2rccyQJRt+TczrAEF/1Dx80ZM4U1c + tQU/+fYGnG8sEjmePqrrHye0U6E7JvxV6YqmuCDVUUaIEEgYqMC0KfEir3FNalMA6JhauV + vCylGdyHJmqBqvDVl9PD0HpFhXgtW3U= +Authentication-Results: imf23.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=jaYbReZJ; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf23.hostedemail.com: domain of + 3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com + designates 209.85.128.201 as permitted sender) + smtp.mailfrom=3FgbGYgYKCFcNJO6zD5DD5A3.1DBA7CJM-BB9Kz19.DG5@flex--yuzhao.bounces.google.com +X-Stat-Signature: ogjqd7prxk1deq49x3znr781rgid8amh +X-Rspamd-Queue-Id: 7B5CF140071 +X-Rspamd-Server: rspam05 +X-Rspam-User: +X-HE-Tag: 1657144855-543148 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +This patch refactors shrink_node() to improve readability for the +upcoming changes to mm/vmscan.c. + +Signed-off-by: Yu Zhao +Reviewed-by: Barry Song +Reviewed-by: Miaohe Lin +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + mm/vmscan.c | 198 +++++++++++++++++++++++++++------------------------- + 1 file changed, 104 insertions(+), 94 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index f7d9a683e3a7..fddb9bd3c6c2 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2670,6 +2670,109 @@ enum scan_balance { + SCAN_FILE, + }; + ++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long file; ++ struct lruvec *target_lruvec; ++ ++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); ++ ++ /* ++ * Flush the memory cgroup stats, so that we read accurate per-memcg ++ * lruvec stats for heuristics. ++ */ ++ mem_cgroup_flush_stats(); ++ ++ /* ++ * Determine the scan balance between anon and file LRUs. ++ */ ++ spin_lock_irq(&target_lruvec->lru_lock); ++ sc->anon_cost = target_lruvec->anon_cost; ++ sc->file_cost = target_lruvec->file_cost; ++ spin_unlock_irq(&target_lruvec->lru_lock); ++ ++ /* ++ * Target desirable inactive:active list ratios for the anon ++ * and file LRU lists. ++ */ ++ if (!sc->force_deactivate) { ++ unsigned long refaults; ++ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_ANON); ++ if (refaults != target_lruvec->refaults[0] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) ++ sc->may_deactivate |= DEACTIVATE_ANON; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_ANON; ++ ++ /* ++ * When refaults are being observed, it means a new ++ * workingset is being established. Deactivate to get ++ * rid of any stale active pages quickly. ++ */ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_FILE); ++ if (refaults != target_lruvec->refaults[1] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) ++ sc->may_deactivate |= DEACTIVATE_FILE; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_FILE; ++ } else ++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; ++ ++ /* ++ * If we have plenty of inactive file pages that aren't ++ * thrashing, try to reclaim those first before touching ++ * anonymous pages. ++ */ ++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); ++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) ++ sc->cache_trim_mode = 1; ++ else ++ sc->cache_trim_mode = 0; ++ ++ /* ++ * Prevent the reclaimer from falling into the cache trap: as ++ * cache pages start out inactive, every cache fault will tip ++ * the scan balance towards the file LRU. And as the file LRU ++ * shrinks, so does the window for rotation from references. ++ * This means we have a runaway feedback loop where a tiny ++ * thrashing file LRU becomes infinitely more attractive than ++ * anon pages. Try to detect this based on file LRU size. ++ */ ++ if (!cgroup_reclaim(sc)) { ++ unsigned long total_high_wmark = 0; ++ unsigned long free, anon; ++ int z; ++ ++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); ++ file = node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE); ++ ++ for (z = 0; z < MAX_NR_ZONES; z++) { ++ struct zone *zone = &pgdat->node_zones[z]; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ total_high_wmark += high_wmark_pages(zone); ++ } ++ ++ /* ++ * Consider anon: if that's low too, this isn't a ++ * runaway file reclaim problem, but rather just ++ * extreme pressure. Reclaim as per usual then. ++ */ ++ anon = node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ sc->file_is_tiny = ++ file + free <= total_high_wmark && ++ !(sc->may_deactivate & DEACTIVATE_ANON) && ++ anon >> sc->priority; ++ } ++} ++ + /* + * Determine how aggressively the anon and file LRU lists should be + * scanned. +@@ -3138,109 +3241,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; +- unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + again: +- /* +- * Flush the memory cgroup stats, so that we read accurate per-memcg +- * lruvec stats for heuristics. +- */ +- mem_cgroup_flush_stats(); +- + memset(&sc->nr, 0, sizeof(sc->nr)); + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + +- /* +- * Determine the scan balance between anon and file LRUs. +- */ +- spin_lock_irq(&target_lruvec->lru_lock); +- sc->anon_cost = target_lruvec->anon_cost; +- sc->file_cost = target_lruvec->file_cost; +- spin_unlock_irq(&target_lruvec->lru_lock); +- +- /* +- * Target desirable inactive:active list ratios for the anon +- * and file LRU lists. +- */ +- if (!sc->force_deactivate) { +- unsigned long refaults; +- +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_ANON); +- if (refaults != target_lruvec->refaults[0] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) +- sc->may_deactivate |= DEACTIVATE_ANON; +- else +- sc->may_deactivate &= ~DEACTIVATE_ANON; +- +- /* +- * When refaults are being observed, it means a new +- * workingset is being established. Deactivate to get +- * rid of any stale active pages quickly. +- */ +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_FILE); +- if (refaults != target_lruvec->refaults[1] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) +- sc->may_deactivate |= DEACTIVATE_FILE; +- else +- sc->may_deactivate &= ~DEACTIVATE_FILE; +- } else +- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; +- +- /* +- * If we have plenty of inactive file pages that aren't +- * thrashing, try to reclaim those first before touching +- * anonymous pages. +- */ +- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) +- sc->cache_trim_mode = 1; +- else +- sc->cache_trim_mode = 0; +- +- /* +- * Prevent the reclaimer from falling into the cache trap: as +- * cache pages start out inactive, every cache fault will tip +- * the scan balance towards the file LRU. And as the file LRU +- * shrinks, so does the window for rotation from references. +- * This means we have a runaway feedback loop where a tiny +- * thrashing file LRU becomes infinitely more attractive than +- * anon pages. Try to detect this based on file LRU size. +- */ +- if (!cgroup_reclaim(sc)) { +- unsigned long total_high_wmark = 0; +- unsigned long free, anon; +- int z; +- +- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- file = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { +- struct zone *zone = &pgdat->node_zones[z]; +- if (!managed_zone(zone)) +- continue; +- +- total_high_wmark += high_wmark_pages(zone); +- } +- +- /* +- * Consider anon: if that's low too, this isn't a +- * runaway file reclaim problem, but rather just +- * extreme pressure. Reclaim as per usual then. +- */ +- anon = node_page_state(pgdat, NR_INACTIVE_ANON); +- +- sc->file_is_tiny = +- file + free <= total_high_wmark && +- !(sc->may_deactivate & DEACTIVATE_ANON) && +- anon >> sc->priority; +- } ++ prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + + +From patchwork Wed Jul 6 22:00:13 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908702 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id E07C1C43334 + for ; Wed, 6 Jul 2022 22:00:59 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id E767D6B0075; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id E26676B0078; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id CC6F16B007B; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com + [216.40.44.11]) + by kanga.kvack.org (Postfix) with ESMTP id B2AAC6B0075 + for ; Wed, 6 Jul 2022 18:00:57 -0400 (EDT) +Received: from smtpin09.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay11.hostedemail.com (Postfix) with ESMTP id 94CBE80B22 + for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) +X-FDA: 79658045754.09.169939E +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf07.hostedemail.com (Postfix) with ESMTP id 3DA0B4002F + for ; Wed, 6 Jul 2022 22:00:57 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + l6-20020a25bf86000000b00668c915a3f2so12477298ybk.4 + for ; Wed, 06 Jul 2022 15:00:56 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=GyLn4alIs4ulph3wPn6kG/6c2qN7BZlAW/LU7V/wtB8=; + b=fqvYzFJd6diatCK9xOi35jB4AbI1jOxd0dc3zbIWXBRd/oZCSL2ChL+LrZ+NDYE03d + TIPGwoUneWvzbc4OXeOfpb0FtGxmdhwy/nlPnMgq+BH+J79K/39lDuK/WznYk1HI+hzN + zL7bsRal3Q8YUC5jRMId0XoVcP/vuEU/M54E4rAJ15EBntL/F6yfHEySvrSBBtWZhnt0 + 90gyXGuo//w+Jc0ez+vgTHQxHk3TDIFEvyNKpltir9acA6/j0jGHYEfhC/r1UrED+Tt8 + m1PcqYkXSdSfGsO4GbojXKICNGmqT0/82l34NKy0jmCO9o+gJUnrEIDeiTyPT8jYjdXn + eGJQ== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=GyLn4alIs4ulph3wPn6kG/6c2qN7BZlAW/LU7V/wtB8=; + b=acqAzpv5+5bfyJLdmlUD3HKE74OO89v1YBTzb7kB54GormxgfqmSVm1QB75bSQFjxy + iRQ9yyQgUru50WX+ppP3B5N0K0edy9kKYXC5SmAf9PX7QbH0T3UYfJi5KDO5H7cptgny + 6VYcZXgdQ/ammtN/NFjOwImJ1NyoXMVSMkwXMJFoH/hDV1+/EGHNkG40d3ui2cNlIGer + 8oJrkMcYwG1L0yl5Lv1F9vCPZCaUecfxMZFvc0McrXP6BtB3ww3KPyipUsx80uRBu4PN + j9j1OZPr59Q+bLCF+TDiL77E2CPhZvzXxkw0VDq91eoHiQQwNRK+035yqGfK0i2derto + 4qrg== +X-Gm-Message-State: AJIora8TvaxLFo7c8jNCAkmO2MPTlGSpywySY2xnOvzKMd8WGYtTu0Lg + H/Iowy+yks3pdn4k6EN6JEFx/cAzaSo= +X-Google-Smtp-Source: + AGRyM1vs1GQqyrE7d9mjEL5MTBaBKoftkODnUWU3nZauu0DiGFKm6nQgOePB+L8kJ6BOwhlufj40Jzp1R4k= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:d043:0:b0:66e:31d6:4606 with SMTP id + h64-20020a25d043000000b0066e31d64606mr25539292ybg.241.1657144856519; Wed, 06 + Jul 2022 15:00:56 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:13 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-5-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 04/14] Revert "include/linux/mm_inline.h: fold + __update_lru_size() into its sole caller" +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Miaohe Lin , Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144857; a=rsa-sha256; + cv=none; + b=VytvHlKkiiUMJbwpI1Paeu5xydng7JksWkoUmOEgZClwpKUFRcoyC2S4kA6s+p3bljEt2Y + 85v0iGMF2ImZomoiEOeODI88v8cnakz2h3vV0KOYBRLWcUp1MoAAGc9/CatS2RZcyEZAsJ + 16N+Z8RlZvTLC+lg1BEaccSkBnce95w= +ARC-Authentication-Results: i=1; + imf07.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=fqvYzFJd; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf07.hostedemail.com: domain of + 3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144857; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=GyLn4alIs4ulph3wPn6kG/6c2qN7BZlAW/LU7V/wtB8=; + b=eWydr8xqhEMOpzfboenYQahizL48uc/GlcGbQBWuOSIiHMGD4xSQylZi9Tf6qFNtPbIcfn + a/FGlP15AWqiWFjYLC/dUvNrh+6vOaN3WEAagxoFslzh2bv3QfjEGhMrlwdaaPWaec5Ive + jXzUgyTbXLI8/pbmo1scWGTYiETc8FI= +X-Rspam-User: +X-Rspamd-Server: rspam02 +X-Rspamd-Queue-Id: 3DA0B4002F +Authentication-Results: imf07.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=fqvYzFJd; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf07.hostedemail.com: domain of + 3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3GAbGYgYKCFkPLQ81F7FF7C5.3FDC9ELO-DDBM13B.FI7@flex--yuzhao.bounces.google.com +X-Stat-Signature: 6b6krqahmtizrdq5upykdpctczw885w8 +X-HE-Tag: 1657144857-930305 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +This patch undoes the following refactor: +commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller") + +The upcoming changes to include/linux/mm_inline.h will reuse +__update_lru_size(). + +Signed-off-by: Yu Zhao +Reviewed-by: Miaohe Lin +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + include/linux/mm_inline.h | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 7b25b53c474a..fb8aadb81cd6 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page) + return folio_is_file_lru(page_folio(page)); + } + +-static __always_inline void update_lru_size(struct lruvec *lruvec, ++static __always_inline void __update_lru_size(struct lruvec *lruvec, + enum lru_list lru, enum zone_type zid, + long nr_pages) + { +@@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); ++} ++ ++static __always_inline void update_lru_size(struct lruvec *lruvec, ++ enum lru_list lru, enum zone_type zid, ++ long nr_pages) ++{ ++ __update_lru_size(lruvec, lru, zid, nr_pages); + #ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); + #endif + +From patchwork Wed Jul 6 22:00:14 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908703 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 8DA63CCA481 + for ; Wed, 6 Jul 2022 22:01:01 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 005E26B007D; Wed, 6 Jul 2022 18:01:00 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id E9B606B007B; Wed, 6 Jul 2022 18:00:59 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id CC4B46B007D; Wed, 6 Jul 2022 18:00:59 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0010.hostedemail.com + [216.40.44.10]) + by kanga.kvack.org (Postfix) with ESMTP id BA0E36B0078 + for ; Wed, 6 Jul 2022 18:00:59 -0400 (EDT) +Received: from smtpin23.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay06.hostedemail.com (Postfix) with ESMTP id 7E56E34725 + for ; Wed, 6 Jul 2022 22:00:59 +0000 (UTC) +X-FDA: 79658045838.23.084DC6A +Received: from mail-io1-f74.google.com (mail-io1-f74.google.com + [209.85.166.74]) + by imf02.hostedemail.com (Postfix) with ESMTP id B44F280009 + for ; Wed, 6 Jul 2022 22:00:58 +0000 (UTC) +Received: by mail-io1-f74.google.com with SMTP id + h7-20020a05660224c700b0067898a33ceaso3543048ioe.13 + for ; Wed, 06 Jul 2022 15:00:58 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=y74IoQ1Un60Xq7yBx41XqudQ9pnNmGNgLv+0SiGV5r4=; + b=p293qdy+AJ1NK8wVIFYa38QTJD9CsNtfxZWrFxc99swgPytMvFTFgMhkjdcKezzZie + yrDLuqEO4g2bHuYcfru6gtGl/vlEBzugJSUw9t9SSuHD0KPbwuSBuj6k/Z4E6o/3VSjs + nmEwp3FaQzQrq+AvQ75NBZLJcjJnu2S/L2SRP5n2jtLL27l7UQfJTw+nlDEN61Y6wnKm + cTbYVguOwFUEjdFi2ghze0M0n87A9CNsBCyQHS9wRzczRWbW6m+LMwO/fsge9KEjZcyq + WUlwLSCnJuEi3hDOUrhrpLVnbT1LO6KIzff4/TXK4ud4HZ+BORPfFQeF2zBQpAIt8foH + VdwQ== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=y74IoQ1Un60Xq7yBx41XqudQ9pnNmGNgLv+0SiGV5r4=; + b=EWxosmI9v8pE6BcCI05EJUbjYKXIh1pLe5c6N0so7Z1m/PNwBo99ulR0RkBk+8Fmsr + 84tyZLy+Pyf35k8XxobhAfMup7WIk+mjRYfkHpt7/gxC3CR3vJEm+WqI+pDZ614pUCR3 + N6ibQFwTKBqpvRJDUwhkC5n8ePOAIkrRwZz7JGXj4eiWWJSZGxUDhwqV9gi7CHQfo8Lr + yHt/gyUcmJDvTu8Fy8aP7r187IjoODs5rbqKu518ZAL20ceKmq+HT3FFv02CyDgkXObe + H8JjcI1Ovt/TvJlosala45+Ckpmt3TNX1+aCLmAaarDpkTxNHVYvWWYlylLQp+itl3t2 + Fj5w== +X-Gm-Message-State: AJIora8xQHdkFAa4pcN+RMWZYlVPfQhLR90DF0MW6/oxA9WDgXVAJA1y + HfRDCS36QboLTpSfrlTvo6hF0/eqWKQ= +X-Google-Smtp-Source: + AGRyM1vTPigokpkBMxkuw/ymV5qWW3cjnNF2AOB7Hi8viYhEQm+kOAzrEtDgBoJ1BwoaUWa5EKU0D3T6qsI= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a05:6638:2114:b0:33e:8e12:e5ee with SMTP id + n20-20020a056638211400b0033e8e12e5eemr22734068jaj.281.1657144858015; Wed, 06 + Jul 2022 15:00:58 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:14 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-6-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 05/14] mm: multi-gen LRU: groundwork +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144858; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=y74IoQ1Un60Xq7yBx41XqudQ9pnNmGNgLv+0SiGV5r4=; + b=oTaGMrRapdp+kh/gHJqHBTnDMs7aDxVAwUqI4ZEVOcywjneTv7eRea/YKenPJ3SUgTezbZ + dNcy3RS0hdffVNFHBdEbaanNSV29TaH7bgFF9LgENwvVZGMsc8+qWj5aOjTjXDI7lU9B66 + y9zhOk8IIuWtM+JWdB8zZv2w73QNkgI= +ARC-Authentication-Results: i=1; + imf02.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=p293qdy+; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf02.hostedemail.com: domain of + 3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com + designates 209.85.166.74 as permitted sender) + smtp.mailfrom=3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144858; a=rsa-sha256; + cv=none; + b=5jGHjx/lCilMy07izrrxr4gRusLHe7TVfK6eNjlXnYmVVS2CdTSjlX6iI6cgO5jY/6Otqp + 7rnHHbSwj6t4vUkRkbfhWehDTUsU9TXEcEaZ8NHjLgX8tJZID/D2dcfA1Z/Ae/1iB6tbQa + vuAWajsuByCUT6SlkfXfwe+TOdR4BNI= +X-Rspamd-Server: rspam11 +X-Rspam-User: +X-Stat-Signature: gxdxu955m74iz4kx3hhyt9yhtcgejep5 +X-Rspamd-Queue-Id: B44F280009 +Authentication-Results: imf02.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=p293qdy+; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf02.hostedemail.com: domain of + 3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com + designates 209.85.166.74 as permitted sender) + smtp.mailfrom=3GgbGYgYKCFsRNSA3H9HH9E7.5HFEBGNQ-FFDO35D.HK9@flex--yuzhao.bounces.google.com +X-HE-Tag: 1657144858-162393 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Evictable pages are divided into multiple generations for each lruvec. +The youngest generation number is stored in lrugen->max_seq for both +anon and file types as they are aged on an equal footing. The oldest +generation numbers are stored in lrugen->min_seq[] separately for anon +and file types as clean file pages can be evicted regardless of swap +constraints. These three variables are monotonically increasing. + +Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits +in order to fit into the gen counter in folio->flags. Each truncated +generation number is an index to lrugen->lists[]. The sliding window +technique is used to track at least MIN_NR_GENS and at most +MAX_NR_GENS generations. The gen counter stores a value within [1, +MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it +stores 0. + +There are two conceptually independent procedures: "the aging", which +produces young generations, and "the eviction", which consumes old +generations. They form a closed-loop system, i.e., "the page reclaim". +Both procedures can be invoked from userspace for the purposes of +working set estimation and proactive reclaim. These techniques are +commonly used to optimize job scheduling (bin packing) in data +centers [1][2]. + +To avoid confusion, the terms "hot" and "cold" will be applied to the +multi-gen LRU, as a new convention; the terms "active" and "inactive" +will be applied to the active/inactive LRU, as usual. + +The protection of hot pages and the selection of cold pages are based +on page access channels and patterns. There are two access channels: +one through page tables and the other through file descriptors. The +protection of the former channel is by design stronger because: +1. The uncertainty in determining the access patterns of the former + channel is higher due to the approximation of the accessed bit. +2. The cost of evicting the former channel is higher due to the TLB + flushes required and the likelihood of encountering the dirty bit. +3. The penalty of underprotecting the former channel is higher because + applications usually do not prepare themselves for major page + faults like they do for blocked I/O. E.g., GUI applications + commonly use dedicated I/O threads to avoid blocking rendering + threads. +There are also two access patterns: one with temporal locality and the +other without. For the reasons listed above, the former channel is +assumed to follow the former pattern unless VM_SEQ_READ or +VM_RAND_READ is present; the latter channel is assumed to follow the +latter pattern unless outlying refaults have been observed [3][4]. + +The next patch will address the "outlying refaults". Three macros, +i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are +added in this patch to make the entire patchset less diffy. + +A page is added to the youngest generation on faulting. The aging +needs to check the accessed bit at least twice before handing this +page over to the eviction. The first check takes care of the accessed +bit set on the initial fault; the second check makes sure this page +has not been used since then. This protocol, AKA second chance, +requires a minimum of two generations, hence MIN_NR_GENS. + +[1] https://dl.acm.org/doi/10.1145/3297858.3304053 +[2] https://dl.acm.org/doi/10.1145/3503222.3507731 +[3] https://lwn.net/Articles/495543/ +[4] https://lwn.net/Articles/815342/ + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + fs/fuse/dev.c | 3 +- + include/linux/mm.h | 2 + + include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++ + include/linux/mmzone.h | 100 +++++++++++++++++ + include/linux/page-flags-layout.h | 13 ++- + include/linux/page-flags.h | 4 +- + include/linux/sched.h | 4 + + kernel/bounds.c | 5 + + mm/Kconfig | 8 ++ + mm/huge_memory.c | 3 +- + mm/memcontrol.c | 2 + + mm/memory.c | 25 +++++ + mm/mm_init.c | 6 +- + mm/mmzone.c | 2 + + mm/swap.c | 9 +- + mm/vmscan.c | 75 +++++++++++++ + 16 files changed, 423 insertions(+), 13 deletions(-) + +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index 0e537e580dc1..5d36015071d2 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -777,7 +777,8 @@ static int fuse_check_page(struct page *page) + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +diff --git a/include/linux/mm.h b/include/linux/mm.h +index cf3d0d673f6b..ed5393e5930d 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1060,6 +1060,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index fb8aadb81cd6..2ff703900fd0 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, + { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ lockdep_assert_held(&lruvec->lru_lock); ++ WARN_ON_ONCE(nr_pages != (int)nr_pages); ++ + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); +@@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return true; ++} ++ ++static inline bool lru_gen_in_fault(void) ++{ ++ return current->in_lru_fault; ++} ++ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++static inline int folio_lru_gen(struct folio *folio) ++{ ++ unsigned long flags = READ_ONCE(folio->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = lruvec->lrugen.max_seq; ++ ++ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); ++ ++ /* see the comment on MIN_NR_GENS */ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, ++ int old_gen, int new_gen) ++{ ++ int type = folio_is_file_lru(folio); ++ int zone = folio_zonenum(folio); ++ int delta = folio_nr_pages(folio); ++ enum lru_list lru = type * LRU_INACTIVE_FILE; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], ++ lrugen->nr_pages[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], ++ lrugen->nr_pages[new_gen][type][zone] + delta); ++ ++ /* addition */ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ __update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ /* deletion */ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ __update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++} ++ ++static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) ++{ ++ unsigned long seq; ++ unsigned long flags; ++ int gen = folio_lru_gen(folio); ++ int type = folio_is_file_lru(folio); ++ int zone = folio_zonenum(folio); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); ++ ++ if (folio_test_unevictable(folio)) ++ return false; ++ /* ++ * There are three common cases for this page: ++ * 1. If it's hot, e.g., freshly faulted in or previously hot and ++ * migrated, add it to the youngest generation. ++ * 2. If it's cold but can't be evicted immediately, i.e., an anon page ++ * not in swapcache or a dirty page pending writeback, add it to the ++ * second oldest generation. ++ * 3. Everything else (clean, cold) is added to the oldest generation. ++ */ ++ if (folio_test_active(folio)) ++ seq = lrugen->max_seq; ++ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || ++ (folio_test_reclaim(folio) && ++ (folio_test_dirty(folio) || folio_test_writeback(folio)))) ++ seq = lrugen->min_seq[type] + 1; ++ else ++ seq = lrugen->min_seq[type]; ++ ++ gen = lru_gen_from_seq(seq); ++ flags = (gen + 1UL) << LRU_GEN_PGOFF; ++ /* see the comment on MIN_NR_GENS about PG_active */ ++ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); ++ ++ lru_gen_update_size(lruvec, folio, -1, gen); ++ /* for folio_rotate_reclaimable() */ ++ if (reclaiming) ++ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&folio->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) ++{ ++ unsigned long flags; ++ int gen = folio_lru_gen(folio); ++ ++ if (gen < 0) ++ return false; ++ ++ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); ++ ++ /* for folio_migrate_flags() */ ++ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; ++ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); ++ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ lru_gen_update_size(lruvec, folio, gen, -1); ++ list_del(&folio->lru); ++ ++ return true; ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_in_fault(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline + void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) + { + enum lru_list lru = folio_lru_list(folio); + ++ if (lru_gen_add_folio(lruvec, folio, false)) ++ return; ++ + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + if (lru != LRU_UNEVICTABLE) +@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) + { + enum lru_list lru = folio_lru_list(folio); + ++ if (lru_gen_add_folio(lruvec, folio, true)) ++ return; ++ + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + /* This is not expected to be used on LRU_UNEVICTABLE */ +@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) + { + enum lru_list lru = folio_lru_list(folio); + ++ if (lru_gen_del_folio(lruvec, folio, false)) ++ return; ++ + if (lru != LRU_UNEVICTABLE) + list_del(&folio->lru); + update_lru_size(lruvec, lru, folio_zonenum(folio), +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index aab70355d64f..c90c2282044e 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -314,6 +314,102 @@ enum lruvec_flags { + */ + }; + ++#endif /* !__GENERATING_BOUNDS_H */ ++ ++/* ++ * Evictable pages are divided into multiple generations. The youngest and the ++ * oldest generation numbers, max_seq and min_seq, are monotonically increasing. ++ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An ++ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the ++ * corresponding generation. The gen counter in folio->flags stores gen+1 while ++ * a page is on one of lrugen->lists[]. Otherwise it stores 0. ++ * ++ * A page is added to the youngest generation on faulting. The aging needs to ++ * check the accessed bit at least twice before handing this page over to the ++ * eviction. The first check takes care of the accessed bit set on the initial ++ * fault; the second check makes sure this page hasn't been used since then. ++ * This process, AKA second chance, requires a minimum of two generations, ++ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive ++ * LRU, e.g., /proc/vmstat, these two generations are considered active; the ++ * rest of generations, if they exist, are considered inactive. See ++ * lru_gen_is_active(). ++ * ++ * PG_active is always cleared while a page is on one of lrugen->lists[] so that ++ * the aging needs not to worry about it. And it's set again when a page ++ * considered active is isolated for non-reclaiming purposes, e.g., migration. ++ * See lru_gen_add_folio() and lru_gen_del_folio(). ++ * ++ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the ++ * number of categories of the active/inactive LRU when keeping track of ++ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits ++ * in folio->flags. ++ */ ++#define MIN_NR_GENS 2U ++#define MAX_NR_GENS 4U ++ ++#ifndef __GENERATING_BOUNDS_H ++ ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++enum { ++ LRU_GEN_ANON, ++ LRU_GEN_FILE, ++}; ++ ++/* ++ * The youngest generation number is stored in max_seq for both anon and file ++ * types as they are aged on an equal footing. The oldest generation numbers are ++ * stored in min_seq[] separately for anon and file types as clean file pages ++ * can be evicted regardless of swap constraints. ++ * ++ * Normally anon and file min_seq are in sync. But if swapping is constrained, ++ * e.g., out of swap space, file min_seq is allowed to advance and leave anon ++ * min_seq behind. ++ * ++ * The number of pages in each generation is eventually consistent and therefore ++ * can be transiently negative. ++ */ ++struct lru_gen_struct { ++ /* the aging increments the youngest generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the oldest generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the multi-gen LRU lists, lazily sorted on eviction */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the multi-gen LRU sizes, eventually consistent */ ++ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++}; ++ ++void lru_gen_init_lruvec(struct lruvec *lruvec); ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg); ++void lru_gen_exit_memcg(struct mem_cgroup *memcg); ++#endif ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_lruvec(struct lruvec *lruvec) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) ++{ ++} ++#endif ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -331,6 +427,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* evictable pages divided into generations */ ++ struct lru_gen_struct lrugen; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h +index ef1e3e736e14..240905407a18 100644 +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -55,7 +55,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +90,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,10 +101,12 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + ++#define LRU_REFS_WIDTH 0 ++ + #endif + #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index e66f7aa3191d..8d466d724852 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -1059,7 +1059,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -1070,7 +1070,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (PAGEFLAGS_MASK & ~__PG_HWPOISON) ++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +diff --git a/include/linux/sched.h b/include/linux/sched.h +index c46f3a63b758..744340a96ace 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -912,6 +912,10 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_LRU_GEN ++ /* whether the LRU algorithm may apply to this access */ ++ unsigned in_lru_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +diff --git a/kernel/bounds.c b/kernel/bounds.c +index 9795d75b09b2..5ee60777d8e4 100644 +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,11 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); ++#else ++ DEFINE(LRU_GEN_WIDTH, 0); ++#endif + /* End of constants */ + + return 0; +diff --git a/mm/Kconfig b/mm/Kconfig +index 169e64192e48..cee109f3128a 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1130,6 +1130,14 @@ config PTE_MARKER_UFFD_WP + purposes. It is required to enable userfaultfd write protection on + file-backed memory types like shmem and hugetlbfs. + ++config LRU_GEN ++ bool "Multi-Gen LRU" ++ depends on MMU ++ # make sure folio->flags has enough spare bits ++ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP ++ help ++ A high performance LRU implementation to overcommit memory. ++ + source "mm/damon/Kconfig" + + endmenu +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 834f288b3769..5500583e35b8 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2370,7 +2370,8 @@ static void __split_huge_page_tail(struct page *head, int tail, + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 618c366a2f07..7d58e8a73ece 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5105,6 +5105,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) + + static void mem_cgroup_free(struct mem_cgroup *memcg) + { ++ lru_gen_exit_memcg(memcg); + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); + } +@@ -5163,6 +5164,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) + memcg->deferred_split_queue.split_queue_len = 0; + #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ lru_gen_init_memcg(memcg); + return memcg; + fail: + mem_cgroup_id_remove(memcg); +diff --git a/mm/memory.c b/mm/memory.c +index 49500390b91b..85d3961c2bd5 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5091,6 +5091,27 @@ static inline void mm_account_fault(struct pt_regs *regs, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); + } + ++#ifdef CONFIG_LRU_GEN ++static void lru_gen_enter_fault(struct vm_area_struct *vma) ++{ ++ /* the LRU algorithm doesn't apply to sequential or random reads */ ++ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); ++} ++ ++static void lru_gen_exit_fault(void) ++{ ++ current->in_lru_fault = false; ++} ++#else ++static void lru_gen_enter_fault(struct vm_area_struct *vma) ++{ ++} ++ ++static void lru_gen_exit_fault(void) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * By the time we get here, we already hold the mm semaphore + * +@@ -5122,11 +5143,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ lru_gen_enter_fault(vma); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ lru_gen_exit_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 9ddaf0e1b0ab..0d7b2bd2454a 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +diff --git a/mm/mmzone.c b/mm/mmzone.c +index 0ae7571e35ab..68e1511be12d 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec) + * Poison its list head, so that any operations on it would crash. + */ + list_del(&lruvec->lists[LRU_UNEVICTABLE]); ++ ++ lru_gen_init_lruvec(lruvec); + } + + #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +diff --git a/mm/swap.c b/mm/swap.c +index 034bb24879a3..b062729b340f 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -460,6 +460,11 @@ void folio_add_lru(struct folio *folio) + VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + ++ /* see the comment in lru_gen_add_folio() */ ++ if (lru_gen_enabled() && !folio_test_unevictable(folio) && ++ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) ++ folio_set_active(folio); ++ + folio_get(folio); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -551,7 +556,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -666,7 +671,7 @@ void deactivate_file_folio(struct folio *folio) + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +diff --git a/mm/vmscan.c b/mm/vmscan.c +index fddb9bd3c6c2..1fcc0feed985 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2992,6 +2992,81 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + return can_demote(pgdat->node_id, sc); + } + ++#ifdef CONFIG_LRU_GEN ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) { ++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; ++ ++ /* for hotadd_new_pgdat() */ ++ if (!lruvec->pgdat) ++ lruvec->pgdat = pgdat; ++ ++ return lruvec; ++ } ++#endif ++ VM_WARN_ON_ONCE(!mem_cgroup_disabled()); ++ ++ return pgdat ? &pgdat->__lruvec : NULL; ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_lruvec(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++ ++void lru_gen_exit_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, ++ sizeof(lruvec->lrugen.nr_pages))); ++ } ++} ++#endif ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ return 0; ++}; ++late_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; + +From patchwork Wed Jul 6 22:00:15 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908704 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id BC3F5C43334 + for ; Wed, 6 Jul 2022 22:01:04 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 12E3C8E0002; Wed, 6 Jul 2022 18:01:02 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 0B84D8E0001; Wed, 6 Jul 2022 18:01:02 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id DAFB38E0002; Wed, 6 Jul 2022 18:01:01 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0016.hostedemail.com + [216.40.44.16]) + by kanga.kvack.org (Postfix) with ESMTP id C1CFC8E0001 + for ; Wed, 6 Jul 2022 18:01:01 -0400 (EDT) +Received: from smtpin27.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay02.hostedemail.com (Postfix) with ESMTP id 8F7AA33A54 + for ; Wed, 6 Jul 2022 22:01:01 +0000 (UTC) +X-FDA: 79658045922.27.E9F2FD6 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf27.hostedemail.com (Postfix) with ESMTP id C38524001B + for ; Wed, 6 Jul 2022 22:01:00 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + u17-20020a258411000000b0066dfb22644eso11129264ybk.6 + for ; Wed, 06 Jul 2022 15:01:00 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=y0ZxSqiIOv2HRYm553wZrJx5fChLkGPPbLO1qwZgmyQ=; + b=k4czIYvx4CiuCTGm0ZE5CP3ROAwcGkVPLViBUVhaVvkR7uaNKMq35oiGoZrpr9wmyA + 3m25Gt55w07/Zl+RDxl25UcbFclUuv1IhW8RxSswLcgrHkQRPfvrY4sHXWvh8Zx9tcVy + 57vPZrwMAdg5KxxrjfPcq/qdHGTF/uyJnTdFe8v4GztZ5hfTrusX1wVVySS9zGZ/5Iow + Nd9yluqy3C3Vy/90KJx2guGDz9MOF3sU6l1ICpYZ9vNR6C8Rq/+pMVqKsY9lUtmogcQ9 + 4GYcy0Nvop1G8oE5zpjlPJBv9NQtnMO9nw2qaCn4RWoOH37nG4jPSXNMIBpa8zn061RW + FgQg== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=y0ZxSqiIOv2HRYm553wZrJx5fChLkGPPbLO1qwZgmyQ=; + b=363aZyaQrxgNeIHPuTaRjAavjP4F8EO5sILVoIfz7F8ymEnUrJ32Mjc0rSEnA9jAM9 + iqLFSFIFzIMWUuTmljy1fFNDL7A2uNdDlNrJRCZ/gZAbXFjDT2j5Dl8E8XzpIRlJl7vn + ZRDzjDR9sVo08B1nyi1AhFP4nb1L2qE8qvTpXkzENYGDSZu7h/AALKU8/CiZpj9hkDwo + lzh5wc2ycnx5mXpDF9ieinUDPgG+YeeoSleAk0FgtV+zyB5xkdIruPNpSueQff+ct/G8 + 5c1VriWHEfaNMcJkuREKnILndD3W/O2w5mkcChSKsKDm9/nd8r2q/EJUgq2xca+u+CAl + jEVQ== +X-Gm-Message-State: AJIora/mFkYkqX5X5k4lKDoDCm8/beFZvhHqMOoV25In9oaB8n7Wpsnu + TVd/VrxMHywvnAlU0/ugaxgGqWgltsw= +X-Google-Smtp-Source: + AGRyM1sRDxtae2IlBgXPvJfXEts8Wxw8Va1kZtVIMGzblX4Mg8zS6Ie6RM5yT6WBMCN4GAE5u4jJ09Jf3oM= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a81:ab4d:0:b0:31c:8655:2207 with SMTP id + d13-20020a81ab4d000000b0031c86552207mr26036050ywk.389.1657144860068; Wed, 06 + Jul 2022 15:01:00 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:15 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-7-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 06/14] mm: multi-gen LRU: minimal implementation +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144861; a=rsa-sha256; + cv=none; + b=oTd2WxyeO8ccfm0UBIeIMD+jFWftz29Vc+53VsIdewSZAb8/4ceMzzXxauEqqrmAUtsLQ4 + sWFeVaIcSbnT8ZbgPae4FumiKT2ISp4qcEqBL74ek6P+YSnzhBoTUB4RYYRJ4JqS5sa2rW + hk5QljWrRnJjE4lY/D16EloP8YSx7T8= +ARC-Authentication-Results: i=1; + imf27.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=k4czIYvx; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf27.hostedemail.com: domain of + 3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144861; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=y0ZxSqiIOv2HRYm553wZrJx5fChLkGPPbLO1qwZgmyQ=; + b=wTs1b9ocf9FcHn9gYIlmuegnIgTo09PHZ8oYnB8j0wXjHhk0Al+NkNixxAvIfaCH4rMsxI + ErhpOzYCe9rwuJ5BAQvblyNUvN2Y5/i9ASXhp2bGy5PaMkTpI8OeOqjiGL9EQQonR3t7UB + j3QLmoVYs9VO0LxxgVoIQEv9nGf6zf8= +X-Stat-Signature: 653crzffxniht38wad94goaie9ebgi3z +X-Rspam-User: +X-Rspamd-Server: rspam12 +X-Rspamd-Queue-Id: C38524001B +Authentication-Results: imf27.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=k4czIYvx; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf27.hostedemail.com: domain of + 3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3HAbGYgYKCF0TPUC5JBJJBG9.7JHGDIPS-HHFQ57F.JMB@flex--yuzhao.bounces.google.com +X-HE-Tag: 1657144860-126552 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +To avoid confusion, the terms "promotion" and "demotion" will be +applied to the multi-gen LRU, as a new convention; the terms +"activation" and "deactivation" will be applied to the active/inactive +LRU, as usual. + +The aging produces young generations. Given an lruvec, it increments +max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging +promotes hot pages to the youngest generation when it finds them +accessed through page tables; the demotion of cold pages happens +consequently when it increments max_seq. Promotion in the aging path +does not involve any LRU list operations, only the updates of the gen +counter and lrugen->nr_pages[]; demotion, unless as the result of the +increment of max_seq, requires LRU list operations, e.g., +lru_deactivate_fn(). The aging has the complexity O(nr_hot_pages), +since it is only interested in hot pages. + +The eviction consumes old generations. Given an lruvec, it increments +min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes +empty. A feedback loop modeled after the PID controller monitors +refaults over anon and file types and decides which type to evict when +both types are available from the same generation. + +The protection of pages accessed multiple times through file +descriptors takes place in the eviction path. Each generation is +divided into multiple tiers. A page accessed N times through file +descriptors is in tier order_base_2(N). Tiers do not have dedicated +lrugen->lists[], only bits in folio->flags. The aforementioned +feedback loop also monitors refaults over all tiers and decides when +to protect pages in which tiers (N>1), using the first tier (N=0,1) as +a baseline. The first tier contains single-use unmapped clean pages, +which are most likely the best choices. In contrast to promotion in +the aging path, the protection of a page in the eviction path is +achieved by moving this page to the next generation, i.e., min_seq+1, +if the feedback loop decides so. This approach has the following +advantages: +1. It removes the cost of activation in the buffered access path by + inferring whether pages accessed multiple times through file + descriptors are statistically hot and thus worth protecting in the + eviction path. +2. It takes pages accessed through page tables into account and avoids + overprotecting pages accessed multiple times through file + descriptors. (Pages accessed through page tables are in the first + tier, since N=0.) +3. More tiers provide better protection for pages accessed more than + twice through file descriptors, when under heavy buffered I/O + workloads. + +Server benchmark results: + Single workload: + fio (buffered I/O): +[30, 32]% + IOPS BW + 5.19-rc1: 2673k 10.2GiB/s + patch1-6: 3491k 13.3GiB/s + + Single workload: + memcached (anon): -[4, 6]% + Ops/sec KB/sec + 5.19-rc1: 1161501.04 45177.25 + patch1-6: 1106168.46 43025.04 + + Configurations: + CPU: two Xeon 6154 + Mem: total 256G + + Node 1 was only used as a ram disk to reduce the variance in the + results. + + patch drivers/block/brd.c < gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE; + > page = alloc_pages_node(1, gfp_flags, 0); + EOF + + cat >>/etc/systemd/system.conf <>/etc/memcached.conf </sys/fs/cgroup/user.slice/test/memory.max + echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs + fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \ + --buffered=1 --ioengine=io_uring --iodepth=128 \ + --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ + --rw=randread --random_distribution=random --norandommap \ + --time_based --ramp_time=10m --runtime=5m --group_reporting + + cat memcached.sh + modprobe brd rd_nr=1 rd_size=113246208 + swapoff -a + mkswap /dev/ram0 + swapon /dev/ram0 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \ + --ratio 1:0 --pipeline 8 -d 2000 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \ + --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed + +Client benchmark results: + kswapd profiles: + 5.19-rc1 + 40.33% page_vma_mapped_walk (overhead) + 21.80% lzo1x_1_do_compress (real work) + 7.53% do_raw_spin_lock + 3.95% _raw_spin_unlock_irq + 2.52% vma_interval_tree_iter_next + 2.37% folio_referenced_one + 2.28% vma_interval_tree_subtree_search + 1.97% anon_vma_interval_tree_iter_first + 1.60% ptep_clear_flush + 1.06% __zram_bvec_write + + patch1-6 + 39.03% lzo1x_1_do_compress (real work) + 18.47% page_vma_mapped_walk (overhead) + 6.74% _raw_spin_unlock_irq + 3.97% do_raw_spin_lock + 2.49% ptep_clear_flush + 2.48% anon_vma_interval_tree_iter_first + 1.92% folio_referenced_one + 1.88% __zram_bvec_write + 1.48% memmove + 1.31% vma_interval_tree_iter_next + + Configurations: + CPU: single Snapdragon 7c + Mem: total 4G + + Chrome OS MemoryPressure [1] + +[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/ + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + include/linux/mm_inline.h | 36 ++ + include/linux/mmzone.h | 41 ++ + include/linux/page-flags-layout.h | 5 +- + kernel/bounds.c | 2 + + mm/Kconfig | 11 + + mm/swap.c | 39 ++ + mm/vmscan.c | 810 +++++++++++++++++++++++++++++- + mm/workingset.c | 110 +++- + 8 files changed, 1044 insertions(+), 10 deletions(-) + +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index 2ff703900fd0..f2b2296a42f9 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -121,6 +121,33 @@ static inline int lru_gen_from_seq(unsigned long seq) + return seq % MAX_NR_GENS; + } + ++static inline int lru_hist_from_seq(unsigned long seq) ++{ ++ return seq % NR_HIST_GENS; ++} ++ ++static inline int lru_tier_from_refs(int refs) ++{ ++ VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); ++ ++ /* see the comment in folio_lru_refs() */ ++ return order_base_2(refs + 1); ++} ++ ++static inline int folio_lru_refs(struct folio *folio) ++{ ++ unsigned long flags = READ_ONCE(folio->flags); ++ bool workingset = flags & BIT(PG_workingset); ++ ++ /* ++ * Return the number of accesses beyond PG_referenced, i.e., N-1 if the ++ * total number of accesses is N>1, since N=0,1 both map to the first ++ * tier. lru_tier_from_refs() will account for this off-by-one. Also see ++ * the comment on MAX_NR_TIERS. ++ */ ++ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; ++} ++ + static inline int folio_lru_gen(struct folio *folio) + { + unsigned long flags = READ_ONCE(folio->flags); +@@ -173,6 +200,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli + __update_lru_size(lruvec, lru, zone, -delta); + return; + } ++ ++ /* promotion */ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ __update_lru_size(lruvec, lru, zone, -delta); ++ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ /* demotion requires isolation, e.g., lru_deactivate_fn() */ ++ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); + } + + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index c90c2282044e..0d76222501ed 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -347,6 +347,28 @@ enum lruvec_flags { + #define MIN_NR_GENS 2U + #define MAX_NR_GENS 4U + ++/* ++ * Each generation is divided into multiple tiers. A page accessed N times ++ * through file descriptors is in tier order_base_2(N). A page in the first tier ++ * (N=0,1) is marked by PG_referenced unless it was faulted in through page ++ * tables or read ahead. A page in any other tier (N>1) is marked by ++ * PG_referenced and PG_workingset. This implies a minimum of two tiers is ++ * supported without using additional bits in folio->flags. ++ * ++ * In contrast to moving across generations which requires the LRU lock, moving ++ * across tiers only involves atomic operations on folio->flags and therefore ++ * has a negligible cost in the buffered access path. In the eviction path, ++ * comparisons of refaulted/(evicted+protected) from the first tier and the ++ * rest infer whether pages accessed multiple times through file descriptors ++ * are statistically hot and thus worth protecting. ++ * ++ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the ++ * number of categories of the active/inactive LRU when keeping track of ++ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in ++ * folio->flags. ++ */ ++#define MAX_NR_TIERS 4U ++ + #ifndef __GENERATING_BOUNDS_H + + struct lruvec; +@@ -361,6 +383,16 @@ enum { + LRU_GEN_FILE, + }; + ++#define MIN_LRU_BATCH BITS_PER_LONG ++#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) ++ ++/* whether to keep historical stats from evicted generations */ ++#ifdef CONFIG_LRU_GEN_STATS ++#define NR_HIST_GENS MAX_NR_GENS ++#else ++#define NR_HIST_GENS 1U ++#endif ++ + /* + * The youngest generation number is stored in max_seq for both anon and file + * types as they are aged on an equal footing. The oldest generation numbers are +@@ -383,6 +415,15 @@ struct lru_gen_struct { + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of evicted+protected */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the first tier doesn't need protection, hence the minus one */ ++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* can be modified without holding the LRU lock */ ++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + }; + + void lru_gen_init_lruvec(struct lruvec *lruvec); +diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h +index 240905407a18..7d79818dc065 100644 +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -106,7 +106,10 @@ + #error "Not enough bits in page flags" + #endif + +-#define LRU_REFS_WIDTH 0 ++/* see the comment on MAX_NR_TIERS */ ++#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \ ++ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ ++ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) + + #endif + #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ +diff --git a/kernel/bounds.c b/kernel/bounds.c +index 5ee60777d8e4..b529182e8b04 100644 +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -24,8 +24,10 @@ int main(void) + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); + #ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); ++ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); + #else + DEFINE(LRU_GEN_WIDTH, 0); ++ DEFINE(__LRU_REFS_WIDTH, 0); + #endif + /* End of constants */ + +diff --git a/mm/Kconfig b/mm/Kconfig +index cee109f3128a..a93478acf341 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1130,6 +1130,7 @@ config PTE_MARKER_UFFD_WP + purposes. It is required to enable userfaultfd write protection on + file-backed memory types like shmem and hugetlbfs. + ++# multi-gen LRU { + config LRU_GEN + bool "Multi-Gen LRU" + depends on MMU +@@ -1138,6 +1139,16 @@ config LRU_GEN + help + A high performance LRU implementation to overcommit memory. + ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ Do not enable this option unless you plan to look at historical stats ++ from evicted generations for debugging purpose. ++ ++ This option has a per-memcg and per-node memory overhead. ++# } ++ + source "mm/damon/Kconfig" + + endmenu +diff --git a/mm/swap.c b/mm/swap.c +index b062729b340f..67e7962fbacc 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -405,6 +405,40 @@ static void __lru_cache_activate_folio(struct folio *folio) + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void folio_inc_refs(struct folio *folio) ++{ ++ unsigned long new_flags, old_flags = READ_ONCE(folio->flags); ++ ++ if (folio_test_unevictable(folio)) ++ return; ++ ++ if (!folio_test_referenced(folio)) { ++ folio_set_referenced(folio); ++ return; ++ } ++ ++ if (!folio_test_workingset(folio)) { ++ folio_set_workingset(folio); ++ return; ++ } ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags & LRU_REFS_MASK; ++ if (new_flags == LRU_REFS_MASK) ++ break; ++ ++ new_flags += BIT(LRU_REFS_PGOFF); ++ new_flags |= old_flags & ~LRU_REFS_MASK; ++ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); ++} ++#else ++static void folio_inc_refs(struct folio *folio) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -417,6 +451,11 @@ static void __lru_cache_activate_folio(struct folio *folio) + */ + void folio_mark_accessed(struct folio *folio) + { ++ if (lru_gen_enabled()) { ++ folio_inc_refs(folio); ++ return; ++ } ++ + if (!folio_test_referenced(folio)) { + folio_set_referenced(folio); + } else if (folio_test_unevictable(folio)) { +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 1fcc0feed985..f768d61e7b85 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1273,9 +1273,11 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, + + if (folio_test_swapcache(folio)) { + swp_entry_t swap = folio_swap_entry(folio); +- mem_cgroup_swapout(folio, swap); ++ ++ /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(folio, target_memcg); ++ mem_cgroup_swapout(folio, swap); + __delete_from_swap_cache(&folio->page, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_page(&folio->page, swap); +@@ -2675,6 +2677,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -2998,6 +3003,17 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + * shorthand helpers + ******************************************************************************/ + ++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ ++#define DEFINE_MAX_SEQ(lruvec) \ ++ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) ++ ++#define DEFINE_MIN_SEQ(lruvec) \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ ++ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ ++ } ++ + #define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ +@@ -3023,6 +3039,764 @@ static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int ni + return pgdat ? &pgdat->__lruvec : NULL; + } + ++static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ if (!can_demote(pgdat->node_id, sc) && ++ mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) ++ return 0; ++ ++ return mem_cgroup_swappiness(memcg); ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ /* see the comment on lru_gen_struct */ ++ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && ++ get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop based on Proportional-Integral-Derivative (PID) controller. ++ * ++ * The P term is refaulted/(evicted+protected) from a tier in the generation ++ * currently being evicted; the I term is the exponential moving average of the ++ * P term over the generations previously evicted, using the smoothing factor ++ * 1/2; the D term isn't supported. ++ * ++ * The setpoint (SP) is always the first tier of one type; the process variable ++ * (PV) is either any tier of the other type or any other tier of the same ++ * type. ++ * ++ * The error is the difference between the SP and the PV; the correction is to ++ * turn off protection when SP>PV or turn on protection when SPlrugen; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) ++{ ++ int hist, tier; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; ++ unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ ++ if (!carryover && !clear) ++ return; ++ ++ hist = lru_hist_from_seq(seq); ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ } ++ ++ if (clear) { ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++ } ++} ++ ++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) ++{ ++ /* ++ * Return true if the PV has a limited number of refaults or a lower ++ * refaulted/total than the SP. ++ */ ++ return pv->refaulted < MIN_LRU_BATCH || ++ pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= ++ (sp->refaulted + 1) * pv->total * pv->gain; ++} ++ ++/****************************************************************************** ++ * the aging ++ ******************************************************************************/ ++ ++/* protect pages accessed multiple times through file descriptors */ ++static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) ++{ ++ int type = folio_is_file_lru(folio); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ unsigned long new_flags, old_flags = READ_ONCE(folio->flags); ++ ++ VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); ++ ++ do { ++ new_gen = (old_gen + 1) % MAX_NR_GENS; ++ ++ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); ++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ /* for folio_end_writeback() */ ++ if (reclaiming) ++ new_flags |= BIT(PG_reclaim); ++ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); ++ ++ lru_gen_update_size(lruvec, folio, old_gen, new_gen); ++ ++ return new_gen; ++} ++ ++static void inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ reset_ctrl_pos(lruvec, type, true); ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++} ++ ++static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) ++{ ++ int gen, type, zone; ++ bool success = false; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ ++ /* find the oldest populated generation */ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { ++ gen = lru_gen_from_seq(min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ goto next; ++ } ++ ++ min_seq[type]++; ++ } ++next: ++ ; ++ } ++ ++ /* see the comment on lru_gen_struct */ ++ if (can_swap) { ++ min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); ++ min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); ++ } ++ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ if (min_seq[type] == lrugen->min_seq[type]) ++ continue; ++ ++ reset_ctrl_pos(lruvec, type, true); ++ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); ++ success = true; ++ } ++ ++ return success; ++} ++ ++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) ++{ ++ int prev, next; ++ int type, zone; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ ++ if (max_seq != lrugen->max_seq) ++ goto unlock; ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) ++ continue; ++ ++ VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); ++ ++ inc_min_seq(lruvec, type); ++ } ++ ++ /* ++ * Update the active/inactive LRU sizes for compatibility. Both sides of ++ * the current max_seq need to be covered, since max_seq+1 can overlap ++ * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do ++ * overlap, cold/hot inversion happens. ++ */ ++ prev = lru_gen_from_seq(lrugen->max_seq - 1); ++ next = lru_gen_from_seq(lrugen->max_seq + 1); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_INACTIVE_FILE; ++ long delta = lrugen->nr_pages[prev][type][zone] - ++ lrugen->nr_pages[next][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ __update_lru_size(lruvec, lru, zone, delta); ++ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); ++ } ++ } ++ ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_ctrl_pos(lruvec, type, false); ++ ++ /* make sure preceding modifications appear */ ++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); ++unlock: ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++static unsigned long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq, ++ unsigned long *min_seq, bool can_swap, bool *need_aging) ++{ ++ int gen, type, zone; ++ unsigned long old = 0; ++ unsigned long young = 0; ++ unsigned long total = 0; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ for (type = !can_swap; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ unsigned long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ ++ total += size; ++ if (seq == max_seq) ++ young += size; ++ if (seq + MIN_NR_GENS == max_seq) ++ old += size; ++ } ++ } ++ ++ /* ++ * The aging tries to be lazy to reduce the overhead. On the other hand, ++ * the eviction stalls when the number of generations reaches ++ * MIN_NR_GENS. So ideally, there should be MIN_NR_GENS+1 generations, ++ * hence the first two if's. ++ * ++ * Also it's ideal to spread pages out evenly, meaning 1/(MIN_NR_GENS+1) ++ * of the total number of pages for each generation. A reasonable range ++ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The ++ * eviction cares about the lower bound of cold pages, whereas the aging ++ * cares about the upper bound of hot pages. ++ */ ++ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) ++ *need_aging = true; ++ else if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) ++ *need_aging = false; ++ else if (young * MIN_NR_GENS > total) ++ *need_aging = true; ++ else if (old * (MIN_NR_GENS + 2) < total) ++ *need_aging = true; ++ else ++ *need_aging = false; ++ ++ return total; ++} ++ ++static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool need_aging; ++ unsigned long nr_to_scan; ++ int swappiness = get_swappiness(lruvec, sc); ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_WARN_ON_ONCE(sc->memcg_low_reclaim); ++ ++ mem_cgroup_calculate_protection(NULL, memcg); ++ ++ if (mem_cgroup_below_min(memcg)) ++ return; ++ ++ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging); ++ if (!nr_to_scan) ++ return; ++ ++ nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; ++ ++ if (nr_to_scan && need_aging) ++ inc_max_seq(lruvec, max_seq, swappiness); ++} ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg; ++ ++ VM_WARN_ON_ONCE(!current_is_kswapd()); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ ++ age_lruvec(lruvec, sc); ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++} ++ ++/****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) ++{ ++ bool success; ++ int gen = folio_lru_gen(folio); ++ int type = folio_is_file_lru(folio); ++ int zone = folio_zonenum(folio); ++ int delta = folio_nr_pages(folio); ++ int refs = folio_lru_refs(folio); ++ int tier = lru_tier_from_refs(refs); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); ++ ++ /* unevictable */ ++ if (!folio_evictable(folio)) { ++ success = lru_gen_del_folio(lruvec, folio, true); ++ VM_WARN_ON_ONCE_FOLIO(!success, folio); ++ folio_set_unevictable(folio); ++ lruvec_add_folio(lruvec, folio); ++ __count_vm_events(UNEVICTABLE_PGCULLED, delta); ++ return true; ++ } ++ ++ /* dirty lazyfree */ ++ if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { ++ success = lru_gen_del_folio(lruvec, folio, true); ++ VM_WARN_ON_ONCE_FOLIO(!success, folio); ++ folio_set_swapbacked(folio); ++ lruvec_add_folio_tail(lruvec, folio); ++ return true; ++ } ++ ++ /* protected */ ++ if (tier > tier_idx) { ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ gen = folio_inc_gen(lruvec, folio, false); ++ list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); ++ ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + delta); ++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); ++ return true; ++ } ++ ++ /* waiting for writeback */ ++ if (folio_test_locked(folio) || folio_test_writeback(folio) || ++ (type == LRU_GEN_FILE && folio_test_dirty(folio))) { ++ gen = folio_inc_gen(lruvec, folio, true); ++ list_move(&folio->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) ++{ ++ bool success; ++ ++ /* unmapping inhibited */ ++ if (!sc->may_unmap && folio_mapped(folio)) ++ return false; ++ ++ /* swapping inhibited */ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (folio_test_dirty(folio) || ++ (folio_test_anon(folio) && !folio_test_swapcache(folio)))) ++ return false; ++ ++ /* raced with release_pages() */ ++ if (!folio_try_get(folio)) ++ return false; ++ ++ /* raced with another isolation */ ++ if (!folio_test_clear_lru(folio)) { ++ folio_put(folio); ++ return false; ++ } ++ ++ /* see the comment on MAX_NR_TIERS */ ++ if (!folio_test_referenced(folio)) ++ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); ++ ++ /* for shrink_page_list() */ ++ folio_clear_reclaim(folio); ++ folio_clear_referenced(folio); ++ ++ success = lru_gen_del_folio(lruvec, folio, true); ++ VM_WARN_ON_ONCE_FOLIO(!success, folio); ++ ++ return true; ++} ++ ++static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, ++ int type, int tier, struct list_head *list) ++{ ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_LRU_BATCH; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_WARN_ON_ONCE(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return 0; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct folio *folio = lru_to_folio(head); ++ int delta = folio_nr_pages(folio); ++ ++ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); ++ ++ scanned += delta; ++ ++ if (sort_folio(lruvec, folio, tier)) ++ sorted += delta; ++ else if (isolate_folio(lruvec, folio, sc)) { ++ list_add(&folio->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&folio->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) ++ break; ++ } ++ ++ if (skipped) { ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ } ++ ++ if (!remaining || isolated >= MIN_LRU_BATCH) ++ break; ++ } ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ /* ++ * There might not be eligible pages due to reclaim_idx, may_unmap and ++ * may_writepage. Check the remaining to prevent livelock if it's not ++ * making progress. ++ */ ++ return isolated || !remaining ? scanned : 0; ++} ++ ++static int get_tier_idx(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct ctrl_pos sp, pv; ++ ++ /* ++ * To leave a margin for fluctuations, use a larger gain factor (1:2). ++ * This value is chosen because any other tier would have at least twice ++ * as many refaults as the first tier. ++ */ ++ read_ctrl_pos(lruvec, type, 0, 1, &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, 2, &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) ++{ ++ int type, tier; ++ struct ctrl_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare the first tier of anon with that of file to determine which ++ * type to scan. Also need to compare other tiers of the selected type ++ * with the first tier of the other type to determine the last tier (of ++ * the selected type) to evict. ++ */ ++ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); ++ read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_idx = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ int *type_scanned, struct list_head *list) ++{ ++ int i; ++ int type; ++ int scanned; ++ int tier = -1; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ /* ++ * Try to make the obvious choice first. When anon and file are both ++ * available from the same generation, interpret swappiness 1 as file ++ * first and 200 as anon first. ++ */ ++ if (!swappiness) ++ type = LRU_GEN_FILE; ++ else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) ++ type = LRU_GEN_ANON; ++ else if (swappiness == 1) ++ type = LRU_GEN_FILE; ++ else if (swappiness == 200) ++ type = LRU_GEN_ANON; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ if (tier < 0) ++ tier = get_tier_idx(lruvec, type); ++ ++ scanned = scan_folios(lruvec, sc, type, tier, list); ++ if (scanned) ++ break; ++ ++ type = !type; ++ tier = -1; ++ } ++ ++ *type_scanned = type; ++ ++ return scanned; ++} ++ ++static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int type; ++ int scanned; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct folio *folio; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); ++ ++ scanned += try_to_inc_min_seq(lruvec, swappiness); ++ ++ if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) ++ scanned = 0; ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (list_empty(&list)) ++ return scanned; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ ++ list_for_each_entry(folio, &list, lru) { ++ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ ++ if (folio_test_workingset(folio)) ++ folio_set_referenced(folio); ++ ++ /* don't add rejected pages to the oldest generation */ ++ if (folio_test_reclaim(folio) && ++ (folio_test_dirty(folio) || folio_test_writeback(folio))) ++ folio_clear_active(folio); ++ else ++ folio_set_active(folio); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++ ++ return scanned; ++} ++ ++static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, ++ bool can_swap, unsigned long reclaimed) ++{ ++ int priority; ++ bool need_aging; ++ unsigned long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ return 0; ++ ++ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, &need_aging); ++ if (!nr_to_scan) ++ return 0; ++ ++ /* adjust priority if memcg is offline or the target is met */ ++ if (!mem_cgroup_online(memcg)) ++ priority = 0; ++ else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim) ++ priority = DEF_PRIORITY; ++ else ++ priority = sc->priority; ++ ++ nr_to_scan >>= priority; ++ if (!nr_to_scan) ++ return 0; ++ ++ if (!need_aging) ++ return nr_to_scan; ++ ++ /* skip the aging path at the default priority */ ++ if (priority == DEF_PRIORITY) ++ goto done; ++ ++ /* leave the work to lru_gen_age_node() */ ++ if (current_is_kswapd()) ++ return 0; ++ ++ inc_max_seq(lruvec, max_seq, can_swap); ++done: ++ return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ unsigned long scanned = 0; ++ unsigned long reclaimed = sc->nr_reclaimed; ++ ++ lru_add_drain(); ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ int delta; ++ int swappiness; ++ unsigned long nr_to_scan; ++ ++ if (sc->may_swap) ++ swappiness = get_swappiness(lruvec, sc); ++ else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) ++ swappiness = 1; ++ else ++ swappiness = 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed); ++ if (!nr_to_scan) ++ break; ++ ++ delta = evict_folios(lruvec, sc, swappiness); ++ if (!delta) ++ break; ++ ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++} ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -3065,6 +3839,16 @@ static int __init init_lru_gen(void) + }; + late_initcall(init_lru_gen); + ++#else /* !CONFIG_LRU_GEN */ ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -3078,6 +3862,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -3582,6 +4371,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +@@ -3946,12 +4738,17 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + } + #endif + +-static void age_active_anon(struct pglist_data *pgdat, ++static void kswapd_age_node(struct pglist_data *pgdat, + struct scan_control *sc) + { + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } ++ + if (!can_age_anon_pages(pgdat, sc)) + return; + +@@ -4271,12 +5068,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) + sc.may_swap = !nr_boost_reclaim; + + /* +- * Do some background aging of the anon list, to give +- * pages a chance to be referenced before reclaiming. All +- * pages are rotated regardless of classzone as this is +- * about consistent aging. ++ * Do some background aging, to give pages a chance to be ++ * referenced before reclaiming. All pages are rotated ++ * regardless of classzone as this is about consistent aging. + */ +- age_active_anon(pgdat, &sc); ++ kswapd_age_node(pgdat, &sc); + + /* + * If we're getting trouble reclaiming, start doing writepage +diff --git a/mm/workingset.c b/mm/workingset.c +index 592569a8974c..84a9e0ab04ad 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static void *lru_gen_eviction(struct folio *folio) ++{ ++ int hist; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lru_gen_struct *lrugen; ++ int type = folio_is_file_lru(folio); ++ int delta = folio_nr_pages(folio); ++ int refs = folio_lru_refs(folio); ++ int tier = lru_tier_from_refs(refs); ++ struct mem_cgroup *memcg = folio_memcg(folio); ++ struct pglist_data *pgdat = folio_pgdat(folio); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->lrugen; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); ++ ++ hist = lru_hist_from_seq(min_seq); ++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); ++} ++ ++static void lru_gen_refault(struct folio *folio, void *shadow) ++{ ++ int hist, tier, refs; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lru_gen_struct *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = folio_is_file_lru(folio); ++ int delta = folio_nr_pages(folio); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ ++ if (pgdat != folio_pgdat(folio)) ++ return; ++ ++ rcu_read_lock(); ++ ++ memcg = folio_memcg_rcu(folio); ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto unlock; ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->lrugen; ++ ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ /* see the comment in folio_lru_refs() */ ++ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; ++ tier = lru_tier_from_refs(refs); ++ ++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); ++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); ++ ++ /* ++ * Count the following two cases as stalls: ++ * 1. For pages accessed through page tables, hotter pages pushed out ++ * hot pages which refaulted immediately. ++ * 2. For pages accessed multiple times through file descriptors, ++ * numbers of accesses might have been out of the range. ++ */ ++ if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { ++ folio_set_workingset(folio); ++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static void *lru_gen_eviction(struct folio *folio) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct folio *folio, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +360,14 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(folio); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + return pack_shadow(memcgid, pgdat, eviction, + folio_test_workingset(folio)); +@@ -298,7 +398,13 @@ void workingset_refault(struct folio *folio, void *shadow) + int memcgid; + long nr; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(folio, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* + +From patchwork Wed Jul 6 22:00:16 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908705 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 14A6EC433EF + for ; Wed, 6 Jul 2022 22:01:08 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id B6A108E0003; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id AA0BB8E0001; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 91B3F8E0003; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0015.hostedemail.com + [216.40.44.15]) + by kanga.kvack.org (Postfix) with ESMTP id 7CF7B8E0001 + for ; Wed, 6 Jul 2022 18:01:04 -0400 (EDT) +Received: from smtpin10.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay01.hostedemail.com (Postfix) with ESMTP id 3A6B760CA7 + for ; Wed, 6 Jul 2022 22:01:04 +0000 (UTC) +X-FDA: 79658046048.10.E729B6A +Received: from mail-io1-f73.google.com (mail-io1-f73.google.com + [209.85.166.73]) + by imf12.hostedemail.com (Postfix) with ESMTP id 654DC40027 + for ; Wed, 6 Jul 2022 22:01:02 +0000 (UTC) +Received: by mail-io1-f73.google.com with SMTP id + k1-20020a5d8741000000b00678ad1103e7so2775981iol.21 + for ; Wed, 06 Jul 2022 15:01:02 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=XKxfVy7bUFxJEpcf3Ov7jSHxjMmYVb5gb0hRKtMPzZA=; + b=KBFBaieE6U899pZedfVW186dkzrS93jrjdIku8VfT3EELG4tmwSu4pbA8t8KgFrkX7 + PNLYIcjPRwCltpKZ41cpDa72lID2PMQjd0C5UzA1EP6Fozv39FS8efLoCNj0H5GROfCg + QMeGKWc2c6xuBh73e/hz1kG0ddQk8uDEqQzdd1hwg6GKOeAe0e98I4co7JiaxOzZQyVa + H3rcYT5ECNNWjJIqW6rJYkUeALUPQkQ6SiSCuxFVIHVt/LqtAYlBu4IfaEL80m1SvJmZ + XAzuOW4B/+BDlzSPXhxDXR3iWNFF0evXZaEn2Xyp6i9pgpXVhqsXUcbDrh/yv+aznbGp + vfZw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=XKxfVy7bUFxJEpcf3Ov7jSHxjMmYVb5gb0hRKtMPzZA=; + b=wEDdfX1Ca1wRAghCjDtN/218Ar7YzHk1SswV9PWlO2rWdVGpEgzcJ4I72IScKmI1ak + byWww5GGZKkp2et7daVUaRaYUyNsN+JvzNTS4ZkA1+KUJp4sBdk6TL4F/+sKxhpfeXRW + 4O7rUvWmL/AcEhzhNNzOrk8NgMwLZJxEbmWumi3GZaaNwvSzvi/ZnQ4iy7QczcGNuoOD + vHTTjgYxcTl+FO55iQfNa8RXZ2EK68p8Q3s6TnE+enNd0MlFosYB8z+tz3T9tsSSAk7D + L5g2KMDag6shNFwrkU67N/AO/rmf0tvQvfgZZtmfM+fNoppSNpfEruaUg6sf+al7cZaq + Y9Jg== +X-Gm-Message-State: AJIora9J02U2PbT6ikgYVNH5lZ4bF/dPn/RAZ8KsVxnRgQbrFGcLQrwU + eoZuUGfIebQ11JrsGluVoFQjXZwzJak= +X-Google-Smtp-Source: + AGRyM1sRwdKObor4YUQFmQ9ta0XlmIxSt4ZUo1xITqK6sTS54jUHQ2ZrB8LtbtvMYYrpomN4w49bZKlSJRk= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a05:6e02:148c:b0:2dc:38ae:5c6a with SMTP id + n12-20020a056e02148c00b002dc38ae5c6amr2363805ilk.115.1657144861728; Wed, 06 + Jul 2022 15:01:01 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:16 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-8-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 07/14] mm: multi-gen LRU: exploit locality in rmap +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Barry Song , Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144862; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=XKxfVy7bUFxJEpcf3Ov7jSHxjMmYVb5gb0hRKtMPzZA=; + b=78gzoYQUEpZ/3nhPL81S9IoTS+tamEn/8D7ioIFwlboSYOhcwIufnOyPh57lBQoFdANuof + SnLww4J7TveiCJa5kFHPwj8xzXM0ANKbJmf4o4cLIVitPhVH7z6V5EFfj457OWAKTjIo6b + NZ86RpRkjWLByNbszbZPPLUZQi27u4U= +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144862; a=rsa-sha256; + cv=none; + b=gbyvyPJZ1QVIBcx+YUE+JKq+Cj69MF+XU4E+AoEjDiVevGW0fLXZdcgIYKhIeTQ4VReTbP + TYy+UJJ7mp48jYOJ43EUlODLaxeez62GyJ6+OwE5GLOHlgIg1MIlrrlMmwrW3B3t4byGOx + 58gkmlSjFRcnFO6DMq3ACgJdURFm7Fo= +ARC-Authentication-Results: i=1; + imf12.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=KBFBaieE; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf12.hostedemail.com: domain of + 3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com + designates 209.85.166.73 as permitted sender) + smtp.mailfrom=3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com +X-Rspam-User: +Authentication-Results: imf12.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=KBFBaieE; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf12.hostedemail.com: domain of + 3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com + designates 209.85.166.73 as permitted sender) + smtp.mailfrom=3HQbGYgYKCF4UQVD6KCKKCHA.8KIHEJQT-IIGR68G.KNC@flex--yuzhao.bounces.google.com +X-Rspamd-Server: rspam06 +X-Rspamd-Queue-Id: 654DC40027 +X-Stat-Signature: upc9kmwzt4t6z55coguwsrw11efy9iir +X-HE-Tag: 1657144862-661235 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Searching the rmap for PTEs mapping each page on an LRU list (to test +and clear the accessed bit) can be expensive because pages from +different VMAs (PA space) are not cache friendly to the rmap (VA +space). For workloads mostly using mapped pages, searching the rmap +can incur the highest CPU cost in the reclaim path. + +This patch exploits spatial locality to reduce the trips into the +rmap. When shrink_page_list() walks the rmap and finds a young PTE, a +new function lru_gen_look_around() scans at most BITS_PER_LONG-1 +adjacent PTEs. On finding another young PTE, it clears the accessed +bit and updates the gen counter of the page mapped by this PTE to +(max_seq%MAX_NR_GENS)+1. + +Server benchmark results: + Single workload: + fio (buffered I/O): no change + + Single workload: + memcached (anon): +[3, 5]% + Ops/sec KB/sec + patch1-6: 1106168.46 43025.04 + patch1-7: 1147696.57 44640.29 + + Configurations: + no change + +Client benchmark results: + kswapd profiles: + patch1-6 + 39.03% lzo1x_1_do_compress (real work) + 18.47% page_vma_mapped_walk (overhead) + 6.74% _raw_spin_unlock_irq + 3.97% do_raw_spin_lock + 2.49% ptep_clear_flush + 2.48% anon_vma_interval_tree_iter_first + 1.92% folio_referenced_one + 1.88% __zram_bvec_write + 1.48% memmove + 1.31% vma_interval_tree_iter_next + + patch1-7 + 48.16% lzo1x_1_do_compress (real work) + 8.20% page_vma_mapped_walk (overhead) + 7.06% _raw_spin_unlock_irq + 2.92% ptep_clear_flush + 2.53% __zram_bvec_write + 2.11% do_raw_spin_lock + 2.02% memmove + 1.93% lru_gen_look_around + 1.56% free_unref_page_list + 1.40% memset + + Configurations: + no change + +Signed-off-by: Yu Zhao +Acked-by: Barry Song +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + include/linux/memcontrol.h | 31 +++++++ + include/linux/mm.h | 5 + + include/linux/mmzone.h | 6 ++ + mm/internal.h | 1 + + mm/memcontrol.c | 1 + + mm/rmap.c | 6 ++ + mm/swap.c | 4 +- + mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++ + 8 files changed, 236 insertions(+), 2 deletions(-) + +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 9ecead1042b9..9d0fea17f9ef 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + * + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. +@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + * + * For a kmem page a caller should hold an rcu read lock to protect memcg + * associated with a kmem page from being released. +@@ -950,6 +952,23 @@ void unlock_page_memcg(struct page *page); + + void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); + ++/* try to stablize folio_memcg() for all the pages in a memcg */ ++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) ++{ ++ rcu_read_lock(); ++ ++ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) ++ return true; ++ ++ rcu_read_unlock(); ++ return false; ++} ++ ++static inline void mem_cgroup_unlock_pages(void) ++{ ++ rcu_read_unlock(); ++} ++ + /* idx can be of type enum memcg_stat_item or node_stat_item */ + static inline void mod_memcg_state(struct mem_cgroup *memcg, + int idx, int val) +@@ -1401,6 +1420,18 @@ static inline void folio_memcg_unlock(struct folio *folio) + { + } + ++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) ++{ ++ /* to match folio_memcg_rcu() */ ++ rcu_read_lock(); ++ return true; ++} ++ ++static inline void mem_cgroup_unlock_pages(void) ++{ ++ rcu_read_unlock(); ++} ++ + static inline void mem_cgroup_handle_over_high(void) + { + } +diff --git a/include/linux/mm.h b/include/linux/mm.h +index ed5393e5930d..981b2e447936 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1523,6 +1523,11 @@ static inline unsigned long folio_pfn(struct folio *folio) + return page_to_pfn(&folio->page); + } + ++static inline struct folio *pfn_folio(unsigned long pfn) ++{ ++ return page_folio(pfn_to_page(pfn)); ++} ++ + static inline atomic_t *folio_pincount_ptr(struct folio *folio) + { + return &folio_page(folio, 1)->compound_pincount; +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 0d76222501ed..4fd7fc16eeb4 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -372,6 +372,7 @@ enum lruvec_flags { + #ifndef __GENERATING_BOUNDS_H + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +@@ -427,6 +428,7 @@ struct lru_gen_struct { + }; + + void lru_gen_init_lruvec(struct lruvec *lruvec); ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); +@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) + { + } + ++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { +diff --git a/mm/internal.h b/mm/internal.h +index c0f8fbe0445b..3d070582052e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); + void folio_rotate_reclaimable(struct folio *folio); + bool __folio_end_writeback(struct folio *folio); + void deactivate_file_folio(struct folio *folio); ++void folio_activate(struct folio *folio); + + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 7d58e8a73ece..743f8513f1c3 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2777,6 +2777,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference ++ * - mem_cgroup_trylock_pages() + */ + folio->memcg_data = (unsigned long)memcg; + } +diff --git a/mm/rmap.c b/mm/rmap.c +index 5bcb334cd6f2..dce1a56b02f8 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -830,6 +830,12 @@ static bool folio_referenced_one(struct folio *folio, + } + + if (pvmw.pte) { ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { ++ lru_gen_look_around(&pvmw); ++ referenced++; ++ } ++ + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +diff --git a/mm/swap.c b/mm/swap.c +index 67e7962fbacc..131fc76242a3 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -342,7 +342,7 @@ static bool need_activate_page_drain(int cpu) + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; + } + +-static void folio_activate(struct folio *folio) ++void folio_activate(struct folio *folio) + { + if (folio_test_lru(folio) && !folio_test_active(folio) && + !folio_test_unevictable(folio)) { +@@ -362,7 +362,7 @@ static inline void activate_page_drain(int cpu) + { + } + +-static void folio_activate(struct folio *folio) ++void folio_activate(struct folio *folio) + { + struct lruvec *lruvec; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index f768d61e7b85..ec786fc556a7 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1574,6 +1574,11 @@ static unsigned int shrink_page_list(struct list_head *page_list, + if (!sc->may_unmap && folio_mapped(folio)) + goto keep_locked; + ++ /* folio_update_gen() tried to promote this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ folio_mapped(folio) && folio_test_referenced(folio)) ++ goto keep_locked; ++ + /* + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing +@@ -3161,6 +3166,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) + * the aging + ******************************************************************************/ + ++/* promote pages accessed through page tables */ ++static int folio_update_gen(struct folio *folio, int gen) ++{ ++ unsigned long new_flags, old_flags = READ_ONCE(folio->flags); ++ ++ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++ do { ++ /* lru_gen_del_folio() has isolated this page? */ ++ if (!(old_flags & LRU_GEN_MASK)) { ++ /* for shrink_page_list() */ ++ new_flags = old_flags | BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ + /* protect pages accessed multiple times through file descriptors */ + static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) + { +@@ -3172,6 +3200,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai + VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); + + do { ++ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* folio_update_gen() has promoted this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) ++ return new_gen; ++ + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); +@@ -3186,6 +3219,43 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai + return new_gen; + } + ++static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) ++{ ++ unsigned long pfn = pte_pfn(pte); ++ ++ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); ++ ++ if (!pte_present(pte) || is_zero_pfn(pfn)) ++ return -1; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) ++ return -1; ++ ++ if (WARN_ON_ONCE(!pfn_valid(pfn))) ++ return -1; ++ ++ return pfn; ++} ++ ++static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, ++ struct pglist_data *pgdat) ++{ ++ struct folio *folio; ++ ++ /* try to avoid unnecessary memory loads */ ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ return NULL; ++ ++ folio = pfn_folio(pfn); ++ if (folio_nid(folio) != pgdat->node_id) ++ return NULL; ++ ++ if (folio_memcg_rcu(folio) != memcg) ++ return NULL; ++ ++ return folio; ++} ++ + static void inc_min_seq(struct lruvec *lruvec, int type) + { + struct lru_gen_struct *lrugen = &lruvec->lrugen; +@@ -3387,6 +3457,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + } + ++/* ++ * This function exploits spatial locality when shrink_page_list() walks the ++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. ++ */ ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ struct folio *folio = pfn_folio(pvmw->pfn); ++ struct mem_cgroup *memcg = folio_memcg(folio); ++ struct pglist_data *pgdat = folio_pgdat(folio); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ DEFINE_MAX_SEQ(lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(max_seq); ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); ++ ++ if (spin_is_contended(pvmw->ptl)) ++ return; ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ ++ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { ++ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ end = start + MIN_LRU_BATCH * PAGE_SIZE; ++ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) ++ start = end - MIN_LRU_BATCH * PAGE_SIZE; ++ else { ++ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; ++ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ ++ rcu_read_lock(); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn; ++ ++ pfn = get_pte_pfn(pte[i], pvmw->vma, addr); ++ if (pfn == -1) ++ continue; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ folio = get_pfn_folio(pfn, memcg, pgdat); ++ if (!folio) ++ continue; ++ ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ continue; ++ ++ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && ++ !(folio_test_anon(folio) && folio_test_swapbacked(folio) && ++ !folio_test_swapcache(folio))) ++ folio_mark_dirty(folio); ++ ++ old_gen = folio_lru_gen(folio); ++ if (old_gen < 0) ++ folio_set_referenced(folio); ++ else if (old_gen != new_gen) ++ __set_bit(i, bitmap); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ rcu_read_unlock(); ++ ++ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { ++ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { ++ folio = pfn_folio(pte_pfn(pte[i])); ++ folio_activate(folio); ++ } ++ return; ++ } ++ ++ /* folio_update_gen() requires stable folio_memcg() */ ++ if (!mem_cgroup_trylock_pages(memcg)) ++ return; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ ++ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { ++ folio = pfn_folio(pte_pfn(pte[i])); ++ if (folio_memcg_rcu(folio) != memcg) ++ continue; ++ ++ old_gen = folio_update_gen(folio, new_gen); ++ if (old_gen < 0 || old_gen == new_gen) ++ continue; ++ ++ lru_gen_update_size(lruvec, folio, old_gen, new_gen); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_unlock_pages(); ++} ++ + /****************************************************************************** + * the eviction + ******************************************************************************/ +@@ -3423,6 +3601,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) + return true; + } + ++ /* promoted */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&folio->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + +From patchwork Wed Jul 6 22:00:17 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908709 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 2F4F3C433EF + for ; Wed, 6 Jul 2022 22:01:21 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id B67CF8E0008; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id A53EA8E0001; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 7BC5B8E0008; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com + [216.40.44.11]) + by kanga.kvack.org (Postfix) with ESMTP id 613D28E0001 + for ; Wed, 6 Jul 2022 18:01:11 -0400 (EDT) +Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay02.hostedemail.com (Postfix) with ESMTP id 3AABF33A6A + for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) +X-FDA: 79658046342.31.25FB448 +Received: from mail-yw1-f202.google.com (mail-yw1-f202.google.com + [209.85.128.202]) + by imf01.hostedemail.com (Postfix) with ESMTP id E1CB840019 + for ; Wed, 6 Jul 2022 22:01:04 +0000 (UTC) +Received: by mail-yw1-f202.google.com with SMTP id + 00721157ae682-31814f7654dso116292467b3.15 + for ; Wed, 06 Jul 2022 15:01:04 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=QRwOHdNvCJsdEWcZ8PiNBBmz8P6BxE21GfKaWd62Hcs=; + b=B0sFmQhPneIOAV1YVS6vP1oEsRe/BTwVflV6UdX0rzJbZE3r0MadNNURWxHDBukL1I + ELnHPWwed0WOAIrL8nftaw9ahABsSVQtJZPifYycg6l36RW7IRVZKE/FLzqQbao5lQVp + 2lyTvVaA0fwTYrrOAkppMHFJS9NhtOwiPWkN8qczgGMF/wfTpEMLT1c3gwH7x0wTp0CA + bmGxHDwTUBtMQvnhn6ZHsn3tW2Mue+sW/jt4FZPTcsu1wgfJSmRfIgRB/FRZTem/MRn1 + s04RGx0yhTSGEtt8gc/smm1CW1G6xElKiEo1r8zVeztvFvFMntvooTqGlsQvsu1rVxNL + nxIA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=QRwOHdNvCJsdEWcZ8PiNBBmz8P6BxE21GfKaWd62Hcs=; + b=c4bdMkLhxyvEnutnBA5XUiYftxDhTV6M0oPnBxTEWM2+ScjFG2RdUzrhfOZMxurWrf + sKZIm+7oIW+QIFcYwXv79hTW6tBrlW/YZfAQk5To3Rg3HYz9y6EONeJBeRq7VD3s7cDK + yCD5V7/rn1zPfpa7e5FCEQ3uaAAzJpmXH0yzMlJovObkLUf5/2H61vCu5Ss3s0nyUzu5 + PmBA7cbVlJg8w2iHFXSYVvkQw+nwkfZPYiZf6a7C4b5cBaaqSjFwp9R1Dj4Dmt/hyfqL + 9aSikv3Dqd00tRhmEqz7CFDN0nFe0RCoyF/1imT4h/wLfpY/PfqAblpzKLs1DVaRiIpK + 0dcg== +X-Gm-Message-State: AJIora/sVJDwRZqeywVvAnGpxiHdOH6QHZPRRxUF3BgVCxqUwhhWX/Sv + qRTmED85dcbhYCcgcB7NXYcxpzqLgic= +X-Google-Smtp-Source: + AGRyM1uf52B2LApundNg2J5h3sPxkMm3CEPkOlcVUbZUCRfPLLNdJaJY9XTnmessJ0elI3BxSTXeuJSaFn0= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a81:2f4c:0:b0:31c:2bee:dfa4 with SMTP id + v73-20020a812f4c000000b0031c2beedfa4mr47320138ywv.483.1657144863343; Wed, 06 + Jul 2022 15:01:03 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:17 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-9-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 08/14] mm: multi-gen LRU: support page table walks +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144870; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=QRwOHdNvCJsdEWcZ8PiNBBmz8P6BxE21GfKaWd62Hcs=; + b=yTdhrGd2Yn7SlvL67mHmk0coJxZY8xT17lW/ewc4fNMOsnnVS1sKnIvZPTnTcn0Fe+dccs + i2sOOxXkGXEhgV1hMozofaMLxhLPzFCWAEqHzOEcXyOK4AUM8ZYrXZOlIFqaID1et19+VY + 9DG+lIYPEo08J5Ku8PkMzTbLZN1d/1w= +ARC-Authentication-Results: i=1; + imf01.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=B0sFmQhP; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf01.hostedemail.com: domain of + 3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com + designates 209.85.128.202 as permitted sender) + smtp.mailfrom=3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144870; a=rsa-sha256; + cv=none; + b=AqNzfeMgehbGAF0NaBoToCygtio3p/CKcEQ2XvVEAyt3GUO/NWzgGf3L1H/PYlFzCPzE7Z + USY6Zs44Owz7ybSkwmXNxexJwWitplxX8dRNKKzWXbZkJ3+tWRbprGyp/NLFp9NFcFhMkz + orvBVvz7eGVPFZ1+kb859dx9H/Ub2G4= +X-Rspam-User: +X-Rspamd-Server: rspam07 +Authentication-Results: imf01.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=B0sFmQhP; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf01.hostedemail.com: domain of + 3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com + designates 209.85.128.202 as permitted sender) + smtp.mailfrom=3HwbGYgYKCGAWSXF8MEMMEJC.AMKJGLSV-KKIT8AI.MPE@flex--yuzhao.bounces.google.com +X-Stat-Signature: z89omp4mfbgn9jqrf7gixf63n1ypp6j5 +X-Rspamd-Queue-Id: E1CB840019 +X-HE-Tag: 1657144864-40541 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +To further exploit spatial locality, the aging prefers to walk page +tables to search for young PTEs and promote hot pages. A kill switch +will be added in the next patch to disable this behavior. When +disabled, the aging relies on the rmap only. + +NB: this behavior has nothing similar with the page table scanning in +the 2.4 kernel [1], which searches page tables for old PTEs, adds cold +pages to swapcache and unmaps them. + +To avoid confusion, the term "iteration" specifically means the +traversal of an entire mm_struct list; the term "walk" will be applied +to page tables and the rmap, as usual. + +An mm_struct list is maintained for each memcg, and an mm_struct +follows its owner task to the new memcg when this task is migrated. +Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls +walk_page_range() with each mm_struct on this list to promote hot +pages before it increments max_seq. + +When multiple page table walkers iterate the same list, each of them +gets a unique mm_struct; therefore they can run concurrently. Page +table walkers ignore any misplaced pages, e.g., if an mm_struct was +migrated, pages it left in the previous memcg will not be promoted +when its current memcg is under reclaim. Similarly, page table walkers +will not promote pages from nodes other than the one under reclaim. + +This patch uses the following optimizations when walking page tables: +1. It tracks the usage of mm_struct's between context switches so that + page table walkers can skip processes that have been sleeping since + the last iteration. +2. It uses generational Bloom filters to record populated branches so + that page table walkers can reduce their search space based on the + query results, e.g., to skip page tables containing mostly holes or + misplaced pages. +3. It takes advantage of the accessed bit in non-leaf PMD entries when + CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. +4. It does not zigzag between a PGD table and the same PMD table + spanning multiple VMAs. IOW, it finishes all the VMAs within the + range of the same PMD table before it returns to a PGD table. This + improves the cache performance for workloads that have large + numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5. + +Server benchmark results: + Single workload: + fio (buffered I/O): no change + + Single workload: + memcached (anon): +[8, 10]% + Ops/sec KB/sec + patch1-7: 1147696.57 44640.29 + patch1-8: 1245274.91 48435.66 + + Configurations: + no change + +Client benchmark results: + kswapd profiles: + patch1-7 + 48.16% lzo1x_1_do_compress (real work) + 8.20% page_vma_mapped_walk (overhead) + 7.06% _raw_spin_unlock_irq + 2.92% ptep_clear_flush + 2.53% __zram_bvec_write + 2.11% do_raw_spin_lock + 2.02% memmove + 1.93% lru_gen_look_around + 1.56% free_unref_page_list + 1.40% memset + + patch1-8 + 49.44% lzo1x_1_do_compress (real work) + 6.19% page_vma_mapped_walk (overhead) + 5.97% _raw_spin_unlock_irq + 3.13% get_pfn_folio + 2.85% ptep_clear_flush + 2.42% __zram_bvec_write + 2.08% do_raw_spin_lock + 1.92% memmove + 1.44% alloc_zspage + 1.36% memset + + Configurations: + no change + +Thanks to the following developers for their efforts [3]. + kernel test robot + +[1] https://lwn.net/Articles/23732/ +[2] https://llvm.org/docs/ScudoHardenedAllocator.html +[3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/ + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + fs/exec.c | 2 + + include/linux/memcontrol.h | 5 + + include/linux/mm_types.h | 77 +++ + include/linux/mmzone.h | 56 +- + include/linux/swap.h | 4 + + kernel/exit.c | 1 + + kernel/fork.c | 9 + + kernel/sched/core.c | 1 + + mm/memcontrol.c | 25 + + mm/vmscan.c | 1000 +++++++++++++++++++++++++++++++++++- + 10 files changed, 1163 insertions(+), 17 deletions(-) + +diff --git a/fs/exec.c b/fs/exec.c +index 0989fb8472a1..b1fda634e01a 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1015,6 +1015,7 @@ static int exec_mmap(struct mm_struct *mm) + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1030,6 +1031,7 @@ static int exec_mmap(struct mm_struct *mm) + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); + task_unlock(tsk); ++ lru_gen_use_mm(mm); + if (old_mm) { + mmap_read_unlock(old_mm); + BUG_ON(active_mm != old_mm); +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 9d0fea17f9ef..eca62345fdd5 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -350,6 +350,11 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ /* per-memcg mm_struct list */ ++ struct lru_gen_mm_list mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[]; + }; + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index c29ab4c0cd5c..7db51151a28b 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -3,6 +3,7 @@ + #define _LINUX_MM_TYPES_H + + #include ++#include + + #include + #include +@@ -17,6 +18,7 @@ + #include + #include + #include ++#include + + #include + +@@ -667,6 +669,22 @@ struct mm_struct { + */ + unsigned long ksm_merging_pages; + #endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* this mm_struct is on lru_gen_mm_list */ ++ struct list_head list; ++ /* ++ * Set when switching to this mm_struct, as a hint of ++ * whether it has been used since the last time per-node ++ * page table walkers cleared the corresponding bits. ++ */ ++ unsigned long bitmap; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of "owner" above */ ++ struct mem_cgroup *memcg; ++#endif ++ } lru_gen; ++#endif /* CONFIG_LRU_GEN */ + } __randomize_layout; + + /* +@@ -693,6 +711,65 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++struct lru_gen_mm_list { ++ /* mm_struct list for page table walkers */ ++ struct list_head fifo; ++ /* protects the list above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++ INIT_LIST_HEAD(&mm->lru_gen.list); ++ mm->lru_gen.bitmap = 0; ++#ifdef CONFIG_MEMCG ++ mm->lru_gen.memcg = NULL; ++#endif ++} ++ ++static inline void lru_gen_use_mm(struct mm_struct *mm) ++{ ++ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ ++ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); ++ ++ if (!(current->flags & PF_KTHREAD)) ++ WRITE_ONCE(mm->lru_gen.bitmap, -1); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_use_mm(struct mm_struct *mm) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 4fd7fc16eeb4..0cf0856b484a 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -405,7 +405,7 @@ enum { + * min_seq behind. + * + * The number of pages in each generation is eventually consistent and therefore +- * can be transiently negative. ++ * can be transiently negative when reset_batch_size() is pending. + */ + struct lru_gen_struct { + /* the aging increments the youngest generation number */ +@@ -427,6 +427,53 @@ struct lru_gen_struct { + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + }; + ++enum { ++ MM_LEAF_TOTAL, /* total leaf entries */ ++ MM_LEAF_OLD, /* old leaf entries */ ++ MM_LEAF_YOUNG, /* young leaf entries */ ++ MM_NONLEAF_TOTAL, /* total non-leaf entries */ ++ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ ++ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ ++ NR_MM_STATS ++}; ++ ++/* double-buffering Bloom filters */ ++#define NR_BLOOM_FILTERS 2 ++ ++struct lru_gen_mm_state { ++ /* set to max_seq after each iteration */ ++ unsigned long seq; ++ /* where the current iteration continues (inclusive) */ ++ struct list_head *head; ++ /* where the last iteration ended (exclusive) */ ++ struct list_head *tail; ++ /* to wait for the last page table walker to finish */ ++ struct wait_queue_head wait; ++ /* Bloom filters flip after each iteration */ ++ unsigned long *filters[NR_BLOOM_FILTERS]; ++ /* the mm stats for debugging */ ++ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; ++ /* the number of concurrent page table walkers */ ++ int nr_walkers; ++}; ++ ++struct lru_gen_mm_walk { ++ /* the lruvec under reclaim */ ++ struct lruvec *lruvec; ++ /* unstable max_seq from lru_gen_struct */ ++ unsigned long max_seq; ++ /* the next address within an mm to scan */ ++ unsigned long next_addr; ++ /* to batch promoted pages */ ++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* to batch the mm stats */ ++ int mm_stats[NR_MM_STATS]; ++ /* total batched items */ ++ int batched; ++ bool can_swap; ++ bool force_scan; ++}; ++ + void lru_gen_init_lruvec(struct lruvec *lruvec); + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + +@@ -477,6 +524,8 @@ struct lruvec { + #ifdef CONFIG_LRU_GEN + /* evictable pages divided into generations */ + struct lru_gen_struct lrugen; ++ /* to concurrently iterate lru_gen_mm_list */ ++ struct lru_gen_mm_state mm_state; + #endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; +@@ -1070,6 +1119,11 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ /* kswap mm walk data */ ++ struct lru_gen_mm_walk mm_walk; ++#endif ++ + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 0c0fed1b348f..b66cbc7ea93c 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -162,6 +162,10 @@ union swap_header { + */ + struct reclaim_state { + unsigned long reclaimed_slab; ++#ifdef CONFIG_LRU_GEN ++ /* per-thread mm walk data */ ++ struct lru_gen_mm_walk *mm_walk; ++#endif + }; + + #ifdef __KERNEL__ +diff --git a/kernel/exit.c b/kernel/exit.c +index f072959fcab7..f2d4d48ea790 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -466,6 +466,7 @@ void mm_update_next_owner(struct mm_struct *mm) + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index 9d44f2d46c69..67b7666d7321 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1152,6 +1152,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + goto fail_nocontext; + + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; + + fail_nocontext: +@@ -1194,6 +1195,7 @@ static inline void __mmput(struct mm_struct *mm) + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); + } + +@@ -2676,6 +2678,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) + get_task_struct(p); + } + ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); + + /* forking complete and child started to run, tell ptracer */ +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index da0bf6fe9ecd..320d82697037 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5130,6 +5130,7 @@ context_switch(struct rq *rq, struct task_struct *prev, + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_use_mm(next->mm); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 743f8513f1c3..84f3707667bc 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -6133,6 +6133,30 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct task_struct *task; ++ struct cgroup_subsys_state *css; ++ ++ /* find the first leader if there is any */ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ break; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && task->mm->owner == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6536,6 +6560,7 @@ struct cgroup_subsys memory_cgrp_subsys = { + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, +diff --git a/mm/vmscan.c b/mm/vmscan.c +index ec786fc556a7..8e55a1ce1ae0 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -3024,7 +3026,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +-static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid) ++static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) + { + struct pglist_data *pgdat = NODE_DATA(nid); + +@@ -3069,6 +3071,372 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + } + ++/****************************************************************************** ++ * mm_struct list ++ ******************************************************************************/ ++ ++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) ++{ ++ static struct lru_gen_mm_list mm_list = { ++ .fifo = LIST_HEAD_INIT(mm_list.fifo), ++ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), ++ }; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ return &memcg->mm_list; ++#endif ++ VM_WARN_ON_ONCE(!mem_cgroup_disabled()); ++ ++ return &mm_list; ++} ++ ++void lru_gen_add_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); ++#ifdef CONFIG_MEMCG ++ VM_WARN_ON_ONCE(mm->lru_gen.memcg); ++ mm->lru_gen.memcg = memcg; ++#endif ++ spin_lock(&mm_list->lock); ++ ++ for_each_node_state(nid, N_MEMORY) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ /* the first addition since the last iteration */ ++ if (lruvec->mm_state.tail == &mm_list->fifo) ++ lruvec->mm_state.tail = &mm->lru_gen.list; ++ } ++ ++ list_add_tail(&mm->lru_gen.list, &mm_list->fifo); ++ ++ spin_unlock(&mm_list->lock); ++} ++ ++void lru_gen_del_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct lru_gen_mm_list *mm_list; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (list_empty(&mm->lru_gen.list)) ++ return; ++ ++#ifdef CONFIG_MEMCG ++ memcg = mm->lru_gen.memcg; ++#endif ++ mm_list = get_mm_list(memcg); ++ ++ spin_lock(&mm_list->lock); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ /* where the last iteration ended (exclusive) */ ++ if (lruvec->mm_state.tail == &mm->lru_gen.list) ++ lruvec->mm_state.tail = lruvec->mm_state.tail->next; ++ ++ /* where the current iteration continues (inclusive) */ ++ if (lruvec->mm_state.head != &mm->lru_gen.list) ++ continue; ++ ++ lruvec->mm_state.head = lruvec->mm_state.head->next; ++ /* the deletion ends the current iteration */ ++ if (lruvec->mm_state.head == &mm_list->fifo) ++ WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); ++ } ++ ++ list_del_init(&mm->lru_gen.list); ++ ++ spin_unlock(&mm_list->lock); ++ ++#ifdef CONFIG_MEMCG ++ mem_cgroup_put(mm->lru_gen.memcg); ++ mm->lru_gen.memcg = NULL; ++#endif ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg; ++ ++ lockdep_assert_held(&mm->owner->alloc_lock); ++ ++ /* for mm_update_next_owner() */ ++ if (mem_cgroup_disabled()) ++ return; ++ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); ++ rcu_read_unlock(); ++ if (memcg == mm->lru_gen.memcg) ++ return; ++ ++ VM_WARN_ON_ONCE(!mm->lru_gen.memcg); ++ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); ++ ++ lru_gen_del_mm(mm); ++ lru_gen_add_mm(mm); ++} ++#endif ++ ++/* ++ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when ++ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of ++ * bits in a bitmap, k is the number of hash functions and n is the number of ++ * inserted items. ++ * ++ * Page table walkers use one of the two filters to reduce their search space. ++ * To get rid of non-leaf entries that no longer have enough leaf entries, the ++ * aging uses the double-buffering technique to flip to the other filter each ++ * time it produces a new generation. For non-leaf entries that have enough ++ * leaf entries, the aging carries them over to the next generation in ++ * walk_pmd_range(); the eviction also report them when walking the rmap ++ * in lru_gen_look_around(). ++ * ++ * For future optimizations: ++ * 1. It's not necessary to keep both filters all the time. The spare one can be ++ * freed after the RCU grace period and reallocated if needed again. ++ * 2. And when reallocating, it's worth scaling its size according to the number ++ * of inserted entries in the other filter, to reduce the memory overhead on ++ * small systems and false positives on large systems. ++ * 3. Jenkins' hash function is an alternative to Knuth's. ++ */ ++#define BLOOM_FILTER_SHIFT 15 ++ ++static inline int filter_gen_from_seq(unsigned long seq) ++{ ++ return seq % NR_BLOOM_FILTERS; ++} ++ ++static void get_item_key(void *item, int *key) ++{ ++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); ++ ++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); ++ ++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); ++ key[1] = hash >> BLOOM_FILTER_SHIFT; ++} ++ ++static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) ++{ ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = lruvec->mm_state.filters[gen]; ++ if (filter) { ++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); ++ return; ++ } ++ ++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), ++ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); ++ WRITE_ONCE(lruvec->mm_state.filters[gen], filter); ++} ++ ++static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_state.filters[gen]); ++ if (!filter) ++ return; ++ ++ get_item_key(item, key); ++ ++ if (!test_bit(key[0], filter)) ++ set_bit(key[0], filter); ++ if (!test_bit(key[1], filter)) ++ set_bit(key[1], filter); ++} ++ ++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_state.filters[gen]); ++ if (!filter) ++ return true; ++ ++ get_item_key(item, key); ++ ++ return test_bit(key[0], filter) && test_bit(key[1], filter); ++} ++ ++static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) ++{ ++ int i; ++ int hist; ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ if (walk) { ++ hist = lru_hist_from_seq(walk->max_seq); ++ ++ for (i = 0; i < NR_MM_STATS; i++) { ++ WRITE_ONCE(lruvec->mm_state.stats[hist][i], ++ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); ++ walk->mm_stats[i] = 0; ++ } ++ } ++ ++ if (NR_HIST_GENS > 1 && last) { ++ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); ++ ++ for (i = 0; i < NR_MM_STATS; i++) ++ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); ++ } ++} ++ ++static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) ++{ ++ int type; ++ unsigned long size = 0; ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); ++ ++ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) ++ return true; ++ ++ clear_bit(key, &mm->lru_gen.bitmap); ++ ++ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { ++ size += type ? get_mm_counter(mm, MM_FILEPAGES) : ++ get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ } ++ ++ if (size < MIN_LRU_BATCH) ++ return true; ++ ++ if (test_bit(MMF_OOM_REAP_QUEUED, &mm->flags)) ++ return true; ++ ++ return !mmget_not_zero(mm); ++} ++ ++static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, ++ struct mm_struct **iter) ++{ ++ bool first = false; ++ bool last = true; ++ struct mm_struct *mm = NULL; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ struct lru_gen_mm_state *mm_state = &lruvec->mm_state; ++ ++ /* ++ * There are four interesting cases for this page table walker: ++ * 1. It tries to start a new iteration of mm_list with a stale max_seq; ++ * there is nothing left to do. ++ * 2. It's the first of the current generation, and it needs to reset ++ * the Bloom filter for the next generation. ++ * 3. It reaches the end of mm_list, and it needs to increment ++ * mm_state->seq; the iteration is done. ++ * 4. It's the last of the current generation, and it needs to reset the ++ * mm stats counters for the next generation. ++ */ ++ spin_lock(&mm_list->lock); ++ ++ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); ++ VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); ++ VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); ++ ++ if (walk->max_seq <= mm_state->seq) { ++ if (!*iter) ++ last = false; ++ goto done; ++ } ++ ++ if (!mm_state->nr_walkers) { ++ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); ++ ++ mm_state->head = mm_list->fifo.next; ++ first = true; ++ } ++ ++ while (!mm && mm_state->head != &mm_list->fifo) { ++ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); ++ ++ mm_state->head = mm_state->head->next; ++ ++ /* force scan for those added after the last iteration */ ++ if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { ++ mm_state->tail = mm_state->head; ++ walk->force_scan = true; ++ } ++ ++ if (should_skip_mm(mm, walk)) ++ mm = NULL; ++ } ++ ++ if (mm_state->head == &mm_list->fifo) ++ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++done: ++ if (*iter && !mm) ++ mm_state->nr_walkers--; ++ if (!*iter && mm) ++ mm_state->nr_walkers++; ++ ++ if (mm_state->nr_walkers) ++ last = false; ++ ++ if (*iter || last) ++ reset_mm_stats(lruvec, walk, last); ++ ++ spin_unlock(&mm_list->lock); ++ ++ if (mm && first) ++ reset_bloom_filter(lruvec, walk->max_seq + 1); ++ ++ if (*iter) ++ mmput_async(*iter); ++ ++ *iter = mm; ++ ++ return last; ++} ++ ++static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) ++{ ++ bool success = false; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ struct lru_gen_mm_state *mm_state = &lruvec->mm_state; ++ ++ spin_lock(&mm_list->lock); ++ ++ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); ++ ++ if (max_seq > mm_state->seq && !mm_state->nr_walkers) { ++ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); ++ ++ WRITE_ONCE(mm_state->seq, mm_state->seq + 1); ++ reset_mm_stats(lruvec, NULL, true); ++ success = true; ++ } ++ ++ spin_unlock(&mm_list->lock); ++ ++ return success; ++} ++ + /****************************************************************************** + * refault feedback loop + ******************************************************************************/ +@@ -3219,6 +3587,118 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai + return new_gen; + } + ++static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, ++ int old_gen, int new_gen) ++{ ++ int type = folio_is_file_lru(folio); ++ int zone = folio_zonenum(folio); ++ int delta = folio_nr_pages(folio); ++ ++ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); ++ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); ++ ++ walk->batched++; ++ ++ walk->nr_pages[old_gen][type][zone] -= delta; ++ walk->nr_pages[new_gen][type][zone] += delta; ++} ++ ++static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) ++{ ++ int gen, type, zone; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ walk->batched = 0; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ enum lru_list lru = type * LRU_INACTIVE_FILE; ++ int delta = walk->nr_pages[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ walk->nr_pages[gen][type][zone] = 0; ++ WRITE_ONCE(lrugen->nr_pages[gen][type][zone], ++ lrugen->nr_pages[gen][type][zone] + delta); ++ ++ if (lru_gen_is_active(lruvec, gen)) ++ lru += LRU_ACTIVE; ++ __update_lru_size(lruvec, lru, zone, delta); ++ } ++} ++ ++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) ++{ ++ struct address_space *mapping; ++ struct vm_area_struct *vma = args->vma; ++ struct lru_gen_mm_walk *walk = args->private; ++ ++ if (!vma_is_accessible(vma)) ++ return true; ++ ++ if (is_vm_hugetlb_page(vma)) ++ return true; ++ ++ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) ++ return true; ++ ++ if (vma == get_gate_vma(vma->vm_mm)) ++ return true; ++ ++ if (vma_is_anonymous(vma)) ++ return !walk->can_swap; ++ ++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) ++ return true; ++ ++ mapping = vma->vm_file->f_mapping; ++ if (mapping_unevictable(mapping)) ++ return true; ++ ++ if (shmem_mapping(mapping)) ++ return !walk->can_swap; ++ ++ /* to exclude special mappings like dax, etc. */ ++ return !mapping->a_ops->read_folio; ++} ++ ++/* ++ * Some userspace memory allocators map many single-page VMAs. Instead of ++ * returning back to the PGD table for each of such VMAs, finish an entire PMD ++ * table to reduce zigzags and improve cache performance. ++ */ ++static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, ++ unsigned long *vm_start, unsigned long *vm_end) ++{ ++ unsigned long start = round_up(*vm_end, size); ++ unsigned long end = (start | ~mask) + 1; ++ ++ VM_WARN_ON_ONCE(mask & size); ++ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); ++ ++ while (args->vma) { ++ if (start >= args->vma->vm_end) { ++ args->vma = args->vma->vm_next; ++ continue; ++ } ++ ++ if (end && end <= args->vma->vm_start) ++ return false; ++ ++ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { ++ args->vma = args->vma->vm_next; ++ continue; ++ } ++ ++ *vm_start = max(start, args->vma->vm_start); ++ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; ++ ++ return true; ++ } ++ ++ return false; ++} ++ + static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) + { + unsigned long pfn = pte_pfn(pte); +@@ -3237,8 +3717,28 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned + return pfn; + } + ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) ++{ ++ unsigned long pfn = pmd_pfn(pmd); ++ ++ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); ++ ++ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) ++ return -1; ++ ++ if (WARN_ON_ONCE(pmd_devmap(pmd))) ++ return -1; ++ ++ if (WARN_ON_ONCE(!pfn_valid(pfn))) ++ return -1; ++ ++ return pfn; ++} ++#endif ++ + static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, +- struct pglist_data *pgdat) ++ struct pglist_data *pgdat, bool can_swap) + { + struct folio *folio; + +@@ -3253,9 +3753,371 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, + if (folio_memcg_rcu(folio) != memcg) + return NULL; + ++ /* file VMAs can contain anon pages from COW */ ++ if (!folio_is_file_lru(folio) && !can_swap) ++ return NULL; ++ + return folio; + } + ++static bool suitable_to_scan(int total, int young) ++{ ++ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); ++ ++ /* suitable if the average number of young PTEs per cacheline is >=1 */ ++ return young * n >= total; ++} ++ ++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, ++ struct mm_walk *args) ++{ ++ int i; ++ pte_t *pte; ++ spinlock_t *ptl; ++ unsigned long addr; ++ int total = 0; ++ int young = 0; ++ struct lru_gen_mm_walk *walk = args->private; ++ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); ++ ++ VM_WARN_ON_ONCE(pmd_leaf(*pmd)); ++ ++ ptl = pte_lockptr(args->mm, pmd); ++ if (!spin_trylock(ptl)) ++ return false; ++ ++ arch_enter_lazy_mmu_mode(); ++ ++ pte = pte_offset_map(pmd, start & PMD_MASK); ++restart: ++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn; ++ struct folio *folio; ++ ++ total++; ++ walk->mm_stats[MM_LEAF_TOTAL]++; ++ ++ pfn = get_pte_pfn(pte[i], args->vma, addr); ++ if (pfn == -1) ++ continue; ++ ++ if (!pte_young(pte[i])) { ++ walk->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); ++ if (!folio) ++ continue; ++ ++ if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) ++ continue; ++ ++ young++; ++ walk->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && ++ !(folio_test_anon(folio) && folio_test_swapbacked(folio) && ++ !folio_test_swapcache(folio))) ++ folio_mark_dirty(folio); ++ ++ old_gen = folio_update_gen(folio, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(walk, folio, old_gen, new_gen); ++ } ++ ++ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) ++ goto restart; ++ ++ pte_unmap(pte); ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++ ++ return suitable_to_scan(total, young); ++} ++ ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, ++ struct mm_walk *args, unsigned long *bitmap, unsigned long *start) ++{ ++ int i; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ struct lru_gen_mm_walk *walk = args->private; ++ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); ++ ++ VM_WARN_ON_ONCE(pud_leaf(*pud)); ++ ++ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ ++ if (*start == -1) { ++ *start = next; ++ return; ++ } ++ ++ i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); ++ if (i && i <= MIN_LRU_BATCH) { ++ __set_bit(i - 1, bitmap); ++ return; ++ } ++ ++ pmd = pmd_offset(pud, *start); ++ ++ ptl = pmd_lockptr(args->mm, pmd); ++ if (!spin_trylock(ptl)) ++ goto done; ++ ++ arch_enter_lazy_mmu_mode(); ++ ++ do { ++ unsigned long pfn; ++ struct folio *folio; ++ unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; ++ ++ pfn = get_pmd_pfn(pmd[i], vma, addr); ++ if (pfn == -1) ++ goto next; ++ ++ if (!pmd_trans_huge(pmd[i])) { ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ pmdp_test_and_clear_young(vma, addr, pmd + i); ++ goto next; ++ } ++ ++ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); ++ if (!folio) ++ goto next; ++ ++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ goto next; ++ ++ walk->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && ++ !(folio_test_anon(folio) && folio_test_swapbacked(folio) && ++ !folio_test_swapcache(folio))) ++ folio_mark_dirty(folio); ++ ++ old_gen = folio_update_gen(folio, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(walk, folio, old_gen, new_gen); ++next: ++ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; ++ } while (i <= MIN_LRU_BATCH); ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++done: ++ *start = -1; ++ bitmap_zero(bitmap, MIN_LRU_BATCH); ++} ++#else ++static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, ++ struct mm_walk *args, unsigned long *bitmap, unsigned long *start) ++{ ++} ++#endif ++ ++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ++ struct mm_walk *args) ++{ ++ int i; ++ pmd_t *pmd; ++ unsigned long next; ++ unsigned long addr; ++ struct vm_area_struct *vma; ++ unsigned long pos = -1; ++ struct lru_gen_mm_walk *walk = args->private; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; ++ ++ VM_WARN_ON_ONCE(pud_leaf(*pud)); ++ ++ /* ++ * Finish an entire PMD in two passes: the first only reaches to PTE ++ * tables to avoid taking the PMD lock; the second, if necessary, takes ++ * the PMD lock to clear the accessed bit in PMD entries. ++ */ ++ pmd = pmd_offset(pud, start & PUD_MASK); ++restart: ++ /* walk_pte_range() may call get_next_vma() */ ++ vma = args->vma; ++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { ++ pmd_t val = pmd_read_atomic(pmd + i); ++ ++ /* for pmd_read_atomic() */ ++ barrier(); ++ ++ next = pmd_addr_end(addr, end); ++ ++ if (!pmd_present(val) || is_huge_zero_pmd(val)) { ++ walk->mm_stats[MM_LEAF_TOTAL]++; ++ continue; ++ } ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ if (pmd_trans_huge(val)) { ++ unsigned long pfn = pmd_pfn(val); ++ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); ++ ++ walk->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (!pmd_young(val)) { ++ walk->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ /* try to avoid unnecessary memory loads */ ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ continue; ++ ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ continue; ++ } ++#endif ++ walk->mm_stats[MM_NONLEAF_TOTAL]++; ++ ++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG ++ if (!pmd_young(val)) ++ continue; ++ ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++#endif ++ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) ++ continue; ++ ++ walk->mm_stats[MM_NONLEAF_FOUND]++; ++ ++ if (!walk_pte_range(&val, addr, next, args)) ++ continue; ++ ++ walk->mm_stats[MM_NONLEAF_ADDED]++; ++ ++ /* carry over to the next generation */ ++ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); ++ } ++ ++ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); ++ ++ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) ++ goto restart; ++} ++ ++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, ++ struct mm_walk *args) ++{ ++ int i; ++ pud_t *pud; ++ unsigned long addr; ++ unsigned long next; ++ struct lru_gen_mm_walk *walk = args->private; ++ ++ VM_WARN_ON_ONCE(p4d_leaf(*p4d)); ++ ++ pud = pud_offset(p4d, start & P4D_MASK); ++restart: ++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { ++ pud_t val = READ_ONCE(pud[i]); ++ ++ next = pud_addr_end(addr, end); ++ ++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) ++ continue; ++ ++ walk_pmd_range(&val, addr, next, args); ++ ++ if (walk->batched >= MAX_LRU_BATCH) { ++ end = (addr | ~PUD_MASK) + 1; ++ goto done; ++ } ++ } ++ ++ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) ++ goto restart; ++ ++ end = round_up(end, P4D_SIZE); ++done: ++ if (!end || !args->vma) ++ return 1; ++ ++ walk->next_addr = max(end, args->vma->vm_start); ++ ++ return -EAGAIN; ++} ++ ++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) ++{ ++ static const struct mm_walk_ops mm_walk_ops = { ++ .test_walk = should_skip_vma, ++ .p4d_entry = walk_pud_range, ++ }; ++ ++ int err; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ walk->next_addr = FIRST_USER_ADDRESS; ++ ++ do { ++ err = -EBUSY; ++ ++ /* folio_update_gen() requires stable folio_memcg() */ ++ if (!mem_cgroup_trylock_pages(memcg)) ++ break; ++ ++ /* the caller might be holding the lock for write */ ++ if (mmap_read_trylock(mm)) { ++ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); ++ ++ mmap_read_unlock(mm); ++ } ++ ++ mem_cgroup_unlock_pages(); ++ ++ if (walk->batched) { ++ spin_lock_irq(&lruvec->lru_lock); ++ reset_batch_size(lruvec, walk); ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while (err == -EAGAIN); ++} ++ ++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) ++{ ++ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; ++ ++ if (pgdat && current_is_kswapd()) { ++ VM_WARN_ON_ONCE(walk); ++ ++ walk = &pgdat->mm_walk; ++ } else if (!pgdat && !walk) { ++ VM_WARN_ON_ONCE(current_is_kswapd()); ++ ++ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); ++ } ++ ++ current->reclaim_state->mm_walk = walk; ++ ++ return walk; ++} ++ ++static void clear_mm_walk(void) ++{ ++ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; ++ ++ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); ++ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); ++ ++ current->reclaim_state->mm_walk = NULL; ++ ++ if (!current_is_kswapd()) ++ kfree(walk); ++} ++ + static void inc_min_seq(struct lruvec *lruvec, int type) + { + struct lru_gen_struct *lrugen = &lruvec->lrugen; +@@ -3307,7 +4169,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + return success; + } + +-static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) ++static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + { + int prev, next; + int type, zone; +@@ -3317,9 +4179,6 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + +- if (max_seq != lrugen->max_seq) +- goto unlock; +- + for (type = 0; type < ANON_AND_FILE; type++) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; +@@ -3357,10 +4216,76 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s + + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +-unlock: ++ + spin_unlock_irq(&lruvec->lru_lock); + } + ++static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, ++ struct scan_control *sc, bool can_swap) ++{ ++ bool success; ++ struct lru_gen_mm_walk *walk; ++ struct mm_struct *mm = NULL; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); ++ ++ /* see the comment in iterate_mm_list() */ ++ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { ++ success = false; ++ goto done; ++ } ++ ++ /* ++ * If the hardware doesn't automatically set the accessed bit, fallback ++ * to lru_gen_look_around(), which only clears the accessed bit in a ++ * handful of PTEs. Spreading the work out over a period of time usually ++ * is less efficient, but it avoids bursty page faults. ++ */ ++ if (!arch_has_hw_pte_young()) { ++ success = iterate_mm_list_nowalk(lruvec, max_seq); ++ goto done; ++ } ++ ++ walk = set_mm_walk(NULL); ++ if (!walk) { ++ success = iterate_mm_list_nowalk(lruvec, max_seq); ++ goto done; ++ } ++ ++ walk->lruvec = lruvec; ++ walk->max_seq = max_seq; ++ walk->can_swap = can_swap; ++ walk->force_scan = false; ++ ++ do { ++ success = iterate_mm_list(lruvec, walk, &mm); ++ if (mm) ++ walk_mm(lruvec, mm, walk); ++ ++ cond_resched(); ++ } while (mm); ++done: ++ if (!success) { ++ if (!current_is_kswapd() && !sc->priority) ++ wait_event_killable(lruvec->mm_state.wait, ++ max_seq < READ_ONCE(lrugen->max_seq)); ++ ++ return max_seq < READ_ONCE(lrugen->max_seq); ++ } ++ ++ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); ++ ++ inc_max_seq(lruvec, can_swap); ++ /* either this sees any waiters or they will see updated max_seq */ ++ if (wq_has_sleeper(&lruvec->mm_state.wait)) ++ wake_up_all(&lruvec->mm_state.wait); ++ ++ wakeup_flusher_threads(WB_REASON_VMSCAN); ++ ++ return true; ++} ++ + static unsigned long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq, + unsigned long *min_seq, bool can_swap, bool *need_aging) + { +@@ -3438,7 +4363,7 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) + nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; + + if (nr_to_scan && need_aging) +- inc_max_seq(lruvec, max_seq, swappiness); ++ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); + } + + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +@@ -3447,6 +4372,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + VM_WARN_ON_ONCE(!current_is_kswapd()); + ++ set_mm_walk(pgdat); ++ + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); +@@ -3455,11 +4382,16 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ clear_mm_walk(); + } + + /* + * This function exploits spatial locality when shrink_page_list() walks the +- * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. ++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If ++ * the scan was done cacheline efficiently, it adds the PMD entry pointing to ++ * the PTE table to the Bloom filter. This forms a feedback loop between the ++ * eviction and the aging. + */ + void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + { +@@ -3468,6 +4400,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + unsigned long start; + unsigned long end; + unsigned long addr; ++ struct lru_gen_mm_walk *walk; ++ int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); +@@ -3497,6 +4431,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); +@@ -3511,13 +4446,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (!pte_young(pte[i])) + continue; + +- folio = get_pfn_folio(pfn, memcg, pgdat); ++ folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); + if (!folio) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + continue; + ++ young++; ++ + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) +@@ -3533,7 +4470,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + +- if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { ++ /* feedback from rmap walkers to page table walkers */ ++ if (suitable_to_scan(i, young)) ++ update_bloom_filter(lruvec, max_seq, pvmw->pmd); ++ ++ if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + folio_activate(folio); +@@ -3545,8 +4486,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (!mem_cgroup_trylock_pages(memcg)) + return; + +- spin_lock_irq(&lruvec->lru_lock); +- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ if (!walk) { ++ spin_lock_irq(&lruvec->lru_lock); ++ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); ++ } + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); +@@ -3557,10 +4500,14 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) + if (old_gen < 0 || old_gen == new_gen) + continue; + +- lru_gen_update_size(lruvec, folio, old_gen, new_gen); ++ if (walk) ++ update_batch_size(walk, folio, old_gen, new_gen); ++ else ++ lru_gen_update_size(lruvec, folio, old_gen, new_gen); + } + +- spin_unlock_irq(&lruvec->lru_lock); ++ if (!walk) ++ spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); + } +@@ -3843,6 +4790,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap + struct folio *folio; + enum vm_event_item item; + struct reclaim_stat stat; ++ struct lru_gen_mm_walk *walk; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + +@@ -3879,6 +4827,10 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap + + move_pages_to_lru(lruvec, &list); + ++ walk = current->reclaim_state->mm_walk; ++ if (walk && walk->batched) ++ reset_batch_size(lruvec, walk); ++ + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); +@@ -3936,7 +4888,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + if (current_is_kswapd()) + return 0; + +- inc_max_seq(lruvec, max_seq, can_swap); ++ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) ++ return nr_to_scan; + done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; + } +@@ -3951,6 +4904,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + + blk_start_plug(&plug); + ++ set_mm_walk(lruvec_pgdat(lruvec)); ++ + while (true) { + int delta; + int swappiness; +@@ -3978,6 +4933,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + cond_resched(); + } + ++ clear_mm_walk(); ++ + blk_finish_plug(&plug); + } + +@@ -3994,15 +4951,21 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ ++ lruvec->mm_state.seq = MIN_NR_GENS; ++ init_waitqueue_head(&lruvec->mm_state.wait); + } + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg) + { ++ INIT_LIST_HEAD(&memcg->mm_list.fifo); ++ spin_lock_init(&memcg->mm_list.lock); + } + + void lru_gen_exit_memcg(struct mem_cgroup *memcg) + { ++ int i; + int nid; + + for_each_node(nid) { +@@ -4010,6 +4973,11 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); ++ ++ for (i = 0; i < NR_BLOOM_FILTERS; i++) { ++ bitmap_free(lruvec->mm_state.filters[i]); ++ lruvec->mm_state.filters[i] = NULL; ++ } + } + } + #endif + +From patchwork Wed Jul 6 22:00:18 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908706 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 4EB4BCCA480 + for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id BDBC98E0005; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id B8B188E0001; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id A05548E0005; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0011.hostedemail.com + [216.40.44.11]) + by kanga.kvack.org (Postfix) with ESMTP id 8B02E8E0001 + for ; Wed, 6 Jul 2022 18:01:06 -0400 (EDT) +Received: from smtpin07.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay08.hostedemail.com (Postfix) with ESMTP id 5322D218C7 + for ; Wed, 6 Jul 2022 22:01:06 +0000 (UTC) +X-FDA: 79658046132.07.9DE38CF +Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com + [209.85.128.201]) + by imf28.hostedemail.com (Postfix) with ESMTP id 97908C0054 + for ; Wed, 6 Jul 2022 22:01:05 +0000 (UTC) +Received: by mail-yw1-f201.google.com with SMTP id + 00721157ae682-31c8c7138ebso70710887b3.17 + for ; Wed, 06 Jul 2022 15:01:05 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=ZS7i+zWbsuFYZiRlenI/F/Y7PzZj3Cv3ABmuogIV+d0=; + b=Is9nnwDLdoF8cmdhQhl8FEZEIPLZOTCQNPziPrZ3WCv4Hkh+8SM7Qirn2/JzlJe5Qt + IMzoKhGVVu62zPGO2f8uqvwVO7ZBpwGEu3Y0nx+xsR+UR6rSMs9BgDYfSl6hxumhEzXQ + AVU29P45SCq1drQE+AuDu2NsKyQ+R9NLi2XNN7GjQzGIS59mnKnciabxZ70kUwocqXEh + TsuagDSQmmH5SjPkOzOUNm6Sk8f3JEhf7X8a1bPpbg+ozA3KspzkTBjkMrHomLe9ffcm + BFgwNEyH9XBgnj0m4gnfT2SYRWWY1k3MsXJMQ+zIJmqc6vDRB4WpYW/qGMJadOFCZfMM + nXgA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=ZS7i+zWbsuFYZiRlenI/F/Y7PzZj3Cv3ABmuogIV+d0=; + b=RCS2awk0QPdb9ZH7wNOyKeKXpab1x67IIAx6j5zq56jXzEwz7GXRfLumdFrGKXCGb4 + Ni9Rp5dJZYzq8bxw94GX64zJKMTSoJ4V+UAQj4zSBLEczJlyZKy/yWOit9ZSOEb0iMvR + 4+5mHFOU3YS4X1wdcqI8B4TLEOn1Me9mvfoveMAhSf8VHEhQ3neJrM0mY0hrTiGCSK5A + J4de3u3SGUe8nuXf8ZpGc38zd9x6oFOeBCUdjka6ao55yXPRLAwyLVnsLaRAKC6md3fT + VHJAQ+yfE2vhF9ga9RIwItKKGhAOJVHWoDcC3pQb9GfR0/p6eeP/23lc7iaFdQYywDcG + ckOQ== +X-Gm-Message-State: AJIora9+K81MvAVQpqZj+MVKlE2AQLbnYpnFjbMqoGoc4IJHROymV58Z + hBHBT8xaW7KW2eD0IR1+YvgbPP1Vazg= +X-Google-Smtp-Source: + AGRyM1uwFcCEi/xhgZ0h4sIMbRvSkHm3hRssR6SxZ63hO5m3+xAhe4vRctYY9iQ6nmc/njn1u9BXzW/MiYM= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:abb3:0:b0:66e:2f9a:4201 with SMTP id + v48-20020a25abb3000000b0066e2f9a4201mr26479914ybi.125.1657144864829; Wed, 06 + Jul 2022 15:01:04 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:18 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-10-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 09/14] mm: multi-gen LRU: optimize multiple memcgs +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144865; a=rsa-sha256; + cv=none; + b=q3V8GcW5a7gLdEkDRvzjbN1oCqdl/PcNPOBK/4yn5O6DZ96fhshx4mHlVqwifd6VM/h6DQ + bokhlmu8Wk2Z61Pnli47ITBMWiwbyG5GqreBCrRln1NfcGXS0mPhEW7lQeDU+ca5XNHA6R + 1crW4jbDQ+Ez8oz8X4F5X5OxrI6ddoA= +ARC-Authentication-Results: i=1; + imf28.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=Is9nnwDL; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf28.hostedemail.com: domain of + 3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com + designates 209.85.128.201 as permitted sender) + smtp.mailfrom=3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144865; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=ZS7i+zWbsuFYZiRlenI/F/Y7PzZj3Cv3ABmuogIV+d0=; + b=fen/7hqEjb4DZZ+j8OhMLxCTQaBQg1nYoyqyzaLOG2yllRdoXgC8upN+NQjx2/OsQCSFdY + tIpgzM/hy9Vb35EZoTXPI0b6U/1kGDSWHJyLMU3CkJKzSF9lMbdGm9UdAtXJ/1dnB/CxlX + R3DdBW8MzocpYgtBQDvKhNaIKlqhdRg= +X-Rspamd-Server: rspam08 +X-Rspamd-Queue-Id: 97908C0054 +X-Rspam-User: +Authentication-Results: imf28.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=Is9nnwDL; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf28.hostedemail.com: domain of + 3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com + designates 209.85.128.201 as permitted sender) + smtp.mailfrom=3IAbGYgYKCGEXTYG9NFNNFKD.BNLKHMTW-LLJU9BJ.NQF@flex--yuzhao.bounces.google.com +X-Stat-Signature: xt8apxhnez18ydabrirx1u5kimzk5obt +X-HE-Tag: 1657144865-519413 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +When multiple memcgs are available, it is possible to make better +choices based on generations and tiers and therefore improve the +overall performance under global memory pressure. This patch adds a +rudimentary optimization to select memcgs that can drop single-use +unmapped clean pages first. Doing so reduces the chance of going into +the aging path or swapping. These two decisions can be costly. + +A typical example that benefits from this optimization is a server +running mixed types of workloads, e.g., heavy anon workload in one +memcg and heavy buffered I/O workload in the other. + +Though this optimization can be applied to both kswapd and direct +reclaim, it is only added to kswapd to keep the patchset manageable. +Later improvements will cover the direct reclaim path. + +Server benchmark results: + Mixed workloads: + fio (buffered I/O): +[19, 21]% + IOPS BW + patch1-8: 1880k 7343MiB/s + patch1-9: 2252k 8796MiB/s + + memcached (anon): +[119, 123]% + Ops/sec KB/sec + patch1-8: 862768.65 33514.68 + patch1-9: 1911022.12 74234.54 + + Mixed workloads: + fio (buffered I/O): +[75, 77]% + IOPS BW + 5.19-rc1: 1279k 4996MiB/s + patch1-9: 2252k 8796MiB/s + + memcached (anon): +[13, 15]% + Ops/sec KB/sec + 5.19-rc1: 1673524.04 65008.87 + patch1-9: 1911022.12 74234.54 + + Configurations: + (changes since patch 6) + + cat mixed.sh + modprobe brd rd_nr=2 rd_size=56623104 + + swapoff -a + mkswap /dev/ram0 + swapon /dev/ram0 + + mkfs.ext4 /dev/ram1 + mount -t ext4 /dev/ram1 /mnt + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \ + --ratio 1:0 --pipeline 8 -d 2000 + + fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \ + --buffered=1 --ioengine=io_uring --iodepth=128 \ + --iodepth_batch_submit=32 --iodepth_batch_complete=32 \ + --rw=randread --random_distribution=random --norandommap \ + --time_based --ramp_time=10m --runtime=90m --group_reporting & + pid=$! + + sleep 200 + + memtier_benchmark -S /var/run/memcached/memcached.sock \ + -P memcache_binary -n allkeys --key-minimum=1 \ + --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \ + --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed + + kill -INT $pid + wait + +Client benchmark results: + no change (CONFIG_MEMCG=n) + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + mm/vmscan.c | 55 ++++++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 46 insertions(+), 9 deletions(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 8e55a1ce1ae0..f469a2740835 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -129,6 +129,13 @@ struct scan_control { + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + ++#ifdef CONFIG_LRU_GEN ++ /* help make better choices when multiple memcgs are available */ ++ unsigned int memcgs_need_aging:1; ++ unsigned int memcgs_need_swapping:1; ++ unsigned int memcgs_avoid_swapping:1; ++#endif ++ + /* Allocation order */ + s8 order; + +@@ -4372,6 +4379,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + + VM_WARN_ON_ONCE(!current_is_kswapd()); + ++ /* ++ * To reduce the chance of going into the aging path or swapping, which ++ * can be costly, optimistically skip them unless their corresponding ++ * flags were cleared in the eviction path. This improves the overall ++ * performance when multiple memcgs are available. ++ */ ++ if (!sc->memcgs_need_aging) { ++ sc->memcgs_need_aging = true; ++ sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping; ++ sc->memcgs_need_swapping = true; ++ return; ++ } ++ ++ sc->memcgs_need_swapping = true; ++ sc->memcgs_avoid_swapping = true; ++ + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); +@@ -4781,7 +4804,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw + return scanned; + } + +-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ bool *need_swapping) + { + int type; + int scanned; +@@ -4844,14 +4868,16 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap + + sc->nr_reclaimed += reclaimed; + ++ if (type == LRU_GEN_ANON && need_swapping) ++ *need_swapping = true; ++ + return scanned; + } + + static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, +- bool can_swap, unsigned long reclaimed) ++ bool can_swap, unsigned long reclaimed, bool *need_aging) + { + int priority; +- bool need_aging; + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); +@@ -4861,7 +4887,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + +- nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, &need_aging); ++ nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, need_aging); + if (!nr_to_scan) + return 0; + +@@ -4877,7 +4903,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + if (!nr_to_scan) + return 0; + +- if (!need_aging) ++ if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ +@@ -4897,6 +4923,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + struct blk_plug plug; ++ bool need_aging = false; ++ bool need_swapping = false; + unsigned long scanned = 0; + unsigned long reclaimed = sc->nr_reclaimed; + +@@ -4918,21 +4946,30 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + else + swappiness = 0; + +- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed); ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging); + if (!nr_to_scan) +- break; ++ goto done; + +- delta = evict_folios(lruvec, sc, swappiness); ++ delta = evict_folios(lruvec, sc, swappiness, &need_swapping); + if (!delta) +- break; ++ goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + ++ if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping) ++ break; ++ + cond_resched(); + } + ++ /* see the comment in lru_gen_age_node() */ ++ if (!need_aging) ++ sc->memcgs_need_aging = false; ++ if (!need_swapping) ++ sc->memcgs_need_swapping = false; ++done: + clear_mm_walk(); + + blk_finish_plug(&plug); + +From patchwork Wed Jul 6 22:00:19 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908707 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 8DE21CCA480 + for ; Wed, 6 Jul 2022 22:01:14 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id D162E8E0006; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id C9D778E0001; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id B177F8E0006; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0014.hostedemail.com + [216.40.44.14]) + by kanga.kvack.org (Postfix) with ESMTP id 9D5508E0001 + for ; Wed, 6 Jul 2022 18:01:07 -0400 (EDT) +Received: from smtpin29.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay12.hostedemail.com (Postfix) with ESMTP id 5B55512053A + for ; Wed, 6 Jul 2022 22:01:07 +0000 (UTC) +X-FDA: 79658046174.29.1D659FF +Received: from mail-yw1-f201.google.com (mail-yw1-f201.google.com + [209.85.128.201]) + by imf23.hostedemail.com (Postfix) with ESMTP id F2017140064 + for ; Wed, 6 Jul 2022 22:01:06 +0000 (UTC) +Received: by mail-yw1-f201.google.com with SMTP id + 00721157ae682-31c9a49a1a8so63946947b3.9 + for ; Wed, 06 Jul 2022 15:01:06 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=t3JqYbFJT9lP6E96sRUzmCzQEu0iJg+mfU6dciROW6I=; + b=K/nKIb14JmIaSQ25G+voEr3Xu6sFBToolWxLX2DrPdbxAa6BpfoEW4/5621Rzsff4D + 1k3G9tp+5ESbNVZCZfqietdtMt6OTAchdy14TXI4WTiTZLglVlIfr80zpxGfIGcphLBv + c2R6icWOjZ0upEVkivTfwH9rKBl233YFlYCWfHzoiU07eBFA2yPOzHZx49n6UFl3tbHt + eSai05q6oFPAPMqEwWKLLg5e2ewTiqoowbahH4nTTyw69dIDZhmip41HFaA0/Sczzyq3 + JDic9dSJ+BDTRQ6TaWU0nw7eqP8mi+/sxNdfATpIluPgr0W9A0QZ1JCn1D9q09woZwV/ + PFjA== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=t3JqYbFJT9lP6E96sRUzmCzQEu0iJg+mfU6dciROW6I=; + b=HM7LGBCrS1eJ/vRo3XCktj01RjkBHhQmWt02aEfYLwa8PL1HdwG+c4Me4gn54xxguO + czAtvRKRQXHGFYRw3EumNTE4ZGOfg2XqVtN9EjwqQhTlBSwX+fziamWFBeroVGwQW5G1 + dGw5hoaY2I+TFoZJ29KsIagqOSfrJETzQGULi/sVBWKaWeb/S8HFZY/EKyoYDxZqIBmP + sF9WEmAcW0+fvVxqWYl3uVJzRtjHRL6YsrIahPgXVedZvFAkXUNU4kkV7vNYV3lz5mBQ + xaBVG/fe1KTKIxNs72Rv8R1FEPhZcIGFotO3DUMW0MRlYpm9F1IVf32khtQ+h2Ym9As/ + PzIA== +X-Gm-Message-State: AJIora954b3OSYa+S8ljATClx2rklKm1t2+1N36MlNK4kzbf9PLLetyh + 9bLCk9rYASJ4G36LmS+oOZUzhHu3MzI= +X-Google-Smtp-Source: + AGRyM1uCmWNAcyv7l4c+bwlvsNWjdcmS50NXK/ousi79Gs9bHWyAObimB3RXzG41nJY/wFbH1TL7Js/68Zk= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:e74e:0:b0:66e:32d4:1f0 with SMTP id + e75-20020a25e74e000000b0066e32d401f0mr24265460ybh.421.1657144866511; Wed, 06 + Jul 2022 15:01:06 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:19 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-11-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 10/14] mm: multi-gen LRU: kill switch +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144867; a=rsa-sha256; + cv=none; + b=srv0M1GwC9igO3Ssc6UQavsL8SeMR4TafyrVnDRr+qa3IdXnaeYuD7mVXVj2DTKftiNgsf + Z0438E6xGFdYlVffElthsW5+/j/7H+6w/+Rrj2/CtQGpnJriXc77Tp9lWDSGzJMNo/18Ur + 5XMrBTHRNBBRZNKQECXPfqx8mY2KlTw= +ARC-Authentication-Results: i=1; + imf23.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b="K/nKIb14"; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf23.hostedemail.com: domain of + 3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com + designates 209.85.128.201 as permitted sender) + smtp.mailfrom=3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144867; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=t3JqYbFJT9lP6E96sRUzmCzQEu0iJg+mfU6dciROW6I=; + b=v9VdQ/ak+0604gCltqLudvPrAy3WcrJhWxCXksIxicPZWyjAnzABJHeJwcXRUr74ilTy45 + 1o9D+n98WPgODBBDXuQgOxdZ/m1FekCnCpnWnR72lB+33NlF8zDMR0jbie23mZMDqsMO/w + cYTpCFhyTn0ribSQdUv7TlFoFBXyRTw= +Authentication-Results: imf23.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b="K/nKIb14"; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf23.hostedemail.com: domain of + 3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com + designates 209.85.128.201 as permitted sender) + smtp.mailfrom=3IgbGYgYKCGMZVaIBPHPPHMF.DPNMJOVY-NNLWBDL.PSH@flex--yuzhao.bounces.google.com +X-Stat-Signature: u9yuk5ppb8f6meekzwox11y8u8f8zsip +X-Rspamd-Queue-Id: F2017140064 +X-Rspamd-Server: rspam05 +X-Rspam-User: +X-HE-Tag: 1657144866-771308 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that +can be disabled include: + 0x0001: the multi-gen LRU core + 0x0002: walking page table, when arch_has_hw_pte_young() returns + true + 0x0004: clearing the accessed bit in non-leaf PMD entries, when + CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y + [yYnN]: apply to all the components above +E.g., + echo y >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0007 + echo 5 >/sys/kernel/mm/lru_gen/enabled + cat /sys/kernel/mm/lru_gen/enabled + 0x0005 + +NB: the page table walks happen on the scale of seconds under heavy +memory pressure, in which case the mmap_lock contention is a lesser +concern, compared with the LRU lock contention and the I/O congestion. +So far the only well-known case of the mmap_lock contention happens on +Android, due to Scudo [1] which allocates several thousand VMAs for +merely a few hundred MBs. The SPF and the Maple Tree also have +provided their own assessments [2][3]. However, if walking page tables +does worsen the mmap_lock contention, the kill switch can be used to +disable it. In this case the multi-gen LRU will suffer a minor +performance degradation, as shown previously. + +Clearing the accessed bit in non-leaf PMD entries can also be +disabled, since this behavior was not tested on x86 varieties other +than Intel and AMD. + +[1] https://source.android.com/devices/tech/debug/scudo +[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ +[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + include/linux/cgroup.h | 15 ++- + include/linux/mm_inline.h | 15 ++- + include/linux/mmzone.h | 9 ++ + kernel/cgroup/cgroup-internal.h | 1 - + mm/Kconfig | 6 + + mm/vmscan.c | 231 +++++++++++++++++++++++++++++++- + 6 files changed, 268 insertions(+), 9 deletions(-) + +diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h +index 0d1ada8968d7..1bc0cabf993f 100644 +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -708,6 +719,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h +index f2b2296a42f9..4949eda9a9a2 100644 +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio) + + #ifdef CONFIG_LRU_GEN + ++#ifdef CONFIG_LRU_GEN_ENABLED + static inline bool lru_gen_enabled(void) + { +- return true; ++ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); ++ ++ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); + } ++#else ++static inline bool lru_gen_enabled(void) ++{ ++ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); ++ ++ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); ++} ++#endif + + static inline bool lru_gen_in_fault(void) + { +@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, + + VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); + +- if (folio_test_unevictable(folio)) ++ if (folio_test_unevictable(folio) || !lrugen->enabled) + return false; + /* + * There are three common cases for this page: +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 0cf0856b484a..840b7ca8b91f 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -384,6 +384,13 @@ enum { + LRU_GEN_FILE, + }; + ++enum { ++ LRU_GEN_CORE, ++ LRU_GEN_MM_WALK, ++ LRU_GEN_NONLEAF_YOUNG, ++ NR_LRU_GEN_CAPS ++}; ++ + #define MIN_LRU_BATCH BITS_PER_LONG + #define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) + +@@ -425,6 +432,8 @@ struct lru_gen_struct { + /* can be modified without holding the LRU lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ /* whether the multi-gen LRU is enabled */ ++ bool enabled; + }; + + enum { +diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h +index 5da09c74228d..c966e55cab29 100644 +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -164,7 +164,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +diff --git a/mm/Kconfig b/mm/Kconfig +index a93478acf341..0c2ef0af0036 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1139,6 +1139,12 @@ config LRU_GEN + help + A high performance LRU implementation to overcommit memory. + ++config LRU_GEN_ENABLED ++ bool "Enable by default" ++ depends on LRU_GEN ++ help ++ This option enables the multi-gen LRU by default. ++ + config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN +diff --git a/mm/vmscan.c b/mm/vmscan.c +index f469a2740835..4c8b475429ed 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -3013,6 +3014,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, + + #ifdef CONFIG_LRU_GEN + ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); ++#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) ++#else ++DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); ++#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) ++#endif ++ + /****************************************************************************** + * shorthand helpers + ******************************************************************************/ +@@ -3890,7 +3899,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area + goto next; + + if (!pmd_trans_huge(pmd[i])) { +- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && ++ get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } +@@ -3988,10 +3998,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + walk->mm_stats[MM_NONLEAF_TOTAL]++; + + #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +- if (!pmd_young(val)) +- continue; ++ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { ++ if (!pmd_young(val)) ++ continue; + +- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); ++ } + #endif + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; +@@ -4249,7 +4261,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!arch_has_hw_pte_young()) { ++ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } +@@ -4975,6 +4987,211 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc + blk_finish_plug(&plug); + } + ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ if (lrugen->enabled) { ++ enum lru_list lru; ++ ++ for_each_evictable_lru(lru) { ++ if (!list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ } else { ++ int gen, type, zone; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON_ONCE(lrugen->nr_pages[gen][type][zone]); ++ } ++ } ++ ++ return true; ++} ++ ++static bool fill_evictable(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_LRU_BATCH; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct folio *folio = lru_to_folio(head); ++ ++ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); ++ ++ lruvec_del_folio(lruvec, folio); ++ success = lru_gen_add_folio(lruvec, folio, false); ++ VM_WARN_ON_ONCE(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_evictable(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_LRU_BATCH; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct folio *folio = lru_to_folio(head); ++ ++ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); ++ ++ success = lru_gen_del_folio(lruvec, folio, false); ++ VM_WARN_ON_ONCE(!success); ++ lruvec_add_folio(lruvec, folio); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static void lru_gen_change_state(bool enabled) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ cgroup_lock(); ++ cpus_read_lock(); ++ get_online_mems(); ++ mutex_lock(&state_mutex); ++ ++ if (enabled == lru_gen_enabled()) ++ goto unlock; ++ ++ if (enabled) ++ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); ++ else ++ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(memcg, nid); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); ++ VM_WARN_ON_ONCE(!state_is_valid(lruvec)); ++ ++ lruvec->lrugen.enabled = enabled; ++ ++ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ put_online_mems(); ++ cpus_read_unlock(); ++ cgroup_unlock(); ++} ++ ++/****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ unsigned int caps = 0; ++ ++ if (get_cap(LRU_GEN_CORE)) ++ caps |= BIT(LRU_GEN_CORE); ++ ++ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) ++ caps |= BIT(LRU_GEN_MM_WALK); ++ ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) ++ caps |= BIT(LRU_GEN_NONLEAF_YOUNG); ++ ++ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); ++} ++ ++static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ int i; ++ unsigned int caps; ++ ++ if (tolower(*buf) == 'n') ++ caps = 0; ++ else if (tolower(*buf) == 'y') ++ caps = -1; ++ else if (kstrtouint(buf, 0, &caps)) ++ return -EINVAL; ++ ++ for (i = 0; i < NR_LRU_GEN_CAPS; i++) { ++ bool enabled = caps & BIT(i); ++ ++ if (i == LRU_GEN_CORE) ++ lru_gen_change_state(enabled); ++ else if (enabled) ++ static_branch_enable(&lru_gen_caps[i]); ++ else ++ static_branch_disable(&lru_gen_caps[i]); ++ } ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enabled, store_enabled ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -4985,6 +5202,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled = lru_gen_enabled(); + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); +@@ -5024,6 +5242,9 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ + return 0; + }; + late_initcall(init_lru_gen); + +From patchwork Wed Jul 6 22:00:20 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908708 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id E259FCCA47C + for ; Wed, 6 Jul 2022 22:01:17 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 1A2F98E0007; Wed, 6 Jul 2022 18:01:09 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 153298E0001; Wed, 6 Jul 2022 18:01:09 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id EE7558E0007; Wed, 6 Jul 2022 18:01:08 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0016.hostedemail.com + [216.40.44.16]) + by kanga.kvack.org (Postfix) with ESMTP id DAE728E0001 + for ; Wed, 6 Jul 2022 18:01:08 -0400 (EDT) +Received: from smtpin28.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay12.hostedemail.com (Postfix) with ESMTP id BCB43120606 + for ; Wed, 6 Jul 2022 22:01:08 +0000 (UTC) +X-FDA: 79658046216.28.7964C66 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf22.hostedemail.com (Postfix) with ESMTP id 5376DC0059 + for ; Wed, 6 Jul 2022 22:01:08 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + k18-20020a25fe12000000b0066e21b72767so9506499ybe.5 + for ; Wed, 06 Jul 2022 15:01:08 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=m3EW4cfAlntTqnxn3SvhsZvF1ytN+sfDtB6iRdzihvY=; + b=NNZxOJisLedvEph13coGoCeVo89XYF3cKhoLr0Qj+8EQSroRh25w+qZuSGaKvrNfmO + djUv79dYHeRCliQ2lBYEsuuPJN6lgSZ6cKW987LKYkUaRIiHw552kndr1VR1raRgUvCU + 568te5aggKYg95okJZ0cLsdFaiOBB18/hCGgU+4bQM73SosPCL/NpSqGWL8mW9AiVFs+ + hT7ErHYOnMn+bCDzuk8GAu9J4/5Gq8c/6z9M6D6X+HmVK0MeVpaKpZ0jPz/vsi747v3J + zvNibUS9XJKNBhR7/Fg26FpINdlMkWHvvcikRiTD5O+czcMeNF2XfnGAvAgAPgyPnYK8 + b6mw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=m3EW4cfAlntTqnxn3SvhsZvF1ytN+sfDtB6iRdzihvY=; + b=zXNzXRZtXF7HqnKj+YqGY7LRRVq6lHVBV5jsPt3MSSmDDGJn6CoeLbEp4cPtjTwO6B + PdPka308tTjCbbT5NueJUGYkQFn+dt6QZPZS/jb4O/Jp4FJYfjG0o4VrhF0wFgETWdJW + SlKTi07ik53nd5tJXcgBVdPvMoYv2WqllknYQA/iDN2/SNhuFxoKXFeHv/5ulZkc5nBp + SmXgJE0BppobJXNXNvFGVF0nGLh8MGF2CDBRi/+lvRIg3ypzFxQ/hVUXa8U2PNCdICqT + s8aNXeAFHJX2x34DN+/C6pMD4gEu9krSDK9BNN79fEhVT7obeGbxtKJ4DZnLSgUa0Xm+ + emKQ== +X-Gm-Message-State: AJIora+LcODNzO0M8diNM7f7lT7CGqqvE1q+GRK1vGVAVA10uhTltSsu + Casv9y2GuL6ljruRdbC60eu1gck5MIA= +X-Google-Smtp-Source: + AGRyM1uVFLai60fRsrxUz+UveX+2HvTnchQxr73gyI+bA9ud92MMOTkT47lvZz9+aNC2VPhD8jfbEwKxJDM= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:1c56:0:b0:66e:2d23:d65d with SMTP id + c83-20020a251c56000000b0066e2d23d65dmr26931039ybc.253.1657144867700; Wed, 06 + Jul 2022 15:01:07 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:20 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-12-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 11/14] mm: multi-gen LRU: thrashing prevention +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144868; a=rsa-sha256; + cv=none; + b=8QjwJzQPm7r/G+Ug8d4Bn/JrZtirxW14NE/TPM5Yuz8TtgqfXHSgDZ0NZs+0NMnmPdFebK + BewOSgj/R+9PisPRBLUEepAkTTAjyW6prOGRhTAKigLh6I3aJOU2/+iXQO+eUvhPXHNnMf + KRnEXerAaMeP1dBwH0VFivF74hvg2OQ= +ARC-Authentication-Results: i=1; + imf22.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=NNZxOJis; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf22.hostedemail.com: domain of + 3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144868; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=m3EW4cfAlntTqnxn3SvhsZvF1ytN+sfDtB6iRdzihvY=; + b=DTQGqCEN2saKpCn2Rlj0DwxYUYns5aLH6ctyLw23CxaYk5FVEKFifd/4msPagn2x3OyYoJ + IHUvwyXUjQkcTa1cZQoQjZtTkZ5tAB3HGWKknBtj00SV590QYCz1tvu/9DdrTQBAJJQVkL + NDOvAf+Q5C0pIHmFotDqxriphq5nQvg= +X-Stat-Signature: ob4t1mrtn1zzw1fgdc6fyo685ij649e9 +X-Rspam-User: +X-Rspamd-Server: rspam12 +X-Rspamd-Queue-Id: 5376DC0059 +Authentication-Results: imf22.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=NNZxOJis; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf22.hostedemail.com: domain of + 3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3IwbGYgYKCGQaWbJCQIQQING.EQONKPWZ-OOMXCEM.QTI@flex--yuzhao.bounces.google.com +X-HE-Tag: 1657144868-301835 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as +requested by many desktop users [1]. + +When set to value N, it prevents the working set of N milliseconds +from getting evicted. The OOM killer is triggered if this working set +cannot be kept in memory. Based on the average human detectable lag +(~100ms), N=1000 usually eliminates intolerable lags due to thrashing. +Larger values like N=3000 make lags less noticeable at the risk of +premature OOM kills. + +Compared with the size-based approach [2], this time-based approach +has the following advantages: +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + +[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/ +[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/ + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + include/linux/mmzone.h | 2 ++ + mm/vmscan.c | 71 +++++++++++++++++++++++++++++++++++++++--- + 2 files changed, 69 insertions(+), 4 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 840b7ca8b91f..472bd5335517 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -419,6 +419,8 @@ struct lru_gen_struct { + unsigned long max_seq; + /* the eviction increments the oldest generation numbers */ + unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; + /* the multi-gen LRU lists, lazily sorted on eviction */ + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the multi-gen LRU sizes, eventually consistent */ +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 4c8b475429ed..1f2892a0dc41 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4233,6 +4233,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + ++ WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + +@@ -4359,7 +4360,7 @@ static unsigned long get_nr_evictable(struct lruvec *lruvec, unsigned long max_s + return total; + } + +-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) + { + bool need_aging; + unsigned long nr_to_scan; +@@ -4373,21 +4374,40 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) +- return; ++ return false; + + nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging); + if (!nr_to_scan) +- return; ++ return false; + + nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0; + ++ if (min_ttl) { ++ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); ++ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); ++ ++ if (time_is_after_jiffies(birth + min_ttl)) ++ return false; ++ ++ /* the size is likely too small to be helpful */ ++ if (!nr_to_scan && sc->priority != DEF_PRIORITY) ++ return false; ++ } ++ + if (nr_to_scan && need_aging) + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); ++ ++ return true; + } + ++/* to protect the working set of the last N jiffies */ ++static unsigned long lru_gen_min_ttl __read_mostly; ++ + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + { + struct mem_cgroup *memcg; ++ bool success = false; ++ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + +@@ -4413,12 +4433,28 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + +- age_lruvec(lruvec, sc); ++ if (age_lruvec(lruvec, sc, min_ttl)) ++ success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + clear_mm_walk(); ++ ++ /* ++ * The main goal is to OOM kill if every generation from all memcgs is ++ * younger than min_ttl. However, another theoretical possibility is all ++ * memcgs are either below min or empty. ++ */ ++ if (!success && !sc->order && mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ }; ++ ++ out_of_memory(&oc); ++ ++ mutex_unlock(&oom_lock); ++ } + } + + /* +@@ -5135,6 +5171,28 @@ static void lru_gen_change_state(bool enabled) + * sysfs interface + ******************************************************************************/ + ++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++} ++ ++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ unsigned int msecs; ++ ++ if (kstrtouint(buf, 0, &msecs)) ++ return -EINVAL; ++ ++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( ++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl ++); ++ + static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) + { + unsigned int caps = 0; +@@ -5183,6 +5241,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + ); + + static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL + }; +@@ -5198,12 +5257,16 @@ static struct attribute_group lru_gen_attr_group = { + + void lru_gen_init_lruvec(struct lruvec *lruvec) + { ++ int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + + +From patchwork Wed Jul 6 22:00:21 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908710 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 69F71CCA47C + for ; Wed, 6 Jul 2022 22:01:25 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 57B768E0009; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 4D6338E0001; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 3007F8E0009; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0012.hostedemail.com + [216.40.44.12]) + by kanga.kvack.org (Postfix) with ESMTP id 1256B8E0001 + for ; Wed, 6 Jul 2022 18:01:12 -0400 (EDT) +Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay09.hostedemail.com (Postfix) with ESMTP id AB0F535EC6 + for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) +X-FDA: 79658046342.31.A60FB64 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf26.hostedemail.com (Postfix) with ESMTP id 4A6EA140028 + for ; Wed, 6 Jul 2022 22:01:10 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + p7-20020a25d807000000b0066e36989a90so7892676ybg.8 + for ; Wed, 06 Jul 2022 15:01:10 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=nff1jLrA4AEpo88lpO2ZCXRvuzs0CKl/TI+ofmEg1y8=; + b=CPQvXMErOqHr1LM+OMqtT0F59XyB+HiQxBX+EbwoUSnPn/FOpbR4dV1NCCwYakR+KD + gThfZIfqp3Y1SzCO2443reP2Soe3KDHNgAEXCZ5YNoeE7AXlAuA2fgD7YeAXZovjmVIh + 7mERrjTMT6/EWjW531e5FNoxfhaMBEMBEgwjAOQ3Km57LeRgBcWr2IgRe48XaW69M16C + KWj2PGLEmurhGwwHU4NVVPpbjL3o7cE3vD/yehuUCz476hIOcC2Nqpn4krz36H5vP68u + MNeJkhynrE7FhYi7+GgffibtX96Vf3x/16YGAxyUCnSyvvk6OhNUeqKo/LQmoS3LAyl4 + LFpw== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=nff1jLrA4AEpo88lpO2ZCXRvuzs0CKl/TI+ofmEg1y8=; + b=VL/nujGONvdvil9k1He7hpkq5LhpScFPlvGxIB7gfBV4qX15+ZTbNG009jHkmfEswJ + LM40W6DV3mGXjx6Gy2MTjobH0jL4c9qrU1ia5WRKzWkXlxaCkDE82vwuaz7rycBaAiPt + JhRi9ADSMoA9G43MZZei2oSwmUoW9WcH4Umy1YImLdHAjkYdJQ+Ss3Q4uYfGGw3866qm + nfc0pXT5KiNC2DMr+Cla/Llx1WlFNi7QIf3AmdpJ9gZTxCC28ikjniRVZN6b7bTrvjnO + iEyt9jKYEk9vW/yTUEzM8L41D+e+Z60AT6T0qi0KACO4Tp3xz77ui1i2Q85btfHs4Uah + 1qwQ== +X-Gm-Message-State: AJIora+YmPGCc9b5W8VNnqsviDKSYwcLGbNwLNCyRBey3F3rMvUMSj7O + PlfF3OKJjs3zxsBvgcOgTGWclCLXuc4= +X-Google-Smtp-Source: + AGRyM1shBkUATwCAbsz8cAeEoY3s7WAj+Jhs0L0rMlWdOOLCX8yRP4QO9OI90Aiszy92GtEPUW7W76UGd7w= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a0d:c486:0:b0:31c:3b63:91fe with SMTP id + g128-20020a0dc486000000b0031c3b6391femr43427605ywd.7.1657144869573; Wed, 06 + Jul 2022 15:01:09 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:21 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-13-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 12/14] mm: multi-gen LRU: debugfs interface +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Qi Zheng , Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144870; a=rsa-sha256; + cv=none; + b=o2/nINgmuZxzIEJU+aSCudJDHKhL7ULIt8sF3JC62cV/HuuBLU0B/xVeMeA2f0cfJN2wtO + kh2UubWWhgsh8V4Cx5XQQfC0fnIjU7kesFrHJslbwx0sV7BFvFu/mCMwBdA2zb0NjFYQ+H + 9ov/Z61nItyp9dvXEOPZKlu3qRCy8D4= +ARC-Authentication-Results: i=1; + imf26.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=CPQvXMEr; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf26.hostedemail.com: domain of + 3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144870; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=nff1jLrA4AEpo88lpO2ZCXRvuzs0CKl/TI+ofmEg1y8=; + b=RI+I7W6K9d5xsUHY54+KeCzGoeOqxuVYKkoikwvrtSbya6NSfQOh7+EFtaBhpVNiDwQMte + 1gOSPtlHmqa//TuxixmT7E3h+4+bbMuck8gjgOl+LEQXqAO7KWKyE6sirgzmwX5HwXk8e5 + zWZIZi2rLOPaapJlUtXn2+31FvtGh1c= +Authentication-Results: imf26.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=CPQvXMEr; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf26.hostedemail.com: domain of + 3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3JQbGYgYKCGYcYdLESKSSKPI.GSQPMRYb-QQOZEGO.SVK@flex--yuzhao.bounces.google.com +X-Stat-Signature: oqpxscpz6ano7mm34xg1zaoyrcimtdxo +X-Rspamd-Queue-Id: 4A6EA140028 +X-Rspamd-Server: rspam05 +X-Rspam-User: +X-HE-Tag: 1657144870-268992 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add /sys/kernel/debug/lru_gen for working set estimation and proactive +reclaim. These techniques are commonly used to optimize job scheduling +(bin packing) in data centers [1][2]. + +Compared with the page table-based approach and the PFN-based +approach, this lruvec-based approach has the following advantages: +1. It offers better choices because it is aware of memcgs, NUMA nodes, + shared mappings and unmapped page cache. +2. It is more scalable because it is O(nr_hot_pages), whereas the + PFN-based approach is O(nr_total_pages). + +Add /sys/kernel/debug/lru_gen_full for debugging. + +[1] https://dl.acm.org/doi/10.1145/3297858.3304053 +[2] https://dl.acm.org/doi/10.1145/3503222.3507731 + +Signed-off-by: Yu Zhao +Reviewed-by: Qi Zheng +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +--- + include/linux/nodemask.h | 1 + + mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 402 insertions(+), 10 deletions(-) + +diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h +index 0f233b76c9ce..292ec0ce0d63 100644 +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -485,6 +485,7 @@ static inline int num_node_state(enum node_states state) + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 1f2892a0dc41..fbcd298adca7 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -4137,12 +4138,40 @@ static void clear_mm_walk(void) + kfree(walk); + } + +-static void inc_min_seq(struct lruvec *lruvec, int type) ++static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) + { ++ int zone; ++ int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + ++ if (type == LRU_GEN_ANON && !can_swap) ++ goto done; ++ ++ /* prevent cold/hot inversion if force_scan is true */ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ struct list_head *head = &lrugen->lists[old_gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct folio *folio = lru_to_folio(head); ++ ++ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); ++ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); ++ ++ new_gen = folio_inc_gen(lruvec, folio, false); ++ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ return true; + } + + static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +@@ -4188,7 +4217,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) + return success; + } + +-static void inc_max_seq(struct lruvec *lruvec, bool can_swap) ++static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) + { + int prev, next; + int type, zone; +@@ -4202,9 +4231,13 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + +- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); ++ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + +- inc_min_seq(lruvec, type); ++ while (!inc_min_seq(lruvec, type, can_swap)) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } + } + + /* +@@ -4241,7 +4274,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap) + } + + static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, +- struct scan_control *sc, bool can_swap) ++ struct scan_control *sc, bool can_swap, bool force_scan) + { + bool success; + struct lru_gen_mm_walk *walk; +@@ -4262,7 +4295,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ +- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { ++ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } +@@ -4276,7 +4309,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + walk->lruvec = lruvec; + walk->max_seq = max_seq; + walk->can_swap = can_swap; +- walk->force_scan = false; ++ walk->force_scan = force_scan; + + do { + success = iterate_mm_list(lruvec, walk, &mm); +@@ -4296,7 +4329,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); + +- inc_max_seq(lruvec, can_swap); ++ inc_max_seq(lruvec, can_swap, force_scan); + /* either this sees any waiters or they will see updated max_seq */ + if (wq_has_sleeper(&lruvec->mm_state.wait)) + wake_up_all(&lruvec->mm_state.wait); +@@ -4395,7 +4428,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned + } + + if (nr_to_scan && need_aging) +- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); ++ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + + return true; + } +@@ -4962,7 +4995,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * + if (current_is_kswapd()) + return 0; + +- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) ++ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) + return nr_to_scan; + done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +@@ -5251,6 +5284,361 @@ static struct attribute_group lru_gen_attr_group = { + .attrs = lru_gen_attrs, + }; + ++/****************************************************************************** ++ * debugfs interface ++ ******************************************************************************/ ++ ++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct mem_cgroup *memcg; ++ loff_t nr_to_skip = *pos; ++ ++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); ++ if (!m->private) ++ return ERR_PTR(-ENOMEM); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ if (!nr_to_skip--) ++ return get_lruvec(memcg, nid); ++ } ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ return NULL; ++} ++ ++static void lru_gen_seq_stop(struct seq_file *m, void *v) ++{ ++ if (!IS_ERR_OR_NULL(v)) ++ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); ++ ++ kvfree(m->private); ++ m->private = NULL; ++} ++ ++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ int nid = lruvec_pgdat(v)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(v); ++ ++ ++*pos; ++ ++ nid = next_memory_node(nid); ++ if (nid == MAX_NUMNODES) { ++ memcg = mem_cgroup_iter(NULL, memcg, NULL); ++ if (!memcg) ++ return NULL; ++ ++ nid = first_memory_node; ++ } ++ ++ return get_lruvec(memcg, nid); ++} ++ ++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, ++ unsigned long max_seq, unsigned long *min_seq, ++ unsigned long seq) ++{ ++ int i; ++ int type, tier; ++ int hist = lru_hist_from_seq(seq); ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ seq_printf(m, " %10d", tier); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ const char *s = " "; ++ unsigned long n[3] = {}; ++ ++ if (seq == max_seq) { ++ s = "RT "; ++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); ++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); ++ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { ++ s = "rep"; ++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); ++ } ++ ++ for (i = 0; i < 3; i++) ++ seq_printf(m, " %10lu%c", n[i], s[i]); ++ } ++ seq_putc(m, '\n'); ++ } ++ ++ seq_puts(m, " "); ++ for (i = 0; i < NR_MM_STATS; i++) { ++ const char *s = " "; ++ unsigned long n = 0; ++ ++ if (seq == max_seq && NR_HIST_GENS == 1) { ++ s = "LOYNFA"; ++ n = READ_ONCE(lruvec->mm_state.stats[hist][i]); ++ } else if (seq != max_seq && NR_HIST_GENS > 1) { ++ s = "loynfa"; ++ n = READ_ONCE(lruvec->mm_state.stats[hist][i]); ++ } ++ ++ seq_printf(m, " %10lu%c", n, s[i]); ++ } ++ seq_putc(m, '\n'); ++} ++ ++static int lru_gen_seq_show(struct seq_file *m, void *v) ++{ ++ unsigned long seq; ++ bool full = !debugfs_real_fops(m->file)->write; ++ struct lruvec *lruvec = v; ++ struct lru_gen_struct *lrugen = &lruvec->lrugen; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (nid == first_memory_node) { ++ const char *path = memcg ? m->private : ""; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); ++#endif ++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); ++ } ++ ++ seq_printf(m, " node %5d\n", nid); ++ ++ if (!full) ++ seq = min_seq[LRU_GEN_ANON]; ++ else if (max_seq >= MAX_NR_GENS) ++ seq = max_seq - MAX_NR_GENS + 1; ++ else ++ seq = 0; ++ ++ for (; seq <= max_seq; seq++) { ++ int type, zone; ++ int gen = lru_gen_from_seq(seq); ++ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); ++ ++ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ unsigned long size = 0; ++ char mark = full && seq < min_seq[type] ? 'x' : ' '; ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); ++ ++ seq_printf(m, " %10lu%c", size, mark); ++ } ++ ++ seq_putc(m, '\n'); ++ ++ if (full) ++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); ++ } ++ ++ return 0; ++} ++ ++static const struct seq_operations lru_gen_seq_ops = { ++ .start = lru_gen_seq_start, ++ .stop = lru_gen_seq_stop, ++ .next = lru_gen_seq_next, ++ .show = lru_gen_seq_show, ++}; ++ ++static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, ++ bool can_swap, bool force_scan) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < max_seq) ++ return 0; ++ ++ if (seq > max_seq) ++ return -EINVAL; ++ ++ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) ++ return -ERANGE; ++ ++ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); ++ ++ return 0; ++} ++ ++static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, ++ int swappiness, unsigned long nr_to_reclaim) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq + MIN_NR_GENS > max_seq) ++ return -EINVAL; ++ ++ sc->nr_reclaimed = 0; ++ ++ while (!signal_pending(current)) { ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < min_seq[!swappiness]) ++ return 0; ++ ++ if (sc->nr_reclaimed >= nr_to_reclaim) ++ return 0; ++ ++ if (!evict_folios(lruvec, sc, swappiness, NULL)) ++ return 0; ++ ++ cond_resched(); ++ } ++ ++ return -EINTR; ++} ++ ++static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, ++ struct scan_control *sc, int swappiness, unsigned long opt) ++{ ++ struct lruvec *lruvec; ++ int err = -EINVAL; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) ++ return -EINVAL; ++ ++ if (!mem_cgroup_disabled()) { ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_id(memcg_id); ++#ifdef CONFIG_MEMCG ++ if (memcg && !css_tryget(&memcg->css)) ++ memcg = NULL; ++#endif ++ rcu_read_unlock(); ++ ++ if (!memcg) ++ return -EINVAL; ++ } ++ ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto done; ++ ++ lruvec = get_lruvec(memcg, nid); ++ ++ if (swappiness < 0) ++ swappiness = get_swappiness(lruvec, sc); ++ else if (swappiness > 200) ++ goto done; ++ ++ switch (cmd) { ++ case '+': ++ err = run_aging(lruvec, seq, sc, swappiness, opt); ++ break; ++ case '-': ++ err = run_eviction(lruvec, seq, sc, swappiness, opt); ++ break; ++ } ++done: ++ mem_cgroup_put(memcg); ++ ++ return err; ++} ++ ++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, ++ size_t len, loff_t *pos) ++{ ++ void *buf; ++ char *cur, *next; ++ unsigned int flags; ++ struct blk_plug plug; ++ int err = -EINVAL; ++ struct scan_control sc = { ++ .may_writepage = true, ++ .may_unmap = true, ++ .may_swap = true, ++ .reclaim_idx = MAX_NR_ZONES - 1, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ ++ buf = kvmalloc(len + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, src, len)) { ++ kvfree(buf); ++ return -EFAULT; ++ } ++ ++ set_task_reclaim_state(current, &sc.reclaim_state); ++ flags = memalloc_noreclaim_save(); ++ blk_start_plug(&plug); ++ if (!set_mm_walk(NULL)) { ++ err = -ENOMEM; ++ goto done; ++ } ++ ++ next = buf; ++ next[len] = '\0'; ++ ++ while ((cur = strsep(&next, ",;\n"))) { ++ int n; ++ int end; ++ char cmd; ++ unsigned int memcg_id; ++ unsigned int nid; ++ unsigned long seq; ++ unsigned int swappiness = -1; ++ unsigned long opt = -1; ++ ++ cur = skip_spaces(cur); ++ if (!*cur) ++ continue; ++ ++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, ++ &seq, &end, &swappiness, &end, &opt, &end); ++ if (n < 4 || cur[end]) { ++ err = -EINVAL; ++ break; ++ } ++ ++ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); ++ if (err) ++ break; ++ } ++done: ++ clear_mm_walk(); ++ blk_finish_plug(&plug); ++ memalloc_noreclaim_restore(flags); ++ set_task_reclaim_state(current, NULL); ++ ++ kvfree(buf); ++ ++ return err ? : len; ++} ++ ++static int lru_gen_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &lru_gen_seq_ops); ++} ++ ++static const struct file_operations lru_gen_rw_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .write = lru_gen_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations lru_gen_ro_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ + /****************************************************************************** + * initialization + ******************************************************************************/ +@@ -5308,6 +5696,9 @@ static int __init init_lru_gen(void) + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + ++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); ++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); ++ + return 0; + }; + late_initcall(init_lru_gen); + +From patchwork Wed Jul 6 22:00:22 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908711 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 6414FC43334 + for ; Wed, 6 Jul 2022 22:01:29 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 39E278E000A; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 34DB98E0001; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 1A1728E000A; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0017.hostedemail.com + [216.40.44.17]) + by kanga.kvack.org (Postfix) with ESMTP id 047FA8E0001 + for ; Wed, 6 Jul 2022 18:01:13 -0400 (EDT) +Received: from smtpin07.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay06.hostedemail.com (Postfix) with ESMTP id CA9C334906 + for ; Wed, 6 Jul 2022 22:01:12 +0000 (UTC) +X-FDA: 79658046384.07.4AE281A +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf19.hostedemail.com (Postfix) with ESMTP id E1D811A004A + for ; Wed, 6 Jul 2022 22:01:11 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + j11-20020a05690212cb00b006454988d225so12639320ybu.10 + for ; Wed, 06 Jul 2022 15:01:11 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=gpspQZzCiCtLDF1mE2Rbzp3OUWg7vlq70C4xLE3ya+E=; + b=KmRh3W6zCTnYhuu2uLwH/71AGZzl5TVUrtsNnUP5zXTmGsYrVbcqdtCu+MA/r0Ndp0 + Swx6K5/Y1yzZuona+ojX9pyfPH0vSgmsnPUuGuK8IgKoxke8pbVIOMVO1oHB4MFfbJr9 + MZQ2DHsaZhnv+oABy231/ZNYVnut1uI8HXMoZE64GkKDaX0oTm6VD5IWp6Pjb9e4CCS2 + 4l6LRlV0GkUZbtfNu7oRMgYKOcOBXuCtbtOCopiW839uMoofW0liroJ2wElyPDiAsF2j + ZEKcyiLmzwxANf1QRl8D0H0t207nTseUwQuoJ0fGq2geu1GyW7/GzRuxYm66v/+UUfVJ + Ti/g== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=gpspQZzCiCtLDF1mE2Rbzp3OUWg7vlq70C4xLE3ya+E=; + b=Ct8NkvISAcd2F1onRi9j8wNBQ3yVS4sMkwQThZBmSai4nt0pCzUW6MSInM6la2RQ+6 + Iyk/Q6V/4/M1AEzJ1CIyUOjtskptWB7g9JCLcYDV67l3e3cym3CfKO6faANsjcNo61aE + cGyF+8I3UwoMP2XkhiX8e+sh+JyAVS+7v6ah2jAK3rMcN9Qy3pRUpTzse16anYIvPXmH + D/n6XDiuVtka4xdvtVrXH1Ovj7jTQyu5zNSeDpYUHIIuY5HyyWlwP2GqOXO5+3ztetSe + lqHq/pwTeg5OaKzyo1/S4u5j63+cCDsRbst48LWqqY7iSJl7Jqjh9IcuciM5gwWyKVQq + exXg== +X-Gm-Message-State: AJIora8Djp7T6fvZwj7nFJ1nTHsOTMleXrE/THizuhZy3oXIgXemxG6T + WRLpIC0iL2d+my0UEmLvbYJe1kwX4xc= +X-Google-Smtp-Source: + AGRyM1tcS0bhkovBqaAMGcBTFG0LXet+IyIY3UhCyBJaxouYWPrdgSATtWZUnD1044Cxo6jW3UsFPLIHGBk= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:73d1:0:b0:66e:aee4:feb3 with SMTP id + o200-20020a2573d1000000b0066eaee4feb3mr1925521ybc.452.1657144871215; Wed, 06 + Jul 2022 15:01:11 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:22 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-14-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 13/14] mm: multi-gen LRU: admin guide +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144872; a=rsa-sha256; + cv=none; + b=JsWQytQvs1ZknqPoqD3qo1TJldLBGiKSTga/ejO8CyQYViqdXml7nvJD7fQyRxXf/sYCeO + o91ZwxjqHFV+Qk45x3ZWpVnbVz5s7Ub1LlWxdnj2ACxVxDi2i4I70KlZDYV1V7+0DLXiwM + Cf5UnSo0xArYOHGQTNNAa/beRpM+U2U= +ARC-Authentication-Results: i=1; + imf19.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=KmRh3W6z; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf19.hostedemail.com: domain of + 3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144872; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=gpspQZzCiCtLDF1mE2Rbzp3OUWg7vlq70C4xLE3ya+E=; + b=7SORwSc3XuCDIhY4Nnt3155Fml8B5PM7q+cxyDoyzRH9f30a8JT7kTOzO43GZtqu1vi0gx + ZvOQWsmLsXdrJ4He9F7TCEfWwHvTKJw2xq5RY+ztHvYdkw0u4ntOGqKfhRRpSNYMieV4f5 + IIb7Tz/BoxO7bW/vk+Bjj4szoFKZSXU= +X-Rspamd-Server: rspam04 +X-Rspam-User: +Authentication-Results: imf19.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=KmRh3W6z; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf19.hostedemail.com: domain of + 3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3JwbGYgYKCGgeafNGUMUUMRK.IUSROTad-SSQbGIQ.UXM@flex--yuzhao.bounces.google.com +X-Stat-Signature: a3k84bgjbfr9z8g5wse9kf3mp6fodfft +X-Rspamd-Queue-Id: E1D811A004A +X-HE-Tag: 1657144871-897432 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add an admin guide. + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +Reviewed-by: Bagas Sanjaya +--- + Documentation/admin-guide/mm/index.rst | 1 + + Documentation/admin-guide/mm/multigen_lru.rst | 156 ++++++++++++++++++ + mm/Kconfig | 3 +- + mm/vmscan.c | 4 + + 4 files changed, 163 insertions(+), 1 deletion(-) + create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst + +diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst +index c21b5823f126..2cf5bae62036 100644 +--- a/Documentation/admin-guide/mm/index.rst ++++ b/Documentation/admin-guide/mm/index.rst +@@ -32,6 +32,7 @@ the Linux memory management. + idle_page_tracking + ksm + memory-hotplug ++ multigen_lru + nommu-mmap + numa_memory_policy + numaperf +diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst +new file mode 100644 +index 000000000000..6355f2b5019d +--- /dev/null ++++ b/Documentation/admin-guide/mm/multigen_lru.rst +@@ -0,0 +1,156 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++============= ++Multi-Gen LRU ++============= ++The multi-gen LRU is an alternative LRU implementation that optimizes ++page reclaim and improves performance under memory pressure. Page ++reclaim decides the kernel's caching policy and ability to overcommit ++memory. It directly impacts the kswapd CPU usage and RAM efficiency. ++ ++Quick start ++=========== ++Build the kernel with the following configurations. ++ ++* ``CONFIG_LRU_GEN=y`` ++* ``CONFIG_LRU_GEN_ENABLED=y`` ++ ++All set! ++ ++Runtime options ++=============== ++``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the ++following subsections. ++ ++Kill switch ++----------- ++``enabled`` accepts different values to enable or disable the ++following components. Its default value depends on ++``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled ++unless some of them have unforeseen side effects. Writing to ++``enabled`` has no effect when a component is not supported by the ++hardware, and valid values will be accepted even when the main switch ++is off. ++ ++====== =============================================================== ++Values Components ++====== =============================================================== ++0x0001 The main switch for the multi-gen LRU. ++0x0002 Clearing the accessed bit in leaf page table entries in large ++ batches, when MMU sets it (e.g., on x86). This behavior can ++ theoretically worsen lock contention (mmap_lock). If it is ++ disabled, the multi-gen LRU will suffer a minor performance ++ degradation for workloads that contiguously map hot pages, ++ whose accessed bits can be otherwise cleared by fewer larger ++ batches. ++0x0004 Clearing the accessed bit in non-leaf page table entries as ++ well, when MMU sets it (e.g., on x86). This behavior was not ++ verified on x86 varieties other than Intel and AMD. If it is ++ disabled, the multi-gen LRU will suffer a negligible ++ performance degradation. ++[yYnN] Apply to all the components above. ++====== =============================================================== ++ ++E.g., ++:: ++ ++ echo y >/sys/kernel/mm/lru_gen/enabled ++ cat /sys/kernel/mm/lru_gen/enabled ++ 0x0007 ++ echo 5 >/sys/kernel/mm/lru_gen/enabled ++ cat /sys/kernel/mm/lru_gen/enabled ++ 0x0005 ++ ++Thrashing prevention ++-------------------- ++Personal computers are more sensitive to thrashing because it can ++cause janks (lags when rendering UI) and negatively impact user ++experience. The multi-gen LRU offers thrashing prevention to the ++majority of laptop and desktop users who do not have ``oomd``. ++ ++Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of ++``N`` milliseconds from getting evicted. The OOM killer is triggered ++if this working set cannot be kept in memory. In other words, this ++option works as an adjustable pressure relief valve, and when open, it ++terminates applications that are hopefully not being used. ++ ++Based on the average human detectable lag (~100ms), ``N=1000`` usually ++eliminates intolerable janks due to thrashing. Larger values like ++``N=3000`` make janks less noticeable at the risk of premature OOM ++kills. ++ ++The default value ``0`` means disabled. ++ ++Experimental features ++===================== ++``/sys/kernel/debug/lru_gen`` accepts commands described in the ++following subsections. Multiple command lines are supported, so does ++concatenation with delimiters ``,`` and ``;``. ++ ++``/sys/kernel/debug/lru_gen_full`` provides additional stats for ++debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from ++evicted generations in this file. ++ ++Working set estimation ++---------------------- ++Working set estimation measures how much memory an application needs ++in a given time interval, and it is usually done with little impact on ++the performance of the application. E.g., data centers want to ++optimize job scheduling (bin packing) to improve memory utilizations. ++When a new job comes in, the job scheduler needs to find out whether ++each server it manages can allocate a certain amount of memory for ++this new job before it can pick a candidate. To do so, the job ++scheduler needs to estimate the working sets of the existing jobs. ++ ++When it is read, ``lru_gen`` returns a histogram of numbers of pages ++accessed over different time intervals for each memcg and node. ++``MAX_NR_GENS`` decides the number of bins for each histogram. The ++histograms are noncumulative. ++:: ++ ++ memcg memcg_id memcg_path ++ node node_id ++ min_gen_nr age_in_ms nr_anon_pages nr_file_pages ++ ... ++ max_gen_nr age_in_ms nr_anon_pages nr_file_pages ++ ++Each bin contains an estimated number of pages that have been accessed ++within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages ++and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of ++the former is the largest and that of the latter is the smallest. ++ ++Users can write ``+ memcg_id node_id max_gen_nr ++[can_swap [force_scan]]`` to ``lru_gen`` to create a new generation ++``max_gen_nr+1``. ``can_swap`` defaults to the swap setting and, if it ++is set to ``1``, it forces the scan of anon pages when swap is off, ++and vice versa. ``force_scan`` defaults to ``1`` and, if it is set to ++``0``, it employs heuristics to reduce the overhead, which is likely ++to reduce the coverage as well. ++ ++A typical use case is that a job scheduler writes to ``lru_gen`` at a ++certain time interval to create new generations, and it ranks the ++servers it manages based on the sizes of their cold pages defined by ++this time interval. ++ ++Proactive reclaim ++----------------- ++Proactive reclaim induces page reclaim when there is no memory ++pressure. It usually targets cold pages only. E.g., when a new job ++comes in, the job scheduler wants to proactively reclaim cold pages on ++the server it selected to improve the chance of successfully landing ++this new job. ++ ++Users can write ``- memcg_id node_id min_gen_nr [swappiness ++[nr_to_reclaim]]`` to ``lru_gen`` to evict generations less than or ++equal to ``min_gen_nr``. Note that ``min_gen_nr`` should be less than ++``max_gen_nr-1`` as ``max_gen_nr`` and ``max_gen_nr-1`` are not fully ++aged and therefore cannot be evicted. ``swappiness`` overrides the ++default value in ``/proc/sys/vm/swappiness``. ``nr_to_reclaim`` limits ++the number of pages to evict. ++ ++A typical use case is that a job scheduler writes to ``lru_gen`` ++before it tries to land a new job on a server. If it fails to ++materialize enough cold pages because of the overestimation, it ++retries on the next server according to the ranking result obtained ++from the working set estimation step. This less forceful approach ++limits the impacts on the existing jobs. +diff --git a/mm/Kconfig b/mm/Kconfig +index 0c2ef0af0036..a0f7b6e66410 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1137,7 +1137,8 @@ config LRU_GEN + # make sure folio->flags has enough spare bits + depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP + help +- A high performance LRU implementation to overcommit memory. ++ A high performance LRU implementation to overcommit memory. See ++ Documentation/admin-guide/mm/multigen_lru.rst for details. + + config LRU_GEN_ENABLED + bool "Enable by default" +diff --git a/mm/vmscan.c b/mm/vmscan.c +index fbcd298adca7..7096ff7836db 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -5209,6 +5209,7 @@ static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, c + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); + } + ++/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ + static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) + { +@@ -5242,6 +5243,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); + } + ++/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ + static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) + { +@@ -5389,6 +5391,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + seq_putc(m, '\n'); + } + ++/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ + static int lru_gen_seq_show(struct seq_file *m, void *v) + { + unsigned long seq; +@@ -5547,6 +5550,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + return err; + } + ++/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ + static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) + { + +From patchwork Wed Jul 6 22:00:23 2022 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit +X-Patchwork-Submitter: Yu Zhao +X-Patchwork-Id: 12908712 +Return-Path: +X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on + aws-us-west-2-korg-lkml-1.web.codeaurora.org +Received: from kanga.kvack.org (kanga.kvack.org [205.233.56.17]) + by smtp.lore.kernel.org (Postfix) with ESMTP id 6E4E6C43334 + for ; Wed, 6 Jul 2022 22:01:33 +0000 (UTC) +Received: by kanga.kvack.org (Postfix) + id 59C1C8E000B; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) +Received: by kanga.kvack.org (Postfix, from userid 40) + id 5235B8E0001; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) +X-Delivered-To: int-list-linux-mm@kvack.org +Received: by kanga.kvack.org (Postfix, from userid 63042) + id 350398E000B; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) +X-Delivered-To: linux-mm@kvack.org +Received: from relay.hostedemail.com (smtprelay0012.hostedemail.com + [216.40.44.12]) + by kanga.kvack.org (Postfix) with ESMTP id 225F58E0001 + for ; Wed, 6 Jul 2022 18:01:14 -0400 (EDT) +Received: from smtpin31.hostedemail.com (a10.router.float.18 [10.200.18.1]) + by unirelay08.hostedemail.com (Postfix) with ESMTP id EE41D2169C + for ; Wed, 6 Jul 2022 22:01:13 +0000 (UTC) +X-FDA: 79658046426.31.47294E4 +Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com + [209.85.219.201]) + by imf01.hostedemail.com (Postfix) with ESMTP id 5513940016 + for ; Wed, 6 Jul 2022 22:01:13 +0000 (UTC) +Received: by mail-yb1-f201.google.com with SMTP id + a8-20020a25a188000000b0066839c45fe8so12515135ybi.17 + for ; Wed, 06 Jul 2022 15:01:13 -0700 (PDT) +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=google.com; s=20210112; + h=date:in-reply-to:message-id:mime-version:references:subject:from:to + :cc:content-transfer-encoding; + bh=Y7M5+uMCyjK2Tw3gtvlFnf3s0uMKtiqOOKU+iupOzGc=; + b=RaJYVCw6kQFWZr57Fj6Z+M7CjIu+Fy2mkXaD9icGpAKOAxyz1uufDA95qkMfXqksCy + CttyIsR4+X5trkDvd0W5HTI3/XFLKoLEsiRSAv23qebNkIOkH8cPlNd2JsU/+DVzJUpM + TGOZ6teMB/sFPIH8IZKMODnpg+VxKIyScGqlsqOiDoxcPPCMP8e0zolM240kI1HmhYsj + WxZdSDL+OZnX2V8pTDz516/mmCsEM23W0x65TiLdKDGOIFAAkNP/EIcvQWWj8SBUz/dL + a0IGdBEhZobBNts8S/4QPXOFk1zc9TBNhY+OPo4y5YJG3duUWWVQ+373DmVdZPluRI23 + DgVQ== +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20210112; + h=x-gm-message-state:date:in-reply-to:message-id:mime-version + :references:subject:from:to:cc:content-transfer-encoding; + bh=Y7M5+uMCyjK2Tw3gtvlFnf3s0uMKtiqOOKU+iupOzGc=; + b=t8KFMI+odj2H0PYsSR514rWxJS26xzV7AKq6CAfD49T5kfUjPz8wfdAfySr/kBsGFX + Ijo9N8v2aDQNSOqwxiy6N0WnrzD6bgFtRWeblglDP9rnKXQmG38PpjQrFbGbWRu0JwOP + V4GhBBsVBqsEbP6lV54mE8LL+AX6orjSmsdYgGuR7py2ze/69AI+KXkU4wuGGk7f02J6 + NOorMZZljVWHawNiYzwJ/nSCIEDP1RdLxj/QR1X2gsT6fGY0XqrePFMti1n8UBr5vGzF + qDM3r6uoPM0Dl89KQfjhANf8jyajCPr0wd7Ldc2REEmnDU12jZhd2cV3sTQEKMbtFvsH + RiDA== +X-Gm-Message-State: AJIora9JyuHh+WKBn43isO3BKSkb8MvQEqp82Y/5Bs0mEkxpSgPDJzSM + HtWXZ+iDc2EVNjhmgizIp3qSZYJgkRM= +X-Google-Smtp-Source: + AGRyM1tPyG6w7lg37p0dKVbMplDSUgwZboH2lG42opEnpdXZgbjOhtWD7cZCMHKO+sLemtrKnNTphNyTinE= +X-Received: from yuzhao.bld.corp.google.com + ([2620:15c:183:200:b89c:e10a:466e:cf7d]) + (user=yuzhao job=sendgmr) by 2002:a25:b806:0:b0:663:d35d:8b8a with SMTP id + v6-20020a25b806000000b00663d35d8b8amr45647399ybj.69.1657144872662; Wed, 06 + Jul 2022 15:01:12 -0700 (PDT) +Date: Wed, 6 Jul 2022 16:00:23 -0600 +In-Reply-To: <20220706220022.968789-1-yuzhao@google.com> +Message-Id: <20220706220022.968789-15-yuzhao@google.com> +Mime-Version: 1.0 +References: <20220706220022.968789-1-yuzhao@google.com> +X-Mailer: git-send-email 2.37.0.rc0.161.g10f37bed90-goog +Subject: [PATCH v13 14/14] mm: multi-gen LRU: design doc +From: Yu Zhao +To: Andrew Morton +Cc: Andi Kleen , + Aneesh Kumar , + Catalin Marinas , + Dave Hansen , Hillf Danton , + Jens Axboe , Johannes Weiner , + Jonathan Corbet , + Linus Torvalds , + Matthew Wilcox , Mel Gorman , + Michael Larabel , + Michal Hocko , Mike Rapoport , + Peter Zijlstra , Tejun Heo , + Vlastimil Babka , Will Deacon , + linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, + linux-kernel@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org, + page-reclaim@google.com, Yu Zhao , + Brian Geffon , + Jan Alexander Steffens , + Oleksandr Natalenko , + Steven Barrett , + Suleiman Souhlal , Daniel Byrne , + Donald Carr , + " =?utf-8?q?Holger_Hoffst=C3=A4tte?= " , + Konstantin Kharlamov , + Shuang Zhai , Sofia Trinh , + Vaibhav Jain +ARC-Seal: i=1; s=arc-20220608; d=hostedemail.com; t=1657144873; a=rsa-sha256; + cv=none; + b=UrrRpXp7KWnXHmjT/QxuJ33LiGsO02xp/Gl5IKp2przZQE/MN2oPkN0qvS6FM/HpuayBLm + zd3wW1kYV7c+CYfLpUIs4G8pg9A6gNyLzycabKZPgoBu+fqMU04tsshxN75CQVnnpFeUVh + ZD4xhdIcppi7j9nVM9IcKC/45QGbnp4= +ARC-Authentication-Results: i=1; + imf01.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=RaJYVCw6; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf01.hostedemail.com: domain of + 3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com +ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; + d=hostedemail.com; + s=arc-20220608; t=1657144873; + h=from:from:sender:reply-to:subject:subject:date:date: + message-id:message-id:to:to:cc:cc:mime-version:mime-version: + content-type:content-type: + content-transfer-encoding:content-transfer-encoding: + in-reply-to:in-reply-to:references:references:dkim-signature; + bh=Y7M5+uMCyjK2Tw3gtvlFnf3s0uMKtiqOOKU+iupOzGc=; + b=CC8ORwOmRVo1ysrsxcLM/w/OQsNgtHVDsWXjTolVPaVGtsBAmORZs9mo/t9qQJXlTbpE6W + MK4e1j+KxvgzJ4hEk7FEh4udfXbo/i2Zs4SIAS1fMWoE8oSUqdpISvSeaeM8m9OTpSMv9b + y/YSdGTLFiLWNyHM+yI8Q6QaQPpR8FA= +X-Rspamd-Server: rspam04 +X-Rspam-User: +Authentication-Results: imf01.hostedemail.com; + dkim=pass header.d=google.com header.s=20210112 header.b=RaJYVCw6; + dmarc=pass (policy=reject) header.from=google.com; + spf=pass (imf01.hostedemail.com: domain of + 3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com + designates 209.85.219.201 as permitted sender) + smtp.mailfrom=3KAbGYgYKCGkfbgOHVNVVNSL.JVTSPUbe-TTRcHJR.VYN@flex--yuzhao.bounces.google.com +X-Stat-Signature: gkifem6ym4fgtjcteqxerconsisp8cqt +X-Rspamd-Queue-Id: 5513940016 +X-HE-Tag: 1657144873-85540 +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.4 +Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: + +Add a design doc. + +Signed-off-by: Yu Zhao +Acked-by: Brian Geffon +Acked-by: Jan Alexander Steffens (heftig) +Acked-by: Oleksandr Natalenko +Acked-by: Steven Barrett +Acked-by: Suleiman Souhlal +Tested-by: Daniel Byrne +Tested-by: Donald Carr +Tested-by: Holger Hoffstätte +Tested-by: Konstantin Kharlamov +Tested-by: Shuang Zhai +Tested-by: Sofia Trinh +Tested-by: Vaibhav Jain +Reviewed-by: Bagas Sanjaya +--- + Documentation/vm/index.rst | 1 + + Documentation/vm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++ + 2 files changed, 160 insertions(+) + create mode 100644 Documentation/vm/multigen_lru.rst + +diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst +index 575ccd40e30c..4aa12b8be278 100644 +--- a/Documentation/vm/index.rst ++++ b/Documentation/vm/index.rst +@@ -51,6 +51,7 @@ above structured documentation, or deleted if it has served its purpose. + ksm + memory-model + mmu_notifier ++ multigen_lru + numa + overcommit-accounting + page_migration +diff --git a/Documentation/vm/multigen_lru.rst b/Documentation/vm/multigen_lru.rst +new file mode 100644 +index 000000000000..d7062c6a8946 +--- /dev/null ++++ b/Documentation/vm/multigen_lru.rst +@@ -0,0 +1,159 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++============= ++Multi-Gen LRU ++============= ++The multi-gen LRU is an alternative LRU implementation that optimizes ++page reclaim and improves performance under memory pressure. Page ++reclaim decides the kernel's caching policy and ability to overcommit ++memory. It directly impacts the kswapd CPU usage and RAM efficiency. ++ ++Design overview ++=============== ++Objectives ++---------- ++The design objectives are: ++ ++* Good representation of access recency ++* Try to profit from spatial locality ++* Fast paths to make obvious choices ++* Simple self-correcting heuristics ++ ++The representation of access recency is at the core of all LRU ++implementations. In the multi-gen LRU, each generation represents a ++group of pages with similar access recency. Generations establish a ++(time-based) common frame of reference and therefore help make better ++choices, e.g., between different memcgs on a computer or different ++computers in a data center (for job scheduling). ++ ++Exploiting spatial locality improves efficiency when gathering the ++accessed bit. A rmap walk targets a single page and does not try to ++profit from discovering a young PTE. A page table walk can sweep all ++the young PTEs in an address space, but the address space can be too ++sparse to make a profit. The key is to optimize both methods and use ++them in combination. ++ ++Fast paths reduce code complexity and runtime overhead. Unmapped pages ++do not require TLB flushes; clean pages do not require writeback. ++These facts are only helpful when other conditions, e.g., access ++recency, are similar. With generations as a common frame of reference, ++additional factors stand out. But obvious choices might not be good ++choices; thus self-correction is necessary. ++ ++The benefits of simple self-correcting heuristics are self-evident. ++Again, with generations as a common frame of reference, this becomes ++attainable. Specifically, pages in the same generation can be ++categorized based on additional factors, and a feedback loop can ++statistically compare the refault percentages across those categories ++and infer which of them are better choices. ++ ++Assumptions ++----------- ++The protection of hot pages and the selection of cold pages are based ++on page access channels and patterns. There are two access channels: ++ ++* Accesses through page tables ++* Accesses through file descriptors ++ ++The protection of the former channel is by design stronger because: ++ ++1. The uncertainty in determining the access patterns of the former ++ channel is higher due to the approximation of the accessed bit. ++2. The cost of evicting the former channel is higher due to the TLB ++ flushes required and the likelihood of encountering the dirty bit. ++3. The penalty of underprotecting the former channel is higher because ++ applications usually do not prepare themselves for major page ++ faults like they do for blocked I/O. E.g., GUI applications ++ commonly use dedicated I/O threads to avoid blocking rendering ++ threads. ++ ++There are also two access patterns: ++ ++* Accesses exhibiting temporal locality ++* Accesses not exhibiting temporal locality ++ ++For the reasons listed above, the former channel is assumed to follow ++the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is ++present, and the latter channel is assumed to follow the latter ++pattern unless outlying refaults have been observed. ++ ++Workflow overview ++================= ++Evictable pages are divided into multiple generations for each ++``lruvec``. The youngest generation number is stored in ++``lrugen->max_seq`` for both anon and file types as they are aged on ++an equal footing. The oldest generation numbers are stored in ++``lrugen->min_seq[]`` separately for anon and file types as clean file ++pages can be evicted regardless of swap constraints. These three ++variables are monotonically increasing. ++ ++Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` ++bits in order to fit into the gen counter in ``folio->flags``. Each ++truncated generation number is an index to ``lrugen->lists[]``. The ++sliding window technique is used to track at least ``MIN_NR_GENS`` and ++at most ``MAX_NR_GENS`` generations. The gen counter stores a value ++within ``[1, MAX_NR_GENS]`` while a page is on one of ++``lrugen->lists[]``; otherwise it stores zero. ++ ++Each generation is divided into multiple tiers. A page accessed ``N`` ++times through file descriptors is in tier ``order_base_2(N)``. Unlike ++generations, tiers do not have dedicated ``lrugen->lists[]``. In ++contrast to moving across generations, which requires the LRU lock, ++moving across tiers only involves atomic operations on ++``folio->flags`` and therefore has a negligible cost. A feedback loop ++modeled after the PID controller monitors refaults over all the tiers ++from anon and file types and decides which tiers from which types to ++evict or protect. ++ ++There are two conceptually independent procedures: the aging and the ++eviction. They form a closed-loop system, i.e., the page reclaim. ++ ++Aging ++----- ++The aging produces young generations. Given an ``lruvec``, it ++increments ``max_seq`` when ``max_seq-min_seq+1`` approaches ++``MIN_NR_GENS``. The aging promotes hot pages to the youngest ++generation when it finds them accessed through page tables; the ++demotion of cold pages happens consequently when it increments ++``max_seq``. The aging uses page table walks and rmap walks to find ++young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list`` ++and calls ``walk_page_range()`` with each ``mm_struct`` on this list ++to scan PTEs, and after each iteration, it increments ``max_seq``. For ++the latter, when the eviction walks the rmap and finds a young PTE, ++the aging scans the adjacent PTEs. For both, on finding a young PTE, ++the aging clears the accessed bit and updates the gen counter of the ++page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. ++ ++Eviction ++-------- ++The eviction consumes old generations. Given an ``lruvec``, it ++increments ``min_seq`` when ``lrugen->lists[]`` indexed by ++``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to ++evict from, it first compares ``min_seq[]`` to select the older type. ++If both types are equally old, it selects the one whose first tier has ++a lower refault percentage. The first tier contains single-use ++unmapped clean pages, which are the best bet. The eviction sorts a ++page according to its gen counter if the aging has found this page ++accessed through page tables and updated its gen counter. It also ++moves a page to the next generation, i.e., ``min_seq+1``, if this page ++was accessed multiple times through file descriptors and the feedback ++loop has detected outlying refaults from the tier this page is in. To ++this end, the feedback loop uses the first tier as the baseline, for ++the reason stated earlier. ++ ++Summary ++------- ++The multi-gen LRU can be disassembled into the following parts: ++ ++* Generations ++* Rmap walks ++* Page table walks ++* Bloom filters ++* PID controller ++ ++The aging and the eviction form a producer-consumer model; ++specifically, the latter drives the former by the sliding window over ++generations. Within the aging, rmap walks drive page table walks by ++inserting hot densely populated page tables to the Bloom filters. ++Within the eviction, the PID controller uses refaults as the feedback ++to select types to evict and tiers to protect. diff --git a/sys-kernel/pinephone-sources/files/pp-keyboard.patch b/sys-kernel/pinephone-sources/files/pp-keyboard.patch new file mode 100644 index 0000000..a8e818e --- /dev/null +++ b/sys-kernel/pinephone-sources/files/pp-keyboard.patch @@ -0,0 +1,176 @@ +From d1d849cae12db71aa81ceedaedc1b17a34790367 Mon Sep 17 00:00:00 2001 +From: Samuel Holland +Date: Sat, 19 Jun 2021 18:36:05 -0500 +Subject: [PATCH] Input: kb151 - Add a driver for the KB151 keyboard + +This keyboard is found in the official Pine64 PinePhone keyboard case. +It is connected over I2C and runs a libre firmware. + +Signed-off-by: Samuel Holland +--- + .../dts/allwinner/sun50i-a64-pinephone.dtsi | 64 +++++ + drivers/input/keyboard/Kconfig | 10 + + drivers/input/keyboard/Makefile | 1 + + drivers/input/keyboard/kb151.c | 246 ++++++++++++++++++ + 4 files changed, 321 insertions(+) + create mode 100644 drivers/input/keyboard/kb151.c + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +index 4ede9fe66020c..0bdc6eceec609 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +@@ -551,6 +551,70 @@ + /* Connected to pogo pins (external spring based pinheader for user addons) */ + &i2c2 { + status = "okay"; ++ ++ keyboard@15 { ++ compatible = "pine64,kb151"; ++ reg = <0x15>; ++ interrupt-parent = <&r_pio>; ++ interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ ++ keypad,num-rows = <6>; ++ keypad,num-columns = <12>; ++ linux,keymap = ; ++ wakeup-source; ++ }; + }; + + &i2s2 { +diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig +index 40a070a2e7f5b..0259e9133f469 100644 +--- a/drivers/input/keyboard/Kconfig ++++ b/drivers/input/keyboard/Kconfig +@@ -353,6 +353,16 @@ config KEYBOARD_HP7XX + To compile this driver as a module, choose M here: the + module will be called jornada720_kbd. + ++config KEYBOARD_KB151 ++ tristate "Pine64 KB151 Keyboard" ++ depends on I2C ++ select CRC8 ++ select INPUT_MATRIXKMAP ++ help ++ Say Y here to enable support for the KB151 keyboard used in the ++ Pine64 PinePhone keyboard case. This driver supports the FLOSS ++ firmware available at https://megous.com/git/pinephone-keyboard/ ++ + config KEYBOARD_LM8323 + tristate "LM8323 keypad chip" + depends on I2C +From 2423aac2d6f5db55da99e11fd799ee66fe6f54c6 Mon Sep 17 00:00:00 2001 +From: Samuel Holland +Date: Mon, 9 Aug 2021 19:30:18 -0500 +Subject: [PATCH] Input: kb151 - Add support for the FN layer + +Signed-off-by: Samuel Holland +--- + .../dts/allwinner/sun50i-a64-pinephone.dtsi | 34 +++++++++++++++++-- + drivers/input/keyboard/kb151.c | 33 ++++++++++-------- + 2 files changed, 51 insertions(+), 16 deletions(-) + +diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +index 0bdc6eceec609..68f5730cf164c 100644 +--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi ++++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinephone.dtsi +@@ -557,7 +557,7 @@ + reg = <0x15>; + interrupt-parent = <&r_pio>; + interrupts = <0 12 IRQ_TYPE_EDGE_FALLING>; /* PL12 */ +- keypad,num-rows = <6>; ++ keypad,num-rows = <12>; + keypad,num-columns = <12>; + linux,keymap = ; ++ MATRIX_KEY(5, 5, KEY_RIGHTALT) ++ ++ /* FN layer */ ++ MATRIX_KEY(6, 1, KEY_BACKSLASH) ++ MATRIX_KEY(6, 2, KEY_BACKSLASH) ++ MATRIX_KEY(6, 3, KEY_DOLLAR) ++ MATRIX_KEY(6, 4, KEY_EURO) ++ MATRIX_KEY(6, 5, KEY_GRAVE) ++ MATRIX_KEY(6, 6, KEY_GRAVE) ++ MATRIX_KEY(6, 7, KEY_MINUS) ++ MATRIX_KEY(6, 8, KEY_EQUAL) ++ MATRIX_KEY(6, 9, KEY_MINUS) ++ MATRIX_KEY(6, 10, KEY_EQUAL) ++ MATRIX_KEY(6, 11, KEY_DELETE) ++ ++ MATRIX_KEY(8, 0, KEY_SYSRQ) ++ MATRIX_KEY(8, 10, KEY_INSERT) ++ ++ MATRIX_KEY(9, 0, KEY_LEFTSHIFT) ++ MATRIX_KEY(9, 8, KEY_HOME) ++ MATRIX_KEY(9, 9, KEY_UP) ++ MATRIX_KEY(9, 10, KEY_END) ++ ++ MATRIX_KEY(10, 1, KEY_LEFTCTRL) ++ MATRIX_KEY(10, 6, KEY_LEFT) ++ MATRIX_KEY(10, 8, KEY_RIGHT) ++ MATRIX_KEY(10, 9, KEY_DOWN) ++ ++ MATRIX_KEY(11, 2, KEY_FN) ++ MATRIX_KEY(11, 3, KEY_LEFTALT) ++ MATRIX_KEY(11, 5, KEY_RIGHTALT)>; + wakeup-source; + }; + }; diff --git a/sys-kernel/pinephone-sources/pinephone-sources-5.18.6.ebuild b/sys-kernel/pinephone-sources/pinephone-sources-5.18.6.ebuild deleted file mode 100755 index 4f19ba3..0000000 --- a/sys-kernel/pinephone-sources/pinephone-sources-5.18.6.ebuild +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 1999-2022 Gentoo Authors -# Distributed under the terms of the GNU General Public License v2 - - -EAPI="8" -ETYPE="sources" -K_NOUSENAME="yes" -K_NOSETEXTRAVERSION="yes" -K_SECURITY_UNSUPPORTED="1" -K_WANT_GENPATCHES="base extras experimental" -K_GENPATCHES_VER="9" - -inherit kernel-2 -detect_version -detect_arch - -KEYWORDS="~arm64" - -DEPEND="${RDEPEND} - >=sys-devel/patch-2.7.5" - -DESCRIPTION="Full sources for the Linux kernel with gentoo patchset and patches for the PinePhone" - -MEGI_TAG="orange-pi-5.18-20220615-1100" -SRC_URI="https://github.com/megous/linux/archive/${MEGI_TAG}.tar.gz ${GENPATCHES_URI}" - -PATCHES=( - # Drop Megi's Modem-Power - "${FILESDIR}"/0101-arm64-dts-pinephone-drop-modem-power-node.patch - "${FILESDIR}"/0102-arm64-dts-pinephone-pro-remove-modem-node.patch - # Reparent clocks to lower speed-occillator - "${FILESDIR}"/0103-ccu-sun50i-a64-reparent-clocks-to-lower-speed-oscillator.patch - # Quirk for Kernel-Bug 210681 - "${FILESDIR}"/0104-quirk-kernel-org-bug-210681-firmware_rome_error.patch - # LED patches - "${FILESDIR}"/0105-leds-gpio-make-max_brightness-configurable.patch - "${FILESDIR}"/0106-panic-led.patch - # Bootsplash - "${FILESDIR}"/0201-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch - "${FILESDIR}"/0202-revert-fbcon-remove-no-op-fbcon_set_origin.patch - "${FILESDIR}"/0203-revert-fbcon-remove-soft-scrollback-code.patch -) - -src_prepare() { - default - eapply_user -} - -pkg_postinst() { - kernel-2_pkg_postinst - einfo "To build and install the kernel use the following commands:" - einfo "# make Image modules" - einfo "# make DTC_FLAGS="-@" dtbs" - einfo "# cp arch/arm64/boot/Image /boot" - einfo "# make INSTALL_MOD_PATH=/usr modules_install" - einfo "# make INSTALL_DTBS_PATH=/boot/dtbs dtbs_install" - einfo "You will need to create and initramfs afterwards." - einfo "If you use dracut you can run:" - einfo "# dracut -m \"rootfs-block base\" --host-only --kver 5.18.3-pinehone-gentoo-arm64" - einfo "Change 5.18.2-pinehone-gentoo-arm64 to your kernel version installed in /lib/modules" -} - -pkg_postrm() { - kernel-2_pkg_postrm -} - diff --git a/sys-kernel/pinephone-sources/pinephone-sources-5.19.0.ebuild b/sys-kernel/pinephone-sources/pinephone-sources-5.19.0.ebuild new file mode 100644 index 0000000..1adb8bc --- /dev/null +++ b/sys-kernel/pinephone-sources/pinephone-sources-5.19.0.ebuild @@ -0,0 +1,81 @@ +# Copyright 1999-2021 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI="8" +K_NOUSENAME="yes" +K_NOSETEXTRAVERSION="yes" +K_SECURITY_UNSUPPORTED="1" +K_GENPATCHES_VER="1" +ETYPE="sources" +inherit kernel-2 +detect_version + +KEYWORDS="~arm64" + +DEPEND="${RDEPEND} + >=sys-devel/patch-2.7.5" + +DESCRIPTION="Full sources for the Linux kernel, with megi's patch for pinephone and gentoo patchset" + +MEGI_TAG="orange-pi-5.19-20220802-0940" +SRC_URI="https://github.com/megous/linux/archive/${MEGI_TAG}.tar.gz" + +PATCHES=( + #Gentoo Patches + ${FILESDIR}/1500_XATTR_USER_PREFIX.patch + ${FILESDIR}/1510_fs-enable-link-security-restrictions-by-default.patch + ${FILESDIR}/1700_sparc-address-warray-bound-warnings.patch + ${FILESDIR}/2000_BT-Check-key-sizes-only-if-Secure-Simple-Pairing-enabled.patch + ${FILESDIR}/2900_tmp513-Fix-build-issue-by-selecting-CONFIG_REG.patch + ${FILESDIR}/2920_sign-file-patch-for-libressl.patch + ${FILESDIR}/3000_Support-printing-firmware-info.patch + ${FILESDIR}/4567_distro-Gentoo-Kconfig.patch + ${FILESDIR}/5010_enable-cpu-optimizations-universal.patch + ${FILESDIR}/5020_BMQ-and-PDS-io-scheduler-v5.19-r0.patch + ${FILESDIR}/5021_BMQ-and-PDS-gentoo-defaults.patch + + #PinePhone Patches + ${FILESDIR}/0101-arm64-dts-pinephone-drop-modem-power-node.patch + ${FILESDIR}/0102-arm64-dts-pinephone-pro-remove-modem-node.patch + ${FILESDIR}/0103-ccu-sun50i-a64-reparent-clocks-to-lower-speed-oscillator.patch + ${FILESDIR}/0104-quirk-kernel-org-bug-210681-firmware_rome_error.patch + ${FILESDIR}/0105-leds-gpio-make-max_brightness-configurable.patch + ${FILESDIR}/0106-panic-led.patch + + # keyboard + ${FILESDIR}/pp-keyboard.patch + + # LRU + ${FILESDIR}/Multi-Gen-LRU-Framework.patch +) + +S="${WORKDIR}/linux-${MEGI_TAG}" + +src_unpack() { + default +} + +src_prepare() { + default + eapply_user +} + +pkg_postinst() { + kernel-2_pkg_postinst + einfo "For more info on this patchset, and how to report problems, see:" + einfo "${HOMEPAGE}" + einfo "To build the kernel use the following command:" + einfo "make Image Image.gz modules" + einfo "make DTC_FLAGS="-@" dtbs" + einfo "make install; make modules_intall; make dtbs_install" + einfo "If you use kernel config coming with this ebuild, don't forget to also copy dracut-pp.conf to /etc/dracut.conf.d/" + einfo "to make sure proper kernel modules are loaded into initramfs" + einfo "if you want to cross compile pinephone kernel on amd64 host, follow the https://wiki.gentoo.org/wiki/Cross_build_environment" + einfo "to setup cross toolchain environment, then create a xmake wrapper like the following, and replace make with xmake in above commands" + einfo "#!/bin/sh" + einfo "exec make ARCH='arm64' CROSS_COMPILE='aarch64-unknown-linux-gnu-' INSTALL_MOD_PATH='${SYSROOT}' '$@'" +} + +pkg_postrm() { + kernel-2_pkg_postrm +}