From d2a2a4714d80d09b0f8eb6438ab4224690b7121e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 8 Aug 2024 11:39:08 -0700 Subject: [PATCH 01/70] exec: Fix ToCToU between perm check and set-uid/gid usage commit f50733b45d865f91db90919f8311e2127ce5a0cb upstream. When opening a file for exec via do_filp_open(), permission checking is done against the file's metadata at that moment, and on success, a file pointer is passed back. Much later in the execve() code path, the file metadata (specifically mode, uid, and gid) is used to determine if/how to set the uid and gid. However, those values may have changed since the permissions check, meaning the execution may gain unintended privileges. For example, if a file could change permissions from executable and not set-id: ---------x 1 root root 16048 Aug 7 13:16 target to set-id and non-executable: ---S------ 1 root root 16048 Aug 7 13:16 target it is possible to gain root privileges when execution should have been disallowed. While this race condition is rare in real-world scenarios, it has been observed (and proven exploitable) when package managers are updating the setuid bits of installed programs. Such files start with being world-executable but then are adjusted to be group-exec with a set-uid bit. For example, "chmod o-x,u+s target" makes "target" executable only by uid "root" and gid "cdrom", while also becoming setuid-root: -rwxr-xr-x 1 root cdrom 16048 Aug 7 13:16 target becomes: -rwsr-xr-- 1 root cdrom 16048 Aug 7 13:16 target But racing the chmod means users without group "cdrom" membership can get the permission to execute "target" just before the chmod, and when the chmod finishes, the exec reaches brpm_fill_uid(), and performs the setuid to root, violating the expressed authorization of "only cdrom group members can setuid to root". Re-check that we still have execute permissions in case the metadata has changed. It would be better to keep a copy from the perm-check time, but until we can do that refactoring, the least-bad option is to do a full inode_permission() call (under inode lock). It is understood that this is safe against dead-locks, but hardly optimal. Reported-by: Marco Vanotti Tested-by: Marco Vanotti Suggested-by: Linus Torvalds Cc: stable@vger.kernel.org Cc: Eric Biederman Cc: Alexander Viro Cc: Christian Brauner Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/exec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 89a9017af7e86f..1cbbef281f8cfe 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1609,6 +1609,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; + int err; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1625,12 +1626,17 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* Be careful if suid/sgid is set */ inode_lock(inode); - /* reload atomically mode/uid/gid now that lock held */ + /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); + /* Did the exec bit vanish out from under us? Give up. */ + if (err) + return; + /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) From bd865c7690020fb18e3f604f1b62706cfae289af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Mon, 3 Jun 2024 12:28:18 +0200 Subject: [PATCH 02/70] ASoC: topology: Clean up route loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e0e7bc2cbee93778c4ad7d9a792d425ffb5af6f7 upstream. Instead of using very long macro name, assign it to shorter variable and use it instead. While doing that, we can reduce multiple if checks using this define to one. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240603102818.36165-5-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-topology.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 8b58a7864703ee..e7a2426dd74439 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1021,6 +1021,7 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, struct snd_soc_tplg_hdr *hdr) { struct snd_soc_dapm_context *dapm = &tplg->comp->dapm; + const size_t maxlen = SNDRV_CTL_ELEM_ID_NAME_MAXLEN; struct snd_soc_tplg_dapm_graph_elem *elem; struct snd_soc_dapm_route *route; int count, i; @@ -1044,38 +1045,27 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, tplg->pos += sizeof(struct snd_soc_tplg_dapm_graph_elem); /* validate routes */ - if (strnlen(elem->source, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { - ret = -EINVAL; - break; - } - if (strnlen(elem->sink, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { - ret = -EINVAL; - break; - } - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) == - SNDRV_CTL_ELEM_ID_NAME_MAXLEN) { + if ((strnlen(elem->source, maxlen) == maxlen) || + (strnlen(elem->sink, maxlen) == maxlen) || + (strnlen(elem->control, maxlen) == maxlen)) { ret = -EINVAL; break; } route->source = devm_kmemdup(tplg->dev, elem->source, - min(strlen(elem->source), - SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->source), maxlen), GFP_KERNEL); route->sink = devm_kmemdup(tplg->dev, elem->sink, - min(strlen(elem->sink), SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->sink), maxlen), GFP_KERNEL); if (!route->source || !route->sink) { ret = -ENOMEM; break; } - if (strnlen(elem->control, SNDRV_CTL_ELEM_ID_NAME_MAXLEN) != 0) { + if (strnlen(elem->control, maxlen) != 0) { route->control = devm_kmemdup(tplg->dev, elem->control, - min(strlen(elem->control), - SNDRV_CTL_ELEM_ID_NAME_MAXLEN), + min(strlen(elem->control), maxlen), GFP_KERNEL); if (!route->control) { ret = -ENOMEM; From 7c3e55d8b4630fe7d34d96bc84e67f341d9de369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Thu, 13 Jun 2024 11:01:26 +0200 Subject: [PATCH 03/70] ASoC: topology: Fix route memory corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0298f51652be47b79780833e0b63194e1231fa34 upstream. It was reported that recent fix for memory corruption during topology load, causes corruption in other cases. Instead of being overeager with checking topology, assume that it is properly formatted and just duplicate strings. Reported-by: Pierre-Louis Bossart Closes: https://lore.kernel.org/linux-sound/171812236450.201359.3019210915105428447.b4-ty@kernel.org/T/#m8c4bd5abf453960fde6f826c4b7f84881da63e9d Suggested-by: Péter Ujfalusi Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20240613090126.841189-1-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-topology.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index e7a2426dd74439..7e8fca0b066280 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1052,21 +1052,15 @@ static int soc_tplg_dapm_graph_elems_load(struct soc_tplg *tplg, break; } - route->source = devm_kmemdup(tplg->dev, elem->source, - min(strlen(elem->source), maxlen), - GFP_KERNEL); - route->sink = devm_kmemdup(tplg->dev, elem->sink, - min(strlen(elem->sink), maxlen), - GFP_KERNEL); + route->source = devm_kstrdup(tplg->dev, elem->source, GFP_KERNEL); + route->sink = devm_kstrdup(tplg->dev, elem->sink, GFP_KERNEL); if (!route->source || !route->sink) { ret = -ENOMEM; break; } if (strnlen(elem->control, maxlen) != 0) { - route->control = devm_kmemdup(tplg->dev, elem->control, - min(strlen(elem->control), maxlen), - GFP_KERNEL); + route->control = devm_kstrdup(tplg->dev, elem->control, GFP_KERNEL); if (!route->control) { ret = -ENOMEM; break; From 9d91b004df9a6aa2487661e52a398fd808184955 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sat, 20 Jul 2024 22:40:58 +0800 Subject: [PATCH 04/70] LoongArch: Define __ARCH_WANT_NEW_STAT in unistd.h commit 7697a0fe0154468f5df35c23ebd7aa48994c2cdc upstream. Chromium sandbox apparently wants to deny statx [1] so it could properly inspect arguments after the sandboxed process later falls back to fstat. Because there's currently not a "fd-only" version of statx, so that the sandbox has no way to ensure the path argument is empty without being able to peek into the sandboxed process's memory. For architectures able to do newfstatat though, glibc falls back to newfstatat after getting -ENOSYS for statx, then the respective SIGSYS handler [2] takes care of inspecting the path argument, transforming allowed newfstatat's into fstat instead which is allowed and has the same type of return value. But, as LoongArch is the first architecture to not have fstat nor newfstatat, the LoongArch glibc does not attempt falling back at all when it gets -ENOSYS for statx -- and you see the problem there! Actually, back when the LoongArch port was under review, people were aware of the same problem with sandboxing clone3 [3], so clone was eventually kept. Unfortunately it seemed at that time no one had noticed statx, so besides restoring fstat/newfstatat to LoongArch uapi (and postponing the problem further), it seems inevitable that we would need to tackle seccomp deep argument inspection. However, this is obviously a decision that shouldn't be taken lightly, so we just restore fstat/newfstatat by defining __ARCH_WANT_NEW_STAT in unistd.h. This is the simplest solution for now, and so we hope the community will tackle the long-standing problem of seccomp deep argument inspection in the future [4][5]. Also add "newstat" to syscall_abis_64 in Makefile.syscalls due to upstream asm-generic changes. More infomation please reading this thread [6]. [1] https://chromium-review.googlesource.com/c/chromium/src/+/2823150 [2] https://chromium.googlesource.com/chromium/src/sandbox/+/c085b51940bd/linux/seccomp-bpf-helpers/sigsys_handlers.cc#355 [3] https://lore.kernel.org/linux-arch/20220511211231.GG7074@brightrain.aerifal.cx/ [4] https://lwn.net/Articles/799557/ [5] https://lpc.events/event/4/contributions/560/attachments/397/640/deep-arg-inspection.pdf [6] https://lore.kernel.org/loongarch/20240226-granit-seilschaft-eccc2433014d@brauner/T/#t Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen Signed-off-by: Greg Kroah-Hartman --- arch/loongarch/include/uapi/asm/unistd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/include/uapi/asm/unistd.h b/arch/loongarch/include/uapi/asm/unistd.h index fcb668984f0336..b344b1f917153b 100644 --- a/arch/loongarch/include/uapi/asm/unistd.h +++ b/arch/loongarch/include/uapi/asm/unistd.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE3 From 5bc2b8f225e9a7470100719121107d4a56c178fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Aug 2024 18:35:53 -0400 Subject: [PATCH 05/70] NFSD: Rewrite synopsis of nfsd_percpu_counters_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5ec39944f874e1ecc09f624a70dfaa8ac3bf9d08 ] In function ‘export_stats_init’, inlined from ‘svc_export_alloc’ at fs/nfsd/export.c:866:6: fs/nfsd/export.c:337:16: warning: ‘nfsd_percpu_counters_init’ accessing 40 bytes in a region of size 0 [-Wstringop-overflow=] 337 | return nfsd_percpu_counters_init(&stats->counter, EXP_STATS_COUNTERS_NUM); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/nfsd/export.c:337:16: note: referencing argument 1 of type ‘struct percpu_counter[0]’ fs/nfsd/stats.h: In function ‘svc_export_alloc’: fs/nfsd/stats.h:40:5: note: in a call to function ‘nfsd_percpu_counters_init’ 40 | int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); | ^~~~~~~~~~~~~~~~~~~~~~~~~ Cc: Amir Goldstein Reviewed-by: Jeff Layton Stable-dep-of: 93483ac5fec6 ("nfsd: expose /proc/net/sunrpc/nfsd in net namespaces") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/stats.c | 2 +- fs/nfsd/stats.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 63797635e1c328..12c299a05433b7 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -76,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v) DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) +int nfsd_percpu_counters_init(struct percpu_counter *counters, int num) { int i, err = 0; diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index cf5524e7ca0623..a3e9e2f47ec42e 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -37,9 +37,9 @@ extern struct nfsd_stats nfsdstats; extern struct svc_stat nfsd_svcstats; -int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); -void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num); -void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num); +int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); +void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); +void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); int nfsd_stat_init(void); void nfsd_stat_shutdown(void); From 9b31d561f47573c9940ef4405d33a5fc1c214acf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Aug 2024 18:35:54 -0400 Subject: [PATCH 06/70] NFSD: Fix frame size warning in svc_export_parse() [ Upstream commit 6939ace1f22681fface7841cdbf34d3204cc94b5 ] fs/nfsd/export.c: In function 'svc_export_parse': fs/nfsd/export.c:737:1: warning: the frame size of 1040 bytes is larger than 1024 bytes [-Wframe-larger-than=] 737 | } On my systems, svc_export_parse() has a stack frame of over 800 bytes, not 1040, but nonetheless, it could do with some reduction. When a struct svc_export is on the stack, it's a temporary structure used as an argument, and not visible as an actual exported FS. No need to reserve space for export_stats in such cases. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310012359.YEw5IrK6-lkp@intel.com/ Cc: Amir Goldstein Reviewed-by: Jeff Layton Stable-dep-of: 4b14885411f7 ("nfsd: make all of the nfsd stats per-network namespace") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/export.c | 32 +++++++++++++++++++++++--------- fs/nfsd/export.h | 4 ++-- fs/nfsd/stats.h | 12 ++++++------ 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 11a0eaa2f91407..b7da17e530077e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats) static void export_stats_reset(struct export_stats *stats) { - nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); + if (stats) + nfsd_percpu_counters_reset(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void export_stats_destroy(struct export_stats *stats) { - nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); + if (stats) + nfsd_percpu_counters_destroy(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) @@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref) path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); - export_stats_destroy(&exp->ex_stats); + export_stats_destroy(exp->ex_stats); + kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } @@ -767,13 +772,15 @@ static int svc_export_show(struct seq_file *m, seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { - seq_printf(m, "\t%lld\n", exp->ex_stats.start_time); + struct percpu_counter *counter = exp->ex_stats->counter; + + seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE])); + percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ])); + percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE])); + percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } @@ -819,7 +826,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; - export_stats_reset(&new->ex_stats); + export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -856,7 +863,14 @@ static struct cache_head *svc_export_alloc(void) if (!i) return NULL; - if (export_stats_init(&i->ex_stats)) { + i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); + if (!i->ex_stats) { + kfree(i); + return NULL; + } + + if (export_stats_init(i->ex_stats)) { + kfree(i->ex_stats); kfree(i); return NULL; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 2df8ae25aad302..ca9dc230ae3d0b 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -64,10 +64,10 @@ struct svc_export { struct cache_head h; struct auth_domain * ex_client; int ex_flags; + int ex_fsid; struct path ex_path; kuid_t ex_anon_uid; kgid_t ex_anon_gid; - int ex_fsid; unsigned char * ex_uuid; /* 16 byte fsid */ struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; @@ -76,8 +76,8 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; - struct export_stats ex_stats; unsigned long ex_xprtsec_modes; + struct export_stats *ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index a3e9e2f47ec42e..14f50c660b619e 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -61,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void) static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) { percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); - if (exp) - percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]); + if (exp && exp->ex_stats) + percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); - if (exp) - percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); - if (exp) - percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) From 1257fe22e14ecb2deb12f2df52e118e221c15b4e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:35:55 -0400 Subject: [PATCH 07/70] sunrpc: don't change ->sv_stats if it doesn't exist [ Upstream commit ab42f4d9a26f1723dcfd6c93fcf768032b2bb5e7 ] We check for the existence of ->sv_stats elsewhere except in the core processing code. It appears that only nfsd actual exports these values anywhere, everybody else just has a write only copy of sv_stats in their svc_program. Add a check for ->sv_stats before every adjustment to allow us to eliminate the stats struct from all the users who don't report the stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/svc.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 691499d1d2315c..03a51c5ec82230 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1377,7 +1377,8 @@ svc_process_common(struct svc_rqst *rqstp) goto err_bad_proc; /* Syntactic check complete */ - serv->sv_stats->rpccnt++; + if (serv->sv_stats) + serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); aoffset = xdr_stream_pos(xdr); @@ -1429,7 +1430,8 @@ svc_process_common(struct svc_rqst *rqstp) goto close_xprt; err_bad_rpc: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_MISMATCH); /* Only RPCv2 supported */ @@ -1440,7 +1442,8 @@ svc_process_common(struct svc_rqst *rqstp) err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); - serv->sv_stats->rpcbadauth++; + if (serv->sv_stats) + serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of reply status: */ xdr_truncate_encode(xdr, XDR_UNIT * 2); xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); @@ -1450,7 +1453,8 @@ svc_process_common(struct svc_rqst *rqstp) err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; @@ -1458,7 +1462,8 @@ svc_process_common(struct svc_rqst *rqstp) svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_mismatch; /* @@ -1472,19 +1477,22 @@ svc_process_common(struct svc_rqst *rqstp) err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; err_garbage_args: svc_printk(rqstp, "failed to decode RPC header\n"); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_garbage_args; goto sendit; err_system_err: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_system_err; goto sendit; } @@ -1536,7 +1544,8 @@ void svc_process(struct svc_rqst *rqstp) out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", be32_to_cpu(*p)); - rqstp->rq_server->sv_stats->rpcbadfmt++; + if (rqstp->rq_server->sv_stats) + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); } From 032ed4c630cd6ff08494137f9636460c83410d46 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:35:56 -0400 Subject: [PATCH 08/70] nfsd: stop setting ->pg_stats for unused stats [ Upstream commit a2214ed588fb3c5b9824a21cff870482510372bb ] A lot of places are setting a blank svc_stats in ->pg_stats and never utilizing these stats. Remove all of these extra structs as we're not reporting these stats anywhere. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/lockd/svc.c | 3 --- fs/nfs/callback.c | 3 --- fs/nfsd/nfssvc.c | 5 ----- 3 files changed, 11 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 6579948070a482..a62331487ebf16 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -712,8 +712,6 @@ static const struct svc_version *nlmsvc_version[] = { #endif }; -static struct svc_stat nlmsvc_stats; - #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ @@ -721,7 +719,6 @@ static struct svc_program nlmsvc_program = { .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ - .pg_stats = &nlmsvc_stats, /* stats table */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 466ebf1d41b2b7..869c88978899c0 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -399,15 +399,12 @@ static const struct svc_version *nfs4_callback_version[] = { [4] = &nfs4_callback_version4, }; -static struct svc_stat nfs4_callback_stats; - static struct svc_program nfs4_callback_program = { .pg_prog = NFS4_CALLBACK, /* RPC service number */ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ .pg_vers = nfs4_callback_version, /* version table */ .pg_name = "NFSv4 callback", /* service name */ .pg_class = "nfs", /* authentication class */ - .pg_stats = &nfs4_callback_stats, .pg_authenticate = nfs_callback_authenticate, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7ef6af908faacb..1c5bdcdcd23955 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -89,7 +89,6 @@ unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, @@ -108,15 +107,11 @@ static struct svc_program nfsd_acl_program = { .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, .pg_authenticate = &svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }; -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { From 465bb0f1f48bcc1b26538e5e33fb462e2733140f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:35:57 -0400 Subject: [PATCH 09/70] sunrpc: pass in the sv_stats struct through svc_create_pooled [ Upstream commit f094323867668d50124886ad884b665de7319537 ] Since only one service actually reports the rpc stats there's not much of a reason to have a pointer to it in the svc_program struct. Adjust the svc_create_pooled function to take the sv_stats as an argument and pass the struct through there as desired instead of getting it from the svc_program->pg_stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton [ cel: adjusted to apply to v6.6.y ] Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfssvc.c | 3 ++- include/linux/sunrpc/svc.h | 4 +++- net/sunrpc/svc.c | 12 +++++++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1c5bdcdcd23955..bd9572ce231015 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -670,7 +670,8 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd); + serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats, + nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index dbf5b21feafe48..1a13ea759af3e1 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -408,7 +408,9 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, void svc_rqst_release_pages(struct svc_rqst *rqstp); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); -struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, +struct svc_serv * svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, + unsigned int bufsize, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 03a51c5ec82230..029c49065016ac 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -453,8 +453,8 @@ __svc_init_bc(struct svc_serv *serv) * Create an RPC service */ static struct svc_serv * -__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - int (*threadfn)(void *data)) +__svc_create(struct svc_program *prog, struct svc_stat *stats, + unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; @@ -466,7 +466,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, serv->sv_name = prog->pg_name; serv->sv_program = prog; kref_init(&serv->sv_refcnt); - serv->sv_stats = prog->pg_stats; + serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; @@ -532,26 +532,28 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { - return __svc_create(prog, bufsize, 1, threadfn); + return __svc_create(prog, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads * @prog: the RPC program the new service will handle + * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, threadfn); + serv = __svc_create(prog, stats, bufsize, npools, threadfn); if (!serv) goto out_err; return serv; From 791be93cf182568f690dd1e5d9cfac86c4e5bc7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:35:58 -0400 Subject: [PATCH 10/70] sunrpc: remove ->pg_stats from svc_program [ Upstream commit 3f6ef182f144dcc9a4d942f97b6a8ed969f13c95 ] Now that this isn't used anywhere, remove it. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfssvc.c | 1 - include/linux/sunrpc/svc.h | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index bd9572ce231015..acb1c12420055e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -136,7 +136,6 @@ struct svc_program nfsd_program = { .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ - .pg_stats = &nfsd_svcstats, /* version table */ .pg_authenticate = &svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 1a13ea759af3e1..3d8b215f32d5b0 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -336,7 +336,6 @@ struct svc_program { const struct svc_version **pg_vers; /* version array */ char * pg_name; /* service name */ char * pg_class; /* class name: services sharing authentication */ - struct svc_stat * pg_stats; /* rpc statistics */ enum svc_auth_status (*pg_authenticate)(struct svc_rqst *rqstp); __be32 (*pg_init_request)(struct svc_rqst *, const struct svc_program *, From 2e8076df20f38afc05b98757769a20ffda2ca1c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:35:59 -0400 Subject: [PATCH 11/70] sunrpc: use the struct net as the svc proc private [ Upstream commit 418b9687dece5bd763c09b5c27a801a7e3387be9 ] nfsd is the only thing using this helper, and it doesn't use the private currently. When we switch to per-network namespace stats we will need the struct net * in order to get to the nfsd_net. Use the net as the proc private so we can utilize this when we make the switch over. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 65fc1297c6dfa4..383860cb1d5b0f 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { - return do_register(net, statp->program->pg_name, statp, proc_ops); + return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); From 5b3a1ecf0790a4c3fa49d6af30b7cf20f3d4a9e7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:36:00 -0400 Subject: [PATCH 12/70] nfsd: rename NFSD_NET_* to NFSD_STATS_* [ Upstream commit d98416cc2154053950610bb6880911e3dcbdf8c5 ] We're going to merge the stats all into per network namespace in subsequent patches, rename these nn counters to be consistent with the rest of the stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/netns.h | 4 ++-- fs/nfsd/nfscache.c | 4 ++-- fs/nfsd/stats.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ec49b200b79762..c956e60c2356b3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -26,9 +26,9 @@ struct nfsd4_client_tracking_ops; enum { /* cache misses due only to checksum comparison failures */ - NFSD_NET_PAYLOAD_MISSES, + NFSD_STATS_PAYLOAD_MISSES, /* amount of memory (in bytes) currently consumed by the DRC */ - NFSD_NET_DRC_MEM_USAGE, + NFSD_STATS_DRC_MEM_USAGE, NFSD_NET_COUNTERS_NUM }; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 6cd36af2f97e10..b8492e140b8228 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -690,7 +690,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); seq_printf(m, "mem usage: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); seq_printf(m, "cache hits: %lld\n", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); seq_printf(m, "cache misses: %lld\n", @@ -698,7 +698,7 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "not cached: %lld\n", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); seq_printf(m, "payload misses: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 14f50c660b619e..7ed4325ac69123 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -81,17 +81,17 @@ static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]); } static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) { - percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) { - percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } #ifdef CONFIG_NFSD_V4 From 6f8d6ed3426a17f77628cebfb6a6e2c6f2b2496c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:36:01 -0400 Subject: [PATCH 13/70] nfsd: expose /proc/net/sunrpc/nfsd in net namespaces [ Upstream commit 93483ac5fec62cc1de166051b219d953bb5e4ef4 ] We are running nfsd servers inside of containers with their own network namespace, and we want to monitor these services using the stats found in /proc. However these are not exposed in the proc inside of the container, so we have to bind mount the host /proc into our containers to get at this information. Separate out the stat counters init and the proc registration, and move the proc registration into the pernet operations entry and exit points so that these stats can be exposed inside of network namespaces. This is an intermediate step, this just exposes the global counters in the network namespace. Subsequent patches will move these counters into the per-network namespace container. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsctl.c | 8 +++++--- fs/nfsd/stats.c | 21 ++++++--------------- fs/nfsd/stats.h | 6 ++++-- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index a13e81e450718a..42d0e3c14f0c85 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1532,6 +1532,7 @@ static __net_init int nfsd_net_init(struct net *net) nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); + nfsd_proc_stat_init(net); return 0; @@ -1552,6 +1553,7 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_proc_stat_shutdown(net); nfsd_net_reply_cache_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); @@ -1575,7 +1577,7 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_stat_init(); /* Statistics */ + retval = nfsd_stat_counters_init(); /* Statistics */ if (retval) goto out_free_pnfs; retval = nfsd_drc_slab_create(); @@ -1611,7 +1613,7 @@ static int __init init_nfsd(void) nfsd_lockd_shutdown(); nfsd_drc_slab_free(); out_free_stat: - nfsd_stat_shutdown(); + nfsd_stat_counters_destroy(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: @@ -1628,7 +1630,7 @@ static void __exit exit_nfsd(void) nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); + nfsd_stat_counters_destroy(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 12c299a05433b7..246882eb7f9540 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -108,31 +108,22 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) percpu_counter_destroy(&counters[i]); } -static int nfsd_stat_counters_init(void) +int nfsd_stat_counters_init(void) { return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); } -static void nfsd_stat_counters_destroy(void) +void nfsd_stat_counters_destroy(void) { nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); } -int nfsd_stat_init(void) +void nfsd_proc_stat_init(struct net *net) { - int err; - - err = nfsd_stat_counters_init(); - if (err) - return err; - - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); - - return 0; + svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops); } -void nfsd_stat_shutdown(void) +void nfsd_proc_stat_shutdown(struct net *net) { - nfsd_stat_counters_destroy(); - svc_proc_unregister(&init_net, "nfsd"); + svc_proc_unregister(net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 7ed4325ac69123..38811aa7d13e1e 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -40,8 +40,10 @@ extern struct svc_stat nfsd_svcstats; int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_stat_counters_init(void); +void nfsd_stat_counters_destroy(void); +void nfsd_proc_stat_init(struct net *net); +void nfsd_proc_stat_shutdown(struct net *net); static inline void nfsd_stats_rc_hits_inc(void) { From b670a59817ece430d26a342f93e03327fb7db3ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:36:02 -0400 Subject: [PATCH 14/70] nfsd: make all of the nfsd stats per-network namespace [ Upstream commit 4b14885411f74b2b0ce0eb2b39d0fffe54e5ca0d ] We have a global set of counters that we modify for all of the nfsd operations, but now that we're exposing these stats across all network namespaces we need to make the stats also be per-network namespace. We already have some caching stats that are per-network namespace, so move these definitions into the same counter and then adjust all the helpers and users of these stats to provide the appropriate nfsd_net struct so that the stats are maintained for the per-network namespace objects. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/cache.h | 2 -- fs/nfsd/netns.h | 17 ++++++++++++-- fs/nfsd/nfs4proc.c | 6 ++--- fs/nfsd/nfs4state.c | 3 ++- fs/nfsd/nfscache.c | 36 ++++++------------------------ fs/nfsd/nfsctl.c | 12 +++------- fs/nfsd/nfsfh.c | 3 ++- fs/nfsd/stats.c | 26 ++++++++++++---------- fs/nfsd/stats.h | 54 ++++++++++++++++----------------------------- fs/nfsd/vfs.c | 6 +++-- 10 files changed, 69 insertions(+), 96 deletions(-) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4cbe0434cbb8ce..66a05fefae98ea 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,8 +80,6 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); -int nfsd_net_reply_cache_init(struct nfsd_net *nn); -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index c956e60c2356b3..9260248a0790df 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,19 @@ enum { NFSD_STATS_PAYLOAD_MISSES, /* amount of memory (in bytes) currently consumed by the DRC */ NFSD_STATS_DRC_MEM_USAGE, - NFSD_NET_COUNTERS_NUM + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ +#endif + NFSD_STATS_COUNTERS_NUM }; /* @@ -169,7 +182,7 @@ struct nfsd_net { atomic_t num_drc_entries; /* Per-netns stats counters */ - struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 451026f9986b61..ae0057c54ef4ed 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2478,10 +2478,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp) return rpc_success; } -static inline void nfsd4_increment_op_stats(u32 opnum) +static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); + percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -2756,7 +2756,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); - nfsd4_increment_op_stats(op->opnum); + nfsd4_increment_op_stats(nn, op->opnum); } fh_put(current_fh); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c7e52d980cd75f..cdad1eaa4a3180 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8422,6 +8422,7 @@ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) { __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct file_lock *fl; struct nfs4_delegation *dp; @@ -8451,7 +8452,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) } break_lease: spin_unlock(&ctx->flc_lock); - nfsd_stats_wdeleg_getattr_inc(); + nfsd_stats_wdeleg_getattr_inc(nn); status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index b8492e140b8228..c52132ecb339d5 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -176,27 +176,6 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -/** - * nfsd_net_reply_cache_init - per net namespace reply cache set-up - * @nn: nfsd_net being initialized - * - * Returns zero on succes; otherwise a negative errno is returned. - */ -int nfsd_net_reply_cache_init(struct nfsd_net *nn) -{ - return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); -} - -/** - * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down - * @nn: nfsd_net being freed - * - */ -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) -{ - nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); -} - int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -502,7 +481,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { - struct nfsd_net *nn; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; @@ -512,7 +491,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, int rtn = RC_DOIT; if (type == RC_NOCACHE) { - nfsd_stats_rc_nocache_inc(); + nfsd_stats_rc_nocache_inc(nn); goto out; } @@ -522,7 +501,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -540,7 +518,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, freed = nfsd_cacherep_dispose(&dispose); trace_nfsd_drc_gc(nn, freed); - nfsd_stats_rc_misses_inc(); + nfsd_stats_rc_misses_inc(nn); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); goto out; @@ -548,7 +526,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, found_entry: /* We found a matching entry which is either in progress or done. */ nfsd_reply_cache_free_locked(NULL, rp, nn); - nfsd_stats_rc_hits_inc(); + nfsd_stats_rc_hits_inc(nn); rtn = RC_DROPIT; rp = found; @@ -692,11 +670,11 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "mem usage: %lld\n", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); seq_printf(m, "cache hits: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS])); seq_printf(m, "cache misses: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES])); seq_printf(m, "not cached: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE])); seq_printf(m, "payload misses: %lld\n", percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 42d0e3c14f0c85..d19ddf2fb4fe9b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1524,7 +1524,7 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - retval = nfsd_net_reply_cache_init(nn); + retval = nfsd_stat_counters_init(nn); if (retval) goto out_repcache_error; nn->nfsd_versions = NULL; @@ -1554,7 +1554,7 @@ static __net_exit void nfsd_net_exit(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfsd_proc_stat_shutdown(net); - nfsd_net_reply_cache_destroy(nn); + nfsd_stat_counters_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(nn); @@ -1577,12 +1577,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_stat_counters_init(); /* Statistics */ - if (retval) - goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) - goto out_free_stat; + goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ retval = create_proc_exports_entry(); if (retval) @@ -1612,8 +1609,6 @@ static int __init init_nfsd(void) out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); -out_free_stat: - nfsd_stat_counters_destroy(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: @@ -1630,7 +1625,6 @@ static void __exit exit_nfsd(void) nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_counters_destroy(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 937be276bb6b48..c2495d98c18928 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -327,6 +327,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -395,7 +396,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) out: trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) - nfsd_stats_fh_stale_inc(exp); + nfsd_stats_fh_stale_inc(nn, exp); return error; } diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 246882eb7f9540..db95c4f34b0e8a 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -34,15 +34,17 @@ struct svc_stat nfsd_svcstats = { static int nfsd_show(struct seq_file *seq, void *v) { + struct net *net = pde_data(file_inode(seq->file)); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); @@ -63,10 +65,10 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); } seq_printf(seq, "\nwdeleg_getattr %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif @@ -108,14 +110,14 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) percpu_counter_destroy(&counters[i]); } -int nfsd_stat_counters_init(void) +int nfsd_stat_counters_init(struct nfsd_net *nn) { - return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM); } -void nfsd_stat_counters_destroy(void) +void nfsd_stat_counters_destroy(struct nfsd_net *nn) { - nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM); } void nfsd_proc_stat_init(struct net *net) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 38811aa7d13e1e..c24be4ddbe7d70 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,26 +10,7 @@ #include #include - -enum { - NFSD_STATS_RC_HITS, /* repcache hits */ - NFSD_STATS_RC_MISSES, /* repcache misses */ - NFSD_STATS_RC_NOCACHE, /* uncached reqs */ - NFSD_STATS_FH_STALE, /* FH stale error */ - NFSD_STATS_IO_READ, /* bytes returned to read requests */ - NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ -#ifdef CONFIG_NFSD_V4 - NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ - NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, -#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) - NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ -#endif - NFSD_STATS_COUNTERS_NUM -}; - struct nfsd_stats { - struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - atomic_t th_cnt; /* number of available threads */ }; @@ -40,43 +21,46 @@ extern struct svc_stat nfsd_svcstats; int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_counters_init(void); -void nfsd_stat_counters_destroy(void); +int nfsd_stat_counters_init(struct nfsd_net *nn); +void nfsd_stat_counters_destroy(struct nfsd_net *nn); void nfsd_proc_stat_init(struct net *net); void nfsd_proc_stat_shutdown(struct net *net); -static inline void nfsd_stats_rc_hits_inc(void) +static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]); } -static inline void nfsd_stats_rc_misses_inc(void) +static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]); } -static inline void nfsd_stats_rc_nocache_inc(void) +static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]); } -static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) +static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn, + struct svc_export *exp) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]); if (exp && exp->ex_stats) percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } -static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_read_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } -static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_write_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } @@ -97,9 +81,9 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) } #ifdef CONFIG_NFSD_V4 -static inline void nfsd_stats_wdeleg_getattr_inc(void) +static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); + percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]); } #endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d0fdf70ab20d36..1f2a5b22b6498e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -985,7 +985,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsd_stats_io_read_add(fhp->fh_export, host_err); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + nfsd_stats_io_read_add(nn, fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1168,7 +1170,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsd_stats_io_write_add(exp, *cnt); + nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); host_err = filemap_check_wb_err(file->f_mapping, since); if (host_err < 0) From 9ae63aab0df8bae904e2467df69f70b7d92de236 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:36:03 -0400 Subject: [PATCH 15/70] nfsd: remove nfsd_stats, make th_cnt a global counter [ Upstream commit e41ee44cc6a473b1f414031782c3b4283d7f3e5f ] This is the last global stat, take it out of the nfsd_stats struct and make it a global part of nfsd, report it the same as always. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsd.h | 1 + fs/nfsd/nfssvc.c | 5 +++-- fs/nfsd/stats.c | 3 +-- fs/nfsd/stats.h | 6 ------ 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index fe846a360ae18d..d05bd2b811f377 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -69,6 +69,7 @@ extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; extern unsigned long nfsd_drc_mem_used; +extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index acb1c12420055e..e9689db75f6351 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -34,6 +34,7 @@ #define NFSDDBG_FACILITY NFSDDBG_SVC +atomic_t nfsd_th_cnt = ATOMIC_INIT(0); extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) @@ -945,7 +946,7 @@ nfsd(void *vrqstp) current->fs->umask = 0; - atomic_inc(&nfsdstats.th_cnt); + atomic_inc(&nfsd_th_cnt); set_freezable(); @@ -959,7 +960,7 @@ nfsd(void *vrqstp) svc_recv(rqstp); } - atomic_dec(&nfsdstats.th_cnt); + atomic_dec(&nfsd_th_cnt); out: /* Release the thread */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index db95c4f34b0e8a..276bbcd59cdb63 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -27,7 +27,6 @@ #include "nfsd.h" -struct nfsd_stats nfsdstats; struct svc_stat nfsd_svcstats = { .program = &nfsd_program, }; @@ -47,7 +46,7 @@ static int nfsd_show(struct seq_file *seq, void *v) percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); + seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index c24be4ddbe7d70..5675d283a53730 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,12 +10,6 @@ #include #include -struct nfsd_stats { - atomic_t th_cnt; /* number of available threads */ -}; - -extern struct nfsd_stats nfsdstats; - extern struct svc_stat nfsd_svcstats; int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); From 9eae19001439f5004e1fb189972d929201db29ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 12 Aug 2024 18:36:04 -0400 Subject: [PATCH 16/70] nfsd: make svc_stat per-network namespace instead of global [ Upstream commit 16fb9808ab2c99979f081987752abcbc5b092eac ] The final bit of stats that is global is the rpc svc_stat. Move this into the nfsd_net struct and use that everywhere instead of the global struct. Remove the unused global struct. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/netns.h | 4 ++++ fs/nfsd/nfsctl.c | 2 ++ fs/nfsd/nfssvc.c | 2 +- fs/nfsd/stats.c | 10 ++++------ fs/nfsd/stats.h | 2 -- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9260248a0790df..9bfca3dda63d33 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -14,6 +14,7 @@ #include #include #include +#include /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -184,6 +185,9 @@ struct nfsd_net { /* Per-netns stats counters */ struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + /* sunrpc svc stats */ + struct svc_stat nfsd_svcstats; + /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d19ddf2fb4fe9b..887035b7446763 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1527,6 +1527,8 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_stat_counters_init(nn); if (retval) goto out_repcache_error; + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); + nn->nfsd_svcstats.program = &nfsd_program; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e9689db75f6351..7911c4b3b5d355 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -670,7 +670,7 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, &nfsd_svcstats, + serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats, nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 276bbcd59cdb63..9f606fa08bd4b8 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -27,10 +27,6 @@ #include "nfsd.h" -struct svc_stat nfsd_svcstats = { - .program = &nfsd_program, -}; - static int nfsd_show(struct seq_file *seq, void *v) { struct net *net = pde_data(file_inode(seq->file)); @@ -56,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); /* show my rpc info */ - svc_seq_show(seq, &nfsd_svcstats); + svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ @@ -121,7 +117,9 @@ void nfsd_stat_counters_destroy(struct nfsd_net *nn) void nfsd_proc_stat_init(struct net *net) { - svc_proc_register(net, &nfsd_svcstats, &nfsd_proc_ops); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } void nfsd_proc_stat_shutdown(struct net *net) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 5675d283a53730..d2753e975dfd34 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,8 +10,6 @@ #include #include -extern struct svc_stat nfsd_svcstats; - int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); From 26273f5f4cf68b29414e403837093408a9c98e1f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 28 Jun 2024 12:14:58 -0700 Subject: [PATCH 17/70] mm: gup: stop abusing try_grab_folio commit f442fa6141379a20b48ae3efabee827a3d260787 upstream. A kernel warning was reported when pinning folio in CMA memory when launching SEV virtual machine. The splat looks like: [ 464.325306] WARNING: CPU: 13 PID: 6734 at mm/gup.c:1313 __get_user_pages+0x423/0x520 [ 464.325464] CPU: 13 PID: 6734 Comm: qemu-kvm Kdump: loaded Not tainted 6.6.33+ #6 [ 464.325477] RIP: 0010:__get_user_pages+0x423/0x520 [ 464.325515] Call Trace: [ 464.325520] [ 464.325523] ? __get_user_pages+0x423/0x520 [ 464.325528] ? __warn+0x81/0x130 [ 464.325536] ? __get_user_pages+0x423/0x520 [ 464.325541] ? report_bug+0x171/0x1a0 [ 464.325549] ? handle_bug+0x3c/0x70 [ 464.325554] ? exc_invalid_op+0x17/0x70 [ 464.325558] ? asm_exc_invalid_op+0x1a/0x20 [ 464.325567] ? __get_user_pages+0x423/0x520 [ 464.325575] __gup_longterm_locked+0x212/0x7a0 [ 464.325583] internal_get_user_pages_fast+0xfb/0x190 [ 464.325590] pin_user_pages_fast+0x47/0x60 [ 464.325598] sev_pin_memory+0xca/0x170 [kvm_amd] [ 464.325616] sev_mem_enc_register_region+0x81/0x130 [kvm_amd] Per the analysis done by yangge, when starting the SEV virtual machine, it will call pin_user_pages_fast(..., FOLL_LONGTERM, ...) to pin the memory. But the page is in CMA area, so fast GUP will fail then fallback to the slow path due to the longterm pinnalbe check in try_grab_folio(). The slow path will try to pin the pages then migrate them out of CMA area. But the slow path also uses try_grab_folio() to pin the page, it will also fail due to the same check then the above warning is triggered. In addition, the try_grab_folio() is supposed to be used in fast path and it elevates folio refcount by using add ref unless zero. We are guaranteed to have at least one stable reference in slow path, so the simple atomic add could be used. The performance difference should be trivial, but the misuse may be confusing and misleading. Redefined try_grab_folio() to try_grab_folio_fast(), and try_grab_page() to try_grab_folio(), and use them in the proper paths. This solves both the abuse and the kernel warning. The proper naming makes their usecase more clear and should prevent from abusing in the future. peterx said: : The user will see the pin fails, for gpu-slow it further triggers the WARN : right below that failure (as in the original report): : : folio = try_grab_folio(page, page_increm - 1, : foll_flags); : if (WARN_ON_ONCE(!folio)) { <------------------------ here : /* : * Release the 1st page ref if the : * folio is problematic, fail hard. : */ : gup_put_folio(page_folio(page), 1, : foll_flags); : ret = -EFAULT; : goto out; : } [1] https://lore.kernel.org/linux-mm/1719478388-31917-1-git-send-email-yangge1116@126.com/ [shy828301@gmail.com: fix implicit declaration of function try_grab_folio_fast] Link: https://lkml.kernel.org/r/CAHbLzkowMSso-4Nufc9hcMehQsK9PNz3OSu-+eniU-2Mm-xjhA@mail.gmail.com Link: https://lkml.kernel.org/r/20240628191458.2605553-1-yang@os.amperecomputing.com Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"") Signed-off-by: Yang Shi Reported-by: yangge Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Peter Xu Cc: [6.6+] Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/gup.c | 251 ++++++++++++++++++++++++----------------------- mm/huge_memory.c | 6 +- mm/hugetlb.c | 2 +- mm/internal.h | 4 +- 4 files changed, 135 insertions(+), 128 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f50fe2219a13b6..fdd75384160d8d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -97,95 +97,6 @@ static inline struct folio *try_get_folio(struct page *page, int refs) return folio; } -/** - * try_grab_folio() - Attempt to get or pin a folio. - * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the folio's refcount - * @flags: gup flags: these are the FOLL_* flag values. - * - * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. - * - * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the - * same time. (That's true throughout the get_user_pages*() and - * pin_user_pages*() APIs.) Cases: - * - * FOLL_GET: folio's refcount will be incremented by @refs. - * - * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its pincount will be incremented by @refs. - * - * FOLL_PIN on single-page folios: folio's refcount will be incremented by - * @refs * GUP_PIN_COUNTING_BIAS. - * - * Return: The folio containing @page (with refcount appropriately - * incremented) for success, or NULL upon failure. If neither FOLL_GET - * nor FOLL_PIN was set, that's considered failure, and furthermore, - * a likely bug in the caller, so a warning is also emitted. - */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) -{ - struct folio *folio; - - if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) - return NULL; - - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) - return NULL; - - if (flags & FOLL_GET) - return try_get_folio(page, refs); - - /* FOLL_PIN is set */ - - /* - * Don't take a pin on the zero page - it's not going anywhere - * and it is used in a *lot* of places. - */ - if (is_zero_page(page)) - return page_folio(page); - - folio = try_get_folio(page, refs); - if (!folio) - return NULL; - - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) - folio_put_refs(folio, refs); - return NULL; - } - - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in page_try_share_anon_rmap(). - */ - smp_mb__after_atomic(); - - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - - return folio; -} - static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { @@ -203,58 +114,59 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) } /** - * try_grab_page() - elevate a page's refcount by a flag-dependent amount - * @page: pointer to page to be grabbed - * @flags: gup flags: these are the FOLL_* flag values. + * try_grab_folio() - add a folio's refcount by a flag-dependent amount + * @folio: pointer to folio to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_folio() documentation, with - * "refs=1". + * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * - * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. + * + * It is called when we have a stable reference for the folio, typically in + * GUP slow path. */ -int __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_folio(struct folio *folio, int refs, + unsigned int flags) { - struct folio *folio = page_folio(page); - if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) return -EREMOTEIO; if (flags & FOLL_GET) - folio_ref_inc(folio); + folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ - if (is_zero_page(page)) + if (is_zero_folio(folio)) return 0; /* - * Similar to try_grab_folio(): be sure to *also* - * increment the normal page refcount field at least once, + * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { - folio_ref_add(folio, 1); - atomic_add(1, &folio->_pincount); + folio_ref_add(folio, refs); + atomic_add(refs, &folio->_pincount); } else { - folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS); } - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; @@ -647,8 +559,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_page(page, flags); + /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ + ret = try_grab_folio(page_folio(page), 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -899,7 +811,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(entry); } - ret = try_grab_page(*page, gup_flags); + ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: @@ -1302,20 +1214,19 @@ static long __get_user_pages(struct mm_struct *mm, * pages. */ if (page_increm > 1) { - struct folio *folio; + struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ - folio = try_grab_folio(page, page_increm - 1, - foll_flags); - if (WARN_ON_ONCE(!folio)) { + if (try_grab_folio(folio, page_increm - 1, + foll_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(page_folio(page), 1, + gup_put_folio(folio, 1, foll_flags); ret = -EFAULT; goto out; @@ -2541,6 +2452,102 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, } } +/** + * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: folio's refcount will be incremented by @refs. + * + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its pincount will be incremented by @refs. + * + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. + * + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. + * + * It uses add ref unless zero to elevate the folio refcount and must be called + * in fast path only. + */ +static struct folio *try_grab_folio_fast(struct page *page, int refs, + unsigned int flags) +{ + struct folio *folio; + + /* Raise warn if it is not called in fast GUP */ + VM_WARN_ON_ONCE(!irqs_disabled()); + + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) + return NULL; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + + if (flags & FOLL_GET) + return try_get_folio(page, refs); + + /* FOLL_PIN is set */ + + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); + + folio = try_get_folio(page, refs); + if (!folio) + return NULL; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); + return NULL; + } + + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in folio_try_share_anon_rmap_*(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; +} + #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable @@ -2605,7 +2612,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; @@ -2699,7 +2706,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, SetPageReferenced(page); pages[*nr] = page; - if (unlikely(try_grab_page(page, flags))) { + if (unlikely(try_grab_folio(page_folio(page), 1, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } @@ -2808,7 +2815,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -2879,7 +2886,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -2923,7 +2930,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -2963,7 +2970,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ac2877e76629b..f2816c9a1f3ec8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1056,7 +1056,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); @@ -1214,7 +1214,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); @@ -1475,7 +1475,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) return ERR_PTR(ret); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fb7a531fce7174..0acb04c3e95291 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6532,7 +6532,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, * try_grab_page() should always be able to get the page here, * because we hold the ptl lock and have verified pte_present(). */ - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (WARN_ON_ONCE(ret)) { page = ERR_PTR(ret); diff --git a/mm/internal.h b/mm/internal.h index abed947f784b7b..ef8d787a510c5c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -938,8 +938,8 @@ int migrate_device_coherent_page(struct page *page); /* * mm/gup.c */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -int __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_folio(struct folio *folio, int refs, + unsigned int flags); /* * mm/huge_memory.c From a47b54846ac7c9d4d2083c6bce44fcfc00c35237 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Mon, 15 Jul 2024 17:31:44 +0800 Subject: [PATCH 18/70] nvme/pci: Add APST quirk for Lenovo N60z laptop commit ab091ec536cb7b271983c0c063b17f62f3591583 upstream. There is a hardware power-saving problem with the Lenovo N60z board. When turn it on and leave it for 10 hours, there is a 20% chance that a nvme disk will not wake up until reboot. Link: https://lore.kernel.org/all/2B5581C46AC6E335+9c7a81f1-05fb-4fd0-9fbb-108757c21628@uniontech.com Signed-off-by: hmy Signed-off-by: Wentao Guan Signed-off-by: WangYuli Signed-off-by: Keith Busch Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0fc7aa78b2e5b9..2c3f55877a1134 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2931,6 +2931,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; } + /* + * NVMe SSD drops off the PCIe bus after system idle + * for 10 hours on a Lenovo N60z board. + */ + if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) + return NVME_QUIRK_NO_APST; + return 0; } From 20dbad7525c6d58a101526c6af7edebe556d3100 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 24 Apr 2024 18:03:41 +0900 Subject: [PATCH 19/70] genirq/cpuhotplug: Skip suspended interrupts when restoring affinity commit a60dd06af674d3bb76b40da5d722e4a0ecefe650 upstream. irq_restore_affinity_of_irq() restarts managed interrupts unconditionally when the first CPU in the affinity mask comes online. That's correct during normal hotplug operations, but not when resuming from S3 because the drivers are not resumed yet and interrupt delivery is not expected by them. Skip the startup of suspended interrupts and let resume_device_irqs() deal with restoring them. This ensures that irqs are not delivered to drivers during the noirq phase of resuming from S3, after non-boot CPUs are brought back online. Signed-off-by: David Stevens Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240424090341.72236-1-stevensd@chromium.org Cc: Bart Van Assche Signed-off-by: Greg Kroah-Hartman --- kernel/irq/cpuhotplug.c | 11 ++++++++--- kernel/irq/manage.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 5ecd072a34fe72..367e15a2f5705b 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -195,10 +195,15 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - if (irqd_is_managed_and_shutdown(data)) { - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + /* + * Don't restore suspended interrupts here when a system comes back + * from S3. They are reenabled via resume_device_irqs(). + */ + if (desc->istate & IRQS_SUSPENDED) return; - } + + if (irqd_is_managed_and_shutdown(data)) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); /* * If the interrupt can only be directed to a single target diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a054cd5ec08bce..8a936c1ffad390 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -796,10 +796,14 @@ void __enable_irq(struct irq_desc *desc) irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the - * interrupt might be marked NOAUTOEN. So irq_startup() - * needs to be invoked when it gets enabled the first - * time. If it was already started up, then irq_startup() - * will invoke irq_enable() under the hood. + * interrupt might be marked NOAUTOEN so irq_startup() + * needs to be invoked when it gets enabled the first time. + * This is also required when __enable_irq() is invoked for + * a managed and shutdown interrupt from the S3 resume + * path. + * + * If it was already started up, then irq_startup() will + * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; From bcd51480439499d5f8605ba2f3edb5bd0dd81fd8 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Tue, 23 Apr 2024 00:34:13 -0700 Subject: [PATCH 20/70] genirq/cpuhotplug: Retry with cpu_online_mask when migration fails commit 88d724e2301a69c1ab805cd74fc27aa36ae529e0 upstream. When a CPU goes offline, the interrupts affine to that CPU are re-configured. Managed interrupts undergo either migration to other CPUs or shutdown if all CPUs listed in the affinity are offline. The migration of managed interrupts is guaranteed on x86 because there are interrupt vectors reserved. Regular interrupts are migrated to a still online CPU in the affinity mask or if there is no online CPU to any online CPU. This works as long as the still online CPUs in the affinity mask have interrupt vectors available, but in case that none of those CPUs has a vector available the migration fails and the device interrupt becomes stale. This is not any different from the case where the affinity mask does not contain any online CPU, but there is no fallback operation for this. Instead of giving up, retry the migration attempt with the online CPU mask if the interrupt is not managed, as managed interrupts cannot be affected by this problem. Signed-off-by: Dongli Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423073413.79625-1-dongli.zhang@oracle.com Cc: Bart Van Assche Signed-off-by: Greg Kroah-Hartman --- kernel/irq/cpuhotplug.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 367e15a2f5705b..eb86283901565b 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -130,6 +130,22 @@ static bool migrate_one_irq(struct irq_desc *desc) * CPU. */ err = irq_do_set_affinity(d, affinity, false); + + /* + * If there are online CPUs in the affinity mask, but they have no + * vectors left to make the migration work, try to break the + * affinity by migrating to any online CPU. + */ + if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != cpu_online_mask) { + pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with online CPUs\n", + d->irq, cpumask_pr_args(affinity)); + + affinity = cpu_online_mask; + brokeaff = true; + + err = irq_do_set_affinity(d, affinity, false); + } + if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); From dd9542ae7c7ca82ed2d7c185754ba9026361f6bc Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:29 +0000 Subject: [PATCH 21/70] cgroup: Make operations on the cgroup root_list RCU safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d23b5c577715892c87533b13923306acc6243f93 upstream. At present, when we perform operations on the cgroup root_list, we must hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, we can make operations on this list RCU-safe, eliminating the need to hold the cgroup_mutex during traversal. Modifications to the list only occur in the cgroup root setup and destroy paths, which should be infrequent in a production environment. In contrast, traversal may occur frequently. Therefore, making it RCU-safe would be beneficial. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo To: Michal Koutný Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup-internal.h | 3 ++- kernel/cgroup/cgroup.c | 23 ++++++++++++++++------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 265da00a1a8b1b..7e7bc25a1aa31f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -558,6 +558,7 @@ struct cgroup_root { /* A list running through the active hierarchies */ struct list_head root_list; + struct rcu_head rcu; /* Hierarchy-specific flags */ unsigned int flags; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c56071f150f2ae..5e17f01ced9fd2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -170,7 +170,8 @@ extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 094f513319259d..d872fff901073f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1313,7 +1313,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1346,7 +1346,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { - list_del(&root->root_list); + list_del_rcu(&root->root_list); cgroup_root_count--; } @@ -1386,7 +1386,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } - BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; } @@ -1445,7 +1453,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); @@ -1453,7 +1460,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -2014,7 +2023,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2097,7 +2106,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* From 77100f2e8412dbb84b3e7f1b947c9531cb509492 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Sep 2023 22:03:56 +0000 Subject: [PATCH 22/70] tcp_metrics: optimize tcp_metrics_flush_all() [ Upstream commit 6532e257aa73645e28dee5b2232cc3c88be62083 ] This is inspired by several syzbot reports where tcp_metrics_flush_all() was seen in the traces. We can avoid acquiring tcp_metrics_lock for empty buckets, and we should add one cond_resched() to break potential long loops. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Neal Cardwell Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/ipv4/tcp_metrics.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b71f94a5932ac0..e0883ba709b0bf 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -899,11 +899,13 @@ static void tcp_metrics_flush_all(struct net *net) unsigned int row; for (row = 0; row < max_rows; row++, hb++) { - struct tcp_metrics_block __rcu **pp; + struct tcp_metrics_block __rcu **pp = &hb->chain; bool match; + if (!rcu_access_pointer(*pp)) + continue; + spin_lock_bh(&tcp_metrics_lock); - pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->ns.count); @@ -915,6 +917,7 @@ static void tcp_metrics_flush_all(struct net *net) } } spin_unlock_bh(&tcp_metrics_lock); + cond_resched(); } } From be31c9be8764a1b05ab8a2d8eccebfbd78270fc5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 28 Aug 2023 14:00:00 +0200 Subject: [PATCH 23/70] wifi: mac80211: take wiphy lock for MAC addr change [ Upstream commit a26787aa13974fb0b3fb42bfeb4256c1b686e305 ] We want to ensure everything holds the wiphy lock, so also extend that to the MAC change callback. Signed-off-by: Johannes Berg Stable-dep-of: 74a7c93f45ab ("wifi: mac80211: fix change_address deadlock during unregister") Signed-off-by: Sasha Levin --- net/mac80211/iface.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6e3bfb46af44d3..9ac5252c3da005 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -251,9 +251,9 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata return ret; } -static int ieee80211_change_mac(struct net_device *dev, void *addr) +static int _ieee80211_change_mac(struct ieee80211_sub_if_data *sdata, + void *addr) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sockaddr *sa = addr; bool check_dup = true; @@ -278,7 +278,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) if (live) drv_remove_interface(local, sdata); - ret = eth_mac_addr(dev, sa); + ret = eth_mac_addr(sdata->dev, sa); if (ret == 0) { memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); @@ -294,6 +294,19 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) return ret; } +static int ieee80211_change_mac(struct net_device *dev, void *addr) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + int ret; + + wiphy_lock(local->hw.wiphy); + ret = _ieee80211_change_mac(sdata, addr); + wiphy_unlock(local->hw.wiphy); + + return ret; +} + static inline int identical_mac_addr_allowed(int type1, int type2) { return type1 == NL80211_IFTYPE_MONITOR || From e58695f6c5568e84c9e2bc065deae3c9d9c860f1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Oct 2023 12:34:47 +0200 Subject: [PATCH 24/70] wifi: mac80211: fix change_address deadlock during unregister [ Upstream commit 74a7c93f45abba538914a65dd2ef2ea7cf7150e2 ] When using e.g. bonding, and doing a sequence such as # iw wlan0 set type __ap # ip link add name bond1 type bond # ip link set wlan0 master bond1 # iw wlan0 interface del we deadlock, since the wlan0 interface removal will cause bonding to reset the MAC address of wlan0. The locking would be somewhat difficult to fix, but since this only happens during removal, we can simply ignore the MAC address change at this time. Reported-by: syzbot+25b3a0b24216651bc2af@syzkaller.appspotmail.com Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20231012123447.9f9d7fd1f237.Ic3a5ef4391b670941a69cec5592aefc79d9c2890@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/iface.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9ac5252c3da005..52b048807feae5 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -300,6 +300,14 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) struct ieee80211_local *local = sdata->local; int ret; + /* + * This happens during unregistration if there's a bond device + * active (maybe other cases?) and we must get removed from it. + * But we really don't care anymore if it's not registered now. + */ + if (!dev->ieee80211_ptr->registered) + return 0; + wiphy_lock(local->hw.wiphy); ret = _ieee80211_change_mac(sdata, addr); wiphy_unlock(local->hw.wiphy); From 4365d0d660ac38ba3ba867402e028872fec3698e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 27 Sep 2023 11:34:25 +0200 Subject: [PATCH 25/70] fs: Convert to bdev_open_by_dev() [ Upstream commit f4a48bc36cdfae7c603e8e3f2a51e2a283f3f365 ] Convert mount code to use bdev_open_by_dev() and propagate the handle around to bdev_release(). Acked-by: Christoph Hellwig Reviewed-by: Christian Brauner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230927093442.25915-19-jack@suse.cz Signed-off-by: Christian Brauner Stable-dep-of: 6306ff39a7fc ("jfs: fix log->bdev_handle null ptr deref in lbmStartIO") Signed-off-by: Sasha Levin --- fs/cramfs/inode.c | 2 +- fs/romfs/super.c | 2 +- fs/super.c | 15 +++++++++------ include/linux/fs.h | 1 + 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 5ee7d7bbb361ce..2fbf97077ce910 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - blkdev_put(sb->s_bdev, sb); + bdev_release(sb->s_bdev_handle); } kfree(sbi); } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 5c35f6c760377e..b1bdfbc211c3c0 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -593,7 +593,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - blkdev_put(sb->s_bdev, sb); + bdev_release(sb->s_bdev_handle); } #endif } diff --git a/fs/super.c b/fs/super.c index 576abb1ff0403d..b142e71eb8dfdd 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1490,14 +1490,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); + struct bdev_handle *bdev_handle; struct block_device *bdev; - bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev_handle)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev); + return PTR_ERR(bdev_handle); } + bdev = bdev_handle->bdev; /* * This really should be in blkdev_get_by_dev, but right now can't due @@ -1505,7 +1507,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, sb); + bdev_release(bdev_handle); return -EACCES; } @@ -1521,10 +1523,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, mutex_unlock(&bdev->bd_fsfreeze_mutex); if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, sb); + bdev_release(bdev_handle); return -EBUSY; } spin_lock(&sb_lock); + sb->s_bdev_handle = bdev_handle; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) @@ -1657,7 +1660,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - blkdev_put(bdev, sb); + bdev_release(sb->s_bdev_handle); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 56dce38c478627..5ca9e859c042b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1223,6 +1223,7 @@ struct super_block { struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; + struct bdev_handle *s_bdev_handle; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; From 9641706cbbc256ce639eef2b182866e07fceb058 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 27 Sep 2023 11:34:30 +0200 Subject: [PATCH 26/70] jfs: Convert to bdev_open_by_dev() [ Upstream commit 898c57f456b537e90493a9e9222226aa3ea66267 ] Convert jfs to use bdev_open_by_dev() and pass the handle around. CC: Dave Kleikamp CC: jfs-discussion@lists.sourceforge.net Acked-by: Christoph Hellwig Acked-by: Dave Kleikamp Reviewed-by: Christian Brauner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230927093442.25915-24-jack@suse.cz Signed-off-by: Christian Brauner Stable-dep-of: 6306ff39a7fc ("jfs: fix log->bdev_handle null ptr deref in lbmStartIO") Signed-off-by: Sasha Levin --- fs/jfs/jfs_logmgr.c | 29 +++++++++++++++-------------- fs/jfs/jfs_logmgr.h | 2 +- fs/jfs/jfs_mount.c | 3 ++- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index e855b8fde76ce1..c911d838b8ec8c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) int lmLogOpen(struct super_block *sb) { int rc; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb) mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { - if (log->bdev->bd_dev == sbi->logdev) { + if (log->bdev_handle->bdev->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); @@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, - log, NULL); - if (IS_ERR(bdev)) { - rc = PTR_ERR(bdev); + bdev_handle = bdev_open_by_dev(sbi->logdev, + BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); + if (IS_ERR(bdev_handle)) { + rc = PTR_ERR(bdev_handle); goto free; } - log->bdev = bdev; + log->bdev_handle = bdev_handle; uuid_copy(&log->uuid, &sbi->loguuid); /* @@ -1141,7 +1141,7 @@ int lmLogOpen(struct super_block *sb) lbmLogShutdown(log); close: /* close external log device */ - blkdev_put(bdev, log); + bdev_release(bdev_handle); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb) init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); - log->bdev = sb->s_bdev; + log->bdev_handle = sb->s_bdev_handle; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); @@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; - struct block_device *bdev; + struct bdev_handle *bdev_handle; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); @@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb) * external log as separate logical volume */ list_del(&log->journal_list); - bdev = log->bdev; + bdev_handle = log->bdev_handle; rc = lmLogShutdown(log); - blkdev_put(bdev, log); + bdev_release(bdev_handle); kfree(log); @@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2113,7 +2113,8 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO"); - bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); + bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_WRITE | REQ_SYNC, + GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 805877ce502044..84aa2d25390743 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -356,7 +356,7 @@ struct jfs_log { * before writing syncpt. */ struct list_head journal_list; /* Global list */ - struct block_device *bdev; /* 4: log lv pointer */ + struct bdev_handle *bdev_handle; /* 4: log lv pointer */ int serial; /* 4: log mount serial number */ s64 base; /* @8: log extent address (inline log ) */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 631b8bd3e43849..9b5c6a20b30c83 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state) if (state == FM_MOUNT) { /* record log's dev_t and mount serial number */ - j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); + j_sb->s_logdev = cpu_to_le32( + new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev)); j_sb->s_logserial = cpu_to_le32(sbi->log->serial); } else if (state == FM_CLEAN) { /* From e4a4435787dd7657805add2dea646f43f06d9979 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 9 Oct 2023 17:45:57 +0800 Subject: [PATCH 27/70] jfs: fix log->bdev_handle null ptr deref in lbmStartIO [ Upstream commit 6306ff39a7fcb7e9c59a00e6860b933b71a2ed3e ] When sbi->flag is JFS_NOINTEGRITY in lmLogOpen(), log->bdev_handle can't be inited, so it value will be NULL. Therefore, add the "log ->no_integrity=1" judgment in lbmStartIO() to avoid such problems. Reported-and-tested-by: syzbot+23bc20037854bb335d59@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Link: https://lore.kernel.org/r/20231009094557.1398920-1-lizhi.xu@windriver.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/jfs/jfs_logmgr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index c911d838b8ec8c..cb6d1fda66a702 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2110,10 +2110,14 @@ static void lbmStartIO(struct lbuf * bp) { struct bio *bio; struct jfs_log *log = bp->l_log; + struct block_device *bdev = NULL; jfs_info("lbmStartIO"); - bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_WRITE | REQ_SYNC, + if (!log->no_integrity) + bdev = log->bdev_handle->bdev; + + bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); From 3999d26986be87cf1d327e857ae903e965545e7a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Nov 2023 00:11:42 -0500 Subject: [PATCH 28/70] net: don't dump stack on queue timeout [ Upstream commit e316dd1cf1358ff9c44b37c7be273a7dc4349986 ] The top syzbot report for networking (#14 for the entire kernel) is the queue timeout splat. We kept it around for a long time, because in real life it provides pretty strong signal that something is wrong with the driver or the device. Removing it is also likely to break monitoring for those who track it as a kernel warning. Nevertheless, WARN()ings are best suited for catching kernel programming bugs. If a Tx queue gets starved due to a pause storm, priority configuration, or other weirdness - that's obviously a problem, but not a problem we can fix at the kernel level. Bite the bullet and convert the WARN() to a print. Before: NETDEV WATCHDOG: eni1np1 (netdevsim): transmit queue 0 timed out 1975 ms WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:525 dev_watchdog+0x39e/0x3b0 [... completely pointless stack trace of a timer follows ...] Now: netdevsim netdevsim1 eni1np1: NETDEV WATCHDOG: CPU: 0: transmit queue 0 timed out 1769 ms Alternatively we could mark the drivers which syzbot has learned to abuse as "print-instead-of-WARN" selectively. Reported-by: syzbot+d55372214aff0faa1f1f@syzkaller.appspotmail.com Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sched/sch_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4023c955036b12..6ab9359c1706f1 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -522,8 +522,9 @@ static void dev_watchdog(struct timer_list *t) if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); - WARN_ONCE(1, "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out %u ms\n", - dev->name, netdev_drivername(dev), i, timedout_ms); + netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", + raw_smp_processor_id(), + i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); From 0b24b79410866512445270cb4412e5937838548e Mon Sep 17 00:00:00 2001 From: Manas Ghandat Date: Wed, 11 Oct 2023 20:09:37 +0530 Subject: [PATCH 29/70] jfs: fix shift-out-of-bounds in dbJoin [ Upstream commit cca974daeb6c43ea971f8ceff5a7080d7d49ee30 ] Currently while joining the leaf in a buddy system there is shift out of bound error in calculation of BUDSIZE. Added the required check to the BUDSIZE and fixed the documentation as well. Reported-by: syzbot+411debe54d318eaed386@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=411debe54d318eaed386 Signed-off-by: Manas Ghandat Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin --- fs/jfs/jfs_dmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index cb3cda1390adb1..8eec84c651bfba 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,7 +2763,9 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: none + * RETURN VALUES: + * 0 - success + * -EIO - i/o error */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2790,6 +2792,10 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ + + if ((newval - tp->dmt_budmin) > BUDMIN) + return -EIO; + budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. From 2dbaa75748ac1dad1e93191783d577e49adbd379 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Thu, 16 Nov 2023 11:13:52 +0800 Subject: [PATCH 30/70] squashfs: squashfs_read_data need to check if the length is 0 [ Upstream commit eb66b8abae98f869c224f7c852b685ae02144564 ] When the length passed in is 0, the pagemap_scan_test_walk() caller should bail. This error causes at least a WARN_ON(). Link: https://lkml.kernel.org/r/20231116031352.40853-1-lizhi.xu@windriver.com Reported-by: syzbot+32d3767580a1ea339a81@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/0000000000000526f2060a30a085@google.com Signed-off-by: Lizhi Xu Reviewed-by: Phillip Lougher Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- fs/squashfs/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 581ce951933901..2dc730800f448d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -321,7 +321,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2, compressed ? "" : "un", length); } - if (length < 0 || length > output->length || + if (length <= 0 || length > output->length || (index + length) > msblk->bytes_used) { res = -EIO; goto out; From 984ed0567f5d4243daf609760b60e7e1aa62fa8f Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 13 Nov 2023 16:09:01 +0000 Subject: [PATCH 31/70] Squashfs: fix variable overflow triggered by sysbot [ Upstream commit 12427de9439d68b8e96ba6f50b601ef15f437612 ] Sysbot reports a slab out of bounds write in squashfs_readahead(). This is ultimately caused by a file reporting an (infeasibly) large file size (1407374883553280 bytes) with the minimum block size of 4K. This causes variable overflow. Link: https://lkml.kernel.org/r/20231113160901.6444-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: syzbot+604424eb051c2f696163@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000b1fda20609ede0d1@google.com/ Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- fs/squashfs/file.c | 3 ++- fs/squashfs/file_direct.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 8ba8c4c5077078..e8df6430444b01 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -544,7 +544,8 @@ static void squashfs_readahead(struct readahead_control *ractl) struct squashfs_page_actor *actor; unsigned int nr_pages = 0; struct page **pages; - int i, file_end = i_size_read(inode) >> msblk->block_log; + int i; + loff_t file_end = i_size_read(inode) >> msblk->block_log; unsigned int max_pages = 1UL << shift; readahead_expand(ractl, start, (len | mask) + 1); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index f1ccad519e28cc..763a3f7a75f6dd 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -26,10 +26,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; + loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int start_index = target_page->index & ~mask; - int end_index = start_index | mask; + loff_t start_index = target_page->index & ~mask; + loff_t end_index = start_index | mask; int i, n, pages, bytes, res = -ENOMEM; struct page **page; struct squashfs_page_actor *actor; From ca9b877a2e2c04fa28e2aa17ce14d0633cbb73b4 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 26 Dec 2023 15:16:09 +0800 Subject: [PATCH 32/70] reiserfs: fix uninit-value in comp_keys [ Upstream commit dd8f87f21dc3da2eaf46e7401173f935b90b13a8 ] The cpu_key was not initialized in reiserfs_delete_solid_item(), which triggered this issue. Reported-and-tested-by: Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_9EA7E746DE92DBC66049A62EDF6ED64CA706@qq.com Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/reiserfs/stree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 3676e02a0232a4..4ab8cab6ea6147 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, INITIALIZE_PATH(path); int item_len = 0; int tb_init = 0; - struct cpu_key cpu_key; + struct cpu_key cpu_key = {}; int retval; int quota_cut_bytes = 0; From 2fdcf3c4ad74baf286621371ec46327a095cd1f8 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 27 Dec 2023 23:19:03 +0800 Subject: [PATCH 33/70] erofs: avoid debugging output for (de)compressed data [ Upstream commit 496530c7c1dfc159d59a75ae00b572f570710c53 ] Syzbot reported a KMSAN warning, erofs: (device loop0): z_erofs_lz4_decompress_mem: failed to decompress -12 in[46, 4050] out[917] ===================================================== BUG: KMSAN: uninit-value in hex_dump_to_buffer+0xae9/0x10f0 lib/hexdump.c:194 .. print_hex_dump+0x13d/0x3e0 lib/hexdump.c:276 z_erofs_lz4_decompress_mem fs/erofs/decompressor.c:252 [inline] z_erofs_lz4_decompress+0x257e/0x2a70 fs/erofs/decompressor.c:311 z_erofs_decompress_pcluster fs/erofs/zdata.c:1290 [inline] z_erofs_decompress_queue+0x338c/0x6460 fs/erofs/zdata.c:1372 z_erofs_runqueue+0x36cd/0x3830 z_erofs_read_folio+0x435/0x810 fs/erofs/zdata.c:1843 The root cause is that the printed decompressed buffer may be filled incompletely due to decompression failure. Since they were once only used for debugging, get rid of them now. Reported-and-tested-by: syzbot+6c746eea496f34b3161d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/000000000000321c24060d7cfa1c@google.com Reviewed-by: Yue Hu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231227151903.2900413-1-hsiangkao@linux.alibaba.com Signed-off-by: Sasha Levin --- fs/erofs/decompressor.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index d36b3963c0bf3c..aa59788a61e6e4 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -248,15 +248,9 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, if (ret != rq->outputsize) { erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); - - print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, - 16, 1, src + inputmargin, rq->inputsize, true); - print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, - 16, 1, out, rq->outputsize, true); - if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); - ret = -EIO; + ret = -EFSCORRUPTED; } else { ret = 0; } From 2cea502f58d6a966c34da896571b8459d0b66611 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 12 Jan 2024 16:32:58 -0800 Subject: [PATCH 34/70] net: tls, add test to capture error on large splice [ Upstream commit 034ea1305e659ddae44c19ba8449166fec318e2d ] syzbot found an error with how splice() is handled with a msg greater than 32. This was fixed in previous patch, but lets add a test for it to ensure it continues to work. Signed-off-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- tools/testing/selftests/net/tls.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index ad993ab3ac1819..bc36c91c4480f5 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -707,6 +707,20 @@ TEST_F(tls, splice_from_pipe) EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); } +TEST_F(tls, splice_more) +{ + unsigned int f = SPLICE_F_NONBLOCK | SPLICE_F_MORE | SPLICE_F_GIFT; + int send_len = TLS_PAYLOAD_MAX_LEN; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + int i, send_pipe = 1; + int p[2]; + + ASSERT_GE(pipe(p), 0); + EXPECT_GE(write(p[1], mem_send, send_len), 0); + for (i = 0; i < 32; i++) + EXPECT_EQ(splice(p[0], NULL, self->fd, NULL, send_pipe, f), 1); +} + TEST_F(tls, splice_from_pipe2) { int send_len = 16000; From 0252e359afa5714015a3a0846ab83289bd6501df Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Sat, 14 Oct 2023 12:20:15 +0200 Subject: [PATCH 35/70] Input: bcm5974 - check endpoint type before starting traffic [ Upstream commit 2b9c3eb32a699acdd4784d6b93743271b4970899 ] syzbot has found a type mismatch between a USB pipe and the transfer endpoint, which is triggered by the bcm5974 driver[1]. This driver expects the device to provide input interrupt endpoints and if that is not the case, the driver registration should terminate. Repros are available to reproduce this issue with a certain setup for the dummy_hcd, leading to an interrupt/bulk mismatch which is caught in the USB core after calling usb_submit_urb() with the following message: "BOGUS urb xfer, pipe 1 != type 3" Some other device drivers (like the appletouch driver bcm5974 is mainly based on) provide some checking mechanism to make sure that an IN interrupt endpoint is available. In this particular case the endpoint addresses are provided by a config table, so the checking can be targeted to the provided endpoints. Add some basic checking to guarantee that the endpoints available match the expected type for both the trackpad and button endpoints. This issue was only found for the trackpad endpoint, but the checking has been added to the button endpoint as well for the same reasons. Given that there was never a check for the endpoint type, this bug has been there since the first implementation of the driver (f89bd95c5c94). [1] https://syzkaller.appspot.com/bug?extid=348331f63b034f89b622 Fixes: f89bd95c5c94 ("Input: bcm5974 - add driver for Macbook Air and Pro Penryn touchpads") Signed-off-by: Javier Carrasco Reported-and-tested-by: syzbot+348331f63b034f89b622@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20231007-topic-bcm5974_bulk-v3-1-d0f38b9d2935@gmail.com Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/mouse/bcm5974.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index ca150618d32f18..953992b458e9f2 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -19,6 +19,7 @@ * Copyright (C) 2006 Nicolas Boichat (nicolas@boichat.ch) */ +#include "linux/usb.h" #include #include #include @@ -193,6 +194,8 @@ enum tp_type { /* list of device capability bits */ #define HAS_INTEGRATED_BUTTON 1 +/* maximum number of supported endpoints (currently trackpad and button) */ +#define MAX_ENDPOINTS 2 /* trackpad finger data block size */ #define FSIZE_TYPE1 (14 * sizeof(__le16)) @@ -891,6 +894,18 @@ static int bcm5974_resume(struct usb_interface *iface) return error; } +static bool bcm5974_check_endpoints(struct usb_interface *iface, + const struct bcm5974_config *cfg) +{ + u8 ep_addr[MAX_ENDPOINTS + 1] = {0}; + + ep_addr[0] = cfg->tp_ep; + if (cfg->tp_type == TYPE1) + ep_addr[1] = cfg->bt_ep; + + return usb_check_int_endpoints(iface, ep_addr); +} + static int bcm5974_probe(struct usb_interface *iface, const struct usb_device_id *id) { @@ -903,6 +918,11 @@ static int bcm5974_probe(struct usb_interface *iface, /* find the product index */ cfg = bcm5974_get_config(udev); + if (!bcm5974_check_endpoints(iface, cfg)) { + dev_err(&iface->dev, "Unexpected non-int endpoint\n"); + return -ENODEV; + } + /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(struct bcm5974), GFP_KERNEL); input_dev = input_allocate_device(); From a2f2e5a4c907250e2c69abffc609226e992cd7f5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 7 Feb 2024 19:12:15 +0100 Subject: [PATCH 36/70] quota: Detect loops in quota tree [ Upstream commit a898cb621ac589b0b9e959309689a027e765aa12 ] Syzbot has found that when it creates corrupted quota files where the quota tree contains a loop, we will deadlock when tryling to insert a dquot. Add loop detection into functions traversing the quota tree. Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/quota/quota_tree.c | 128 +++++++++++++++++++++++++++++++----------- fs/quota/quota_v2.c | 15 +++-- 2 files changed, 105 insertions(+), 38 deletions(-) diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 0f1493e0f6d059..254f6359b287fa 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -21,6 +21,12 @@ MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota trie support"); MODULE_LICENSE("GPL"); +/* + * Maximum quota tree depth we support. Only to limit recursion when working + * with the tree. + */ +#define MAX_QTREE_DEPTH 6 + #define __QUOTA_QT_PARANOIA static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) @@ -327,27 +333,36 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, /* Insert reference to structure into the trie */ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, - uint *treeblk, int depth) + uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; + int i; if (!buf) return -ENOMEM; - if (!*treeblk) { + if (!blks[depth]) { ret = get_free_dqblk(info); if (ret < 0) goto out_buf; - *treeblk = ret; + for (i = 0; i < depth; i++) + if (ret == blks[i]) { + quota_error(dquot->dq_sb, + "Free block already used in tree: block %u", + ret); + ret = -EIO; + goto out_buf; + } + blks[depth] = ret; memset(buf, 0, info->dqi_usable_bs); newact = 1; } else { - ret = read_blk(info, *treeblk, buf); + ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read tree quota " - "block %u", *treeblk); + "block %u", blks[depth]); goto out_buf; } } @@ -357,8 +372,20 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, info->dqi_blocks - 1); if (ret) goto out_buf; - if (!newblk) + if (!newblk) { newson = 1; + } else { + for (i = 0; i <= depth; i++) + if (newblk == blks[i]) { + quota_error(dquot->dq_sb, + "Cycle in quota tree detected: block %u index %u", + blks[depth], + get_index(info, dquot->dq_id, depth)); + ret = -EIO; + goto out_buf; + } + } + blks[depth + 1] = newblk; if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { @@ -370,16 +397,16 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } #endif - newblk = find_free_dqentry(info, dquot, &ret); + blks[depth + 1] = find_free_dqentry(info, dquot, &ret); } else { - ret = do_insert_tree(info, dquot, &newblk, depth+1); + ret = do_insert_tree(info, dquot, blks, depth + 1); } if (newson && ret >= 0) { ref[get_index(info, dquot->dq_id, depth)] = - cpu_to_le32(newblk); - ret = write_blk(info, *treeblk, buf); + cpu_to_le32(blks[depth + 1]); + ret = write_blk(info, blks[depth], buf); } else if (newact && ret < 0) { - put_free_dqblk(info, buf, *treeblk); + put_free_dqblk(info, buf, blks[depth]); } out_buf: kfree(buf); @@ -390,7 +417,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot) { - int tmp = QT_TREEOFF; + uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; #ifdef __QUOTA_QT_PARANOIA if (info->dqi_blocks <= QT_TREEOFF) { @@ -398,7 +425,11 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, return -EIO; } #endif - return do_insert_tree(info, dquot, &tmp, 0); + if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { + quota_error(dquot->dq_sb, "Quota tree depth too big!"); + return -EIO; + } + return do_insert_tree(info, dquot, blks, 0); } /* @@ -511,19 +542,20 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, /* Remove reference to dquot from tree */ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, - uint *blk, int depth) + uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; + int i; if (!buf) return -ENOMEM; - ret = read_blk(info, *blk, buf); + ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", - *blk); + blks[depth]); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); @@ -532,29 +564,38 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (ret) goto out_buf; + for (i = 0; i <= depth; i++) + if (newblk == blks[i]) { + quota_error(dquot->dq_sb, + "Cycle in quota tree detected: block %u index %u", + blks[depth], + get_index(info, dquot->dq_id, depth)); + ret = -EIO; + goto out_buf; + } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); - newblk = 0; + blks[depth + 1] = 0; } else { - ret = remove_tree(info, dquot, &newblk, depth+1); + blks[depth + 1] = newblk; + ret = remove_tree(info, dquot, blks, depth + 1); } - if (ret >= 0 && !newblk) { - int i; + if (ret >= 0 && !blks[depth + 1]) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); /* Block got empty? */ for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) ; /* Don't put the root block into the free block list */ if (i == (info->dqi_usable_bs >> 2) - && *blk != QT_TREEOFF) { - put_free_dqblk(info, buf, *blk); - *blk = 0; + && blks[depth] != QT_TREEOFF) { + put_free_dqblk(info, buf, blks[depth]); + blks[depth] = 0; } else { - ret = write_blk(info, *blk, buf); + ret = write_blk(info, blks[depth], buf); if (ret < 0) quota_error(dquot->dq_sb, "Can't write quota tree block %u", - *blk); + blks[depth]); } } out_buf: @@ -565,11 +606,15 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, /* Delete dquot from tree */ int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { - uint tmp = QT_TREEOFF; + uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (!dquot->dq_off) /* Even not allocated? */ return 0; - return remove_tree(info, dquot, &tmp, 0); + if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { + quota_error(dquot->dq_sb, "Quota tree depth too big!"); + return -EIO; + } + return remove_tree(info, dquot, blks, 0); } EXPORT_SYMBOL(qtree_delete_dquot); @@ -613,18 +658,20 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, /* Find entry for given id in the tree */ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, - struct dquot *dquot, uint blk, int depth) + struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; __le32 *ref = (__le32 *)buf; + uint blk; + int i; if (!buf) return -ENOMEM; - ret = read_blk(info, blk, buf); + ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree block %u", - blk); + blks[depth]); goto out_buf; } ret = 0; @@ -636,8 +683,19 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, if (ret) goto out_buf; + /* Check for cycles in the tree */ + for (i = 0; i <= depth; i++) + if (blk == blks[i]) { + quota_error(dquot->dq_sb, + "Cycle in quota tree detected: block %u index %u", + blks[depth], + get_index(info, dquot->dq_id, depth)); + ret = -EIO; + goto out_buf; + } + blks[depth + 1] = blk; if (depth < info->dqi_qtree_depth - 1) - ret = find_tree_dqentry(info, dquot, blk, depth+1); + ret = find_tree_dqentry(info, dquot, blks, depth + 1); else ret = find_block_dqentry(info, dquot, blk); out_buf: @@ -649,7 +707,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot) { - return find_tree_dqentry(info, dquot, QT_TREEOFF, 0); + uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; + + if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { + quota_error(dquot->dq_sb, "Quota tree depth too big!"); + return -EIO; + } + return find_tree_dqentry(info, dquot, blks, 0); } int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index ae99e7b88205b2..7978ab671e0c6a 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -166,14 +166,17 @@ static int v2_read_file_info(struct super_block *sb, int type) i_size_read(sb_dqopt(sb)->files[type])); goto out_free; } - if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) { - quota_error(sb, "Free block number too big (%u >= %u).", - qinfo->dqi_free_blk, qinfo->dqi_blocks); + if (qinfo->dqi_free_blk && (qinfo->dqi_free_blk <= QT_TREEOFF || + qinfo->dqi_free_blk >= qinfo->dqi_blocks)) { + quota_error(sb, "Free block number %u out of range (%u, %u).", + qinfo->dqi_free_blk, QT_TREEOFF, qinfo->dqi_blocks); goto out_free; } - if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) { - quota_error(sb, "Block with free entry too big (%u >= %u).", - qinfo->dqi_free_entry, qinfo->dqi_blocks); + if (qinfo->dqi_free_entry && (qinfo->dqi_free_entry <= QT_TREEOFF || + qinfo->dqi_free_entry >= qinfo->dqi_blocks)) { + quota_error(sb, "Block with free entry %u out of range (%u, %u).", + qinfo->dqi_free_entry, QT_TREEOFF, + qinfo->dqi_blocks); goto out_free; } ret = 0; From 6cee13d8d4e0831bd8e89dcdc0bf6f86c92f3fc2 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 8 Feb 2024 19:28:54 -0700 Subject: [PATCH 37/70] net:rds: Fix possible deadlock in rds_message_put [ Upstream commit f1acf1ac84d2ae97b7889b87223c1064df850069 ] Functions rds_still_queued and rds_clear_recv_queue lock a given socket in order to safely iterate over the incoming rds messages. However calling rds_inc_put while under this lock creates a potential deadlock. rds_inc_put may eventually call rds_message_purge, which will lock m_rs_lock. This is the incorrect locking order since m_rs_lock is meant to be locked before the socket. To fix this, we move the message item to a local list or variable that wont need rs_recv_lock protection. Then we can safely call rds_inc_put on any item stored locally after rs_recv_lock is released. Fixes: bdbe6fbc6a2f ("RDS: recv.c") Reported-by: syzbot+f9db6ff27b9bfdcfeca0@syzkaller.appspotmail.com Reported-by: syzbot+dcd73ff9291e6d34b3ab@syzkaller.appspotmail.com Signed-off-by: Allison Henderson Link: https://lore.kernel.org/r/20240209022854.200292-1-allison.henderson@oracle.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/rds/recv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/rds/recv.c b/net/rds/recv.c index c71b923764fd7c..5627f80013f8b1 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -425,6 +425,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsigned long flags; + struct rds_incoming *to_drop = NULL; write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) { @@ -435,11 +436,14 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); - rds_inc_put(inc); + to_drop = inc; } } write_unlock_irqrestore(&rs->rs_recv_lock, flags); + if (to_drop) + rds_inc_put(to_drop); + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); return ret; } @@ -758,16 +762,21 @@ void rds_clear_recv_queue(struct rds_sock *rs) struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsigned long flags; + LIST_HEAD(to_drop); write_lock_irqsave(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + list_move(&inc->i_item, &to_drop); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + list_for_each_entry_safe(inc, tmp, &to_drop, i_item) { list_del_init(&inc->i_item); rds_inc_put(inc); } - write_unlock_irqrestore(&rs->rs_recv_lock, flags); } /* From 84c176fbecd1523a1d16650ed232c8bc99eddd11 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 14 Feb 2024 11:22:24 +0300 Subject: [PATCH 38/70] net: sctp: fix skb leak in sctp_inq_free() [ Upstream commit 4e45170d9acc2d5ae8f545bf3f2f67504a361338 ] In case of GSO, 'chunk->skb' pointer may point to an entry from fraglist created in 'sctp_packet_gso_append()'. To avoid freeing random fraglist entry (and so undefined behavior and/or memory leak), introduce 'sctp_inq_chunk_free()' helper to ensure that 'chunk->skb' is set to 'chunk->head_skb' (i.e. fraglist head) before calling 'sctp_chunk_free()', and use the aforementioned helper in 'sctp_inq_pop()' as well. Reported-by: syzbot+8bb053b5d63595ab47db@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?id=0d8351bbe54fd04a492c2daab0164138db008042 Fixes: 90017accff61 ("sctp: Add GSO support") Suggested-by: Xin Long Signed-off-by: Dmitry Antipov Acked-by: Xin Long Link: https://lore.kernel.org/r/20240214082224.10168-1-dmantipov@yandex.ru Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/inqueue.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 7182c5a450fb5b..5c165218180588 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -38,6 +38,14 @@ void sctp_inq_init(struct sctp_inq *queue) INIT_WORK(&queue->immediate, NULL); } +/* Properly release the chunk which is being worked on. */ +static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk) +{ + if (chunk->head_skb) + chunk->skb = chunk->head_skb; + sctp_chunk_free(chunk); +} + /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { @@ -53,7 +61,7 @@ void sctp_inq_free(struct sctp_inq *queue) * free it as well. */ if (queue->in_progress) { - sctp_chunk_free(queue->in_progress); + sctp_inq_chunk_free(queue->in_progress); queue->in_progress = NULL; } } @@ -130,9 +138,7 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) goto new_skb; } - if (chunk->head_skb) - chunk->skb = chunk->head_skb; - sctp_chunk_free(chunk); + sctp_inq_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ From 5fbbd952e7c395eb6cf7fbc617888874740906bf Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Wed, 14 Feb 2024 09:01:50 +0000 Subject: [PATCH 39/70] pppoe: Fix memory leak in pppoe_sendmsg() [ Upstream commit dc34ebd5c018b0edf47f39d11083ad8312733034 ] syzbot reports a memory leak in pppoe_sendmsg [1]. The problem is in the pppoe_recvmsg() function that handles errors in the wrong order. For the skb_recv_datagram() function, check the pointer to skb for NULL first, and then check the 'error' variable, because the skb_recv_datagram() function can set 'error' to -EAGAIN in a loop but return a correct pointer to socket buffer after a number of attempts, though 'error' remains set to -EAGAIN. skb_recv_datagram __skb_recv_datagram // Loop. if (err == -EAGAIN) then // go to the next loop iteration __skb_try_recv_datagram // if (skb != NULL) then return 'skb' // else if a signal is received then // return -EAGAIN Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller. Link: https://syzkaller.appspot.com/bug?extid=6bdfd184eac7709e5cc9 [1] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+6bdfd184eac7709e5cc9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6bdfd184eac7709e5cc9 Signed-off-by: Gavrilov Ilia Reviewed-by: Guillaume Nault Link: https://lore.kernel.org/r/20240214085814.3894917-1-Ilia.Gavrilov@infotecs.ru Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ppp/pppoe.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index ba8b6bd8233cad..96cca4ee470a4b 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1007,26 +1007,21 @@ static int pppoe_recvmsg(struct socket *sock, struct msghdr *m, struct sk_buff *skb; int error = 0; - if (sk->sk_state & PPPOX_BOUND) { - error = -EIO; - goto end; - } + if (sk->sk_state & PPPOX_BOUND) + return -EIO; skb = skb_recv_datagram(sk, flags, &error); - if (error < 0) - goto end; + if (!skb) + return error; - if (skb) { - total_len = min_t(size_t, total_len, skb->len); - error = skb_copy_datagram_msg(skb, 0, m, total_len); - if (error == 0) { - consume_skb(skb); - return total_len; - } + total_len = min_t(size_t, total_len, skb->len); + error = skb_copy_datagram_msg(skb, 0, m, total_len); + if (error == 0) { + consume_skb(skb); + return total_len; } kfree_skb(skb); -end: return error; } From ef33f0296893356314a5c79e787bca977bb3c9ec Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 22 Feb 2024 07:56:15 -0800 Subject: [PATCH 40/70] bpf: Replace bpf_lpm_trie_key 0-length array with flexible array [ Upstream commit 896880ff30866f386ebed14ab81ce1ad3710cfc4 ] Replace deprecated 0-length array in struct bpf_lpm_trie_key with flexible array. Found with GCC 13: ../kernel/bpf/lpm_trie.c:207:51: warning: array subscript i is outside array bounds of 'const __u8[0]' {aka 'const unsigned char[]'} [-Warray-bounds=] 207 | *(__be16 *)&key->data[i]); | ^~~~~~~~~~~~~ ../include/uapi/linux/swab.h:102:54: note: in definition of macro '__swab16' 102 | #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x)) | ^ ../include/linux/byteorder/generic.h:97:21: note: in expansion of macro '__be16_to_cpu' 97 | #define be16_to_cpu __be16_to_cpu | ^~~~~~~~~~~~~ ../kernel/bpf/lpm_trie.c:206:28: note: in expansion of macro 'be16_to_cpu' 206 | u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ | ^~~~~~~~~~~ In file included from ../include/linux/bpf.h:7: ../include/uapi/linux/bpf.h:82:17: note: while referencing 'data' 82 | __u8 data[0]; /* Arbitrary size */ | ^~~~ And found at run-time under CONFIG_FORTIFY_SOURCE: UBSAN: array-index-out-of-bounds in kernel/bpf/lpm_trie.c:218:49 index 0 is out of range for type '__u8 [*]' Changing struct bpf_lpm_trie_key is difficult since has been used by userspace. For example, in Cilium: struct egress_gw_policy_key { struct bpf_lpm_trie_key lpm_key; __u32 saddr; __u32 daddr; }; While direct references to the "data" member haven't been found, there are static initializers what include the final member. For example, the "{}" here: struct egress_gw_policy_key in_key = { .lpm_key = { 32 + 24, {} }, .saddr = CLIENT_IP, .daddr = EXTERNAL_SVC_IP & 0Xffffff, }; To avoid the build time and run time warnings seen with a 0-sized trailing array for struct bpf_lpm_trie_key, introduce a new struct that correctly uses a flexible array for the trailing bytes, struct bpf_lpm_trie_key_u8. As part of this, include the "header" portion (which is just the "prefixlen" member), so it can be used by anything building a bpf_lpr_trie_key that has trailing members that aren't a u8 flexible array (like the self-test[1]), which is named struct bpf_lpm_trie_key_hdr. Unfortunately, C++ refuses to parse the __struct_group() helper, so it is not possible to define struct bpf_lpm_trie_key_hdr directly in struct bpf_lpm_trie_key_u8, so we must open-code the union directly. Adjust the kernel code to use struct bpf_lpm_trie_key_u8 through-out, and for the selftest to use struct bpf_lpm_trie_key_hdr. Add a comment to the UAPI header directing folks to the two new options. Reported-by: Mark Rutland Signed-off-by: Kees Cook Signed-off-by: Daniel Borkmann Acked-by: Gustavo A. R. Silva Closes: https://paste.debian.net/hidden/ca500597/ Link: https://lore.kernel.org/all/202206281009.4332AA33@keescook/ [1] Link: https://lore.kernel.org/bpf/20240222155612.it.533-kees@kernel.org Stable-dep-of: 59f2f841179a ("bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie.") Signed-off-by: Sasha Levin --- Documentation/bpf/map_lpm_trie.rst | 2 +- include/uapi/linux/bpf.h | 19 +++++++++++++++++- kernel/bpf/lpm_trie.c | 20 +++++++++---------- samples/bpf/map_perf_test_user.c | 2 +- samples/bpf/xdp_router_ipv4_user.c | 2 +- tools/include/uapi/linux/bpf.h | 19 +++++++++++++++++- .../selftests/bpf/progs/map_ptr_kern.c | 2 +- tools/testing/selftests/bpf/test_lpm_map.c | 18 ++++++++--------- 8 files changed, 59 insertions(+), 25 deletions(-) diff --git a/Documentation/bpf/map_lpm_trie.rst b/Documentation/bpf/map_lpm_trie.rst index 74d64a30f50073..f9cd579496c9ce 100644 --- a/Documentation/bpf/map_lpm_trie.rst +++ b/Documentation/bpf/map_lpm_trie.rst @@ -17,7 +17,7 @@ significant byte. LPM tries may be created with a maximum prefix length that is a multiple of 8, in the range from 8 to 2048. The key used for lookup and update -operations is a ``struct bpf_lpm_trie_key``, extended by +operations is a ``struct bpf_lpm_trie_key_u8``, extended by ``max_prefixlen/8`` bytes. - For IPv4 addresses the data length is 4 bytes diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fb09fd1767f289..ba6e346c8d669a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -77,12 +77,29 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; -/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for + * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for + * the trailing flexible array member) instead. + */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ __u8 data[0]; /* Arbitrary size */ }; +/* Header for bpf_lpm_trie_key structs */ +struct bpf_lpm_trie_key_hdr { + __u32 prefixlen; +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */ +struct bpf_lpm_trie_key_u8 { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + __u8 data[]; /* Arbitrary size */ +}; + struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ __u32 attach_type; /* program attach type (enum bpf_attach_type) */ diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b32be680da6cdc..050fe1ebf0f7d1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -164,13 +164,13 @@ static inline int extract_bit(const u8 *data, size_t index) */ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, - const struct bpf_lpm_trie_key *key) + const struct bpf_lpm_trie_key_u8 *key) { u32 limit = min(node->prefixlen, key->prefixlen); u32 prefixlen = 0, i = 0; BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); - BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32)); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) @@ -229,7 +229,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *found = NULL; - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; if (key->prefixlen > trie->max_prefixlen) return NULL; @@ -309,7 +309,7 @@ static long trie_update_elem(struct bpf_map *map, struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; struct lpm_trie_node __rcu **slot; - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -437,7 +437,7 @@ static long trie_update_elem(struct bpf_map *map, static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key; + struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; unsigned long irq_flags; @@ -536,7 +536,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) sizeof(struct lpm_trie_node)) #define LPM_VAL_SIZE_MIN 1 -#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X)) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) @@ -565,7 +565,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - - offsetof(struct bpf_lpm_trie_key, data); + offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; spin_lock_init(&trie->lock); @@ -616,7 +616,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key; struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; @@ -703,7 +703,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) } do_copy: next_key->prefixlen = next_node->prefixlen; - memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data), next_node->data, trie->data_size); free_stack: kfree(node_stack); @@ -715,7 +715,7 @@ static int trie_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - /* Keys must have struct bpf_lpm_trie_key embedded. */ + /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */ return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? -EINVAL : 0; } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index d2fbcf963cdf6d..07ff471ed6aee0 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -370,7 +370,7 @@ static void run_perf_test(int tasks) static void fill_lpm_trie(void) { - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; unsigned long value = 0; unsigned int i; int r; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 9d41db09c4800f..266fdd0b025dc6 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -91,7 +91,7 @@ static int recv_msg(struct sockaddr_nl sock_addr, int sock) static void read_route(struct nlmsghdr *nh, int nll) { char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; - struct bpf_lpm_trie_key *prefix_key; + struct bpf_lpm_trie_key_u8 *prefix_key; struct rtattr *rt_attr; struct rtmsg *rt_msg; int rtm_family; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fb09fd1767f289..ba6e346c8d669a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -77,12 +77,29 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; -/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for + * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for + * the trailing flexible array member) instead. + */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ __u8 data[0]; /* Arbitrary size */ }; +/* Header for bpf_lpm_trie_key structs */ +struct bpf_lpm_trie_key_hdr { + __u32 prefixlen; +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */ +struct bpf_lpm_trie_key_u8 { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + __u8 data[]; /* Arbitrary size */ +}; + struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ __u32 attach_type; /* program attach type (enum bpf_attach_type) */ diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index 3325da17ec81af..efaf622c28ddec 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -316,7 +316,7 @@ struct lpm_trie { } __attribute__((preserve_access_index)); struct lpm_key { - struct bpf_lpm_trie_key trie_key; + struct bpf_lpm_trie_key_hdr trie_key; __u32 data; }; diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index c028d621c744da..d98c72dc563eaf 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -211,7 +211,7 @@ static void test_lpm_map(int keysize) volatile size_t n_matches, n_matches_after_delete; size_t i, j, n_nodes, n_lookups; struct tlpm_node *t, *list = NULL; - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; uint8_t *data, *value; int r, map; @@ -331,8 +331,8 @@ static void test_lpm_map(int keysize) static void test_lpm_ipaddr(void) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - struct bpf_lpm_trie_key *key_ipv4; - struct bpf_lpm_trie_key *key_ipv6; + struct bpf_lpm_trie_key_u8 *key_ipv4; + struct bpf_lpm_trie_key_u8 *key_ipv6; size_t key_size_ipv4; size_t key_size_ipv6; int map_fd_ipv4; @@ -423,7 +423,7 @@ static void test_lpm_ipaddr(void) static void test_lpm_delete(void) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - struct bpf_lpm_trie_key *key; + struct bpf_lpm_trie_key_u8 *key; size_t key_size; int map_fd; __u64 value; @@ -532,7 +532,7 @@ static void test_lpm_delete(void) static void test_lpm_get_next_key(void) { LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); - struct bpf_lpm_trie_key *key_p, *next_key_p; + struct bpf_lpm_trie_key_u8 *key_p, *next_key_p; size_t key_size; __u32 value = 0; int map_fd; @@ -693,9 +693,9 @@ static void *lpm_test_command(void *arg) { int i, j, ret, iter, key_size; struct lpm_mt_test_info *info = arg; - struct bpf_lpm_trie_key *key_p; + struct bpf_lpm_trie_key_u8 *key_p; - key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32); + key_size = sizeof(*key_p) + sizeof(__u32); key_p = alloca(key_size); for (iter = 0; iter < info->iter; iter++) for (i = 0; i < MAX_TEST_KEYS; i++) { @@ -717,7 +717,7 @@ static void *lpm_test_command(void *arg) ret = bpf_map_lookup_elem(info->map_fd, key_p, &value); assert(ret == 0 || errno == ENOENT); } else { - struct bpf_lpm_trie_key *next_key_p = alloca(key_size); + struct bpf_lpm_trie_key_u8 *next_key_p = alloca(key_size); ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p); assert(ret == 0 || errno == ENOENT || errno == ENOMEM); } @@ -752,7 +752,7 @@ static void test_lpm_multi_thread(void) /* create a trie */ value_size = sizeof(__u32); - key_size = sizeof(struct bpf_lpm_trie_key) + value_size; + key_size = sizeof(struct bpf_lpm_trie_key_hdr) + value_size; map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, value_size, 100, &opts); /* create 4 threads to test update, delete, lookup and get_next_key */ From 63f13eb5d627e6b24b03bdc3b59516fac4e68056 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 29 Mar 2024 10:14:39 -0700 Subject: [PATCH 41/70] bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie. [ Upstream commit 59f2f841179aa6a0899cb9cf53659149a35749b7 ] syzbot reported the following lock sequence: cpu 2: grabs timer_base lock spins on bpf_lpm lock cpu 1: grab rcu krcp lock spins on timer_base lock cpu 0: grab bpf_lpm lock spins on rcu krcp lock bpf_lpm lock can be the same. timer_base lock can also be the same due to timer migration. but rcu krcp lock is always per-cpu, so it cannot be the same lock. Hence it's a false positive. To avoid lockdep complaining move kfree_rcu() after spin_unlock. Reported-by: syzbot+1fa663a2100308ab6eab@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240329171439.37813-1-alexei.starovoitov@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/lpm_trie.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 050fe1ebf0f7d1..d0febf07051edf 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -308,6 +308,7 @@ static long trie_update_elem(struct bpf_map *map, { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; unsigned long irq_flags; @@ -382,7 +383,7 @@ static long trie_update_elem(struct bpf_map *map, trie->n_entries--; rcu_assign_pointer(*slot, new_node); - kfree_rcu(node, rcu); + free_node = node; goto out; } @@ -429,6 +430,7 @@ static long trie_update_elem(struct bpf_map *map, } spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_node, rcu); return ret; } @@ -437,6 +439,7 @@ static long trie_update_elem(struct bpf_map *map, static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *free_node = NULL, *free_parent = NULL; struct bpf_lpm_trie_key_u8 *key = _key; struct lpm_trie_node __rcu **trim, **trim2; struct lpm_trie_node *node, *parent; @@ -506,8 +509,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) else rcu_assign_pointer( *trim2, rcu_access_pointer(parent->child[0])); - kfree_rcu(parent, rcu); - kfree_rcu(node, rcu); + free_parent = parent; + free_node = node; goto out; } @@ -521,10 +524,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); else RCU_INIT_POINTER(*trim, NULL); - kfree_rcu(node, rcu); + free_node = node; out: spin_unlock_irqrestore(&trie->lock, irq_flags); + kfree_rcu(free_parent, rcu); + kfree_rcu(free_node, rcu); return ret; } From 107449cfb2176318600e4763c9d9d267b32b636a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 25 Mar 2024 19:34:01 -0600 Subject: [PATCH 42/70] fs: Annotate struct file_handle with __counted_by() and use struct_size() [ Upstream commit 68d6f4f3fbd9b1baae53e7cf33fb3362b5a21494 ] Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). While there, use struct_size() helper, instead of the open-coded version. [brauner@kernel.org: contains a fix by Edward for an OOB access] Reported-by: syzbot+4139435cb1b34cf759c2@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_A7845DD769577306D813742365E976E3A205@qq.com Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/ZgImCXTdGDTeBvSS@neat Reviewed-by: Jan Kara Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/fhandle.c | 6 +++--- include/linux/fs.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 99dcf07cfecfe1..c361d7ff1b88dd 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -40,7 +40,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) return -ENOMEM; @@ -75,7 +75,7 @@ static long do_sys_name_to_handle(const struct path *path, /* copy the mount id */ if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || copy_to_user(ufh, handle, - sizeof(struct file_handle) + handle_bytes)) + struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; @@ -196,7 +196,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, retval = -EINVAL; goto out_err; } - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ca9e859c042b3..43e640fb4a7f77 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1036,7 +1036,7 @@ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ - unsigned char f_handle[]; + unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) From bae45e9b78e8aebf4ce22e82ab082253f8878fe2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:44 +0000 Subject: [PATCH 43/70] mISDN: fix MISDN_TIME_STAMP handling [ Upstream commit 138b787804f4a10417618e8d1e6e2700539fd88c ] syzbot reports one unsafe call to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in data_sock_setsockopt+0x46c/0x4cc drivers/isdn/mISDN/socket.c:417 Read of size 4 at addr ffff0000c6d54083 by task syz-executor406/6167 CPU: 1 PID: 6167 Comm: syz-executor406 Not tainted 6.8.0-rc7-syzkaller-g707081b61156 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call trace: dump_backtrace+0x1b8/0x1e4 arch/arm64/kernel/stacktrace.c:291 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:298 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd0/0x124 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:377 [inline] print_report+0x178/0x518 mm/kasan/report.c:488 kasan_report+0xd8/0x138 mm/kasan/report.c:601 __asan_report_load_n_noabort+0x1c/0x28 mm/kasan/report_generic.c:391 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] data_sock_setsockopt+0x46c/0x4cc drivers/isdn/mISDN/socket.c:417 do_sock_setsockopt+0x2a0/0x4e0 net/socket.c:2311 __sys_setsockopt+0x128/0x1a8 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __arm64_sys_setsockopt+0xb8/0xd4 net/socket.c:2340 __invoke_syscall arch/arm64/kernel/syscall.c:34 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:48 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:133 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:152 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Fixes: 1b2b03f8e514 ("Add mISDN core files") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Karsten Keil Link: https://lore.kernel.org/r/20240408082845.3957374-3-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/isdn/mISDN/socket.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 2776ca5fc33f39..b215b28cad7b76 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -401,23 +401,23 @@ data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } static int data_sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, - level, optname, len); + level, optname, optlen); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: - if (copy_from_sockptr(&opt, optval, sizeof(int))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; From ae7f73e64e9bbea215c5dbf7c28421865232b8b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:43 +0000 Subject: [PATCH 44/70] net: add copy_safe_from_sockptr() helper [ Upstream commit 6309863b31dd80317cd7d6824820b44e254e2a9c ] copy_from_sockptr() helper is unsafe, unless callers did the prior check against user provided optlen. Too many callers get this wrong, lets add a helper to fix them and avoid future copy/paste bugs. Instead of : if (optlen < sizeof(opt)) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(opt)) { err = -EFAULT; break; } Use : err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408082845.3957374-2-edumazet@google.com Signed-off-by: Jakub Kicinski Stable-dep-of: 7a87441c9651 ("nfc: llcp: fix nfc_llcp_setsockopt() unsafe copies") Signed-off-by: Sasha Levin --- include/linux/sockptr.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f7a..1c1a5d926b1713 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -50,11 +50,36 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, return 0; } +/* Deprecated. + * This is unsafe, unless caller checked user provided optlen. + * Prefer copy_safe_from_sockptr() instead. + */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } +/** + * copy_safe_from_sockptr: copy a struct from sockptr + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @optval: Source address. (in user or kernel space) + * @optlen: Size of @optval data. + * + * Returns: + * * -EINVAL: @optlen < @ksize + * * -EFAULT: access to userspace failed. + * * 0 : @ksize bytes were copied + */ +static inline int copy_safe_from_sockptr(void *dst, size_t ksize, + sockptr_t optval, unsigned int optlen) +{ + if (optlen < ksize) + return -EINVAL; + return copy_from_sockptr(dst, optval, ksize); +} + static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { From 0f106133203021533cb753e80d75896f4ad222f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:45 +0000 Subject: [PATCH 45/70] nfc: llcp: fix nfc_llcp_setsockopt() unsafe copies [ Upstream commit 7a87441c9651ba37842f4809224aca13a554a26f ] syzbot reported unsafe calls to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 Read of size 4 at addr ffff88801caa1ec3 by task syz-executor459/5078 CPU: 0 PID: 5078 Comm: syz-executor459 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 do_sock_setsockopt+0x3b1/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfd/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7f7fac07fd89 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 91 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff660eb788 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f7fac07fd89 RDX: 0000000000000000 RSI: 0000000000000118 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020000a80 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240408082845.3957374-4-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/nfc/llcp_sock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 819157bbb5a2c6..d5344563e525c9 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -252,10 +252,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_RW) { err = -EINVAL; @@ -274,10 +274,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_MIUX) { err = -EINVAL; From 4ea65e2095e9bd151d0469328dd7fc2858feb546 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:43:45 -0400 Subject: [PATCH 46/70] Bluetooth: RFCOMM: Fix not validating setsockopt user input [ Upstream commit a97de7bff13b1cc825c1b1344eaed8d6c2d3e695 ] syzbot reported rfcomm_sock_setsockopt_old() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in rfcomm_sock_setsockopt_old net/bluetooth/rfcomm/sock.c:632 [inline] BUG: KASAN: slab-out-of-bounds in rfcomm_sock_setsockopt+0x893/0xa70 net/bluetooth/rfcomm/sock.c:673 Read of size 4 at addr ffff8880209a8bc3 by task syz-executor632/5064 Fixes: 9f2c8a03fbb3 ("Bluetooth: Replace RFCOMM link mode with security level") Fixes: bb23c0ab8246 ("Bluetooth: Add support for deferring RFCOMM connection setup") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/rfcomm/sock.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b54e8a530f55a1..29aa07e9db9d71 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -629,7 +629,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case RFCOMM_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + if (bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen)) { err = -EFAULT; break; } @@ -664,7 +664,6 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct bt_security sec; int err = 0; - size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -686,11 +685,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level > BT_SECURITY_HIGH) { err = -EINVAL; @@ -706,10 +703,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); From f2a77188a3964b3c2d198f16903285c8ece8299b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 9 Feb 2024 12:20:59 +0100 Subject: [PATCH 47/70] ext4: fold quota accounting into ext4_xattr_inode_lookup_create() [ Upstream commit 8208c41c43ad5e9b63dce6c45a73e326109ca658 ] When allocating EA inode, quota accounting is done just before ext4_xattr_inode_lookup_create(). Logically these two operations belong together so just fold quota accounting into ext4_xattr_inode_lookup_create(). We also make ext4_xattr_inode_lookup_create() return the looked up / created inode to convert the function to a more standard calling convention. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240209112107.10585-1-jack@suse.cz Signed-off-by: Theodore Ts'o Stable-dep-of: 0a46ef234756 ("ext4: do not create EA inode under buffer lock") Signed-off-by: Sasha Levin --- fs/ext4/xattr.c | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c58cbe9f7809c1..f176c4e8fdcb11 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1571,46 +1571,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, /* * Add value of the EA in an inode. */ -static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, - const void *value, size_t value_len, - struct inode **ret_inode) +static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, + struct inode *inode, const void *value, size_t value_len) { struct inode *ea_inode; u32 hash; int err; + /* Account inode & space to quota even if sharing... */ + err = ext4_xattr_inode_alloc_quota(inode, value_len); + if (err) + return ERR_PTR(err); + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); - if (err) { - iput(ea_inode); - return err; - } - - *ret_inode = ea_inode; - return 0; + if (err) + goto out_err; + return ea_inode; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); - if (IS_ERR(ea_inode)) - return PTR_ERR(ea_inode); + if (IS_ERR(ea_inode)) { + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ea_inode; + } err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); - iput(ea_inode); - return err; + goto out_err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); - - *ret_inode = ea_inode; - return 0; + return ea_inode; +out_err: + iput(ea_inode); + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ERR_PTR(err); } /* @@ -1718,16 +1721,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, if (i->value && in_inode) { WARN_ON_ONCE(!i->value_len); - ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); - if (ret) - goto out; - - ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, - i->value_len, - &new_ea_inode); - if (ret) { + new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(new_ea_inode)) { + ret = PTR_ERR(new_ea_inode); new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } From 737fb7853acd5bc8984f6f42e4bfba3334be8ae1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 Mar 2024 17:26:50 +0100 Subject: [PATCH 48/70] ext4: do not create EA inode under buffer lock [ Upstream commit 0a46ef234756dca04623b7591e8ebb3440622f0b ] ext4_xattr_set_entry() creates new EA inodes while holding buffer lock on the external xattr block. This is problematic as it nests all the allocation locking (which acquires locks on other buffers) under the buffer lock. This can even deadlock when the filesystem is corrupted and e.g. quota file is setup to contain xattr block as data block. Move the allocation of EA inode out of ext4_xattr_set_entry() into the callers. Reported-by: syzbot+a43d4f48b8397d0e41a9@syzkaller.appspotmail.com Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240321162657.27420-2-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/xattr.c | 113 +++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 60 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f176c4e8fdcb11..c368ff671d7739 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1625,6 +1625,7 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, + struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; @@ -1632,7 +1633,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; - struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; @@ -1717,38 +1717,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, old_ea_inode = NULL; goto out; } - } - if (i->value && in_inode) { - WARN_ON_ONCE(!i->value_len); - - new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode, - i->value, i->value_len); - if (IS_ERR(new_ea_inode)) { - ret = PTR_ERR(new_ea_inode); - new_ea_inode = NULL; - goto out; - } - } - if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); - if (ret) { - /* Release newly required ref count on new_ea_inode. */ - if (new_ea_inode) { - int err; - - err = ext4_xattr_inode_dec_ref(handle, - new_ea_inode); - if (err) - ext4_warning_inode(new_ea_inode, - "dec ref new_ea_inode err=%d", - err); - ext4_xattr_inode_free_quota(inode, new_ea_inode, - i->value_len); - } + if (ret) goto out; - } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); @@ -1872,7 +1845,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ret = 0; out: iput(old_ea_inode); - iput(new_ea_inode); return ret; } @@ -1935,9 +1907,21 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, size_t old_ea_inode_quota = 0; unsigned int ea_ino; - #define header(x) ((struct ext4_xattr_header *)(x)) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) { + error = PTR_ERR(ea_inode); + ea_inode = NULL; + goto cleanup; + } + } + if (s->base) { int offset = (char *)s->here - bs->bh->b_data; @@ -1946,6 +1930,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, EXT4_JTR_NONE); if (error) goto cleanup; + lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { @@ -1972,7 +1957,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, - true /* is_block */); + ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -2040,29 +2025,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (i->value && s->here->e_value_inum) { - /* - * A ref count on ea_inode has been taken as part of the call to - * ext4_xattr_set_entry() above. We would like to drop this - * extra ref but we have to wait until the xattr block is - * initialized and has its own ref count on the ea_inode. - */ - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &ea_inode); - if (error) { - ea_inode = NULL; - goto cleanup; - } - } - inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), @@ -2215,17 +2184,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, cleanup: if (ea_inode) { - int error2; - - error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); - if (error2) - ext4_warning_inode(ea_inode, "dec ref error=%d", - error2); + if (error) { + int error2; - /* If there was an error, revert the quota charge. */ - if (error) + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); + } iput(ea_inode); } if (ce) @@ -2283,14 +2251,38 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; + struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + } + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + false /* is_block */); + if (error) { + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } return error; + } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2299,6 +2291,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } + iput(ea_inode); return 0; } From 5b485efcb6a64daea26a24895fdfaecaa6914715 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 17 Apr 2024 17:25:49 -0400 Subject: [PATCH 49/70] mm/page_table_check: support userfault wr-protect entries [ Upstream commit 8430557fc584657559bfbd5150b6ae1bb90f35a0 ] Allow page_table_check hooks to check over userfaultfd wr-protect criteria upon pgtable updates. The rule is no co-existance allowed for any writable flag against userfault wr-protect flag. This should be better than c2da319c2e, where we used to only sanitize such issues during a pgtable walk, but when hitting such issue we don't have a good chance to know where does that writable bit came from [1], so that even the pgtable walk exposes a kernel bug (which is still helpful on triaging) but not easy to track and debug. Now we switch to track the source. It's much easier too with the recent introduction of page table check. There are some limitations with using the page table check here for userfaultfd wr-protect purpose: - It is only enabled with explicit enablement of page table check configs and/or boot parameters, but should be good enough to track at least syzbot issues, as syzbot should enable PAGE_TABLE_CHECK[_ENFORCED] for x86 [1]. We used to have DEBUG_VM but it's now off for most distros, while distros also normally not enable PAGE_TABLE_CHECK[_ENFORCED], which is similar. - It conditionally works with the ptep_modify_prot API. It will be bypassed when e.g. XEN PV is enabled, however still work for most of the rest scenarios, which should be the common cases so should be good enough. - Hugetlb check is a bit hairy, as the page table check cannot identify hugetlb pte or normal pte via trapping at set_pte_at(), because of the current design where hugetlb maps every layers to pte_t... For example, the default set_huge_pte_at() can invoke set_pte_at() directly and lose the hugetlb context, treating it the same as a normal pte_t. So far it's fine because we have huge_pte_uffd_wp() always equals to pte_uffd_wp() as long as supported (x86 only). It'll be a bigger problem when we'll define _PAGE_UFFD_WP differently at various pgtable levels, because then one huge_pte_uffd_wp() per-arch will stop making sense first.. as of now we can leave this for later too. This patch also removes commit c2da319c2e altogether, as we have something better now. [1] https://lore.kernel.org/all/000000000000dce0530615c89210@google.com/ Link: https://lkml.kernel.org/r/20240417212549.2766883-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Pasha Tatashin Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- Documentation/mm/page_table_check.rst | 9 +++++++- arch/x86/include/asm/pgtable.h | 18 +--------------- mm/page_table_check.c | 30 +++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst index c12838ce6b8de2..c59f22eb6a0f9a 100644 --- a/Documentation/mm/page_table_check.rst +++ b/Documentation/mm/page_table_check.rst @@ -14,7 +14,7 @@ Page table check performs extra verifications at the time when new pages become accessible from the userspace by getting their page table entries (PTEs PMDs etc.) added into the table. -In case of detected corruption, the kernel is crashed. There is a small +In case of most detected corruption, the kernel is crashed. There is a small performance and memory overhead associated with the page table check. Therefore, it is disabled by default, but can be optionally enabled on systems where the extra hardening outweighs the performance costs. Also, because page table check @@ -22,6 +22,13 @@ is synchronous, it can help with debugging double map memory corruption issues, by crashing kernel at the time wrong mapping occurs instead of later which is often the case with memory corruptions bugs. +It can also be used to do page table entry checks over various flags, dump +warnings when illegal combinations of entry flags are detected. Currently, +userfaultfd is the only user of such to sanity check wr-protect bit against +any writable flags. Illegal flag combinations will not directly cause data +corruption in this case immediately, but that will cause read-only data to +be writable, leading to corrupt when the page content is later modified. + Double mapping detection logic ============================== diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e02b179ec65989..d03fe4fb41f43c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -387,23 +387,7 @@ static inline pte_t pte_wrprotect(pte_t pte) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { - bool wp = pte_flags(pte) & _PAGE_UFFD_WP; - -#ifdef CONFIG_DEBUG_VM - /* - * Having write bit for wr-protect-marked present ptes is fatal, - * because it means the uffd-wp bit will be ignored and write will - * just go through. - * - * Use any chance of pgtable walking to verify this (e.g., when - * page swapped out or being migrated for all purposes). It means - * something is already wrong. Tell the admin even before the - * process crashes. We also nail it with wrong pgtable setup. - */ - WARN_ON_ONCE(wp && pte_write(pte)); -#endif - - return wp; + return pte_flags(pte) & _PAGE_UFFD_WP; } static inline pte_t pte_mkuffd_wp(pte_t pte) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 6363f93a47c691..509c6ef8de400e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt @@ -191,6 +193,22 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); +/* Whether the swap entry cached writable information */ +static inline bool swap_cached_writable(swp_entry_t entry) +{ + return is_writable_device_exclusive_entry(entry) || + is_writable_device_private_entry(entry) || + is_writable_migration_entry(entry); +} + +static inline void page_table_check_pte_flags(pte_t pte) +{ + if (pte_present(pte) && pte_uffd_wp(pte)) + WARN_ON_ONCE(pte_write(pte)); + else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) + WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); +} + void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { @@ -199,6 +217,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, if (&init_mm == mm) return; + page_table_check_pte_flags(pte); + for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) @@ -206,11 +226,21 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, } EXPORT_SYMBOL(__page_table_check_ptes_set); +static inline void page_table_check_pmd_flags(pmd_t pmd) +{ + if (pmd_present(pmd) && pmd_uffd_wp(pmd)) + WARN_ON_ONCE(pmd_write(pmd)); + else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) + WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); +} + void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; + page_table_check_pmd_flags(pmd); + __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, From 8a3ac7fb36962c34698f884bd697938054ff2afa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2024 16:08:00 +0000 Subject: [PATCH 50/70] wifi: cfg80211: restrict NL80211_ATTR_TXQ_QUANTUM values [ Upstream commit d1cba2ea8121e7fdbe1328cea782876b1dd80993 ] syzbot is able to trigger softlockups, setting NL80211_ATTR_TXQ_QUANTUM to 2^31. We had a similar issue in sch_fq, fixed with commit d9e15a273306 ("pkt_sched: fq: do not accept silly TCA_FQ_QUANTUM") watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:0:24] Modules linked in: irq event stamp: 131135 hardirqs last enabled at (131134): [] __exit_to_kernel_mode arch/arm64/kernel/entry-common.c:85 [inline] hardirqs last enabled at (131134): [] exit_to_kernel_mode+0xdc/0x10c arch/arm64/kernel/entry-common.c:95 hardirqs last disabled at (131135): [] __el1_irq arch/arm64/kernel/entry-common.c:533 [inline] hardirqs last disabled at (131135): [] el1_interrupt+0x24/0x68 arch/arm64/kernel/entry-common.c:551 softirqs last enabled at (125892): [] neigh_hh_init net/core/neighbour.c:1538 [inline] softirqs last enabled at (125892): [] neigh_resolve_output+0x268/0x658 net/core/neighbour.c:1553 softirqs last disabled at (125896): [] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 CPU: 1 PID: 24 Comm: kworker/1:0 Not tainted 6.9.0-rc7-syzkaller-gfda5695d692c #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Workqueue: mld mld_ifc_work pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __list_del include/linux/list.h:195 [inline] pc : __list_del_entry include/linux/list.h:218 [inline] pc : list_move_tail include/linux/list.h:310 [inline] pc : fq_tin_dequeue include/net/fq_impl.h:112 [inline] pc : ieee80211_tx_dequeue+0x6b8/0x3b4c net/mac80211/tx.c:3854 lr : __list_del_entry include/linux/list.h:218 [inline] lr : list_move_tail include/linux/list.h:310 [inline] lr : fq_tin_dequeue include/net/fq_impl.h:112 [inline] lr : ieee80211_tx_dequeue+0x67c/0x3b4c net/mac80211/tx.c:3854 sp : ffff800093d36700 x29: ffff800093d36a60 x28: ffff800093d36960 x27: dfff800000000000 x26: ffff0000d800ad50 x25: ffff0000d800abe0 x24: ffff0000d800abf0 x23: ffff0000e0032468 x22: ffff0000e00324d4 x21: ffff0000d800abf0 x20: ffff0000d800abf8 x19: ffff0000d800abf0 x18: ffff800093d363c0 x17: 000000000000d476 x16: ffff8000805519dc x15: ffff7000127a6cc8 x14: 1ffff000127a6cc8 x13: 0000000000000004 x12: ffffffffffffffff x11: ffff7000127a6cc8 x10: 0000000000ff0100 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000 x5 : ffff80009287aa08 x4 : 0000000000000008 x3 : ffff80008034c7fc x2 : ffff0000e0032468 x1 : 00000000da0e46b8 x0 : ffff0000e0032470 Call trace: __list_del include/linux/list.h:195 [inline] __list_del_entry include/linux/list.h:218 [inline] list_move_tail include/linux/list.h:310 [inline] fq_tin_dequeue include/net/fq_impl.h:112 [inline] ieee80211_tx_dequeue+0x6b8/0x3b4c net/mac80211/tx.c:3854 wake_tx_push_queue net/mac80211/util.c:294 [inline] ieee80211_handle_wake_tx_queue+0x118/0x274 net/mac80211/util.c:315 drv_wake_tx_queue net/mac80211/driver-ops.h:1350 [inline] schedule_and_wake_txq net/mac80211/driver-ops.h:1357 [inline] ieee80211_queue_skb+0x18e8/0x2244 net/mac80211/tx.c:1664 ieee80211_tx+0x260/0x400 net/mac80211/tx.c:1966 ieee80211_xmit+0x278/0x354 net/mac80211/tx.c:2062 __ieee80211_subif_start_xmit+0xab8/0x122c net/mac80211/tx.c:4338 ieee80211_subif_start_xmit+0xe0/0x438 net/mac80211/tx.c:4532 __netdev_start_xmit include/linux/netdevice.h:4903 [inline] netdev_start_xmit include/linux/netdevice.h:4917 [inline] xmit_one net/core/dev.c:3531 [inline] dev_hard_start_xmit+0x27c/0x938 net/core/dev.c:3547 __dev_queue_xmit+0x1678/0x33fc net/core/dev.c:4341 dev_queue_xmit include/linux/netdevice.h:3091 [inline] neigh_resolve_output+0x558/0x658 net/core/neighbour.c:1563 neigh_output include/net/neighbour.h:542 [inline] ip6_finish_output2+0x104c/0x1ee8 net/ipv6/ip6_output.c:137 ip6_finish_output+0x428/0x7a0 net/ipv6/ip6_output.c:222 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip6_output+0x270/0x594 net/ipv6/ip6_output.c:243 dst_output include/net/dst.h:450 [inline] NF_HOOK+0x160/0x4f0 include/linux/netfilter.h:314 mld_sendpack+0x7b4/0x10f4 net/ipv6/mcast.c:1818 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x840/0xd0c net/ipv6/mcast.c:2650 process_one_work+0x7b8/0x15d4 kernel/workqueue.c:3267 process_scheduled_works kernel/workqueue.c:3348 [inline] worker_thread+0x938/0xef4 kernel/workqueue.c:3429 kthread+0x288/0x310 kernel/kthread.c:388 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:860 Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20240615160800.250667-1-edumazet@google.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index be5c42d6ffbeab..2b2dc46dc701f9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -468,6 +468,10 @@ static struct netlink_range_validation nl80211_punct_bitmap_range = { .max = 0xffff, }; +static struct netlink_range_validation q_range = { + .max = INT_MAX, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -750,7 +754,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, - [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_TXQ_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &q_range), [NL80211_ATTR_HE_CAPABILITY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_he_capa, NL80211_HE_MAX_CAPABILITY_LEN), From 3a2c70baf62b8b5693b6f29a98d7d63c6d30f979 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Dec 2023 05:30:35 +0000 Subject: [PATCH 51/70] ext4: convert ext4_da_do_write_end() to take a folio [ Upstream commit 4d5cdd757d0c74924b629559fccb68d8803ce995 ] There's nothing page-specific happening in ext4_da_do_write_end(); it's merely used for its refcount & lock, both of which are folio properties. Saves four calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231214053035.1018876-1-willy@infradead.org Signed-off-by: Theodore Ts'o Stable-dep-of: 83f4414b8f84 ("ext4: sanity check for NULL pointer after ext4_force_shutdown") Signed-off-by: Sasha Levin --- fs/ext4/inode.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cef119a2476bb4..e24afb80c0f6be 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2966,7 +2966,7 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page) + struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -2977,12 +2977,13 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + copied = block_write_end(NULL, mapping, pos, len, copied, + &folio->page, NULL); new_i_size = pos + copied; /* - * It's important to update i_size while still holding page lock, - * because page writeout could otherwise come in and zero beyond + * It's important to update i_size while still holding folio lock, + * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= @@ -3000,14 +3001,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); - if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -3046,10 +3047,10 @@ static int ext4_da_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; - return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); + return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* From 3f6bbe6e07e5239294ecc3d2efa70d1f98aed52e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20G=C5=82adysz?= Date: Wed, 3 Jul 2024 09:01:12 +0200 Subject: [PATCH 52/70] ext4: sanity check for NULL pointer after ext4_force_shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 83f4414b8f84249d538905825b088ff3ae555652 ] Test case: 2 threads write short inline data to a file. In ext4_page_mkwrite the resulting inline data is converted. Handling ext4_grp_locked_error with description "block bitmap and bg descriptor inconsistent: X vs Y free clusters" calls ext4_force_shutdown. The conversion clears EXT4_STATE_MAY_INLINE_DATA but fails for ext4_destroy_inline_data_nolock and ext4_mark_iloc_dirty due to ext4_forced_shutdown. The restoration of inline data fails for the same reason not setting EXT4_STATE_MAY_INLINE_DATA. Without the flag set a regular process path in ext4_da_write_end follows trying to dereference page folio private pointer that has not been set. The fix calls early return with -EIO error shall the pointer to private be NULL. Sample crash report: Unable to handle kernel paging request at virtual address dfff800000000004 KASAN: null-ptr-deref in range [0x0000000000000020-0x0000000000000027] Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [dfff800000000004] address between user and kernel address ranges Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 20274 Comm: syz-executor185 Not tainted 6.9.0-rc7-syzkaller-gfda5695d692c #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __block_commit_write+0x64/0x2b0 fs/buffer.c:2167 lr : __block_commit_write+0x3c/0x2b0 fs/buffer.c:2160 sp : ffff8000a1957600 x29: ffff8000a1957610 x28: dfff800000000000 x27: ffff0000e30e34b0 x26: 0000000000000000 x25: dfff800000000000 x24: dfff800000000000 x23: fffffdffc397c9e0 x22: 0000000000000020 x21: 0000000000000020 x20: 0000000000000040 x19: fffffdffc397c9c0 x18: 1fffe000367bd196 x17: ffff80008eead000 x16: ffff80008ae89e3c x15: 00000000200000c0 x14: 1fffe0001cbe4e04 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000001 x10: 0000000000ff0100 x9 : 0000000000000000 x8 : 0000000000000004 x7 : 0000000000000000 x6 : 0000000000000000 x5 : fffffdffc397c9c0 x4 : 0000000000000020 x3 : 0000000000000020 x2 : 0000000000000040 x1 : 0000000000000020 x0 : fffffdffc397c9c0 Call trace: __block_commit_write+0x64/0x2b0 fs/buffer.c:2167 block_write_end+0xb4/0x104 fs/buffer.c:2253 ext4_da_do_write_end fs/ext4/inode.c:2955 [inline] ext4_da_write_end+0x2c4/0xa40 fs/ext4/inode.c:3028 generic_perform_write+0x394/0x588 mm/filemap.c:3985 ext4_buffered_write_iter+0x2c0/0x4ec fs/ext4/file.c:299 ext4_file_write_iter+0x188/0x1780 call_write_iter include/linux/fs.h:2110 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0x968/0xc3c fs/read_write.c:590 ksys_write+0x15c/0x26c fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __arm64_sys_write+0x7c/0x90 fs/read_write.c:652 __invoke_syscall arch/arm64/kernel/syscall.c:34 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:48 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:133 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:152 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Code: 97f85911 f94002da 91008356 d343fec8 (38796908) ---[ end trace 0000000000000000 ]--- ---------------- Code disassembly (best guess): 0: 97f85911 bl 0xffffffffffe16444 4: f94002da ldr x26, [x22] 8: 91008356 add x22, x26, #0x20 c: d343fec8 lsr x8, x22, #3 * 10: 38796908 ldrb w8, [x8, x25] <-- trapping instruction Reported-by: syzbot+18df508cf00a0598d9a6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=18df508cf00a0598d9a6 Link: https://lore.kernel.org/all/000000000000f19a1406109eb5c5@google.com/T/ Signed-off-by: Wojciech Gładysz Link: https://patch.msgid.link/20240703070112.10235-1-wojciech.gladysz@infogain.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/buffer.c | 2 ++ fs/ext4/inode.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d74..ecd8b47507ff80 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2179,6 +2179,8 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) struct buffer_head *bh, *head; bh = head = folio_buffers(folio); + if (!bh) + return; blocksize = bh->b_size; block_start = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e24afb80c0f6be..a4ffd1acac6514 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2973,6 +2973,11 @@ static int ext4_da_do_write_end(struct address_space *mapping, bool disksize_changed = false; loff_t new_i_size; + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. From bd104cbb9d3f50a7b963d8d220d8bee46c085dd0 Mon Sep 17 00:00:00 2001 From: yunshui Date: Thu, 23 May 2024 11:35:20 +0800 Subject: [PATCH 53/70] bpf, net: Use DEV_STAT_INC() [ Upstream commit d9cbd8343b010016fcaabc361c37720dcafddcbe ] syzbot/KCSAN reported that races happen when multiple CPUs updating dev->stats.tx_error concurrently. Adopt SMP safe DEV_STATS_INC() to update the dev->stats fields. Reported-by: syzbot Signed-off-by: yunshui Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240523033520.4029314-1-jiangyunshui@kylinos.cn Signed-off-by: Sasha Levin --- net/core/filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 8cb44cd29967bb..be313928d272c6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2271,12 +2271,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2378,12 +2378,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; From ae00e6536a2dd54b64b39e9a39548870cf835745 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 May 2024 14:23:17 +0800 Subject: [PATCH 54/70] f2fs: fix to do sanity check on F2FS_INLINE_DATA flag in inode during GC [ Upstream commit fc01008c92f40015aeeced94750855a7111b6929 ] syzbot reports a f2fs bug as below: ------------[ cut here ]------------ kernel BUG at fs/f2fs/inline.c:258! CPU: 1 PID: 34 Comm: kworker/u8:2 Not tainted 6.9.0-rc6-syzkaller-00012-g9e4bc4bcae01 #0 RIP: 0010:f2fs_write_inline_data+0x781/0x790 fs/f2fs/inline.c:258 Call Trace: f2fs_write_single_data_page+0xb65/0x1d60 fs/f2fs/data.c:2834 f2fs_write_cache_pages fs/f2fs/data.c:3133 [inline] __f2fs_write_data_pages fs/f2fs/data.c:3288 [inline] f2fs_write_data_pages+0x1efe/0x3a90 fs/f2fs/data.c:3315 do_writepages+0x35b/0x870 mm/page-writeback.c:2612 __writeback_single_inode+0x165/0x10b0 fs/fs-writeback.c:1650 writeback_sb_inodes+0x905/0x1260 fs/fs-writeback.c:1941 wb_writeback+0x457/0xce0 fs/fs-writeback.c:2117 wb_do_writeback fs/fs-writeback.c:2264 [inline] wb_workfn+0x410/0x1090 fs/fs-writeback.c:2304 process_one_work kernel/workqueue.c:3254 [inline] process_scheduled_works+0xa12/0x17c0 kernel/workqueue.c:3335 worker_thread+0x86d/0xd70 kernel/workqueue.c:3416 kthread+0x2f2/0x390 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 The root cause is: inline_data inode can be fuzzed, so that there may be valid blkaddr in its direct node, once f2fs triggers background GC to migrate the block, it will hit f2fs_bug_on() during dirty page writeback. Let's add sanity check on F2FS_INLINE_DATA flag in inode during GC, so that, it can forbid migrating inline_data inode's data block for fixing. Reported-by: syzbot+848062ba19c8782ca5c8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/000000000000d103ce06174d7ec3@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/gc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index afb7c88ba06b2c..888c301ffe8f4c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1563,6 +1563,16 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } + if (f2fs_has_inline_data(inode)) { + iput(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "inode %lx has both inline_data flag and " + "data block, nid=%u, ofs_in_node=%u", + inode->i_ino, dni.nid, ofs_in_node); + continue; + } + err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err == -EAGAIN) { iput(inode); From 263df78166d3a9609b97d28c34029bd01874cbb8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 31 May 2024 10:00:32 +0800 Subject: [PATCH 55/70] f2fs: fix to cover read extent cache access with lock [ Upstream commit d7409b05a64f212735f0d33f5f1602051a886eab ] syzbot reports a f2fs bug as below: BUG: KASAN: slab-use-after-free in sanity_check_extent_cache+0x370/0x410 fs/f2fs/extent_cache.c:46 Read of size 4 at addr ffff8880739ab220 by task syz-executor200/5097 CPU: 0 PID: 5097 Comm: syz-executor200 Not tainted 6.9.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 sanity_check_extent_cache+0x370/0x410 fs/f2fs/extent_cache.c:46 do_read_inode fs/f2fs/inode.c:509 [inline] f2fs_iget+0x33e1/0x46e0 fs/f2fs/inode.c:560 f2fs_nfs_get_inode+0x74/0x100 fs/f2fs/super.c:3237 generic_fh_to_dentry+0x9f/0xf0 fs/libfs.c:1413 exportfs_decode_fh_raw+0x152/0x5f0 fs/exportfs/expfs.c:444 exportfs_decode_fh+0x3c/0x80 fs/exportfs/expfs.c:584 do_handle_to_path fs/fhandle.c:155 [inline] handle_to_path fs/fhandle.c:210 [inline] do_handle_open+0x495/0x650 fs/fhandle.c:226 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f We missed to cover sanity_check_extent_cache() w/ extent cache lock, so, below race case may happen, result in use after free issue. - f2fs_iget - do_read_inode - f2fs_init_read_extent_tree : add largest extent entry in to cache - shrink - f2fs_shrink_read_extent_tree - __shrink_extent_tree - __detach_extent_node : drop largest extent entry - sanity_check_extent_cache : access et->largest w/o lock let's refactor sanity_check_extent_cache() to avoid extent cache access and call it before f2fs_init_read_extent_tree() to fix this issue. Reported-by: syzbot+74ebe2104433e9dc610d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/00000000000009beea061740a531@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/extent_cache.c | 48 +++++++++++++++++------------------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inode.c | 10 ++++----- 3 files changed, 25 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ad8dfac73bd446..6a9a470345bfc7 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -19,34 +19,24 @@ #include "node.h" #include -bool sanity_check_extent_cache(struct inode *inode) +bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct extent_tree *et = fi->extent_tree[EX_READ]; - struct extent_info *ei; - - if (!et) - return true; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; + struct extent_info ei; - ei = &et->largest; - if (!ei->len) - return true; + get_read_extent_info(&ei, i_ext); - /* Let's drop, if checkpoint got corrupted. */ - if (is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) { - ei->len = 0; - et->largest_updated = true; + if (!ei.len) return true; - } - if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) || - !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + if (!f2fs_is_valid_blkaddr(sbi, ei.blk, DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei.blk + ei.len - 1, DATA_GENERIC_ENHANCE)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", __func__, inode->i_ino, - ei->blk, ei->fofs, ei->len); + ei.blk, ei.fofs, ei.len); return false; } return true; @@ -395,24 +385,22 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ - if (i_ext && i_ext->len) { + if (i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); } - goto out; + set_inode_flag(inode, FI_NO_EXTENT); + return; } et = __grab_extent_tree(inode, EX_READ); - if (!i_ext || !i_ext->len) - goto out; - get_read_extent_info(&ei, i_ext); write_lock(&et->lock); - if (atomic_read(&et->node_cnt)) - goto unlock_out; + if (atomic_read(&et->node_cnt) || !ei.len) + goto skip; en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); @@ -424,11 +412,13 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) list_add_tail(&en->list, &eti->extent_list); spin_unlock(&eti->extent_lock); } -unlock_out: +skip: + /* Let's drop, if checkpoint got corrupted. */ + if (f2fs_cp_error(sbi)) { + et->largest.len = 0; + et->largest_updated = true; + } write_unlock(&et->lock); -out: - if (!F2FS_I(inode)->extent_tree[EX_READ]) - set_inode_flag(inode, FI_NO_EXTENT); } void f2fs_init_age_extent_tree(struct inode *inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19490dd8321943..00eff023cd9d63 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4189,7 +4189,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -bool sanity_check_extent_cache(struct inode *inode); +bool sanity_check_extent_cache(struct inode *inode, struct page *ipage); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0172f4e503061d..26e857fee631d9 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -511,16 +511,16 @@ static int do_read_inode(struct inode *inode) init_idisk_time(inode); - /* Need all the flag bits */ - f2fs_init_read_extent_tree(inode, node_page); - f2fs_init_age_extent_tree(inode); - - if (!sanity_check_extent_cache(inode)) { + if (!sanity_check_extent_cache(inode, node_page)) { f2fs_put_page(node_page, 1); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_age_extent_tree(inode); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); From 3db4395332e7050ef9ddeb3052e6b5019f2a2a59 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Jun 2024 08:25:18 -0400 Subject: [PATCH 56/70] fou: remove warn in gue_gro_receive on unsupported protocol [ Upstream commit dd89a81d850fa9a65f67b4527c0e420d15bf836c ] Drop the WARN_ON_ONCE inn gue_gro_receive if the encapsulated type is not known or does not have a GRO handler. Such a packet is easily constructed. Syzbot generates them and sets off this warning. Remove the warning as it is expected and not actionable. The warning was previously reduced from WARN_ON to WARN_ON_ONCE in commit 270136613bf7 ("fou: Do WARN_ON_ONCE in gue_gro_receive for bad proto callbacks"). Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240614122552.1649044-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/fou_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 0c41076e31edad..b38b82ae903de0 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -433,7 +433,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); - if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) + if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); From 6ea10dbb1e6c58384136e9adfd75f81951e423f6 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Thu, 11 Apr 2024 20:05:28 +0800 Subject: [PATCH 57/70] jfs: fix null ptr deref in dtInsertEntry [ Upstream commit ce6dede912f064a855acf6f04a04cbb2c25b8c8c ] [syzbot reported] general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 PID: 5061 Comm: syz-executor404 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 RIP: 0010:dtInsertEntry+0xd0c/0x1780 fs/jfs/jfs_dtree.c:3713 ... [Analyze] In dtInsertEntry(), when the pointer h has the same value as p, after writing name in UniStrncpy_to_le(), p->header.flag will be cleared. This will cause the previously true judgment "p->header.flag & BT-LEAF" to change to no after writing the name operation, this leads to entering an incorrect branch and accessing the uninitialized object ih when judging this condition for the second time. [Fix] After got the page, check freelist first, if freelist == 0 then exit dtInsert() and return -EINVAL. Reported-by: syzbot+bba84aef3a26fb93deb9@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin --- fs/jfs/jfs_dtree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 031d8f570f581f..5d3127ca68a42d 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -834,6 +834,8 @@ int dtInsert(tid_t tid, struct inode *ip, * the full page. */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + if (p->header.freelist == 0) + return -EINVAL; /* * insert entry for new key From f650148b43949ca9e37e820804bb6026fff404f3 Mon Sep 17 00:00:00 2001 From: Pei Li Date: Tue, 25 Jun 2024 09:42:05 -0700 Subject: [PATCH 58/70] jfs: Fix shift-out-of-bounds in dbDiscardAG [ Upstream commit 7063b80268e2593e58bee8a8d709c2f3ff93e2f2 ] When searching for the next smaller log2 block, BLKSTOL2() returned 0, causing shift exponent -1 to be negative. This patch fixes the issue by exiting the loop directly when negative shift is found. Reported-by: syzbot+61be3359d2ee3467e7e4@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=61be3359d2ee3467e7e4 Signed-off-by: Pei Li Signed-off-by: Dave Kleikamp Signed-off-by: Sasha Levin --- fs/jfs/jfs_dmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 8eec84c651bfba..19eddbc5d616b0 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1626,6 +1626,8 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) } else if (rc == -ENOSPC) { /* search for next smaller log2 block */ l2nb = BLKSTOL2(nblocks) - 1; + if (unlikely(l2nb < 0)) + break; nblocks = 1LL << l2nb; } else { /* Trim any already allocated blocks */ From 5472b587cf71f72fa4be675c30fab677e65c6f63 Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Mon, 17 Jun 2024 15:14:07 +0300 Subject: [PATCH 59/70] fs/ntfs3: Do copy_to_user out of run_lock [ Upstream commit d57431c6f511bf020e474026d9f3123d7bfbea8c ] In order not to call copy_to_user (from fiemap_fill_next_extent) we allocate memory in the kernel, fill it and copy it to user memory after up_read(run_lock). Reported-by: syzbot+36bb70085ef6edc2ebb9@syzkaller.appspotmail.com Signed-off-by: Konstantin Komarov Signed-off-by: Sasha Levin --- fs/ntfs3/frecord.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 3 deletions(-) diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 424865dfca74ba..45b687aff700be 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -1896,6 +1896,47 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, return REPARSE_LINK; } +/* + * fiemap_fill_next_extent_k - a copy of fiemap_fill_next_extent + * but it accepts kernel address for fi_extents_start + */ +static int fiemap_fill_next_extent_k(struct fiemap_extent_info *fieinfo, + u64 logical, u64 phys, u64 len, u32 flags) +{ + struct fiemap_extent extent; + struct fiemap_extent __user *dest = fieinfo->fi_extents_start; + + /* only count the extents */ + if (fieinfo->fi_extents_max == 0) { + fieinfo->fi_extents_mapped++; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; + } + + if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) + return 1; + + if (flags & FIEMAP_EXTENT_DELALLOC) + flags |= FIEMAP_EXTENT_UNKNOWN; + if (flags & FIEMAP_EXTENT_DATA_ENCRYPTED) + flags |= FIEMAP_EXTENT_ENCODED; + if (flags & (FIEMAP_EXTENT_DATA_TAIL | FIEMAP_EXTENT_DATA_INLINE)) + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + + memset(&extent, 0, sizeof(extent)); + extent.fe_logical = logical; + extent.fe_physical = phys; + extent.fe_length = len; + extent.fe_flags = flags; + + dest += fieinfo->fi_extents_mapped; + memcpy(dest, &extent, sizeof(extent)); + + fieinfo->fi_extents_mapped++; + if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) + return 1; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +} + /* * ni_fiemap - Helper for file_fiemap(). * @@ -1906,6 +1947,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, __u64 vbo, __u64 len) { int err = 0; + struct fiemap_extent __user *fe_u = fieinfo->fi_extents_start; + struct fiemap_extent *fe_k = NULL; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; struct runs_tree *run; @@ -1953,6 +1996,18 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, goto out; } + /* + * To avoid lock problems replace pointer to user memory by pointer to kernel memory. + */ + fe_k = kmalloc_array(fieinfo->fi_extents_max, + sizeof(struct fiemap_extent), + GFP_NOFS | __GFP_ZERO); + if (!fe_k) { + err = -ENOMEM; + goto out; + } + fieinfo->fi_extents_start = fe_k; + end = vbo + len; alloc_size = le64_to_cpu(attr->nres.alloc_size); if (end > alloc_size) @@ -2041,8 +2096,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, if (vbo + dlen >= end) flags |= FIEMAP_EXTENT_LAST; - err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, - flags); + err = fiemap_fill_next_extent_k(fieinfo, vbo, lbo, dlen, + flags); + if (err < 0) break; if (err == 1) { @@ -2062,7 +2118,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, if (vbo + bytes >= end) flags |= FIEMAP_EXTENT_LAST; - err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); + err = fiemap_fill_next_extent_k(fieinfo, vbo, lbo, bytes, + flags); if (err < 0) break; if (err == 1) { @@ -2075,7 +2132,19 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, up_read(run_lock); + /* + * Copy to user memory out of lock + */ + if (copy_to_user(fe_u, fe_k, + fieinfo->fi_extents_max * + sizeof(struct fiemap_extent))) { + err = -EFAULT; + } + out: + /* Restore original pointer. */ + fieinfo->fi_extents_start = fe_u; + kfree(fe_k); return err; } From f95d175a9e5fcf747c3ae87ca2f663cac2d846f6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 15 Jul 2024 14:35:54 +0200 Subject: [PATCH 60/70] ALSA: usb: Fix UBSAN warning in parse_audio_unit() [ Upstream commit 2f38cf730caedaeacdefb7ff35b0a3c1168117f9 ] A malformed USB descriptor may pass the lengthy mixer description with a lot of channels, and this may overflow the 32bit integer shift size, as caught by syzbot UBSAN test. Although this won't cause any real trouble, it's better to address. This patch introduces a sanity check of the number of channels to bail out the parsing when too many channels are found. Reported-by: syzbot+78d5b129a762182225aa@syzkaller.appspotmail.com Closes: https://lore.kernel.org/0000000000000adac5061d3c7355@google.com Link: https://patch.msgid.link/20240715123619.26612-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/usb/mixer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index d1bdb0b93bda0c..8cc2d4937f3403 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -2021,6 +2021,13 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, bmaControls = ftr->bmaControls; } + if (channels > 32) { + usb_audio_info(state->chip, + "usbmixer: too many channels (%d) in unit %d\n", + channels, unitid); + return -EINVAL; + } + /* parse the source unit */ err = parse_audio_unit(state, hdr->bSourceID); if (err < 0) From 49df34d2b7da9e57c839555a2f7877291ce45ad1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Aug 2024 12:51:23 -0700 Subject: [PATCH 61/70] binfmt_flat: Fix corruption when not offsetting data start [ Upstream commit 3eb3cd5992f7a0c37edc8d05b4c38c98758d8671 ] Commit 04d82a6d0881 ("binfmt_flat: allow not offsetting data start") introduced a RISC-V specific variant of the FLAT format which does not allocate any space for the (obsolete) array of shared library pointers. However, it did not disable the code which initializes the array, resulting in the corruption of sizeof(long) bytes before the DATA segment, generally the end of the TEXT segment. Introduce MAX_SHARED_LIBS_UPDATE which depends on the state of CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET to guard the initialization of the shared library pointer region so that it will only be initialized if space is reserved for it. Fixes: 04d82a6d0881 ("binfmt_flat: allow not offsetting data start") Co-developed-by: Stefan O'Rear Signed-off-by: Stefan O'Rear Reviewed-by: Damien Le Moal Acked-by: Greg Ungerer Link: https://lore.kernel.org/r/20240807195119.it.782-kees@kernel.org Signed-off-by: Kees Cook Signed-off-by: Sasha Levin --- fs/binfmt_flat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index c26545d71d39a3..cd6d5bbb4b9df5 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -72,8 +72,10 @@ #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET #define DATA_START_OFFSET_WORDS (0) +#define MAX_SHARED_LIBS_UPDATE (0) #else #define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#define MAX_SHARED_LIBS_UPDATE (MAX_SHARED_LIBS) #endif struct lib_info { @@ -880,7 +882,7 @@ static int load_flat_binary(struct linux_binprm *bprm) return res; /* Update data segment pointers for all libraries */ - for (i = 0; i < MAX_SHARED_LIBS; i++) { + for (i = 0; i < MAX_SHARED_LIBS_UPDATE; i++) { if (!libinfo.lib_list[i].loaded) continue; for (j = 0; j < MAX_SHARED_LIBS; j++) { From 6fb93eeb25e56cbb782b4273baaf8c19ce1da9c6 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 29 Jan 2024 08:40:23 -0600 Subject: [PATCH 62/70] Revert "jfs: fix shift-out-of-bounds in dbJoin" commit e42e29cc442395d62f1a8963ec2dfb700ba6a5d7 upstream. This reverts commit cca974daeb6c43ea971f8ceff5a7080d7d49ee30. The added sanity check is incorrect. BUDMIN is not the wrong value and is too small. Signed-off-by: Dave Kleikamp Signed-off-by: Greg Kroah-Hartman --- fs/jfs/jfs_dmap.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 19eddbc5d616b0..5713994328cbcb 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2765,9 +2765,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * RETURN VALUES: none */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2794,10 +2792,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ - - if ((newval - tp->dmt_budmin) > BUDMIN) - return -EIO; - budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. From e2bf9ba1d3d7280e2c8cc608d39d12c646962fa1 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Tue, 5 Mar 2024 08:49:21 +0100 Subject: [PATCH 63/70] Revert "Input: bcm5974 - check endpoint type before starting traffic" commit 7105e92c60c9cc4112c782d69c172e96b69a43dc upstream. This patch intended to fix an well-knonw issue in old drivers where the endpoint type is taken for granted, which is often triggered by fuzzers. That was the case for this driver [1], and although the fix seems to be correct, it uncovered another issue that leads to a regression [2], if the endpoints of the current interface are checked. The driver makes use of endpoints that belong to a different interface rather than the one it binds (it binds to the third interface, but also accesses an endpoint from a different one). The driver should claim the interfaces it requires, but that is still not the case. Given that the regression is more severe than the issue found by syzkaller, the best approach is reverting the patch that causes the regression, and trying to fix the underlying problem before checking the endpoint types again. Note that reverting this patch will probably trigger the syzkaller bug at some point. This reverts commit 2b9c3eb32a699acdd4784d6b93743271b4970899. Link: https://syzkaller.appspot.com/bug?extid=348331f63b034f89b622 [1] Link: https://lore.kernel.org/linux-input/87sf161jjc.wl-tiwai@suse.de/ [2] Fixes: 2b9c3eb32a69 ("Input: bcm5974 - check endpoint type before starting traffic") Reported-by: Jacopo Radice Closes: https://bugzilla.suse.com/show_bug.cgi?id=1220030 Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20240305-revert_bcm5974_ep_check-v3-1-527198cf6499@gmail.com Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/bcm5974.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index 953992b458e9f2..ca150618d32f18 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -19,7 +19,6 @@ * Copyright (C) 2006 Nicolas Boichat (nicolas@boichat.ch) */ -#include "linux/usb.h" #include #include #include @@ -194,8 +193,6 @@ enum tp_type { /* list of device capability bits */ #define HAS_INTEGRATED_BUTTON 1 -/* maximum number of supported endpoints (currently trackpad and button) */ -#define MAX_ENDPOINTS 2 /* trackpad finger data block size */ #define FSIZE_TYPE1 (14 * sizeof(__le16)) @@ -894,18 +891,6 @@ static int bcm5974_resume(struct usb_interface *iface) return error; } -static bool bcm5974_check_endpoints(struct usb_interface *iface, - const struct bcm5974_config *cfg) -{ - u8 ep_addr[MAX_ENDPOINTS + 1] = {0}; - - ep_addr[0] = cfg->tp_ep; - if (cfg->tp_type == TYPE1) - ep_addr[1] = cfg->bt_ep; - - return usb_check_int_endpoints(iface, ep_addr); -} - static int bcm5974_probe(struct usb_interface *iface, const struct usb_device_id *id) { @@ -918,11 +903,6 @@ static int bcm5974_probe(struct usb_interface *iface, /* find the product index */ cfg = bcm5974_get_config(udev); - if (!bcm5974_check_endpoints(iface, cfg)) { - dev_err(&iface->dev, "Unexpected non-int endpoint\n"); - return -ENODEV; - } - /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(struct bcm5974), GFP_KERNEL); input_dev = input_allocate_device(); From 6419341b6b2bc188b26f9044fe4059aaf5f1bf12 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 23 May 2024 09:21:39 -0400 Subject: [PATCH 64/70] mm/debug_vm_pgtable: drop RANDOM_ORVALUE trick commit 0b1ef4fde7a24909ff2afacffd0d6afa28b73652 upstream. Macro RANDOM_ORVALUE was used to make sure the pgtable entry will be populated with !none data in clear tests. The RANDOM_ORVALUE tried to cover mostly all the bits in a pgtable entry, even if there's no discussion on whether all the bits will be vaild. Both S390 and PPC64 have their own masks to avoid touching some bits. Now it's the turn for x86_64. The issue is there's a recent report from Mikhail Gavrilov showing that this can cause a warning with the newly added pte set check in commit 8430557fc5 on writable v.s. userfaultfd-wp bit, even though the check itself was valid, the random pte is not. We can choose to mask more bits out. However the need to have such random bits setup is questionable, as now it's already guaranteed to be true on below: - For pte level, the pgtable entry will be installed with value from pfn_pte(), where pfn points to a valid page. Hence the pte will be !none already if populated with pfn_pte(). - For upper-than-pte level, the pgtable entry should contain a directory entry always, which is also !none. All the cases look like good enough to test a pxx_clear() helper. Instead of extending the bitmask, drop the "set random bits" trick completely. Add some warning guards to make sure the entries will be !none before clear(). Link: https://lkml.kernel.org/r/20240523132139.289719-1-peterx@redhat.com Fixes: 8430557fc584 ("mm/page_table_check: support userfault wr-protect entries") Signed-off-by: Peter Xu Reported-by: Mikhail Gavrilov Link: https://lore.kernel.org/r/CABXGCsMB9A8-X+Np_Q+fWLURYL_0t3Y-MdoNabDM-Lzk58-DGA@mail.gmail.com Tested-by: Mikhail Gavrilov Reviewed-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Aneesh Kumar K.V Cc: Gavin Shan Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/debug_vm_pgtable.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 13f0d11927074a..68af76ca8bc992 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -39,22 +39,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - * - * On s390 platform, the lower 4 bits are used to identify given page table - * entry type. But these bits might affect the ability to clear entries with - * pxx_clear() because of how dynamic page table folding works on s390. So - * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. Also avoid the 62nd bit on ppc64 that is - * used to mark a pte entry. */ -#define S390_SKIP_MASK GENMASK(3, 0) -#if __BITS_PER_LONG == 64 -#define PPC64_SKIP_MASK GENMASK(62, 62) -#else -#define PPC64_SKIP_MASK 0x0 -#endif -#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) -#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) struct pgtable_debug_args { @@ -510,8 +495,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PUD clear\n"); - pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pudp, pud); + WARN_ON(pud_none(pud)); pud_clear(args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); @@ -547,8 +531,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating P4D clear\n"); - p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*args->p4dp, p4d); + WARN_ON(p4d_none(p4d)); p4d_clear(args->p4dp); p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); @@ -581,8 +564,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PGD clear\n"); - pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pgdp, pgd); + WARN_ON(pgd_none(pgd)); pgd_clear(args->pgdp); pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); @@ -633,10 +615,8 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) if (WARN_ON(!args->ptep)) return; -#ifndef CONFIG_RISCV - pte = __pte(pte_val(pte) | RANDOM_ORVALUE); -#endif set_pte_at(args->mm, args->vaddr, args->ptep, pte); + WARN_ON(pte_none(pte)); flush_dcache_page(page); barrier(); ptep_clear(args->mm, args->vaddr, args->ptep); @@ -649,8 +629,7 @@ static void __init pmd_clear_tests(struct pgtable_debug_args *args) pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); - pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pmdp, pmd); + WARN_ON(pmd_none(pmd)); pmd_clear(args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); From f3c60ab676bb62e01d004d5b1cf2963a296c8e6a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 7 Dec 2023 08:46:14 -0500 Subject: [PATCH 65/70] cgroup: Move rcu_head up near the top of cgroup_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a7fb0423c201ba12815877a0b5a68a6a1710b23a upstream. Commit d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu() for freeing the cgroup_root. The current implementation of kvfree_rcu(), however, has the limitation that the offset of the rcu_head structure within the larger data structure must be less than 4096 or the compilation will fail. See the macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h for more information. By putting rcu_head below the large cgroup structure, any change to the cgroup structure that makes it larger run the risk of causing build failure under certain configurations. Commit 77070eeb8821 ("cgroup: Avoid false cacheline sharing of read mostly rstat_cpu") happens to be the last straw that breaks it. Fix this problem by moving the rcu_head structure up before the cgroup structure. Fixes: d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/ Signed-off-by: Waiman Long Acked-by: Yafang Shao Reviewed-by: Yosry Ahmed Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7e7bc25a1aa31f..6eefe5153a6ff7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -543,6 +543,10 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; + /* A list running through the active hierarchies */ + struct list_head root_list; + struct rcu_head rcu; /* Must be near the top */ + /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the @@ -556,10 +560,6 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; - /* A list running through the active hierarchies */ - struct list_head root_list; - struct rcu_head rcu; - /* Hierarchy-specific flags */ unsigned int flags; From 2c770086e0792eddb40e82067179586e90dc9c23 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 15 Aug 2024 13:46:25 +0100 Subject: [PATCH 66/70] KVM: arm64: Don't defer TLB invalidation when zapping table entries commit f62d4c3eb687d87b616b4279acec7862553bda77 upstream. Commit 7657ea920c54 ("KVM: arm64: Use TLBI range-based instructions for unmap") introduced deferred TLB invalidation for the stage-2 page-table so that range-based invalidation can be used for the accumulated addresses. This works fine if the structure of the page-tables remains unchanged, but if entire tables are zapped and subsequently freed then we transiently leave the hardware page-table walker with a reference to freed memory thanks to the translation walk caches. For example, stage2_unmap_walker() will free page-table pages: if (childp) mm_ops->put_page(childp); and issue the TLB invalidation later in kvm_pgtable_stage2_unmap(): if (stage2_unmap_defer_tlb_flush(pgt)) /* Perform the deferred TLB invalidations */ kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); For now, take the conservative approach and invalidate the TLB eagerly when we clear a table entry. Note, however, that the existing level hint passed to __kvm_tlb_flush_vmid_ipa() is incorrect and will be fixed in a subsequent patch. Cc: Raghavendra Rao Ananta Cc: Shaoqin Huang Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-2-will@kernel.org Signed-off-by: Oliver Upton Cc: # 6.6.y only Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/hyp/pgtable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 15aa9bad1c280b..6692327fabe7d8 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -861,9 +861,11 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt)) + if (!stage2_unmap_defer_tlb_flush(pgt) || + kvm_pte_table(ctx->old, ctx->level)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + } } mm_ops->put_page(ctx->ptep); From 60d90e158261b5258fba7b02eea729cd91545b66 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 15 Aug 2024 13:46:26 +0100 Subject: [PATCH 67/70] KVM: arm64: Don't pass a TLBI level hint when zapping table entries commit 36e008323926036650299cfbb2dca704c7aba849 upstream. The TLBI level hints are for leaf entries only, so take care not to pass them incorrectly after clearing a table entry. Cc: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Fixes: 82bb02445de5 ("KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2") Fixes: 6d9d2115c480 ("KVM: arm64: Add support for stage-2 map()/unmap() in generic page-table") Signed-off-by: Will Deacon Reviewed-by: Shaoqin Huang Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240327124853.11206-3-will@kernel.org Signed-off-by: Oliver Upton Cc: # 6.6.y only [will@: Use '0' instead of TLBI_TTL_UNKNOWN to indicate "no level"] Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/hyp/pgtable.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 6692327fabe7d8..ca0bf0b92ca09e 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -523,7 +523,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), 0); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; @@ -861,10 +861,12 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - if (!stage2_unmap_defer_tlb_flush(pgt) || - kvm_pte_table(ctx->old, ctx->level)) { - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, - ctx->addr, ctx->level); + if (kvm_pte_table(ctx->old, ctx->level)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + 0); + } else if (!stage2_unmap_defer_tlb_flush(pgt)) { + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, + ctx->level); } } From fb6f56244af30b5cdfaa914d544a0a5fd7a2b246 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Thu, 8 Aug 2024 10:35:19 +0200 Subject: [PATCH 68/70] media: Revert "media: dvb-usb: Fix unexpected infinite loop in dvb_usb_read_remote_control()" commit 0c84bde4f37ba27d50e4c70ecacd33fe4a57030d upstream. This reverts commit 2052138b7da52ad5ccaf74f736d00f39a1c9198c. This breaks the TeVii s480 dual DVB-S2 S660. The device has a bulk in endpoint but no corresponding out endpoint, so the device does not pass the "has both receive and send bulk endpoint" test. Seemingly this device does not use dvb_usb_generic_rw() so I have tried removing the generic_bulk_ctrl_endpoint entry, but this resulted in different problems. As we have no explanation yet, revert. $ dmesg | grep -i -e dvb -e dw21 -e usb\ 4 [ 0.999122] usb 1-1: new high-speed USB device number 2 using ehci-pci [ 1.023123] usb 4-1: new high-speed USB device number 2 using ehci-pci [ 1.130247] usb 1-1: New USB device found, idVendor=9022, idProduct=d482, +bcdDevice= 0.01 [ 1.130257] usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 1.152323] usb 4-1: New USB device found, idVendor=9022, idProduct=d481, +bcdDevice= 0.01 [ 1.152329] usb 4-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 [ 6.701033] dvb-usb: found a 'TeVii S480.2 USB' in cold state, will try to +load a firmware [ 6.701178] dvb-usb: downloading firmware from file 'dvb-usb-s660.fw' [ 6.701179] dw2102: start downloading DW210X firmware [ 6.703715] dvb-usb: found a 'Microsoft Xbox One Digital TV Tuner' in cold +state, will try to load a firmware [ 6.703974] dvb-usb: downloading firmware from file 'dvb-usb-dib0700-1.20.fw' [ 6.756432] usb 1-1: USB disconnect, device number 2 [ 6.862119] dvb-usb: found a 'TeVii S480.2 USB' in warm state. [ 6.862194] dvb-usb: TeVii S480.2 USB error while loading driver (-22) [ 6.862209] dvb-usb: found a 'TeVii S480.1 USB' in cold state, will try to +load a firmware [ 6.862244] dvb-usb: downloading firmware from file 'dvb-usb-s660.fw' [ 6.862245] dw2102: start downloading DW210X firmware [ 6.914811] usb 4-1: USB disconnect, device number 2 [ 7.014131] dvb-usb: found a 'TeVii S480.1 USB' in warm state. [ 7.014487] dvb-usb: TeVii S480.1 USB error while loading driver (-22) [ 7.014538] usbcore: registered new interface driver dw2102 Closes: https://lore.kernel.org/stable/20240801165146.38991f60@mir/ Fixes: 2052138b7da5 ("media: dvb-usb: Fix unexpected infinite loop in dvb_usb_read_remote_control()") Reported-by: Stefan Lippers-Hollmann Cc: stable@vger.kernel.org Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/dvb-usb/dvb-usb-init.c | 35 +++--------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/drivers/media/usb/dvb-usb/dvb-usb-init.c b/drivers/media/usb/dvb-usb/dvb-usb-init.c index 22d83ac18eb735..fbf58012becdf2 100644 --- a/drivers/media/usb/dvb-usb/dvb-usb-init.c +++ b/drivers/media/usb/dvb-usb/dvb-usb-init.c @@ -23,40 +23,11 @@ static int dvb_usb_force_pid_filter_usage; module_param_named(force_pid_filter_usage, dvb_usb_force_pid_filter_usage, int, 0444); MODULE_PARM_DESC(force_pid_filter_usage, "force all dvb-usb-devices to use a PID filter, if any (default: 0)."); -static int dvb_usb_check_bulk_endpoint(struct dvb_usb_device *d, u8 endpoint) -{ - if (endpoint) { - int ret; - - ret = usb_pipe_type_check(d->udev, usb_sndbulkpipe(d->udev, endpoint)); - if (ret) - return ret; - ret = usb_pipe_type_check(d->udev, usb_rcvbulkpipe(d->udev, endpoint)); - if (ret) - return ret; - } - return 0; -} - -static void dvb_usb_clear_halt(struct dvb_usb_device *d, u8 endpoint) -{ - if (endpoint) { - usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, endpoint)); - usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, endpoint)); - } -} - static int dvb_usb_adapter_init(struct dvb_usb_device *d, short *adapter_nrs) { struct dvb_usb_adapter *adap; int ret, n, o; - ret = dvb_usb_check_bulk_endpoint(d, d->props.generic_bulk_ctrl_endpoint); - if (ret) - return ret; - ret = dvb_usb_check_bulk_endpoint(d, d->props.generic_bulk_ctrl_endpoint_response); - if (ret) - return ret; for (n = 0; n < d->props.num_adapters; n++) { adap = &d->adapter[n]; adap->dev = d; @@ -132,8 +103,10 @@ static int dvb_usb_adapter_init(struct dvb_usb_device *d, short *adapter_nrs) * when reloading the driver w/o replugging the device * sometimes a timeout occurs, this helps */ - dvb_usb_clear_halt(d, d->props.generic_bulk_ctrl_endpoint); - dvb_usb_clear_halt(d, d->props.generic_bulk_ctrl_endpoint_response); + if (d->props.generic_bulk_ctrl_endpoint != 0) { + usb_clear_halt(d->udev, usb_sndbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); + usb_clear_halt(d->udev, usb_rcvbulkpipe(d->udev, d->props.generic_bulk_ctrl_endpoint)); + } return 0; From 88042e41534b6213ce0fab5e3132939a78e654d9 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 13 Aug 2024 15:19:01 +0200 Subject: [PATCH 69/70] Revert "ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error" commit fa0db8e568787c665384430eaf2221b299b85367 upstream. This reverts commit 28ab9769117ca944cb6eb537af5599aa436287a4. Sense data can be in either fixed format or descriptor format. SAT-6 revision 1, "10.4.6 Control mode page", defines the D_SENSE bit: "The SATL shall support this bit as defined in SPC-5 with the following exception: if the D_ SENSE bit is set to zero (i.e., fixed format sense data), then the SATL should return fixed format sense data for ATA PASS-THROUGH commands." The libata SATL has always kept D_SENSE set to zero by default. (It is however possible to change the value using a MODE SELECT SG_IO command.) Failed ATA PASS-THROUGH commands correctly respected the D_SENSE bit, however, successful ATA PASS-THROUGH commands incorrectly returned the sense data in descriptor format (regardless of the D_SENSE bit). Commit 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error") fixed this bug for successful ATA PASS-THROUGH commands. However, after commit 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error"), there were bug reports that hdparm, hddtemp, and udisks were no longer working as expected. These applications incorrectly assume the returned sense data is in descriptor format, without even looking at the RESPONSE CODE field in the returned sense data (to see which format the returned sense data is in). Considering that there will be broken versions of these applications around roughly forever, we are stuck with being bug compatible with older kernels. Cc: stable@vger.kernel.org # 4.19+ Reported-by: Stephan Eisvogel Reported-by: Christian Heusel Closes: https://lore.kernel.org/linux-ide/0bf3f2f0-0fc6-4ba5-a420-c0874ef82d64@heusel.eu/ Fixes: 28ab9769117c ("ata: libata-scsi: Honor the D_SENSE bit for CK_COND=1 and no error") Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240813131900.1285842-2-cassel@kernel.org Signed-off-by: Niklas Cassel Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-scsi.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 77dbd516a05463..277bf0e8ed0918 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -941,8 +941,19 @@ static void ata_gen_passthru_sense(struct ata_queued_cmd *qc) &sense_key, &asc, &ascq); ata_scsi_set_sense(qc->dev, cmd, sense_key, asc, ascq); } else { - /* ATA PASS-THROUGH INFORMATION AVAILABLE */ - ata_scsi_set_sense(qc->dev, cmd, RECOVERED_ERROR, 0, 0x1D); + /* + * ATA PASS-THROUGH INFORMATION AVAILABLE + * + * Note: we are supposed to call ata_scsi_set_sense(), which + * respects the D_SENSE bit, instead of unconditionally + * generating the sense data in descriptor format. However, + * because hdparm, hddtemp, and udisks incorrectly assume sense + * data in descriptor format, without even looking at the + * RESPONSE CODE field in the returned sense data (to see which + * format the returned sense data is in), we are stuck with + * being bug compatible with older kernels. + */ + scsi_build_sense(cmd, 1, RECOVERED_ERROR, 0, 0x1D); } } From 4c1a2d4cd9a5b6c55739a80c5b9efbca322adad7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 19 Aug 2024 06:04:32 +0200 Subject: [PATCH 70/70] Linux 6.6.47 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://lore.kernel.org/r/20240815131838.311442229@linuxfoundation.org Tested-by: ChromeOS CQ Test Tested-by: Peter Schneider  Tested-by: Florian Fainelli Tested-by: Linux Kernel Functional Testing Tested-by: Mark Brown Tested-by: Jon Hunter Tested-by: Ron Economos Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 77de99984c2f18..6b967e135c80f0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 6 -SUBLEVEL = 46 +SUBLEVEL = 47 EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth