From 933ac5c9945683a1d5d0ce800a2d6b870a1c86a4 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 26 Mar 2024 13:00:10 +0000 Subject: [PATCH 01/17] mirror: Add u16 definition to types.h We'll have a value cast as a u16 in src/kernel/bpf/disasm.c in a future commit. Add the type definition to the relevant header. Signed-off-by: Quentin Monnet --- include/linux/types.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/types.h b/include/linux/types.h index f991d117..d2df5ed0 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -12,6 +12,7 @@ typedef uint64_t u64; typedef __u32 u32; +typedef __u16 u16; typedef __u8 u8; #define __bitwise__ From 2d219fb5a7e447e108fbe204115a5721bf0c8552 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 26 Mar 2024 09:47:27 +0000 Subject: [PATCH 02/17] sync: Update libbpf submodule Pull latest libbpf from mirror. Libbpf version: 1.4.0 Libbpf commit: 20ea95b4505c477af3b6ff6ce9d19cee868ddc5d Signed-off-by: Quentin Monnet --- libbpf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libbpf b/libbpf index 2778cbce..20ea95b4 160000 --- a/libbpf +++ b/libbpf @@ -1 +1 @@ -Subproject commit 2778cbce609aa1e2747a69349f7f46a2f94f0522 +Subproject commit 20ea95b4505c477af3b6ff6ce9d19cee868ddc5d From 75beab84bbafbd0bec811823739b2d9f99325b3c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 6 Mar 2024 19:12:27 -0800 Subject: [PATCH 03/17] bpftool: rename is_internal_mmapable_map into is_mmapable_map It's not restricted to working with "internal" maps, it cares about any map that can be mmap'ed. Reflect that in more succinct and generic name. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20240307031228.42896-6-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- src/gen.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/gen.c b/src/gen.c index 1f579eac..a3d72be3 100644 --- a/src/gen.c +++ b/src/gen.c @@ -248,7 +248,7 @@ static const struct btf_type *find_type_for_map(struct btf *btf, const char *map return NULL; } -static bool is_internal_mmapable_map(const struct bpf_map *map, char *buf, size_t sz) +static bool is_mmapable_map(const struct bpf_map *map, char *buf, size_t sz) { if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE)) return false; @@ -274,7 +274,7 @@ static int codegen_datasecs(struct bpf_object *obj, const char *obj_name) bpf_object__for_each_map(map, obj) { /* only generate definitions for memory-mapped internal maps */ - if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident))) + if (!is_mmapable_map(map, map_ident, sizeof(map_ident))) continue; sec = find_type_for_map(btf, map_ident); @@ -327,7 +327,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name bpf_object__for_each_map(map, obj) { /* only generate definitions for memory-mapped internal maps */ - if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident))) + if (!is_mmapable_map(map, map_ident, sizeof(map_ident))) continue; sec = find_type_for_map(btf, map_ident); @@ -504,7 +504,7 @@ static void codegen_asserts(struct bpf_object *obj, const char *obj_name) ", obj_name); bpf_object__for_each_map(map, obj) { - if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident))) + if (!is_mmapable_map(map, map_ident, sizeof(map_ident))) continue; sec = find_type_for_map(btf, map_ident); @@ -720,7 +720,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h const void *mmap_data = NULL; size_t mmap_size = 0; - if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + if (!is_mmapable_map(map, ident, sizeof(ident))) continue; codegen("\ @@ -782,7 +782,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h bpf_object__for_each_map(map, obj) { const char *mmap_flags; - if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + if (!is_mmapable_map(map, ident, sizeof(ident))) continue; if (bpf_map__map_flags(map) & BPF_F_RDONLY_PROG) @@ -871,7 +871,7 @@ codegen_maps_skeleton(struct bpf_object *obj, size_t map_cnt, bool mmaped) ", i, bpf_map__name(map), i, ident); /* memory-mapped internal maps */ - if (mmaped && is_internal_mmapable_map(map, ident, sizeof(ident))) { + if (mmaped && is_mmapable_map(map, ident, sizeof(ident))) { printf("\ts->maps[%zu].mmaped = (void **)&obj->%s;\n", i, ident); } @@ -1617,7 +1617,7 @@ static int do_subskeleton(int argc, char **argv) /* Also count all maps that have a name */ map_cnt++; - if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + if (!is_mmapable_map(map, ident, sizeof(ident))) continue; map_type_id = bpf_map__btf_value_type_id(map); @@ -1739,7 +1739,7 @@ static int do_subskeleton(int argc, char **argv) /* walk through each symbol and emit the runtime representation */ bpf_object__for_each_map(map, obj) { - if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + if (!is_mmapable_map(map, ident, sizeof(ident))) continue; map_type_id = bpf_map__btf_value_type_id(map); From 6aa4b2e0d80a64f7bea7abc778d4a5e6d9af5e3c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:07:59 -0800 Subject: [PATCH 04/17] bpf: Introduce bpf_arena. Introduce bpf_arena, which is a sparse shared memory region between the bpf program and user space. Use cases: 1. User space mmap-s bpf_arena and uses it as a traditional mmap-ed anonymous region, like memcached or any key/value storage. The bpf program implements an in-kernel accelerator. XDP prog can search for a key in bpf_arena and return a value without going to user space. 2. The bpf program builds arbitrary data structures in bpf_arena (hash tables, rb-trees, sparse arrays), while user space consumes it. 3. bpf_arena is a "heap" of memory from the bpf program's point of view. The user space may mmap it, but bpf program will not convert pointers to user base at run-time to improve bpf program speed. Initially, the kernel vm_area and user vma are not populated. User space can fault in pages within the range. While servicing a page fault, bpf_arena logic will insert a new page into the kernel and user vmas. The bpf program can allocate pages from that region via bpf_arena_alloc_pages(). This kernel function will insert pages into the kernel vm_area. The subsequent fault-in from user space will populate that page into the user vma. The BPF_F_SEGV_ON_FAULT flag at arena creation time can be used to prevent fault-in from user space. In such a case, if a page is not allocated by the bpf program and not present in the kernel vm_area, the user process will segfault. This is useful for use cases 2 and 3 above. bpf_arena_alloc_pages() is similar to user space mmap(). It allocates pages either at a specific address within the arena or allocates a range with the maple tree. bpf_arena_free_pages() is analogous to munmap(), which frees pages and removes the range from the kernel vm_area and from user process vmas. bpf_arena can be used as a bpf program "heap" of up to 4GB. The speed of bpf program is more important than ease of sharing with user space. This is use case 3. In such a case, the BPF_F_NO_USER_CONV flag is recommended. It will tell the verifier to treat the rX = bpf_arena_cast_user(rY) instruction as a 32-bit move wX = wY, which will improve bpf prog performance. Otherwise, bpf_arena_cast_user is translated by JIT to conditionally add the upper 32 bits of user vm_start (if the pointer is not NULL) to arena pointers before they are stored into memory. This way, user space sees them as valid 64-bit pointers. Diff https://github.com/llvm/llvm-project/pull/84410 enables LLVM BPF backend generate the bpf_addr_space_cast() instruction to cast pointers between address_space(1) which is reserved for bpf_arena pointers and default address space zero. All arena pointers in a bpf program written in C language are tagged as __attribute__((address_space(1))). Hence, clang provides helpful diagnostics when pointers cross address space. Libbpf and the kernel support only address_space == 1. All other address space identifiers are reserved. rX = bpf_addr_space_cast(rY, /* dst_as */ 1, /* src_as */ 0) tells the verifier that rX->type = PTR_TO_ARENA. Any further operations on PTR_TO_ARENA register have to be in the 32-bit domain. The verifier will mark load/store through PTR_TO_ARENA with PROBE_MEM32. JIT will generate them as kern_vm_start + 32bit_addr memory accesses. The behavior is similar to copy_from_kernel_nofault() except that no address checks are necessary. The address is guaranteed to be in the 4GB range. If the page is not present, the destination register is zeroed on read, and the operation is ignored on write. rX = bpf_addr_space_cast(rY, 0, 1) tells the verifier that rX->type = unknown scalar. If arena->map_flags has BPF_F_NO_USER_CONV set, then the verifier converts such cast instructions to mov32. Otherwise, JIT will emit native code equivalent to: rX = (u32)rY; if (rY) rX |= clear_lo32_bits(arena->user_vm_start); /* replace hi32 bits in rX */ After such conversion, the pointer becomes a valid user pointer within bpf_arena range. The user process can access data structures created in bpf_arena without any additional computations. For example, a linked list built by a bpf program can be walked natively by user space. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Barret Rhoden Link: https://lore.kernel.org/bpf/20240308010812.89848-2-alexei.starovoitov@gmail.com --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85ec7fc7..e30d943d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1009,6 +1009,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_ARENA, __MAX_BPF_MAP_TYPE }; @@ -1396,6 +1397,12 @@ enum { /* BPF token FD is passed in a corresponding command's token_fd field */ BPF_F_TOKEN_FD = (1U << 16), + +/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */ + BPF_F_SEGV_ON_FAULT = (1U << 17), + +/* Do not translate kernel bpf_arena pointers to user pointers */ + BPF_F_NO_USER_CONV = (1U << 18), }; /* Flags for BPF_PROG_QUERY. */ @@ -1467,6 +1474,9 @@ union bpf_attr { * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the * number of hash functions (if 0, the bloom filter will default * to using 5 hash functions). + * + * BPF_MAP_TYPE_ARENA - contains the address where user space + * is going to mmap() the arena. It has to be page aligned. */ __u64 map_extra; From 8940e674079ec4a2f5f043554760ad4af170eece Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:00 -0800 Subject: [PATCH 05/17] bpf: Disasm support for addr_space_cast instruction. LLVM generates rX = addr_space_cast(rY, dst_addr_space, src_addr_space) instruction when pointers in non-zero address space are used by the bpf program. Recognize this insn in uapi and in bpf disassembler. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-3-alexei.starovoitov@gmail.com --- include/uapi/linux/bpf.h | 4 ++++ src/kernel/bpf/disasm.c | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e30d943d..3c42b9f1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1339,6 +1339,10 @@ enum { */ #define BPF_PSEUDO_KFUNC_CALL 2 +enum bpf_addr_space_cast { + BPF_ADDR_SPACE_CAST = 1, +}; + /* flags for BPF_MAP_UPDATE_ELEM command */ enum { BPF_ANY = 0, /* create new element or update existing */ diff --git a/src/kernel/bpf/disasm.c b/src/kernel/bpf/disasm.c index 82b2dbdd..bd2e2dd0 100644 --- a/src/kernel/bpf/disasm.c +++ b/src/kernel/bpf/disasm.c @@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn) (insn->off == 8 || insn->off == 16 || insn->off == 32); } +static bool is_addr_space_cast(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && + insn->off == BPF_ADDR_SPACE_CAST; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); + } else if (is_addr_space_cast(insn)) { + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + insn->code, insn->dst_reg, + insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', From 39648954b2dbe4f6e0a84eebc2337b8ec9893d45 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:07 -0800 Subject: [PATCH 06/17] bpftool: Recognize arena map type Teach bpftool to recognize arena map type. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240308010812.89848-10-alexei.starovoitov@gmail.com --- docs/bpftool-map.rst | 2 +- src/map.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/bpftool-map.rst b/docs/bpftool-map.rst index 3b7ba037..9d6a314d 100644 --- a/docs/bpftool-map.rst +++ b/docs/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** } DESCRIPTION =========== diff --git a/src/map.c b/src/map.c index f98f7bbe..b89bd792 100644 --- a/src/map.c +++ b/src/map.c @@ -1463,7 +1463,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter | user_ringbuf | cgrp_storage }\n" + " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", From f1f54c6b4916cf1b722e2a0f0e43c36d18c8778e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 7 Mar 2024 17:08:08 -0800 Subject: [PATCH 07/17] libbpf: Recognize __arena global variables. LLVM automatically places __arena variables into ".arena.1" ELF section. In order to use such global variables bpf program must include definition of arena map in ".maps" section, like: struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); __uint(max_entries, 1000); /* number of pages */ __ulong(map_extra, 2ull << 44); /* start of mmap() region */ } arena SEC(".maps"); libbpf recognizes both uses of arena and creates single `struct bpf_map *` instance in libbpf APIs. ".arena.1" ELF section data is used as initial data image, which is exposed through skeleton and bpf_map__initial_value() to the user, if they need to tune it before the load phase. During load phase, this initial image is copied over into mmap()'ed region corresponding to arena, and discarded. Few small checks here and there had to be added to make sure this approach works with bpf_map__initial_value(), mostly due to hard-coded assumption that map->mmaped is set up with mmap() syscall and should be munmap()'ed. For arena, .arena.1 can be (much) smaller than maximum arena size, so this smaller data size has to be tracked separately. Given it is enforced that there is only one arena for entire bpf_object instance, we just keep it in a separate field. This can be generalized if necessary later. All global variables from ".arena.1" section are accessible from user space via skel->arena->name_of_var. For bss/data/rodata the skeleton/libbpf perform the following sequence: 1. addr = mmap(MAP_ANONYMOUS) 2. user space optionally modifies global vars 3. map_fd = bpf_create_map() 4. bpf_update_map_elem(map_fd, addr) // to store values into the kernel 5. mmap(addr, MAP_FIXED, map_fd) after step 5 user spaces see the values it wrote at step 2 at the same addresses arena doesn't support update_map_elem. Hence skeleton/libbpf do: 1. addr = malloc(sizeof SEC ".arena.1") 2. user space optionally modifies global vars 3. map_fd = bpf_create_map(MAP_TYPE_ARENA) 4. real_addr = mmap(map->map_extra, MAP_SHARED | MAP_FIXED, map_fd) 5. memcpy(real_addr, addr) // this will fault-in and allocate pages At the end look and feel of global data vs __arena global data is the same from bpf prog pov. Another complication is: struct { __uint(type, BPF_MAP_TYPE_ARENA); } arena SEC(".maps"); int __arena foo; int bar; ptr1 = &foo; // relocation against ".arena.1" section ptr2 = &arena; // relocation against ".maps" section ptr3 = &bar; // relocation against ".bss" section Fo the kernel ptr1 and ptr2 has point to the same arena's map_fd while ptr3 points to a different global array's map_fd. For the verifier: ptr1->type == unknown_scalar ptr2->type == const_ptr_to_map ptr3->type == ptr_to_map_value After verification, from JIT pov all 3 ptr-s are normal ld_imm64 insns. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240308010812.89848-11-alexei.starovoitov@gmail.com --- src/gen.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/gen.c b/src/gen.c index a3d72be3..4fa4ade1 100644 --- a/src/gen.c +++ b/src/gen.c @@ -120,6 +120,12 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz) static const char *pfxs[] = { ".data", ".rodata", ".bss", ".kconfig" }; int i, n; + /* recognize hard coded LLVM section name */ + if (strcmp(sec_name, ".arena.1") == 0) { + /* this is the name to use in skeleton */ + snprintf(buf, buf_sz, "arena"); + return true; + } for (i = 0, n = ARRAY_SIZE(pfxs); i < n; i++) { const char *pfx = pfxs[i]; @@ -250,6 +256,13 @@ static const struct btf_type *find_type_for_map(struct btf *btf, const char *map static bool is_mmapable_map(const struct bpf_map *map, char *buf, size_t sz) { + size_t tmp_sz; + + if (bpf_map__type(map) == BPF_MAP_TYPE_ARENA && bpf_map__initial_value(map, &tmp_sz)) { + snprintf(buf, sz, "arena"); + return true; + } + if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE)) return false; From 2a37aa0f9bf5f036341e800529e208e14cfe0185 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 11 Mar 2024 18:37:26 -0700 Subject: [PATCH 08/17] bpftool: Cast pointers for shadow types explicitly. According to a report, skeletons fail to assign shadow pointers when being compiled with C++ programs. Unlike C doing implicit casting for void pointers, C++ requires an explicit casting. To support C++, we do explicit casting for each shadow pointer. Also add struct_ops_module.skel.h to test_cpp to validate C++ compilation as part of BPF selftests. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240312013726.1780720-1-thinker.li@gmail.com --- src/gen.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gen.c b/src/gen.c index 4fa4ade1..3ce27754 100644 --- a/src/gen.c +++ b/src/gen.c @@ -1131,7 +1131,8 @@ static void gen_st_ops_shadow_init(struct btf *btf, struct bpf_object *obj) continue; codegen("\ \n\ - obj->struct_ops.%1$s = bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ + obj->struct_ops.%1$s = (typeof(obj->struct_ops.%1$s))\n\ + bpf_map__initial_value(obj->maps.%1$s, NULL);\n\ \n\ ", ident); } From 7752997eb9a3aadff50bdc24ba1c6673d0f753eb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Mar 2024 19:32:49 -0700 Subject: [PATCH 09/17] bpftool: Fix missing pids during link show Current 'bpftool link' command does not show pids, e.g., $ tools/build/bpftool/bpftool link ... 4: tracing prog 23 prog_type lsm attach_type lsm_mac target_obj_id 1 target_btf_id 31320 Hack the following change to enable normal libbpf debug output, --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -121,9 +121,9 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) /* we don't want output polluted with libbpf errors if bpf_iter is not * supported */ - default_print = libbpf_set_print(libbpf_print_none); + /* default_print = libbpf_set_print(libbpf_print_none); */ err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + /* libbpf_set_print(default_print); */ Rerun the above bpftool command: $ tools/build/bpftool/bpftool link libbpf: prog 'iter': BPF program load failed: Permission denied libbpf: prog 'iter': -- BEGIN PROG LOAD LOG -- 0: R1=ctx() R10=fp0 ; struct task_struct *task = ctx->task; @ pid_iter.bpf.c:69 0: (79) r6 = *(u64 *)(r1 +8) ; R1=ctx() R6_w=ptr_or_null_task_struct(id=1) ; struct file *file = ctx->file; @ pid_iter.bpf.c:68 ... ; struct bpf_link *link = (struct bpf_link *) file->private_data; @ pid_iter.bpf.c:103 80: (79) r3 = *(u64 *)(r8 +432) ; R3_w=scalar() R8=ptr_file() ; if (link->type == bpf_core_enum_value(enum bpf_link_type___local, @ pid_iter.bpf.c:105 81: (61) r1 = *(u32 *)(r3 +12) R3 invalid mem access 'scalar' processed 39 insns (limit 1000000) max_states_per_insn 0 total_states 3 peak_states 3 mark_read 2 -- END PROG LOAD LOG -- libbpf: prog 'iter': failed to load: -13 ... The 'file->private_data' returns a 'void' type and this caused subsequent 'link->type' (insn #81) failed in verification. To fix the issue, restore the previous BPF_CORE_READ so old kernels can also work. With this patch, the 'bpftool link' runs successfully with 'pids'. $ tools/build/bpftool/bpftool link ... 4: tracing prog 23 prog_type lsm attach_type lsm_mac target_obj_id 1 target_btf_id 31320 pids systemd(1) Fixes: 44ba7b30e84f ("bpftool: Use a local copy of BPF_LINK_TYPE_PERF_EVENT in pid_iter.bpf.c") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Tested-by: Quentin Monnet Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240312023249.3776718-1-yonghong.song@linux.dev --- src/skeleton/pid_iter.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/skeleton/pid_iter.bpf.c b/src/skeleton/pid_iter.bpf.c index 26004f0c..7bdbcac3 100644 --- a/src/skeleton/pid_iter.bpf.c +++ b/src/skeleton/pid_iter.bpf.c @@ -102,8 +102,8 @@ int iter(struct bpf_iter__task_file *ctx) BPF_LINK_TYPE_PERF_EVENT___local)) { struct bpf_link *link = (struct bpf_link *) file->private_data; - if (link->type == bpf_core_enum_value(enum bpf_link_type___local, - BPF_LINK_TYPE_PERF_EVENT___local)) { + if (BPF_CORE_READ(link, type) == bpf_core_enum_value(enum bpf_link_type___local, + BPF_LINK_TYPE_PERF_EVENT___local)) { e.has_bpf_cookie = true; e.bpf_cookie = get_bpf_cookie(link); } From 7ca158b193ea97feb4fa67e2b4dbc48607ebec2b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 16:38:50 -0700 Subject: [PATCH 10/17] bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs Wire up BPF cookie for raw tracepoint programs (both BTF and non-BTF aware variants). This brings them up to part w.r.t. BPF cookie usage with classic tracepoint and fentry/fexit programs. Acked-by: Stanislav Fomichev Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Message-ID: <20240319233852.1977493-4-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c42b9f1..bf80b614 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1664,6 +1664,7 @@ union bpf_attr { struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ __u64 name; __u32 prog_fd; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ From 6c7fc190d7d6bafa48d2da9335e6d47a01740def Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 20 Mar 2024 01:22:41 +0000 Subject: [PATCH 11/17] bpftool: Enable libbpf logs when loading pid_iter in debug mode When trying to load the pid_iter BPF program used to iterate over the PIDs of the processes holding file descriptors to BPF links, we would unconditionally silence libbpf in order to keep the output clean if the kernel does not support iterators and loading fails. Although this is the desirable behaviour in most cases, this may hide bugs in the pid_iter program that prevent it from loading, and it makes it hard to debug such load failures, even in "debug" mode. Instead, it makes more sense to print libbpf's logs when we pass the -d|--debug flag to bpftool, so that users get the logs to investigate failures without having to edit bpftool's source code. Signed-off-by: Quentin Monnet Message-ID: <20240320012241.42991-1-qmo@kernel.org> Signed-off-by: Alexei Starovoitov --- src/pids.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/pids.c b/src/pids.c index 00c77edb..9b898571 100644 --- a/src/pids.c +++ b/src/pids.c @@ -101,7 +101,6 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) char buf[4096 / sizeof(*e) * sizeof(*e)]; struct pid_iter_bpf *skel; int err, ret, fd = -1, i; - libbpf_print_fn_t default_print; *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL); if (IS_ERR(*map)) { @@ -118,12 +117,18 @@ int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) skel->rodata->obj_type = type; - /* we don't want output polluted with libbpf errors if bpf_iter is not - * supported - */ - default_print = libbpf_set_print(libbpf_print_none); - err = pid_iter_bpf__load(skel); - libbpf_set_print(default_print); + if (!verifier_logs) { + libbpf_print_fn_t default_print; + + /* Unless debug information is on, we don't want the output to + * be polluted with libbpf errors if bpf_iter is not supported. + */ + default_print = libbpf_set_print(libbpf_print_none); + err = pid_iter_bpf__load(skel); + libbpf_set_print(default_print); + } else { + err = pid_iter_bpf__load(skel); + } if (err) { /* too bad, kernel doesn't support BPF iterators yet */ err = 0; From bea9bb862fcf6c56227d0e920652aa9d0359f1ce Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 20 Mar 2024 01:34:57 +0000 Subject: [PATCH 12/17] bpftool: Remove unnecessary source files from bootstrap version Commit d510296d331a ("bpftool: Use syscall/loader program in "prog load" and "gen skeleton" command.") added new files to the list of objects to compile in order to build the bootstrap version of bpftool. As far as I can tell, these objects are unnecessary and were added by mistake; maybe a draft version intended to add support for loading loader programs from the bootstrap version. Anyway, we can remove these object files from the list to make the bootstrap bpftool binary a tad smaller and faster to build. Fixes: d510296d331a ("bpftool: Use syscall/loader program in "prog load" and "gen skeleton" command.") Signed-off-by: Quentin Monnet Message-ID: <20240320013457.44808-1-qmo@kernel.org> Signed-off-by: Alexei Starovoitov --- src/Makefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Makefile b/src/Makefile index 048d2214..27d07a65 100644 --- a/src/Makefile +++ b/src/Makefile @@ -177,7 +177,7 @@ HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) $(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o @@ -225,9 +225,6 @@ endif CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) -$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c - $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ - $(OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ From d6b80c61bdff5c5e21921d2728bf1aa0e610dfc8 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 20 Mar 2024 01:41:03 +0000 Subject: [PATCH 13/17] bpftool: Clean up HOST_CFLAGS, HOST_LDFLAGS for bootstrap bpftool Bpftool's Makefile uses $(HOST_CFLAGS) to build the bootstrap version of bpftool, in order to pick the flags for the host (where we run the bootstrap version) and not for the target system (where we plan to run the full bpftool binary). But we pass too much information through this variable. In particular, we set HOST_CFLAGS by copying most of the $(CFLAGS); but we do this after the feature detection for bpftool, which means that $(CFLAGS), hence $(HOST_CFLAGS), contain all macro definitions for using the different optional features. For example, -DHAVE_LLVM_SUPPORT may be passed to the $(HOST_CFLAGS), even though the LLVM disassembler is not used in the bootstrap version, and the related library may even be missing for the host architecture. A similar thing happens with the $(LDFLAGS), that we use unchanged for linking the bootstrap version even though they may contains flags to link against additional libraries. To address the $(HOST_CFLAGS) issue, we move the definition of $(HOST_CFLAGS) earlier in the Makefile, before the $(CFLAGS) update resulting from the feature probing - none of which being relevant to the bootstrap version. To clean up the $(LDFLAGS) for the bootstrap version, we introduce a dedicated $(HOST_LDFLAGS) variable that we base on $(LDFLAGS), before the feature probing as well. On my setup, the following macro and libraries are removed from the compiler invocation to build bpftool after this patch: -DUSE_LIBCAP -DHAVE_LLVM_SUPPORT -I/usr/lib/llvm-17/include -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -lLLVM-17 -L/usr/lib/llvm-17/lib Another advantage of cleaning up these flags is that displaying available features with "bpftool version" becomes more accurate for the bootstrap bpftool, and no longer reflects the features detected (and available only) for the final binary. Cc: Jean-Philippe Brucker Signed-off-by: Quentin Monnet Acked-by: Jiri Olsa Message-ID: <20240320014103.45641-1-qmo@kernel.org> Signed-off-by: Alexei Starovoitov --- src/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Makefile b/src/Makefile index 27d07a65..a3bc6a0c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -87,6 +87,10 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif +HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) +HOST_LDFLAGS := $(LDFLAGS) + INSTALL ?= install RM ?= rm -f @@ -172,9 +176,6 @@ ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),) SRCS := $(filter-out jit_disasm.c,$(SRCS)) endif -HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ - $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) - BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) @@ -229,7 +230,7 @@ $(OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) - $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ + $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(HOST_LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ From 5a8620191542036b9588583c6e75d9c5ddce81e7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 24 Mar 2024 20:38:42 -0700 Subject: [PATCH 14/17] bpf: Sync uapi bpf.h to tools directory There is a difference between kernel uapi bpf.h and tools uapi bpf.h. There is no functionality difference, but let us sync properly to make it easy for later bpf.h update. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240325033842.1693553-1-yonghong.song@linux.dev --- include/uapi/linux/bpf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf80b614..9585f534 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1662,9 +1662,10 @@ union bpf_attr { } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ - __u64 name; - __u32 prog_fd; - __aligned_u64 cookie; + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; } raw_tracepoint; struct { /* anonymous struct for BPF_BTF_LOAD */ From a2282a6204e5bc501ffcc8cc2738178b6842bffe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Mar 2024 19:18:32 -0700 Subject: [PATCH 15/17] libbpf, selftests/bpf: Adjust libbpf, bpftool, selftests to match LLVM The selftests use to tell LLVM about special pointers. For LLVM there is nothing "arena" about them. They are simply pointers in a different address space. Hence LLVM diff https://github.com/llvm/llvm-project/pull/85161 renamed: . macro __BPF_FEATURE_ARENA_CAST -> __BPF_FEATURE_ADDR_SPACE_CAST . global variables in __attribute__((address_space(N))) are now placed in section named ".addr_space.N" instead of ".arena.N". Adjust libbpf, bpftool, and selftests to match LLVM. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315021834.62988-3-alexei.starovoitov@gmail.com --- src/gen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gen.c b/src/gen.c index 3ce27754..786268f1 100644 --- a/src/gen.c +++ b/src/gen.c @@ -121,7 +121,7 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz) int i, n; /* recognize hard coded LLVM section name */ - if (strcmp(sec_name, ".arena.1") == 0) { + if (strcmp(sec_name, ".addr_space.1") == 0) { /* this is the name to use in skeleton */ snprintf(buf, buf_sz, "arena"); return true; From 2e7a423857dd8140d46714845a5a96fb71c7927c Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 26 Mar 2024 12:50:01 +0000 Subject: [PATCH 16/17] sync: Pull latest bpftool changes from kernel Syncing latest bpftool commits from kernel repository. Baseline bpf-next commit: e63985ecd22681c7f5975f2e8637187a326b6791 Checkpoint bpf-next commit: 14bb1e8c8d4ad5d9d2febb7d19c70a3cf536e1e5 Baseline bpf commit: 2487007aa3b9fafbd2cb14068f49791ce1d7ede5 Checkpoint bpf commit: 443574b033876c85a35de4c65c14f7fe092222b2 Alexei Starovoitov (4): bpf: Introduce bpf_arena. bpf: Disasm support for addr_space_cast instruction. bpftool: Recognize arena map type libbpf, selftests/bpf: Adjust libbpf, bpftool, selftests to match LLVM Andrii Nakryiko (3): bpftool: rename is_internal_mmapable_map into is_mmapable_map libbpf: Recognize __arena global variables. bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs Kui-Feng Lee (1): bpftool: Cast pointers for shadow types explicitly. Quentin Monnet (3): bpftool: Enable libbpf logs when loading pid_iter in debug mode bpftool: Remove unnecessary source files from bootstrap version bpftool: Clean up HOST_CFLAGS, HOST_LDFLAGS for bootstrap bpftool Yonghong Song (2): bpftool: Fix missing pids during link show bpf: Sync uapi bpf.h to tools directory docs/bpftool-map.rst | 2 +- include/uapi/linux/bpf.h | 20 ++++++++++++++++++-- src/Makefile | 14 ++++++-------- src/gen.c | 34 ++++++++++++++++++++++++---------- src/kernel/bpf/disasm.c | 10 ++++++++++ src/map.c | 2 +- src/pids.c | 19 ++++++++++++------- src/skeleton/pid_iter.bpf.c | 4 ++-- 8 files changed, 74 insertions(+), 31 deletions(-) Signed-off-by: Quentin Monnet --- BPF-CHECKPOINT-COMMIT | 2 +- CHECKPOINT-COMMIT | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BPF-CHECKPOINT-COMMIT b/BPF-CHECKPOINT-COMMIT index def0a92e..8236b54f 100644 --- a/BPF-CHECKPOINT-COMMIT +++ b/BPF-CHECKPOINT-COMMIT @@ -1 +1 @@ -2487007aa3b9fafbd2cb14068f49791ce1d7ede5 +443574b033876c85a35de4c65c14f7fe092222b2 diff --git a/CHECKPOINT-COMMIT b/CHECKPOINT-COMMIT index a0c8df76..4025fe90 100644 --- a/CHECKPOINT-COMMIT +++ b/CHECKPOINT-COMMIT @@ -1 +1 @@ -e63985ecd22681c7f5975f2e8637187a326b6791 +14bb1e8c8d4ad5d9d2febb7d19c70a3cf536e1e5 From 6c104031d480cd013fa828abcfe9228acdf0a99f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 26 Mar 2024 12:53:06 +0000 Subject: [PATCH 17/17] mirror: Update expected diff with kernel sources A recent patch has touched some portions of bpftool's Makefile that differ between kernel's and mirror's sources. Let's update the diff with the expected differences accordingly, to smoothen future sync ups. Signed-off-by: Quentin Monnet --- scripts/sync-kernel-expected-diff.patch | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/scripts/sync-kernel-expected-diff.patch b/scripts/sync-kernel-expected-diff.patch index 769801a1..92496a54 100644 --- a/scripts/sync-kernel-expected-diff.patch +++ b/scripts/sync-kernel-expected-diff.patch @@ -76,7 +76,7 @@ ifneq ($(BPFTOOL_VERSION),) CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' endif -@@ -119,11 +117,7 @@ +@@ -123,11 +121,7 @@ endif ifeq ($(check_feat),1) @@ -89,7 +89,7 @@ endif LIBS = $(LIBBPF) -lelf -lz -@@ -213,7 +207,7 @@ +@@ -214,7 +208,7 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) $(QUIET_CLANG)$(CLANG) \ -I$(or $(OUTPUT),.) \ @@ -97,21 +97,17 @@ + -I$(srctree)/include/uapi/ \ -I$(LIBBPF_BOOTSTRAP_INCLUDE) \ -g -O2 -Wall -fno-stack-protector \ - -target bpf -c $< -o $@ -@@ -231,10 +225,10 @@ + --target=bpf -c $< -o $@ +@@ -232,7 +226,7 @@ CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) --$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c -+$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c - $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ - -$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c +$(OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) -@@ -253,7 +247,7 @@ +@@ -251,7 +245,7 @@ $(call QUIET_CLEAN, feature-detect) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null @@ -120,7 +116,7 @@ $(call QUIET_CLEAN, bpftool) $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d $(Q)$(RM) -- $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h -@@ -269,7 +263,7 @@ +@@ -267,7 +261,7 @@ install: install-bin $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir) @@ -129,7 +125,7 @@ uninstall: $(call QUIET_UNINST, bpftool) -@@ -277,16 +271,16 @@ +@@ -275,16 +269,16 @@ $(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool doc: