Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for LPM trie #4710

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 65 additions & 41 deletions kernel/bpf/lpm_trie.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>

/* Intermediate node */
#define LPM_TREE_NODE_FLAG_IM BIT(0)

struct lpm_trie_node;

struct lpm_trie_node {
struct rcu_head rcu;
struct lpm_trie_node __rcu *child[2];
u32 prefixlen;
u32 flags;
Expand All @@ -32,10 +32,11 @@ struct lpm_trie_node {
struct lpm_trie {
struct bpf_map map;
struct lpm_trie_node __rcu *root;
struct bpf_mem_alloc ma;
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
spinlock_t lock;
raw_spinlock_t lock;
};

/* This trie implements a longest prefix match algorithm that can be used to
Expand Down Expand Up @@ -287,17 +288,12 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
return found->data + trie->data_size;
}

static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
const void *value)
{
struct lpm_trie_node *node;
size_t size = sizeof(struct lpm_trie_node) + trie->data_size;

if (value)
size += trie->map.value_size;

node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
trie->map.numa_node);
node = bpf_mem_cache_alloc(&trie->ma);
if (!node)
return NULL;

Expand All @@ -310,6 +306,16 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
return node;
}

static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
{
if (flags == BPF_EXIST)
return -ENOENT;
if (trie->n_entries == trie->map.max_entries)
return -ENOSPC;
trie->n_entries++;
return 0;
}

/* Called from syscall or from eBPF program */
static long trie_update_elem(struct bpf_map *map,
void *_key, void *value, u64 flags)
Expand All @@ -330,23 +336,15 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;

spin_lock_irqsave(&trie->lock, irq_flags);
raw_spin_lock_irqsave(&trie->lock, irq_flags);

/* Allocate and fill a new node */

if (trie->n_entries == trie->map.max_entries) {
ret = -ENOSPC;
goto out;
}

new_node = lpm_trie_node_alloc(trie, value);
if (!new_node) {
ret = -ENOMEM;
goto out;
}

trie->n_entries++;

new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
RCU_INIT_POINTER(new_node->child[1], NULL);
Expand All @@ -364,8 +362,7 @@ static long trie_update_elem(struct bpf_map *map,
matchlen = longest_prefix_match(trie, node, key);

if (node->prefixlen != matchlen ||
node->prefixlen == key->prefixlen ||
node->prefixlen == trie->max_prefixlen)
node->prefixlen == key->prefixlen)
break;

next_bit = extract_bit(key->data, node->prefixlen);
Expand All @@ -376,6 +373,10 @@ static long trie_update_elem(struct bpf_map *map,
* simply assign the @new_node to that slot and be done.
*/
if (!node) {
ret = trie_check_add_elem(trie, flags);
if (ret)
goto out;

rcu_assign_pointer(*slot, new_node);
goto out;
}
Expand All @@ -384,18 +385,30 @@ static long trie_update_elem(struct bpf_map *map,
* which already has the correct data array set.
*/
if (node->prefixlen == matchlen) {
if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
if (flags == BPF_NOEXIST) {
ret = -EEXIST;
goto out;
}
} else {
ret = trie_check_add_elem(trie, flags);
if (ret)
goto out;
}

new_node->child[0] = node->child[0];
new_node->child[1] = node->child[1];

if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
trie->n_entries--;

rcu_assign_pointer(*slot, new_node);
free_node = node;

goto out;
}

ret = trie_check_add_elem(trie, flags);
if (ret)
goto out;

/* If the new node matches the prefix completely, it must be inserted
* as an ancestor. Simply insert it between @node and *@slot.
*/
Expand All @@ -408,6 +421,7 @@ static long trie_update_elem(struct bpf_map *map,

im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
trie->n_entries--;
ret = -ENOMEM;
goto out;
}
Expand All @@ -429,16 +443,10 @@ static long trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);

out:
if (ret) {
if (new_node)
trie->n_entries--;

kfree(new_node);
kfree(im_node);
}

spin_unlock_irqrestore(&trie->lock, irq_flags);
kfree_rcu(free_node, rcu);
if (ret)
bpf_mem_cache_free(&trie->ma, new_node);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

return ret;
}
Expand All @@ -459,7 +467,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;

spin_lock_irqsave(&trie->lock, irq_flags);
raw_spin_lock_irqsave(&trie->lock, irq_flags);

/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
Expand Down Expand Up @@ -535,9 +543,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
free_node = node;

out:
spin_unlock_irqrestore(&trie->lock, irq_flags);
kfree_rcu(free_parent, rcu);
kfree_rcu(free_node, rcu);
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);

return ret;
}
Expand All @@ -559,6 +567,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
size_t leaf_size;
int err;

/* check sanity of attributes */
if (attr->max_entries == 0 ||
Expand All @@ -581,9 +591,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;

spin_lock_init(&trie->lock);
raw_spin_lock_init(&trie->lock);

/* Allocate intermediate and leaf nodes from the same allocator */
leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
trie->map.value_size;
err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
if (err)
goto free_out;
return &trie->map;

free_out:
bpf_map_area_free(trie);
return ERR_PTR(err);
}

static void trie_free(struct bpf_map *map)
Expand Down Expand Up @@ -615,13 +635,17 @@ static void trie_free(struct bpf_map *map)
continue;
}

kfree(node);
/* No bpf program may access the map, so freeing the
* node without waiting for the extra RCU GP.
*/
bpf_mem_cache_raw_free(node);
RCU_INIT_POINTER(*slot, NULL);
break;
}
}

out:
bpf_mem_alloc_destroy(&trie->ma);
bpf_map_area_free(trie);
}

Expand All @@ -633,7 +657,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
struct lpm_trie_node **node_stack = NULL;
int err = 0, stack_ptr = -1;
unsigned int next_bit;
size_t matchlen;
size_t matchlen = 0;

/* The get_next_key follows postorder. For the 4 node example in
* the top of this file, the trie_get_next_key() returns the following
Expand Down Expand Up @@ -672,7 +696,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
next_bit = extract_bit(key->data, node->prefixlen);
node = rcu_dereference(node->child[next_bit]);
}
if (!node || node->prefixlen != key->prefixlen ||
if (!node || node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM))
goto find_leftmost;

Expand Down
1 change: 0 additions & 1 deletion tools/testing/selftests/bpf/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ bpf-syscall*
test_verifier
test_maps
test_lru_map
test_lpm_map
test_tag
FEATURE-DUMP.libbpf
FEATURE-DUMP.selftests
Expand Down
2 changes: 1 addition & 1 deletion tools/testing/selftests/bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ CLANG_CPUV4 := 1
endif

# Order correspond to 'make run_tests' order
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
test_sockmap \
test_tcpnotify_user test_sysctl \
test_progs-no_alu32
Expand Down
Loading
Loading