Skip to content

Commit

Permalink
Fix load balanced hairpin traffic for fragmented packets.
Browse files Browse the repository at this point in the history
If we have a UDP load balancer - 10.0.0.10:80 = 10.0.0.3:8080, in order to
determine if the load balanced traffic needs to be hairpinned, the
vip - 10.0.0.10 and the vip port - 80 are stored in the registers before
the packet is load balanced using the below logical flow -

table=6 (ls_in_pre_stateful ), priority=120  ,
  match=(reg0[2] == 1 && ip4.dst == 10.0.0.10 && tcp.dst == 80),
  action=(reg1 = 10.0.0.10; reg2[0..15] = 80; ct_lb_mark;)

These registers are used in the later stages to check if the load balanced
packet needs to be hairpinned or not.

However, if the packet is fragmented we may not be able to match on the
L4 fields (tcp, udp or sctp dest port) and this breaks the hairpin
traffic.

This patch addressed this issue by making use of ct_nw_dst/ct_ip6_dst and
ct_tp_dst conntrack fields to determine the hairpin load balanced
traffic.

In order to not break hardware offload on certain smart nics, care is taken
to match on these fields only for fragmented packets.

Note: Relying on conntrack to reassemble packets is not exactly correct, it
only accidentaly works with the kernel datapath.  In our internal bug
tracking system we have this issue to track this incorrect assumption:
https://issues.redhat.com/browse/FDP-913

Reported-at: https://issues.redhat.com/browse/FDP-905
Fixes: 1139b65 ("Don't blindly save original dst IP and Port to avoid megaflow unwildcarding.")
Acked-by: Dumitru Ceara <[email protected]>
Suggested-by: Dumitru Ceara <[email protected]>
Signed-off-by: Numan Siddique <[email protected]>
(cherry picked from commit 0f806cf)
  • Loading branch information
numansiddique committed Nov 9, 2024
1 parent e5b428e commit 0e2fb83
Show file tree
Hide file tree
Showing 14 changed files with 513 additions and 48 deletions.
3 changes: 3 additions & 0 deletions controller/lflow.c
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,9 @@ add_matches_to_flow_table(const struct sbrec_logical_flow *lflow,
.in_port_sec_ptable = OFTABLE_CHK_IN_PORT_SEC,
.out_port_sec_ptable = OFTABLE_CHK_OUT_PORT_SEC,
.mac_cache_use_table = OFTABLE_MAC_CACHE_USE,
.ct_nw_dst_load_table = OFTABLE_CT_ORIG_NW_DST_LOAD,
.ct_ip6_dst_load_table = OFTABLE_CT_ORIG_IP6_DST_LOAD,
.ct_tp_dst_load_table = OFTABLE_CT_ORIG_TP_DST_LOAD,
.ctrl_meter_id = ctrl_meter_id,
.common_nat_ct_zone = get_common_nat_zone(ldp),
};
Expand Down
4 changes: 4 additions & 0 deletions controller/lflow.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ struct uuid;
#define OFTABLE_CHK_LB_AFFINITY 78
#define OFTABLE_MAC_CACHE_USE 79
#define OFTABLE_CT_ZONE_LOOKUP 80
#define OFTABLE_CT_ORIG_NW_DST_LOAD 81
#define OFTABLE_CT_ORIG_IP6_DST_LOAD 82
#define OFTABLE_CT_ORIG_TP_DST_LOAD 83


struct lflow_ctx_in {
struct ovsdb_idl_index *sbrec_multicast_group_by_name_datapath;
Expand Down
37 changes: 37 additions & 0 deletions controller/physical.c
Original file line number Diff line number Diff line change
Expand Up @@ -2769,5 +2769,42 @@ physical_run(struct physical_ctx *p_ctx,
*/
add_default_drop_flow(p_ctx, OFTABLE_LOG_TO_PHY, flow_table);

/* Table 81, 82 and 83
* Match on ct.trk and ct.est and store the ct_nw_dst, ct_ip6_dst and
* ct_tp_dst in the registers. */
uint32_t ct_state = OVS_CS_F_TRACKED | OVS_CS_F_ESTABLISHED;
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);

/* Add the flow:
* match = (ct.trk && ct.est), action = (reg8 = ct_tp_dst)
* table = 83
*/
match_set_ct_state_masked(&match, ct_state, ct_state);
put_move(MFF_CT_TP_DST, 0, MFF_LOG_CT_ORIG_TP_DST_PORT, 0, 16, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_CT_ORIG_TP_DST_LOAD, 100, 0, &match,
&ofpacts, hc_uuid);

/* Add the flow:
* match = (ct.trk && ct.est && ip4), action = (reg4 = ct_nw_dst)
* table = 81
*/
ofpbuf_clear(&ofpacts);
match_set_dl_type(&match, htons(ETH_TYPE_IP));
put_move(MFF_CT_NW_DST, 0, MFF_LOG_CT_ORIG_NW_DST_ADDR, 0, 32, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_CT_ORIG_NW_DST_LOAD, 100, 0, &match,
&ofpacts, hc_uuid);

/* Add the flow:
* match = (ct.trk && ct.est && ip6), action = (xxreg0 = ct_ip6_dst)
* table = 82
*/
ofpbuf_clear(&ofpacts);
match_set_dl_type(&match, htons(ETH_TYPE_IPV6));
put_move(MFF_CT_IPV6_DST, 0, MFF_LOG_CT_ORIG_IP6_DST_ADDR, 0,
128, &ofpacts);
ofctrl_add_flow(flow_table, OFTABLE_CT_ORIG_IP6_DST_LOAD, 100, 0, &match,
&ofpacts, hc_uuid);

ofpbuf_uninit(&ofpacts);
}
14 changes: 12 additions & 2 deletions include/ovn/actions.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ struct collector_set_ids;
OVNACT(CHK_LB_AFF, ovnact_result) \
OVNACT(SAMPLE, ovnact_sample) \
OVNACT(MAC_CACHE_USE, ovnact_null) \
OVNACT(CT_ORIG_NW_DST, ovnact_result) \
OVNACT(CT_ORIG_IP6_DST, ovnact_result) \
OVNACT(CT_ORIG_TP_DST, ovnact_result) \

/* enum ovnact_type, with a member OVNACT_<ENUM> for each action. */
enum OVS_PACKED_ENUM ovnact_type {
Expand Down Expand Up @@ -415,10 +418,11 @@ struct ovnact_set_queue {
uint16_t queue_id;
};

/* OVNACT_DNS_LOOKUP, OVNACT_CHK_LB_HAIRPIN, OVNACT_CHK_LB_HAIRPIN_REPLY. */
/* OVNACT_DNS_LOOKUP, OVNACT_CHK_LB_HAIRPIN, OVNACT_CHK_LB_HAIRPIN_REPLY,
* OVNACT_CT_ORIG_NW_DST, CT_ORIG_IP6_DST, CT_ORIG_TP_DST */
struct ovnact_result {
struct ovnact ovnact;
struct expr_field dst; /* 1-bit destination field. */
struct expr_field dst; /* destination field. */
};

/* OVNACT_LOG. */
Expand Down Expand Up @@ -934,6 +938,12 @@ struct ovnact_encode_params {
this determines which CT zone to use */
uint32_t mac_cache_use_table; /* OpenFlow table for 'mac_cache_use'
* to resubmit. */
uint32_t ct_nw_dst_load_table; /* OpenFlow table for 'ct_nw_dst'
* to resubmit. */
uint32_t ct_ip6_dst_load_table; /* OpenFlow table for 'ct_ip6_dst'
* to resubmit. */
uint32_t ct_tp_dst_load_table; /* OpenFlow table for 'ct_tp_dst'
* to resubmit. */
};

void ovnacts_encode(const struct ovnact[], size_t ovnacts_len,
Expand Down
4 changes: 4 additions & 0 deletions include/ovn/logical-fields.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ enum ovn_controller_event {
#define MFF_LOG_LB_AFF_MATCH_LR_IP6_ADDR MFF_XXREG1
#define MFF_LOG_LB_AFF_MATCH_PORT MFF_REG8

#define MFF_LOG_CT_ORIG_NW_DST_ADDR MFF_REG4
#define MFF_LOG_CT_ORIG_IP6_DST_ADDR MFF_XXREG0
#define MFF_LOG_CT_ORIG_TP_DST_PORT MFF_REG8

void ovn_init_symtab(struct shash *symtab);

/* MFF_LOG_FLAGS_REG bit assignments */
Expand Down
138 changes: 123 additions & 15 deletions lib/actions.c
Original file line number Diff line number Diff line change
Expand Up @@ -3240,18 +3240,19 @@ ovnact_set_queue_free(struct ovnact_set_queue *a OVS_UNUSED)
}

static void
parse_ovnact_result(struct action_context *ctx, const char *name,
const char *prereq, const struct expr_field *dst,
struct ovnact_result *res)
parse_ovnact_result__(struct action_context *ctx, const char *name,
const char *prereq, const struct expr_field *dst,
struct ovnact_result *res,
int n_bits)
{
lexer_get(ctx->lexer); /* Skip action name. */
lexer_get(ctx->lexer); /* Skip '('. */
if (!lexer_match(ctx->lexer, LEX_T_RPAREN)) {
lexer_error(ctx->lexer, "%s doesn't take any parameters", name);
return;
}
/* Validate that the destination is a 1-bit, modifiable field. */
char *error = expr_type_check(dst, 1, true, ctx->scope);
/* Validate that the destination is n_bits, modifiable field. */
char *error = expr_type_check(dst, n_bits, true, ctx->scope);
if (error) {
lexer_error(ctx->lexer, "%s", error);
free(error);
Expand All @@ -3264,6 +3265,14 @@ parse_ovnact_result(struct action_context *ctx, const char *name,
}
}

static void
parse_ovnact_result(struct action_context *ctx, const char *name,
const char *prereq, const struct expr_field *dst,
struct ovnact_result *res)
{
parse_ovnact_result__(ctx, name, prereq, dst, res, 1);
}

static void
parse_dns_lookup(struct action_context *ctx, const struct expr_field *dst,
struct ovnact_result *dl)
Expand Down Expand Up @@ -4259,22 +4268,40 @@ format_CHK_LB_HAIRPIN_REPLY(const struct ovnact_result *res, struct ds *s)
ds_put_cstr(s, " = chk_lb_hairpin_reply();");
}

static void
encode_result_action___(const struct ovnact_result *res,
uint8_t resubmit_table,
enum mf_field_id dst,
int ofs, int n_bits,
struct ofpbuf *ofpacts)
{
ovs_assert(n_bits <= 128);

struct mf_subfield res_dst = expr_resolve_field(&res->dst);
ovs_assert(res_dst.field);

put_load(0, dst, ofs, n_bits < 64 ? n_bits : 64, ofpacts);
if (n_bits > 64) {
put_load(0, dst, ofs + 64, n_bits - 64, ofpacts);
}

emit_resubmit(ofpacts, resubmit_table);

struct ofpact_reg_move *orm = ofpact_put_REG_MOVE(ofpacts);
orm->dst = res_dst;
orm->src.field = mf_from_id(dst);
orm->src.ofs = ofs;
orm->src.n_bits = n_bits;
}

static void
encode_result_action__(const struct ovnact_result *res,
uint8_t resubmit_table,
int log_flags_result_bit,
struct ofpbuf *ofpacts)
{
struct mf_subfield dst = expr_resolve_field(&res->dst);
ovs_assert(dst.field);
put_load(0, MFF_LOG_FLAGS, log_flags_result_bit, 1, ofpacts);
emit_resubmit(ofpacts, resubmit_table);

struct ofpact_reg_move *orm = ofpact_put_REG_MOVE(ofpacts);
orm->dst = dst;
orm->src.field = mf_from_id(MFF_LOG_FLAGS);
orm->src.ofs = log_flags_result_bit;
orm->src.n_bits = 1;
encode_result_action___(res, resubmit_table, MFF_LOG_FLAGS,
log_flags_result_bit, 1, ofpacts);
}

static void
Expand Down Expand Up @@ -5395,6 +5422,75 @@ encode_MAC_CACHE_USE(const struct ovnact_null *null OVS_UNUSED,
emit_resubmit(ofpacts, ep->mac_cache_use_table);
}

static void
encode_CT_ORIG_NW_DST(const struct ovnact_result *res,
const struct ovnact_encode_params *ep,
struct ofpbuf *ofpacts)
{
encode_result_action___(res, ep->ct_nw_dst_load_table,
MFF_LOG_CT_ORIG_NW_DST_ADDR, 0, 32, ofpacts);
}

static void
parse_CT_ORIG_NW_DST(struct action_context *ctx, const struct expr_field *dst,
struct ovnact_result *res)
{
parse_ovnact_result__(ctx, "ct_nw_dst", NULL, dst, res, 32);
}

static void
format_CT_ORIG_NW_DST(const struct ovnact_result *res, struct ds *s)
{
expr_field_format(&res->dst, s);
ds_put_cstr(s, " = ct_nw_dst();");
}

static void
encode_CT_ORIG_IP6_DST(const struct ovnact_result *res,
const struct ovnact_encode_params *ep,
struct ofpbuf *ofpacts)
{
encode_result_action___(res, ep->ct_ip6_dst_load_table,
MFF_LOG_CT_ORIG_IP6_DST_ADDR, 0, 128, ofpacts);
}

static void
parse_CT_ORIG_IP6_DST(struct action_context *ctx, const struct expr_field *dst,
struct ovnact_result *res)
{
parse_ovnact_result__(ctx, "ct_ip6_dst", NULL, dst, res, 128);
}

static void
format_CT_ORIG_IP6_DST(const struct ovnact_result *res, struct ds *s)
{
expr_field_format(&res->dst, s);
ds_put_cstr(s, " = ct_ip6_dst();");
}

static void
encode_CT_ORIG_TP_DST(const struct ovnact_result *res,
const struct ovnact_encode_params *ep OVS_UNUSED,
struct ofpbuf *ofpacts)
{
encode_result_action___(res, ep->ct_tp_dst_load_table,
MFF_LOG_CT_ORIG_TP_DST_PORT, 0, 16, ofpacts);
}

static void
parse_CT_ORIG_TP_DST(struct action_context *ctx, const struct expr_field *dst,
struct ovnact_result *res)
{
parse_ovnact_result__(ctx, "ct_tp_dst", NULL, dst, res, 16);
}

static void
format_CT_ORIG_TP_DST(const struct ovnact_result *res, struct ds *s)
{
expr_field_format(&res->dst, s);
ds_put_cstr(s, " = ct_tp_dst();");
}

/* Parses an assignment or exchange or put_dhcp_opts action. */
static void
parse_set_action(struct action_context *ctx)
Expand Down Expand Up @@ -5489,6 +5585,18 @@ parse_set_action(struct action_context *ctx)
} else if (lexer_match_id(ctx->lexer, "dhcp_relay_resp_chk")) {
parse_dhcp_relay_chk(
ctx, &lhs, ovnact_put_DHCPV4_RELAY_RESP_CHK(ctx->ovnacts));
} else if (!strcmp(ctx->lexer->token.s, "ct_nw_dst") &&
lexer_lookahead(ctx->lexer) == LEX_T_LPAREN) {
parse_CT_ORIG_NW_DST(ctx, &lhs,
ovnact_put_CT_ORIG_NW_DST(ctx->ovnacts));
} else if (!strcmp(ctx->lexer->token.s, "ct_ip6_dst") &&
lexer_lookahead(ctx->lexer) == LEX_T_LPAREN) {
parse_CT_ORIG_IP6_DST(ctx, &lhs,
ovnact_put_CT_ORIG_IP6_DST(ctx->ovnacts));
} else if (!strcmp(ctx->lexer->token.s, "ct_tp_dst") &&
lexer_lookahead(ctx->lexer) == LEX_T_LPAREN) {
parse_CT_ORIG_TP_DST(ctx, &lhs,
ovnact_put_CT_ORIG_TP_DST(ctx->ovnacts));
} else {
parse_assignment_action(ctx, false, &lhs);
}
Expand Down
57 changes: 54 additions & 3 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ static bool vxlan_mode;
#define REGBIT_ACL_STATELESS "reg0[16]"
#define REGBIT_ACL_HINT_ALLOW_REL "reg0[17]"
#define REGBIT_FROM_ROUTER_PORT "reg0[18]"
#define REGBIT_IP_FRAG "reg0[19]"

#define REG_ORIG_DIP_IPV4 "reg1"
#define REG_ORIG_DIP_IPV6 "xxreg1"
Expand Down Expand Up @@ -6398,6 +6399,13 @@ build_pre_stateful(struct ovn_datapath *od,

/* Note: priority-120 flows are added in build_lb_rules_pre_stateful(). */

/* If the packet is fragmented, set the REGBIT_IP_FRAG reg bit to 1
* as ip.is_frag will not be preserved after conntrack recirculation. */
ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 115,
REGBIT_CONNTRACK_NAT" == 1 && ip.is_frag",
REGBIT_IP_FRAG" = 1; ct_lb_mark;",
lflow_ref);

ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 110,
REGBIT_CONNTRACK_NAT" == 1", "ct_lb_mark;",
lflow_ref);
Expand Down Expand Up @@ -8243,13 +8251,32 @@ build_lb_rules(struct lflow_table *lflows, struct ovn_lb_datapaths *lb_dps,
struct ovn_lb_vip *lb_vip = &lb->vips[i];
struct ovn_northd_lb_vip *lb_vip_nb = &lb->vips_nb[i];
const char *ip_match = NULL;

ds_clear(action);

/* Store the original destination IP to be used when generating
* hairpin flows.
* If the packet is fragmented, then the flow which saves the
* original destination IP (and port) in the "ls_in_pre_stateful"
* stage will not be hit.
*/
if (lb_vip->address_family == AF_INET) {
ip_match = "ip4";
ds_put_format(action, REG_ORIG_DIP_IPV4 " = %s; ",
lb_vip->vip_str);
} else {
ip_match = "ip6";
ds_put_format(action, REG_ORIG_DIP_IPV6 " = %s; ",
lb_vip->vip_str);
}

ds_clear(action);
if (lb_vip->port_str) {
/* Store the original destination port to be used when generating
* hairpin flows.
*/
ds_put_format(action, REG_ORIG_TP_DPORT " = %s; ",
lb_vip->port_str);
}
ds_clear(match);

/* New connections in Ingress table. */
Expand Down Expand Up @@ -8381,8 +8408,32 @@ build_lb_hairpin(const struct ls_stateful_record *ls_stateful_rec,
lflow_ref);

if (ls_stateful_rec->has_lb_vip) {
/* Check if the packet needs to be hairpinned.
* Set REGBIT_HAIRPIN in the original direction and
/* Check if the packet needs to be hairpinned. */

/* In order to check if the fragmented packets needs to be
* hairpinned we need to save the ct tuple original IPv4/v6
* destination and L4 destination port in the registers after
* the conntrack recirculation.
*
* Note: We are assuming that sending the packets to conntrack
* will reassemble the packet and L4 fields will be available.
* It is a risky assumption as ovs-vswitchd doesn't guarantee it
* and userspace datapath doesn't reassemble the fragmented packets
* after conntrack. It is the kernel datapath conntrack behavior.
* We need to find a better way to handle the fragmented packets.
* */
ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 110,
"ct.trk && !ct.rpl && "REGBIT_IP_FRAG" == 1 && ip4",
REG_ORIG_DIP_IPV4 " = ct_nw_dst(); "
REG_ORIG_TP_DPORT " = ct_tp_dst(); next;",
lflow_ref);
ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 110,
"ct.trk && !ct.rpl && "REGBIT_IP_FRAG" == 1 && ip6",
REG_ORIG_DIP_IPV6 " = ct_ip6_dst(); "
REG_ORIG_TP_DPORT " = ct_tp_dst(); next;",
lflow_ref);

/* Set REGBIT_HAIRPIN in the original direction and
* REGBIT_HAIRPIN_REPLY in the reply direction.
*/
ovn_lflow_add_with_hint(
Expand Down
Loading

0 comments on commit 0e2fb83

Please sign in to comment.