diff --git a/src/ipvs/ip_vs_proto_tcp.c b/src/ipvs/ip_vs_proto_tcp.c index 75e5713a6..6acbbcaed 100644 --- a/src/ipvs/ip_vs_proto_tcp.c +++ b/src/ipvs/ip_vs_proto_tcp.c @@ -306,46 +306,88 @@ static void tcp_in_remove_ts(struct tcphdr *tcph) } } -/* use NOP option to replace TCP_OLEN_IP4_ADDR and TCP_OLEN_IP6_ADDR opt */ -static void tcp_in_remove_toa(struct tcphdr *tcph, int af) +/* + * Remove NOP and TOA options preset in the mbuf and compact option space. + * Return the trimmed length on success, otherwise dpvs error num on failure. + * */ +static int tcp_in_prune_options(int af, struct rte_mbuf *mbuf, struct tcphdr *tcph) { - unsigned char *ptr; - int len, i; - uint32_t tcp_opt_len = af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR; + unsigned char *ptr, *fast, *slow; + const unsigned char *l3hdr, *payload; + int i, optlen; + unsigned int pruned; + uint8_t opcode, opsize; ptr = (unsigned char *)(tcph + 1); - len = (tcph->doff << 2) - sizeof(struct tcphdr); - - while (len > 0) { - int opcode = *ptr++; - int opsize; + fast = slow = ptr; + optlen = (tcph->doff << 2) - sizeof(struct tcphdr); + payload = ptr + optlen; + while (optlen > 0) { + opcode = *ptr++; switch (opcode) { case TCP_OPT_EOL: - return; + goto fini; case TCP_OPT_NOP: - len--; + fast++; + optlen--; continue; default: opsize = *ptr++; - if (opsize < 2) /* silly options */ - return; - if (opsize > len) - return; /* partial options */ - if ((opcode == TCP_OPT_ADDR) && (opsize == tcp_opt_len)) { - for (i = 0; i < tcp_opt_len; i++) { - *(ptr - 2 + i) = TCP_OPT_NOP; + if (opsize < 2) /* silly options */ + return EDPVS_INVPKT; + if (opsize > optlen) + return EDPVS_INVPKT; /* partial options */ + if (opcode == TCP_OPT_ADDR) { + for (i = 0; i < opsize; i++) { + fast++; + } + } else { + for (i = 0; i < opsize; i++) { + if (slow != fast) + *slow = *fast; + slow++; + fast++; } - /* DON'T RETURN - * keep search other TCP_OPT_ADDR ,and clear them. - * See https://github.com/iqiyi/dpvs/pull/925 for more detail. */ } ptr += opsize - 2; - len -= opsize; + optlen -= opsize; break; } } + +fini: + if (slow != payload) { + pruned = payload - slow; + while (pruned & 0x3) { /* 4-bytes alignment for tcp options */ + *slow++ = 0; + pruned--; + } + if (slow == payload) + return 0; + /* trim the packet */ + l3hdr = rte_pktmbuf_mtod(mbuf, void *); + if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)) { + memset(slow, 0, pruned); + return EDPVS_INVPKT; + } + if (unlikely(payload - l3hdr > mbuf->pkt_len)) { + memset(slow, 0, pruned); + return EDPVS_INVPKT; + } + memmove(slow, payload, mbuf->pkt_len - (payload - l3hdr)); + rte_pktmbuf_trim(mbuf, pruned); + tcph->doff -= (pruned >> 2); + if (af == AF_INET) + ((struct rte_ipv4_hdr *)l3hdr)->total_length = + htons(ntohs(((struct rte_ipv4_hdr *)l3hdr)->total_length) - pruned); + else + ((struct rte_ipv6_hdr *)l3hdr)->payload_len = + htons(ntohs(((struct rte_ipv6_hdr *)l3hdr)->payload_len) - pruned); + return pruned; + } + return 0; } static int tcp_in_add_proxy_proto(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, @@ -797,15 +839,12 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ - int iaf, oaf; + int af; /* outbound af */ int iphdrlen; int err, pp_hdr_shift = 0; - iaf = tuplehash_in(conn).af; - oaf = tuplehash_out(conn).af; - - iphdrlen = ((AF_INET6 == oaf) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); + af = tuplehash_out(conn).af; + iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; @@ -819,41 +858,37 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, /* * for SYN packet - * 1. remove tcp timestamp option - * laddress for different client have diff timestamp. - * 2. save original TCP sequence for seq-adjust later. - * since TCP option will be change. - * 3. add TOA option - * so that RS with TOA module can get real client IP. + * 1. remove tcp timestamp option, + * laddrs for different clients have diff timestamp. + * 2. save original TCP sequence for seq-adjust later + * since TCP option will be changed. */ if (th->syn && !th->ack) { tcp_in_remove_ts(th); - tcp_in_init_seq(conn, mbuf, th); - if (PROXY_PROTOCOL_V1 != PROXY_PROTOCOL_VERSION(conn->pp_version) - && PROXY_PROTOCOL_V2 != PROXY_PROTOCOL_VERSION(conn->pp_version)) { - if (unlikely(tcp_in_add_toa(conn, mbuf, th) != EDPVS_OK)) { - tcp_in_remove_toa(th, iaf); - } - } } - /* add toa/proxy_proto to first data packet */ + /* Add toa/proxy_protocol to the first data packet */ if (ntohl(th->ack_seq) == conn->fnat_seq.fdata_seq && !th->syn && !th->rst /*&& !th->fin*/) { if (PROXY_PROTOCOL_V2 == PROXY_PROTOCOL_VERSION(conn->pp_version) || PROXY_PROTOCOL_V1 == PROXY_PROTOCOL_VERSION(conn->pp_version)) { if (conn->fnat_seq.isn - conn->fnat_seq.delta + 1 == ntohl(th->seq)) { - /* avoid inserting repetitive ppdata when the first rs ack delayed */ + /* avoid inserting repetitive proxy protocol data + * when the first rs ack is delayed */ err = tcp_in_add_proxy_proto(conn, mbuf, th, iphdrlen, &pp_hdr_shift); if (unlikely(EDPVS_OK != err)) RTE_LOG(INFO, IPVS, "%s: insert proxy protocol fail -- %s\n", __func__, dpvs_strerror(err)); th = ((void *)th) + pp_hdr_shift; } - } else { - if (unlikely(tcp_in_add_toa(conn, mbuf, th) != EDPVS_OK)) { - tcp_in_remove_toa(th, iaf); + } else { /* use toa */ + err = tcp_in_add_toa(conn, mbuf, th); + if (unlikely(EDPVS_OK != err)) { + if (tcp_in_prune_options(af, mbuf, th) >= TCP_OLEN_IP4_ADDR + && (err == EDPVS_NOROOM)) { + tcp_in_add_toa(conn, mbuf, th); + } } } } @@ -864,7 +899,7 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, th->source = conn->lport; th->dest = conn->dport; - return tcp_send_csum(oaf, iphdrlen, th, conn, mbuf, conn->in_dev); + return tcp_send_csum(af, iphdrlen, th, conn, mbuf, conn->in_dev); } static int tcp_fnat_out_handler(struct dp_vs_proto *proto,